1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2000-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: ucnvmbcs.cpp
12 * tab size: 8 (not used)
15 * created on: 2000jul03
16 * created by: Markus W. Scherer
18 * The current code in this file replaces the previous implementation
19 * of conversion code from multi-byte codepages to Unicode and back.
20 * This implementation supports the following:
21 * - legacy variable-length codepages with up to 4 bytes per character
22 * - all Unicode code points (up to 0x10ffff)
23 * - efficient distinction of unassigned vs. illegal byte sequences
24 * - it is possible in fromUnicode() to directly deal with simple
25 * stateful encodings (used for EBCDIC_STATEFUL)
26 * - it is possible to convert Unicode code points
27 * to a single zero byte (but not as a fallback except for SBCS)
29 * Remaining limitations in fromUnicode:
30 * - byte sequences must not have leading zero bytes
31 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
32 * - limitation to up to 4 bytes per character
34 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
35 * limitations and adds m:n character mappings and other features.
36 * See ucnv_ext.h for details.
40 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
41 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
42 * macros to ucnvmbcs.h file
45 #include "unicode/utypes.h"
47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
49 #include "unicode/ucnv.h"
50 #include "unicode/ucnv_cb.h"
51 #include "unicode/udata.h"
52 #include "unicode/uset.h"
53 #include "unicode/utf8.h"
54 #include "unicode/utf16.h"
64 /* control optimizations according to the platform */
65 #define MBCS_UNROLL_SINGLE_TO_BMP 1
66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
69 * _MBCSHeader versions 5.3 & 4.3
70 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
72 * This version is optional. Version 5 is used for incompatible data format changes.
73 * makeconv will continue to generate version 4 files if possible.
75 * Changes from version 4:
77 * The main difference is an additional _MBCSHeader field with
78 * - the length (number of uint32_t) of the _MBCSHeader
79 * - flags for further incompatible data format changes
80 * - flags for further, backward compatible data format changes
82 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
83 * the file and needs to be reconstituted at load time.
84 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
85 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
86 * (For details about these structures see below, and see ucnvmbcs.h.)
88 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
89 * of the Unicode code points. (This requires that the .ucm file has the |0 etc.
90 * precision markers for all mappings.)
92 * All fallbacks have been moved to the extension table, leaving only roundtrips in the
93 * omitted data that can be reconstituted from the toUnicode data.
95 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
96 * With only roundtrip mappings in the base fromUnicode data, this part is fully
97 * redundant with the mbcsIndex and will be reconstituted from that (also using the
98 * stage 1 table which contains the information about how stage 2 was compacted).
100 * The rest of the stage 2 table, the part for code points above maxFastUChar,
101 * is stored in the file and will be appended to the reconstituted part.
103 * The entire fromUBytes array is omitted from the file and will be reconstitued.
104 * This is done by enumerating all toUnicode roundtrip mappings, performing
105 * each mapping (using the stage 1 and reconstituted stage 2 tables) and
106 * writing instead of reading the byte values.
108 * _MBCSHeader version 4.3
110 * Change from version 4.2:
111 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
112 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
113 * files which can be used instead of stages 1 & 2.
114 * Faster lookups for roundtrips from most commonly used characters,
115 * and lookups from UTF-8 byte sequences with a natural bit distribution.
116 * See ucnvmbcs.h for more details.
118 * Change from version 4.1:
119 * - Added an optional extension table structure at the end of the .cnv file.
120 * It is present if the upper bits of the header flags field contains a non-zero
122 * Files that contain only a conversion table and no base table
123 * use the special outputType MBCS_OUTPUT_EXT_ONLY.
124 * These contain the base table name between the MBCS header and the extension
127 * Change from version 4.0:
128 * - Replace header.reserved with header.fromUBytesLength so that all
129 * fields in the data have length.
131 * Changes from version 3 (for performance improvements):
132 * - new bit distribution for state table entries
133 * - reordered action codes
134 * - new data structure for single-byte fromUnicode
135 * + stage 2 only contains indexes
136 * + stage 3 stores 16 bits per character with classification bits 15..8
137 * - no multiplier for stage 1 entries
138 * - stage 2 for non-single-byte codepages contains the index and the flags in
140 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
142 * For more details about old versions of the MBCS data structure, see
143 * the corresponding versions of this file.
145 * Converting stateless codepage data ---------------------------------------***
146 * (or codepage data with simple states) to Unicode.
148 * Data structure and algorithm for converting from complex legacy codepages
149 * to Unicode. (Designed before 2000-may-22.)
151 * The basic idea is that the structure of legacy codepages can be described
153 * When reading a byte stream, each input byte causes a state transition.
154 * Some transitions result in the output of a code point, some result in
155 * "unassigned" or "illegal" output.
156 * This is used here for character conversion.
158 * The data structure begins with a state table consisting of a row
159 * per state, with 256 entries (columns) per row for each possible input
161 * Each entry is 32 bits wide, with two formats distinguished by
162 * the sign bit (bit 31):
164 * One format for transitional entries (bit 31 not set) for non-final bytes, and
165 * one format for final entries (bit 31 set).
166 * Both formats contain the number of the next state in the same bit
168 * State 0 is the initial state.
170 * Most of the time, the offset values of subsequent states are added
171 * up to a scalar value. This value will eventually be the index of
172 * the Unicode code point in a table that follows the state table.
173 * The effect is that the code points for final state table rows
174 * are contiguous. The code points of final state rows follow each other
175 * in the order of the references to those final states by previous
178 * For some terminal states, the offset is itself the output Unicode
179 * code point (16 bits for a BMP code point or 20 bits for a supplementary
180 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
181 * For others, the code point in the Unicode table is stored with either
182 * one or two code units: one for BMP code points, two for a pair of
184 * All code points for a final state entry take up the same number of code
185 * units, regardless of whether they all actually _use_ the same number
186 * of code units. This is necessary for simple array access.
188 * An additional feature comes in with what in ICU is called "fallback"
191 * In addition to round-trippable, precise, 1:1 mappings, there are often
192 * mappings defined between similar, though not the same, characters.
193 * Typically, such mappings occur only in fromUnicode mapping tables because
194 * Unicode has a superset repertoire of most other codepages. However, it
195 * is possible to provide such mappings in the toUnicode tables, too.
196 * In this case, the fallback mappings are partly integrated into the
197 * general state tables because the structure of the encoding includes their
199 * For final entries in an initial state, fallback mappings are stored in
200 * the entry itself like with roundtrip mappings.
201 * For other final entries, they are stored in the code units table if
202 * the entry is for a pair of code units.
203 * For single-unit results in the code units table, there is no space to
204 * alternatively hold a fallback mapping; in this case, the code unit
205 * is stored as U+fffe (unassigned), and the fallback mapping needs to
206 * be looked up by the scalar offset value in a separate table.
208 * "Unassigned" state entries really mean "structurally unassigned",
209 * i.e., such a byte sequence will never have a mapping result.
211 * The interpretation of the bits in each entry is as follows:
213 * Bit 31 not set, not a terminal entry ("transitional"):
215 * 23..0 offset delta, to be added up
217 * Bit 31 set, terminal ("final") entry:
218 * 30..24 next state (regardless of action code)
219 * 23..20 action code:
220 * action codes 0 and 1 result in precise-mapping Unicode code points
221 * 0 valid byte sequence
223 * 15..0 16-bit Unicode BMP code point
224 * never U+fffe or U+ffff
225 * 1 valid byte sequence
226 * 19..0 20-bit Unicode supplementary code point
227 * never U+fffe or U+ffff
229 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
230 * 2 valid byte sequence (fallback)
232 * 15..0 16-bit Unicode BMP code point as fallback result
233 * 3 valid byte sequence (fallback)
234 * 19..0 20-bit Unicode supplementary code point as fallback result
236 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
237 * depending on the code units they result in
238 * 4 valid byte sequence
240 * 8..0 final offset delta
241 * pointing to one 16-bit code unit which may be
242 * fffe unassigned -- look for a fallback for this offset
244 * 5 valid byte sequence
246 * 8..0 final offset delta
247 * pointing to two 16-bit code units
248 * (typically UTF-16 surrogates)
249 * the result depends on the first code unit as follows:
250 * 0000..d7ff roundtrip BMP code point (1st alone)
251 * d800..dbff roundtrip surrogate pair (1st, 2nd)
252 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
253 * e000 roundtrip BMP code point (2nd alone)
254 * e001 fallback BMP code point (2nd alone)
257 * (the final offset deltas are at most 255 * 2,
258 * times 2 because of storing code unit pairs)
260 * 6 unassigned byte sequence
262 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
263 * this does not contain a final offset delta because the main
264 * purpose of this action code is to save scalar offset values;
265 * therefore, fallback values cannot be assigned to byte
266 * sequences that result in this action code
267 * 7 illegal byte sequence
269 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
270 * 8 state change only
272 * useful for state changes in simple stateful encodings,
273 * at Shift-In/Shift-Out codes
276 * 9..15 reserved for future use
277 * current implementations will only perform a state change
278 * and ignore bits 19..0
280 * An encoding with contiguous ranges of unassigned byte sequences, like
281 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
282 * at least two states for the trail bytes:
283 * One trail byte state that results in code points, and one that only
284 * has "unassigned" and "illegal" terminal states.
286 * Note: partly by accident, this data structure supports simple stateful
287 * encodings without any additional logic.
288 * Currently, only simple Shift-In/Shift-Out schemes are handled with
289 * appropriate state tables (especially EBCDIC_STATEFUL!).
291 * MBCS version 2 added:
292 * unassigned and illegal action codes have U+fffe and U+ffff
293 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
295 * Converting from Unicode to codepage bytes --------------------------------***
297 * The conversion data structure for fromUnicode is designed for the known
298 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
299 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
300 * a roundtrip mapping.
302 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
303 * like in the character properties table.
304 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
305 * with the resulting bytes is at offsetFromUBytes.
307 * Beginning with version 4, single-byte codepages have a significantly different
308 * trie compared to other codepages.
309 * In all cases, the entry in stage 1 is directly the index of the block of
310 * 64 entries in stage 2.
312 * Single-byte lookup:
314 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
315 * Stage 3 contains one 16-bit word per result:
316 * Bits 15..8 indicate the kind of result:
318 * c fallback result from private-use code point
319 * 8 fallback result from other code points
321 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
323 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
324 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
325 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
326 * ASCII code points can be looked up with a linear array access into stage 3.
327 * See maxFastUChar and other details in ucnvmbcs.h.
331 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
332 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
333 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
334 * If this test is false, then a non-zero result will be interpreted as
335 * a fallback mapping.
336 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
338 * Stage 3 contains 2, 3, or 4 bytes per result.
339 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
340 * while 3 bytes are stored as bytes in big-endian order.
341 * Leading zero bytes are ignored, and the number of bytes is counted.
342 * A zero byte mapping result is possible as a roundtrip result.
343 * For some output types, the actual result is processed from this;
344 * see ucnv_MBCSFromUnicodeWithOffsets().
346 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
347 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
349 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
350 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
351 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
352 * ASCII code points can be looked up with a linear array access into stage 3.
353 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
355 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
357 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
358 * may overlap by any number of entries.
360 * MBCS version 2 added:
361 * the converter checks for known output types, which allows
362 * adding new ones without crashing an unaware converter
366 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
367 * consecutive sequences of bytes, starting from the one encoded in value,
368 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
369 * Does not currently support m:n mappings or reverse fallbacks.
370 * This function will not be called for sequences of bytes with leading zeros.
372 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
373 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
374 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
375 * not map to anything
376 * @return TRUE to continue enumeration, FALSE to stop
378 typedef UBool U_CALLCONV
379 UConverterEnumToUCallback(const void *context
, uint32_t value
, UChar32 codePoints
[32]);
381 static void U_CALLCONV
382 ucnv_MBCSLoad(UConverterSharedData
*sharedData
,
383 UConverterLoadArgs
*pArgs
,
385 UErrorCode
*pErrorCode
);
387 static void U_CALLCONV
388 ucnv_MBCSUnload(UConverterSharedData
*sharedData
);
390 static void U_CALLCONV
391 ucnv_MBCSOpen(UConverter
*cnv
,
392 UConverterLoadArgs
*pArgs
,
393 UErrorCode
*pErrorCode
);
395 static UChar32 U_CALLCONV
396 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
397 UErrorCode
*pErrorCode
);
399 static void U_CALLCONV
400 ucnv_MBCSGetStarters(const UConverter
* cnv
,
402 UErrorCode
*pErrorCode
);
405 static const char* U_CALLCONV
406 ucnv_MBCSGetName(const UConverter
*cnv
);
409 static void U_CALLCONV
410 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs
*pArgs
,
412 UErrorCode
*pErrorCode
);
414 static UChar32 U_CALLCONV
415 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
416 UErrorCode
*pErrorCode
);
418 static void U_CALLCONV
419 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
420 UConverterToUnicodeArgs
*pToUArgs
,
421 UErrorCode
*pErrorCode
);
423 static void U_CALLCONV
424 ucnv_MBCSGetUnicodeSet(const UConverter
*cnv
,
426 UConverterUnicodeSet which
,
427 UErrorCode
*pErrorCode
);
429 static void U_CALLCONV
430 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
431 UConverterToUnicodeArgs
*pToUArgs
,
432 UErrorCode
*pErrorCode
);
434 static const UConverterImpl _SBCSUTF8Impl
={
444 ucnv_MBCSToUnicodeWithOffsets
,
445 ucnv_MBCSToUnicodeWithOffsets
,
446 ucnv_MBCSFromUnicodeWithOffsets
,
447 ucnv_MBCSFromUnicodeWithOffsets
,
448 ucnv_MBCSGetNextUChar
,
450 ucnv_MBCSGetStarters
,
454 ucnv_MBCSGetUnicodeSet
,
460 static const UConverterImpl _DBCSUTF8Impl
={
470 ucnv_MBCSToUnicodeWithOffsets
,
471 ucnv_MBCSToUnicodeWithOffsets
,
472 ucnv_MBCSFromUnicodeWithOffsets
,
473 ucnv_MBCSFromUnicodeWithOffsets
,
474 ucnv_MBCSGetNextUChar
,
476 ucnv_MBCSGetStarters
,
480 ucnv_MBCSGetUnicodeSet
,
486 static const UConverterImpl _MBCSImpl
={
496 ucnv_MBCSToUnicodeWithOffsets
,
497 ucnv_MBCSToUnicodeWithOffsets
,
498 ucnv_MBCSFromUnicodeWithOffsets
,
499 ucnv_MBCSFromUnicodeWithOffsets
,
500 ucnv_MBCSGetNextUChar
,
502 ucnv_MBCSGetStarters
,
506 ucnv_MBCSGetUnicodeSet
,
511 /* Static data is in tools/makeconv/ucnvstat.c for data-based
512 * converters. Be sure to update it as well.
515 const UConverterSharedData _MBCSData
={
516 sizeof(UConverterSharedData
), 1,
517 NULL
, NULL
, FALSE
, TRUE
, &_MBCSImpl
,
518 0, UCNV_MBCS_TABLE_INITIALIZER
522 /* GB 18030 data ------------------------------------------------------------ */
524 /* helper macros for linear values for GB 18030 four-byte sequences */
525 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
527 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
529 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
532 * Some ranges of GB 18030 where both the Unicode code points and the
533 * GB four-byte sequences are contiguous and are handled algorithmically by
534 * the special callback functions below.
535 * The values are start & end of Unicode & GB codes.
537 * Note that single surrogates are not mapped by GB 18030
538 * as of the re-released mapping tables from 2000-nov-30.
540 static const uint32_t
541 gb18030Ranges
[14][4]={
542 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
543 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
544 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
545 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
546 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
547 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
548 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
549 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
550 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
551 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
552 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
553 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
554 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
555 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
558 /* bit flag for UConverter.options indicating GB 18030 special handling */
559 #define _MBCS_OPTION_GB18030 0x8000
561 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
562 #define _MBCS_OPTION_KEIS 0x01000
563 #define _MBCS_OPTION_JEF 0x02000
564 #define _MBCS_OPTION_JIPS 0x04000
566 #define KEIS_SO_CHAR_1 0x0A
567 #define KEIS_SO_CHAR_2 0x42
568 #define KEIS_SI_CHAR_1 0x0A
569 #define KEIS_SI_CHAR_2 0x41
571 #define JEF_SO_CHAR 0x28
572 #define JEF_SI_CHAR 0x29
574 #define JIPS_SO_CHAR_1 0x1A
575 #define JIPS_SO_CHAR_2 0x70
576 #define JIPS_SI_CHAR_1 0x1A
577 #define JIPS_SI_CHAR_2 0x71
583 typedef enum SISO_Option SISO_Option
;
585 static int32_t getSISOBytes(SISO_Option option
, uint32_t cnvOption
, uint8_t *value
) {
586 int32_t SISOLength
= 0;
590 if ((cnvOption
&_MBCS_OPTION_KEIS
)!=0) {
591 value
[0] = KEIS_SI_CHAR_1
;
592 value
[1] = KEIS_SI_CHAR_2
;
594 } else if ((cnvOption
&_MBCS_OPTION_JEF
)!=0) {
595 value
[0] = JEF_SI_CHAR
;
597 } else if ((cnvOption
&_MBCS_OPTION_JIPS
)!=0) {
598 value
[0] = JIPS_SI_CHAR_1
;
599 value
[1] = JIPS_SI_CHAR_2
;
607 if ((cnvOption
&_MBCS_OPTION_KEIS
)!=0) {
608 value
[0] = KEIS_SO_CHAR_1
;
609 value
[1] = KEIS_SO_CHAR_2
;
611 } else if ((cnvOption
&_MBCS_OPTION_JEF
)!=0) {
612 value
[0] = JEF_SO_CHAR
;
614 } else if ((cnvOption
&_MBCS_OPTION_JIPS
)!=0) {
615 value
[0] = JIPS_SO_CHAR_1
;
616 value
[1] = JIPS_SO_CHAR_2
;
624 /* Should never happen. */
631 /* Miscellaneous ------------------------------------------------------------ */
633 /* similar to ucnv_MBCSGetNextUChar() but recursive */
635 enumToU(UConverterMBCSTable
*mbcsTable
, int8_t stateProps
[],
636 int32_t state
, uint32_t offset
,
638 UConverterEnumToUCallback
*callback
, const void *context
,
639 UErrorCode
*pErrorCode
) {
640 UChar32 codePoints
[32];
642 const uint16_t *unicodeCodeUnits
;
643 UChar32 anyCodePoints
;
646 row
=mbcsTable
->stateTable
[state
];
647 unicodeCodeUnits
=mbcsTable
->unicodeCodeUnits
;
650 anyCodePoints
=-1; /* becomes non-negative if there is a mapping */
652 b
=(stateProps
[state
]&0x38)<<2;
653 if(b
==0 && stateProps
[state
]>=0x40) {
654 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
655 codePoints
[0]=U_SENTINEL
;
658 limit
=((stateProps
[state
]&7)+1)<<5;
660 int32_t entry
=row
[b
];
661 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
662 int32_t nextState
=MBCS_ENTRY_TRANSITION_STATE(entry
);
663 if(stateProps
[nextState
]>=0) {
664 /* recurse to a state with non-ignorable actions */
666 mbcsTable
, stateProps
, nextState
,
667 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
674 codePoints
[b
&0x1f]=U_SENTINEL
;
680 * An if-else-if chain provides more reliable performance for
681 * the most common cases compared to a switch.
683 action
=MBCS_ENTRY_FINAL_ACTION(entry
);
684 if(action
==MBCS_STATE_VALID_DIRECT_16
) {
685 /* output BMP code point */
686 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
687 } else if(action
==MBCS_STATE_VALID_16
) {
688 int32_t finalOffset
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
689 c
=unicodeCodeUnits
[finalOffset
];
691 /* output BMP code point */
695 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
696 int32_t finalOffset
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
697 c
=unicodeCodeUnits
[finalOffset
++];
699 /* output BMP code point below 0xd800 */
700 } else if(c
<=0xdbff) {
701 /* output roundtrip or fallback supplementary code point */
702 c
=((c
&0x3ff)<<10)+unicodeCodeUnits
[finalOffset
]+(0x10000-0xdc00);
703 } else if(c
==0xe000) {
704 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
705 c
=unicodeCodeUnits
[finalOffset
];
709 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
710 /* output supplementary code point */
711 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
716 codePoints
[b
&0x1f]=c
;
719 if(((++b
)&0x1f)==0) {
720 if(anyCodePoints
>=0) {
721 if(!callback(context
, value
|(uint32_t)(b
-0x20), codePoints
)) {
732 * Only called if stateProps[state]==-1.
733 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
734 * MBCS_STATE_CHANGE_ONLY.
737 getStateProp(const int32_t (*stateTable
)[256], int8_t stateProps
[], int state
) {
739 int32_t min
, max
, entry
, nextState
;
741 row
=stateTable
[state
];
744 /* find first non-ignorable state */
747 nextState
=MBCS_ENTRY_STATE(entry
);
748 if(stateProps
[nextState
]==-1) {
749 getStateProp(stateTable
, stateProps
, nextState
);
751 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
752 if(stateProps
[nextState
]>=0) {
755 } else if(MBCS_ENTRY_FINAL_ACTION(entry
)<MBCS_STATE_UNASSIGNED
) {
759 stateProps
[state
]=-0x40; /* (int8_t)0xc0 */
760 return stateProps
[state
];
763 stateProps
[state
]|=(int8_t)((min
>>5)<<3);
765 /* find last non-ignorable state */
766 for(max
=0xff; min
<max
; --max
) {
768 nextState
=MBCS_ENTRY_STATE(entry
);
769 if(stateProps
[nextState
]==-1) {
770 getStateProp(stateTable
, stateProps
, nextState
);
772 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
773 if(stateProps
[nextState
]>=0) {
776 } else if(MBCS_ENTRY_FINAL_ACTION(entry
)<MBCS_STATE_UNASSIGNED
) {
780 stateProps
[state
]|=(int8_t)(max
>>5);
782 /* recurse further and collect direct-state information */
785 nextState
=MBCS_ENTRY_STATE(entry
);
786 if(stateProps
[nextState
]==-1) {
787 getStateProp(stateTable
, stateProps
, nextState
);
789 if(MBCS_ENTRY_IS_FINAL(entry
)) {
790 stateProps
[nextState
]|=0x40;
791 if(MBCS_ENTRY_FINAL_ACTION(entry
)<=MBCS_STATE_FALLBACK_DIRECT_20
) {
792 stateProps
[state
]|=0x40;
797 return stateProps
[state
];
801 * Internal function enumerating the toUnicode data of an MBCS converter.
802 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
803 * table, but could also be used for a future ucnv_getUnicodeSet() option
804 * that includes reverse fallbacks (after updating this function's implementation).
805 * Currently only handles roundtrip mappings.
806 * Does not currently handle extensions.
809 ucnv_MBCSEnumToUnicode(UConverterMBCSTable
*mbcsTable
,
810 UConverterEnumToUCallback
*callback
, const void *context
,
811 UErrorCode
*pErrorCode
) {
813 * Properties for each state, to speed up the enumeration.
814 * Ignorable actions are unassigned/illegal/state-change-only:
815 * They do not lead to mappings.
818 * 1 direct/initial state (stateful converters have multiple)
819 * 0 non-initial state with transitions or with non-ignorable result actions
820 * -1 final state with only ignorable actions
823 * The lowest byte value with non-ignorable actions is
824 * value<<5 (rounded down).
827 * The highest byte value with non-ignorable actions is
828 * (value<<5)&0x1f (rounded up).
830 int8_t stateProps
[MBCS_MAX_STATE_COUNT
];
833 uprv_memset(stateProps
, -1, sizeof(stateProps
));
835 /* recurse from state 0 and set all stateProps */
836 getStateProp(mbcsTable
->stateTable
, stateProps
, 0);
838 for(state
=0; state
<mbcsTable
->countStates
; ++state
) {
839 /*if(stateProps[state]==-1) {
840 printf("unused/unreachable <icu:state> %d\n", state);
842 if(stateProps
[state
]>=0x40) {
843 /* start from each direct state */
845 mbcsTable
, stateProps
, state
, 0, 0,
853 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData
*sharedData
,
855 UConverterUnicodeSet which
,
856 UConverterSetFilter filter
,
857 UErrorCode
*pErrorCode
) {
858 const UConverterMBCSTable
*mbcsTable
;
859 const uint16_t *table
;
862 uint16_t st1
, maxStage1
, st2
;
866 /* enumerate the from-Unicode trie table */
867 mbcsTable
=&sharedData
->mbcs
;
868 table
=mbcsTable
->fromUnicodeTable
;
869 if(mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
875 c
=0; /* keep track of the current code point while enumerating */
877 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
878 const uint16_t *stage2
, *stage3
, *results
;
881 results
=(const uint16_t *)mbcsTable
->fromUnicodeBytes
;
884 * Set a threshold variable for selecting which mappings to use.
885 * See ucnv_MBCSSingleFromBMPWithOffsets() and
886 * MBCS_SINGLE_RESULT_FROM_U() for details.
888 if(which
==UCNV_ROUNDTRIP_SET
) {
889 /* use only roundtrips */
891 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
892 /* use all roundtrip and fallback results */
896 for(st1
=0; st1
<maxStage1
; ++st1
) {
900 for(st2
=0; st2
<64; ++st2
) {
901 if((st3
=stage2
[st2
])!=0) {
902 /* read the stage 3 block */
906 if(*stage3
++>=minValue
) {
909 } while((++c
&0xf)!=0);
911 c
+=16; /* empty stage 3 block */
915 c
+=1024; /* empty stage 2 block */
919 const uint32_t *stage2
;
920 const uint8_t *stage3
, *bytes
;
921 uint32_t st3Multiplier
;
925 bytes
=mbcsTable
->fromUnicodeBytes
;
927 useFallback
=(UBool
)(which
==UCNV_ROUNDTRIP_AND_FALLBACK_SET
);
929 switch(mbcsTable
->outputType
) {
931 case MBCS_OUTPUT_4_EUC
:
942 for(st1
=0; st1
<maxStage1
; ++st1
) {
944 if(st2
>(maxStage1
>>1)) {
945 stage2
=(const uint32_t *)table
+st2
;
946 for(st2
=0; st2
<64; ++st2
) {
947 if((st3
=stage2
[st2
])!=0) {
948 /* read the stage 3 block */
949 stage3
=bytes
+st3Multiplier
*16*(uint32_t)(uint16_t)st3
;
951 /* get the roundtrip flags for the stage 3 block */
955 * Add code points for which the roundtrip flag is set,
956 * or which map to non-zero bytes if we use fallbacks.
957 * See ucnv_MBCSFromUnicodeWithOffsets() for details.
960 case UCNV_SET_FILTER_NONE
:
964 stage3
+=st3Multiplier
;
965 } else if(useFallback
) {
967 switch(st3Multiplier
) {
975 b
|=stage3
[0]|stage3
[1];
986 } while((++c
&0xf)!=0);
988 case UCNV_SET_FILTER_DBCS_ONLY
:
989 /* Ignore single-byte results (<0x100). */
991 if(((st3
&1)!=0 || useFallback
) && *((const uint16_t *)stage3
)>=0x100) {
995 stage3
+=2; /* +=st3Multiplier */
996 } while((++c
&0xf)!=0);
998 case UCNV_SET_FILTER_2022_CN
:
999 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
1001 if(((st3
&1)!=0 || useFallback
) && ((value
=*stage3
)==0x81 || value
==0x82)) {
1002 sa
->add(sa
->set
, c
);
1005 stage3
+=3; /* +=st3Multiplier */
1006 } while((++c
&0xf)!=0);
1008 case UCNV_SET_FILTER_SJIS
:
1009 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
1011 if(((st3
&1)!=0 || useFallback
) && (value
=*((const uint16_t *)stage3
))>=0x8140 && value
<=0xeffc) {
1012 sa
->add(sa
->set
, c
);
1015 stage3
+=2; /* +=st3Multiplier */
1016 } while((++c
&0xf)!=0);
1018 case UCNV_SET_FILTER_GR94DBCS
:
1019 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
1021 if( ((st3
&1)!=0 || useFallback
) &&
1022 (uint16_t)((value
=*((const uint16_t *)stage3
)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
1023 (uint8_t)(value
-0xa1)<=(0xfe - 0xa1)
1025 sa
->add(sa
->set
, c
);
1028 stage3
+=2; /* +=st3Multiplier */
1029 } while((++c
&0xf)!=0);
1031 case UCNV_SET_FILTER_HZ
:
1032 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
1034 if( ((st3
&1)!=0 || useFallback
) &&
1035 (uint16_t)((value
=*((const uint16_t *)stage3
))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
1036 (uint8_t)(value
-0xa1)<=(0xfe - 0xa1)
1038 sa
->add(sa
->set
, c
);
1041 stage3
+=2; /* +=st3Multiplier */
1042 } while((++c
&0xf)!=0);
1045 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
1049 c
+=16; /* empty stage 3 block */
1053 c
+=1024; /* empty stage 2 block */
1058 ucnv_extGetUnicodeSet(sharedData
, sa
, which
, filter
, pErrorCode
);
1062 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData
*sharedData
,
1063 const USetAdder
*sa
,
1064 UConverterUnicodeSet which
,
1065 UErrorCode
*pErrorCode
) {
1066 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
1067 sharedData
, sa
, which
,
1068 sharedData
->mbcs
.outputType
==MBCS_OUTPUT_DBCS_ONLY
?
1069 UCNV_SET_FILTER_DBCS_ONLY
:
1070 UCNV_SET_FILTER_NONE
,
1074 static void U_CALLCONV
1075 ucnv_MBCSGetUnicodeSet(const UConverter
*cnv
,
1076 const USetAdder
*sa
,
1077 UConverterUnicodeSet which
,
1078 UErrorCode
*pErrorCode
) {
1079 if(cnv
->options
&_MBCS_OPTION_GB18030
) {
1080 sa
->addRange(sa
->set
, 0, 0xd7ff);
1081 sa
->addRange(sa
->set
, 0xe000, 0x10ffff);
1083 ucnv_MBCSGetUnicodeSetForUnicode(cnv
->sharedData
, sa
, which
, pErrorCode
);
1087 /* conversion extensions for input not in the main table -------------------- */
1090 * Hardcoded extension handling for GB 18030.
1091 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
1093 * In the future, conversion extensions may handle m:n mappings and delta tables,
1094 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
1096 * If an input character cannot be mapped, then these functions set an error
1097 * code. The framework will then call the callback function.
1101 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
1102 * else return 0 after output has been written to the target
1105 _extFromU(UConverter
*cnv
, const UConverterSharedData
*sharedData
,
1107 const UChar
**source
, const UChar
*sourceLimit
,
1108 uint8_t **target
, const uint8_t *targetLimit
,
1109 int32_t **offsets
, int32_t sourceIndex
,
1111 UErrorCode
*pErrorCode
) {
1114 cnv
->useSubChar1
=FALSE
;
1116 if( (cx
=sharedData
->mbcs
.extIndexes
)!=NULL
&&
1117 ucnv_extInitialMatchFromU(
1119 cp
, source
, sourceLimit
,
1120 (char **)target
, (char *)targetLimit
,
1121 offsets
, sourceIndex
,
1125 return 0; /* an extension mapping handled the input */
1129 if((cnv
->options
&_MBCS_OPTION_GB18030
)!=0) {
1130 const uint32_t *range
;
1133 range
=gb18030Ranges
[0];
1134 for(i
=0; i
<UPRV_LENGTHOF(gb18030Ranges
); range
+=4, ++i
) {
1135 if(range
[0]<=(uint32_t)cp
&& (uint32_t)cp
<=range
[1]) {
1136 /* found the Unicode code point, output the four-byte sequence for it */
1140 /* get the linear value of the first GB 18030 code in this range */
1141 linear
=range
[2]-LINEAR_18030_BASE
;
1143 /* add the offset from the beginning of the range */
1144 linear
+=((uint32_t)cp
-range
[0]);
1146 /* turn this into a four-byte sequence */
1147 bytes
[3]=(char)(0x30+linear%10
); linear
/=10;
1148 bytes
[2]=(char)(0x81+linear%126
); linear
/=126;
1149 bytes
[1]=(char)(0x30+linear%10
); linear
/=10;
1150 bytes
[0]=(char)(0x81+linear
);
1152 /* output this sequence */
1153 ucnv_fromUWriteBytes(cnv
,
1154 bytes
, 4, (char **)target
, (char *)targetLimit
,
1155 offsets
, sourceIndex
, pErrorCode
);
1162 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1167 * Input sequence: cnv->toUBytes[0..length[
1168 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1169 * else return 0 after output has been written to the target
1172 _extToU(UConverter
*cnv
, const UConverterSharedData
*sharedData
,
1174 const uint8_t **source
, const uint8_t *sourceLimit
,
1175 UChar
**target
, const UChar
*targetLimit
,
1176 int32_t **offsets
, int32_t sourceIndex
,
1178 UErrorCode
*pErrorCode
) {
1181 if( (cx
=sharedData
->mbcs
.extIndexes
)!=NULL
&&
1182 ucnv_extInitialMatchToU(
1184 length
, (const char **)source
, (const char *)sourceLimit
,
1185 target
, targetLimit
,
1186 offsets
, sourceIndex
,
1190 return 0; /* an extension mapping handled the input */
1194 if(length
==4 && (cnv
->options
&_MBCS_OPTION_GB18030
)!=0) {
1195 const uint32_t *range
;
1199 linear
=LINEAR_18030(cnv
->toUBytes
[0], cnv
->toUBytes
[1], cnv
->toUBytes
[2], cnv
->toUBytes
[3]);
1200 range
=gb18030Ranges
[0];
1201 for(i
=0; i
<UPRV_LENGTHOF(gb18030Ranges
); range
+=4, ++i
) {
1202 if(range
[2]<=linear
&& linear
<=range
[3]) {
1203 /* found the sequence, output the Unicode code point for it */
1204 *pErrorCode
=U_ZERO_ERROR
;
1206 /* add the linear difference between the input and start sequences to the start code point */
1207 linear
=range
[0]+(linear
-range
[2]);
1209 /* output this code point */
1210 ucnv_toUWriteCodePoint(cnv
, linear
, target
, targetLimit
, offsets
, sourceIndex
, pErrorCode
);
1218 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1222 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
1225 * This code modifies a standard EBCDIC<->Unicode mapping table for
1226 * OS/390 (z/OS) Unix System Services (Open Edition).
1227 * The difference is in the mapping of Line Feed and New Line control codes:
1228 * Standard EBCDIC maps
1233 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1239 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1240 * by copying it into allocated memory and swapping the LF and NL values.
1241 * It allows to support the same EBCDIC charset in both versions without
1242 * duplicating the entire installed table.
1245 /* standard EBCDIC codes */
1246 #define EBCDIC_LF 0x25
1247 #define EBCDIC_NL 0x15
1249 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1250 #define EBCDIC_RT_LF 0xf25
1251 #define EBCDIC_RT_NL 0xf15
1253 /* Unicode code points */
1258 _EBCDICSwapLFNL(UConverterSharedData
*sharedData
, UErrorCode
*pErrorCode
) {
1259 UConverterMBCSTable
*mbcsTable
;
1261 const uint16_t *table
, *results
;
1262 const uint8_t *bytes
;
1264 int32_t (*newStateTable
)[256];
1265 uint16_t *newResults
;
1269 uint32_t stage2Entry
;
1270 uint32_t size
, sizeofFromUBytes
;
1272 mbcsTable
=&sharedData
->mbcs
;
1274 table
=mbcsTable
->fromUnicodeTable
;
1275 bytes
=mbcsTable
->fromUnicodeBytes
;
1276 results
=(const uint16_t *)bytes
;
1279 * Check that this is an EBCDIC table with SBCS portion -
1280 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1282 * If not, ignore the option. Options are always ignored if they do not apply.
1285 (mbcsTable
->outputType
==MBCS_OUTPUT_1
|| mbcsTable
->outputType
==MBCS_OUTPUT_2_SISO
) &&
1286 mbcsTable
->stateTable
[0][EBCDIC_LF
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
) &&
1287 mbcsTable
->stateTable
[0][EBCDIC_NL
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
)
1292 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1294 EBCDIC_RT_LF
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_LF
) &&
1295 EBCDIC_RT_NL
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_NL
)
1299 } else /* MBCS_OUTPUT_2_SISO */ {
1300 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
1302 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_LF
)!=0 &&
1303 EBCDIC_LF
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_LF
)
1308 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
1310 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_NL
)!=0 &&
1311 EBCDIC_NL
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_NL
)
1317 if(mbcsTable
->fromUBytesLength
>0) {
1319 * We _know_ the number of bytes in the fromUnicodeBytes array
1320 * starting with header.version 4.1.
1322 sizeofFromUBytes
=mbcsTable
->fromUBytesLength
;
1326 * There used to be code to enumerate the fromUnicode
1327 * trie and find the highest entry, but it was removed in ICU 3.2
1328 * because it was not tested and caused a low code coverage number.
1329 * See Jitterbug 3674.
1330 * This affects only some .cnv file formats with a header.version
1331 * below 4.1, and only when swaplfnl is requested.
1333 * ucnvmbcs.c revision 1.99 is the last one with the
1334 * ucnv_MBCSSizeofFromUBytes() function.
1336 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
1341 * The table has an appropriate format.
1342 * Allocate and build
1343 * - a modified to-Unicode state table
1344 * - a modified from-Unicode output array
1345 * - a converter name string with the swap option appended
1348 mbcsTable
->countStates
*1024+
1350 UCNV_MAX_CONVERTER_NAME_LENGTH
+20;
1351 p
=(uint8_t *)uprv_malloc(size
);
1353 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1357 /* copy and modify the to-Unicode state table */
1358 newStateTable
=(int32_t (*)[256])p
;
1359 uprv_memcpy(newStateTable
, mbcsTable
->stateTable
, mbcsTable
->countStates
*1024);
1361 newStateTable
[0][EBCDIC_LF
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
);
1362 newStateTable
[0][EBCDIC_NL
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
);
1364 /* copy and modify the from-Unicode result table */
1365 newResults
=(uint16_t *)newStateTable
[mbcsTable
->countStates
];
1366 uprv_memcpy(newResults
, bytes
, sizeofFromUBytes
);
1368 /* conveniently, the table access macros work on the left side of expressions */
1369 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1370 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_LF
)=EBCDIC_RT_NL
;
1371 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_NL
)=EBCDIC_RT_LF
;
1372 } else /* MBCS_OUTPUT_2_SISO */ {
1373 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
1374 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_LF
)=EBCDIC_NL
;
1376 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
1377 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_NL
)=EBCDIC_LF
;
1380 /* set the canonical converter name */
1381 name
=(char *)newResults
+sizeofFromUBytes
;
1382 uprv_strcpy(name
, sharedData
->staticData
->name
);
1383 uprv_strcat(name
, UCNV_SWAP_LFNL_OPTION_STRING
);
1385 /* set the pointers */
1386 icu::umtx_lock(NULL
);
1387 if(mbcsTable
->swapLFNLStateTable
==NULL
) {
1388 mbcsTable
->swapLFNLStateTable
=newStateTable
;
1389 mbcsTable
->swapLFNLFromUnicodeBytes
=(uint8_t *)newResults
;
1390 mbcsTable
->swapLFNLName
=name
;
1394 icu::umtx_unlock(NULL
);
1396 /* release the allocated memory if another thread beat us to it */
1397 if(newStateTable
!=NULL
) {
1398 uprv_free(newStateTable
);
1403 /* reconstitute omitted fromUnicode data ------------------------------------ */
1405 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1406 static UBool U_CALLCONV
1407 writeStage3Roundtrip(const void *context
, uint32_t value
, UChar32 codePoints
[32]) {
1408 UConverterMBCSTable
*mbcsTable
=(UConverterMBCSTable
*)context
;
1409 const uint16_t *table
;
1415 table
=mbcsTable
->fromUnicodeTable
;
1416 bytes
=(uint8_t *)mbcsTable
->fromUnicodeBytes
;
1418 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1419 switch(mbcsTable
->outputType
) {
1420 case MBCS_OUTPUT_3_EUC
:
1422 /* short sequences are stored directly */
1423 /* code set 0 or 1 */
1424 } else if(value
<=0x8effff) {
1427 } else /* first byte is 0x8f */ {
1432 case MBCS_OUTPUT_4_EUC
:
1433 if(value
<=0xffffff) {
1434 /* short sequences are stored directly */
1435 /* code set 0 or 1 */
1436 } else if(value
<=0x8effffff) {
1439 } else /* first byte is 0x8f */ {
1448 for(i
=0; i
<=0x1f; ++value
, ++i
) {
1454 /* locate the stage 2 & 3 data */
1455 stage2
=((uint32_t *)table
)+table
[c
>>10]+((c
>>4)&0x3f);
1457 st3
=(int32_t)(uint16_t)*stage2
*16+(c
&0xf);
1459 /* write the codepage bytes into stage 3 */
1460 switch(mbcsTable
->outputType
) {
1462 case MBCS_OUTPUT_4_EUC
:
1464 p
[0]=(uint8_t)(value
>>16);
1465 p
[1]=(uint8_t)(value
>>8);
1466 p
[2]=(uint8_t)value
;
1469 ((uint32_t *)p
)[st3
]=value
;
1472 /* 2 bytes per character */
1473 ((uint16_t *)p
)[st3
]=(uint16_t)value
;
1477 /* set the roundtrip flag */
1478 *stage2
|=(1UL<<(16+(c
&0xf)));
1484 reconstituteData(UConverterMBCSTable
*mbcsTable
,
1485 uint32_t stage1Length
, uint32_t stage2Length
,
1486 uint32_t fullStage2Length
, /* lengths are numbers of units, not bytes */
1487 UErrorCode
*pErrorCode
) {
1490 uint32_t dataLength
=stage1Length
*2+fullStage2Length
*4+mbcsTable
->fromUBytesLength
;
1491 mbcsTable
->reconstitutedData
=(uint8_t *)uprv_malloc(dataLength
);
1492 if(mbcsTable
->reconstitutedData
==NULL
) {
1493 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1496 uprv_memset(mbcsTable
->reconstitutedData
, 0, dataLength
);
1498 /* copy existing data and reroute the pointers */
1499 stage1
=(uint16_t *)mbcsTable
->reconstitutedData
;
1500 uprv_memcpy(stage1
, mbcsTable
->fromUnicodeTable
, stage1Length
*2);
1502 stage2
=(uint32_t *)(stage1
+stage1Length
);
1503 uprv_memcpy(stage2
+(fullStage2Length
-stage2Length
),
1504 mbcsTable
->fromUnicodeTable
+stage1Length
,
1507 mbcsTable
->fromUnicodeTable
=stage1
;
1508 mbcsTable
->fromUnicodeBytes
=(uint8_t *)(stage2
+fullStage2Length
);
1510 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1511 stage2
=(uint32_t *)stage1
;
1513 /* reconstitute the initial part of stage 2 from the mbcsIndex */
1515 int32_t stageUTF8Length
=((int32_t)mbcsTable
->maxFastUChar
+1)>>6;
1516 int32_t stageUTF8Index
=0;
1517 int32_t st1
, st2
, st3
, i
;
1519 for(st1
=0; stageUTF8Index
<stageUTF8Length
; ++st1
) {
1521 if(st2
!=(int32_t)stage1Length
/2) {
1522 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1523 for(i
=0; i
<16; ++i
) {
1524 st3
=mbcsTable
->mbcsIndex
[stageUTF8Index
++];
1526 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1529 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1530 * allocated together as a single 64-block for access from the mbcsIndex
1532 stage2
[st2
++]=st3
++;
1533 stage2
[st2
++]=st3
++;
1534 stage2
[st2
++]=st3
++;
1537 /* no stage 3 block, skip */
1542 /* no stage 2 block, skip */
1548 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1549 ucnv_MBCSEnumToUnicode(mbcsTable
, writeStage3Roundtrip
, mbcsTable
, pErrorCode
);
1552 /* MBCS setup functions ----------------------------------------------------- */
1554 static void U_CALLCONV
1555 ucnv_MBCSLoad(UConverterSharedData
*sharedData
,
1556 UConverterLoadArgs
*pArgs
,
1558 UErrorCode
*pErrorCode
) {
1560 UConverterMBCSTable
*mbcsTable
=&sharedData
->mbcs
;
1561 _MBCSHeader
*header
=(_MBCSHeader
*)raw
;
1563 uint32_t headerLength
;
1564 UBool noFromU
=FALSE
;
1566 if(header
->version
[0]==4) {
1567 headerLength
=MBCS_HEADER_V4_LENGTH
;
1568 } else if(header
->version
[0]==5 && header
->version
[1]>=3 &&
1569 (header
->options
&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK
)==0) {
1570 headerLength
=header
->options
&MBCS_OPT_LENGTH_MASK
;
1571 noFromU
=(UBool
)((header
->options
&MBCS_OPT_NO_FROM_U
)!=0);
1573 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1577 mbcsTable
->outputType
=(uint8_t)header
->flags
;
1578 if(noFromU
&& mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1579 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1583 /* extension data, header version 4.2 and higher */
1584 offset
=header
->flags
>>8;
1586 mbcsTable
->extIndexes
=(const int32_t *)(raw
+offset
);
1589 if(mbcsTable
->outputType
==MBCS_OUTPUT_EXT_ONLY
) {
1590 UConverterLoadArgs args
=UCNV_LOAD_ARGS_INITIALIZER
;
1591 UConverterSharedData
*baseSharedData
;
1592 const int32_t *extIndexes
;
1593 const char *baseName
;
1595 /* extension-only file, load the base table and set values appropriately */
1596 if((extIndexes
=mbcsTable
->extIndexes
)==NULL
) {
1597 /* extension-only file without extension */
1598 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1602 if(pArgs
->nestedLoads
!=1) {
1603 /* an extension table must not be loaded as a base table */
1604 *pErrorCode
=U_INVALID_TABLE_FILE
;
1608 /* load the base table */
1609 baseName
=(const char *)header
+headerLength
*4;
1610 if(0==uprv_strcmp(baseName
, sharedData
->staticData
->name
)) {
1611 /* forbid loading this same extension-only file */
1612 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1616 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1617 args
.size
=sizeof(UConverterLoadArgs
);
1619 args
.onlyTestIsLoadable
=pArgs
->onlyTestIsLoadable
;
1620 args
.reserved
=pArgs
->reserved
;
1621 args
.options
=pArgs
->options
;
1622 args
.pkg
=pArgs
->pkg
;
1624 baseSharedData
=ucnv_load(&args
, pErrorCode
);
1625 if(U_FAILURE(*pErrorCode
)) {
1628 if( baseSharedData
->staticData
->conversionType
!=UCNV_MBCS
||
1629 baseSharedData
->mbcs
.baseSharedData
!=NULL
1631 ucnv_unload(baseSharedData
);
1632 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1635 if(pArgs
->onlyTestIsLoadable
) {
1637 * Exit as soon as we know that we can load the converter
1638 * and the format is valid and supported.
1639 * The worst that can happen in the following code is a memory
1642 ucnv_unload(baseSharedData
);
1646 /* copy the base table data */
1647 uprv_memcpy(mbcsTable
, &baseSharedData
->mbcs
, sizeof(UConverterMBCSTable
));
1649 /* overwrite values with relevant ones for the extension converter */
1650 mbcsTable
->baseSharedData
=baseSharedData
;
1651 mbcsTable
->extIndexes
=extIndexes
;
1654 * It would be possible to share the swapLFNL data with a base converter,
1655 * but the generated name would have to be different, and the memory
1656 * would have to be free'd only once.
1657 * It is easier to just create the data for the extension converter
1658 * separately when it is requested.
1660 mbcsTable
->swapLFNLStateTable
=NULL
;
1661 mbcsTable
->swapLFNLFromUnicodeBytes
=NULL
;
1662 mbcsTable
->swapLFNLName
=NULL
;
1665 * The reconstitutedData must be deleted only when the base converter
1668 mbcsTable
->reconstitutedData
=NULL
;
1671 * Set a special, runtime-only outputType if the extension converter
1672 * is a DBCS version of a base converter that also maps single bytes.
1674 if( sharedData
->staticData
->conversionType
==UCNV_DBCS
||
1675 (sharedData
->staticData
->conversionType
==UCNV_MBCS
&&
1676 sharedData
->staticData
->minBytesPerChar
>=2)
1678 if(baseSharedData
->mbcs
.outputType
==MBCS_OUTPUT_2_SISO
) {
1679 /* the base converter is SI/SO-stateful */
1682 /* get the dbcs state from the state table entry for SO=0x0e */
1683 entry
=mbcsTable
->stateTable
[0][0xe];
1684 if( MBCS_ENTRY_IS_FINAL(entry
) &&
1685 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_CHANGE_ONLY
&&
1686 MBCS_ENTRY_FINAL_STATE(entry
)!=0
1688 mbcsTable
->dbcsOnlyState
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1690 mbcsTable
->outputType
=MBCS_OUTPUT_DBCS_ONLY
;
1693 baseSharedData
->staticData
->conversionType
==UCNV_MBCS
&&
1694 baseSharedData
->staticData
->minBytesPerChar
==1 &&
1695 baseSharedData
->staticData
->maxBytesPerChar
==2 &&
1696 mbcsTable
->countStates
<=127
1698 /* non-stateful base converter, need to modify the state table */
1699 int32_t (*newStateTable
)[256];
1703 /* allocate a new state table and copy the base state table contents */
1704 count
=mbcsTable
->countStates
;
1705 newStateTable
=(int32_t (*)[256])uprv_malloc((count
+1)*1024);
1706 if(newStateTable
==NULL
) {
1707 ucnv_unload(baseSharedData
);
1708 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1712 uprv_memcpy(newStateTable
, mbcsTable
->stateTable
, count
*1024);
1714 /* change all final single-byte entries to go to a new all-illegal state */
1715 state
=newStateTable
[0];
1716 for(i
=0; i
<256; ++i
) {
1717 if(MBCS_ENTRY_IS_FINAL(state
[i
])) {
1718 state
[i
]=MBCS_ENTRY_TRANSITION(count
, 0);
1722 /* build the new all-illegal state */
1723 state
=newStateTable
[count
];
1724 for(i
=0; i
<256; ++i
) {
1725 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0);
1727 mbcsTable
->stateTable
=(const int32_t (*)[256])newStateTable
;
1728 mbcsTable
->countStates
=(uint8_t)(count
+1);
1729 mbcsTable
->stateTableOwned
=TRUE
;
1731 mbcsTable
->outputType
=MBCS_OUTPUT_DBCS_ONLY
;
1736 * unlike below for files with base tables, do not get the unicodeMask
1737 * from the sharedData; instead, use the base table's unicodeMask,
1738 * which we copied in the memcpy above;
1739 * this is necessary because the static data unicodeMask, especially
1740 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1743 /* conversion file with a base table; an additional extension table is optional */
1744 /* make sure that the output type is known */
1745 switch(mbcsTable
->outputType
) {
1750 case MBCS_OUTPUT_3_EUC
:
1751 case MBCS_OUTPUT_4_EUC
:
1752 case MBCS_OUTPUT_2_SISO
:
1756 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1759 if(pArgs
->onlyTestIsLoadable
) {
1761 * Exit as soon as we know that we can load the converter
1762 * and the format is valid and supported.
1763 * The worst that can happen in the following code is a memory
1769 mbcsTable
->countStates
=(uint8_t)header
->countStates
;
1770 mbcsTable
->countToUFallbacks
=header
->countToUFallbacks
;
1771 mbcsTable
->stateTable
=(const int32_t (*)[256])(raw
+headerLength
*4);
1772 mbcsTable
->toUFallbacks
=(const _MBCSToUFallback
*)(mbcsTable
->stateTable
+header
->countStates
);
1773 mbcsTable
->unicodeCodeUnits
=(const uint16_t *)(raw
+header
->offsetToUCodeUnits
);
1775 mbcsTable
->fromUnicodeTable
=(const uint16_t *)(raw
+header
->offsetFromUTable
);
1776 mbcsTable
->fromUnicodeBytes
=(const uint8_t *)(raw
+header
->offsetFromUBytes
);
1777 mbcsTable
->fromUBytesLength
=header
->fromUBytesLength
;
1780 * converter versions 6.1 and up contain a unicodeMask that is
1781 * used here to select the most efficient function implementations
1783 info
.size
=sizeof(UDataInfo
);
1784 udata_getInfo((UDataMemory
*)sharedData
->dataMemory
, &info
);
1785 if(info
.formatVersion
[0]>6 || (info
.formatVersion
[0]==6 && info
.formatVersion
[1]>=1)) {
1786 /* mask off possible future extensions to be safe */
1787 mbcsTable
->unicodeMask
=(uint8_t)(sharedData
->staticData
->unicodeMask
&3);
1789 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1790 mbcsTable
->unicodeMask
=UCNV_HAS_SUPPLEMENTARY
|UCNV_HAS_SURROGATES
;
1794 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1795 * Check for the header version, SBCS vs. MBCS, and for whether the
1796 * data structures are optimized for code points as high as what the
1797 * runtime code is designed for.
1798 * The implementation does not handle mapping tables with entries for
1799 * unpaired surrogates.
1801 if( header
->version
[1]>=3 &&
1802 (mbcsTable
->unicodeMask
&UCNV_HAS_SURROGATES
)==0 &&
1803 (mbcsTable
->countStates
==1 ?
1804 (header
->version
[2]>=(SBCS_FAST_MAX
>>8)) :
1805 (header
->version
[2]>=(MBCS_FAST_MAX
>>8))
1808 mbcsTable
->utf8Friendly
=TRUE
;
1810 if(mbcsTable
->countStates
==1) {
1812 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1813 * Build a table with indexes to each block, to be used instead of
1814 * the regular stage 1/2 table.
1817 for(i
=0; i
<(SBCS_FAST_LIMIT
>>6); ++i
) {
1818 mbcsTable
->sbcsIndex
[i
]=mbcsTable
->fromUnicodeTable
[mbcsTable
->fromUnicodeTable
[i
>>4]+((i
<<2)&0x3c)];
1820 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1821 mbcsTable
->maxFastUChar
=SBCS_FAST_MAX
;
1824 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1825 * The .cnv file is prebuilt with an additional stage table with indexes
1828 mbcsTable
->mbcsIndex
=(const uint16_t *)
1829 (mbcsTable
->fromUnicodeBytes
+
1830 (noFromU
? 0 : mbcsTable
->fromUBytesLength
));
1831 mbcsTable
->maxFastUChar
=(((UChar
)header
->version
[2])<<8)|0xff;
1835 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1837 uint32_t asciiRoundtrips
=0xffffffff;
1840 for(i
=0; i
<0x80; ++i
) {
1841 if(mbcsTable
->stateTable
[0][i
]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, i
)) {
1842 asciiRoundtrips
&=~((uint32_t)1<<(i
>>2));
1845 mbcsTable
->asciiRoundtrips
=asciiRoundtrips
;
1849 uint32_t stage1Length
=
1850 mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
?
1852 uint32_t stage2Length
=
1853 (header
->offsetFromUBytes
-header
->offsetFromUTable
)/4-
1855 reconstituteData(mbcsTable
, stage1Length
, stage2Length
, header
->fullStage2Length
, pErrorCode
);
1859 /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1860 if(mbcsTable
->utf8Friendly
) {
1861 if(mbcsTable
->countStates
==1) {
1862 sharedData
->impl
=&_SBCSUTF8Impl
;
1864 if(mbcsTable
->outputType
==MBCS_OUTPUT_2
) {
1865 sharedData
->impl
=&_DBCSUTF8Impl
;
1870 if(mbcsTable
->outputType
==MBCS_OUTPUT_DBCS_ONLY
|| mbcsTable
->outputType
==MBCS_OUTPUT_2_SISO
) {
1872 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1873 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1875 mbcsTable
->asciiRoundtrips
=0;
1879 static void U_CALLCONV
1880 ucnv_MBCSUnload(UConverterSharedData
*sharedData
) {
1881 UConverterMBCSTable
*mbcsTable
=&sharedData
->mbcs
;
1883 if(mbcsTable
->swapLFNLStateTable
!=NULL
) {
1884 uprv_free(mbcsTable
->swapLFNLStateTable
);
1886 if(mbcsTable
->stateTableOwned
) {
1887 uprv_free((void *)mbcsTable
->stateTable
);
1889 if(mbcsTable
->baseSharedData
!=NULL
) {
1890 ucnv_unload(mbcsTable
->baseSharedData
);
1892 if(mbcsTable
->reconstitutedData
!=NULL
) {
1893 uprv_free(mbcsTable
->reconstitutedData
);
1897 static void U_CALLCONV
1898 ucnv_MBCSOpen(UConverter
*cnv
,
1899 UConverterLoadArgs
*pArgs
,
1900 UErrorCode
*pErrorCode
) {
1901 UConverterMBCSTable
*mbcsTable
;
1902 const int32_t *extIndexes
;
1904 int8_t maxBytesPerUChar
;
1906 if(pArgs
->onlyTestIsLoadable
) {
1910 mbcsTable
=&cnv
->sharedData
->mbcs
;
1911 outputType
=mbcsTable
->outputType
;
1913 if(outputType
==MBCS_OUTPUT_DBCS_ONLY
) {
1914 /* the swaplfnl option does not apply, remove it */
1915 cnv
->options
=pArgs
->options
&=~UCNV_OPTION_SWAP_LFNL
;
1918 if((pArgs
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1919 /* do this because double-checked locking is broken */
1922 icu::umtx_lock(NULL
);
1923 isCached
=mbcsTable
->swapLFNLStateTable
!=NULL
;
1924 icu::umtx_unlock(NULL
);
1927 if(!_EBCDICSwapLFNL(cnv
->sharedData
, pErrorCode
)) {
1928 if(U_FAILURE(*pErrorCode
)) {
1929 return; /* something went wrong */
1932 /* the option does not apply, remove it */
1933 cnv
->options
=pArgs
->options
&=~UCNV_OPTION_SWAP_LFNL
;
1938 if(uprv_strstr(pArgs
->name
, "18030")!=NULL
) {
1939 if(uprv_strstr(pArgs
->name
, "gb18030")!=NULL
|| uprv_strstr(pArgs
->name
, "GB18030")!=NULL
) {
1940 /* set a flag for GB 18030 mode, which changes the callback behavior */
1941 cnv
->options
|=_MBCS_OPTION_GB18030
;
1943 } else if((uprv_strstr(pArgs
->name
, "KEIS")!=NULL
) || (uprv_strstr(pArgs
->name
, "keis")!=NULL
)) {
1944 /* set a flag for KEIS converter, which changes the SI/SO character sequence */
1945 cnv
->options
|=_MBCS_OPTION_KEIS
;
1946 } else if((uprv_strstr(pArgs
->name
, "JEF")!=NULL
) || (uprv_strstr(pArgs
->name
, "jef")!=NULL
)) {
1947 /* set a flag for JEF converter, which changes the SI/SO character sequence */
1948 cnv
->options
|=_MBCS_OPTION_JEF
;
1949 } else if((uprv_strstr(pArgs
->name
, "JIPS")!=NULL
) || (uprv_strstr(pArgs
->name
, "jips")!=NULL
)) {
1950 /* set a flag for JIPS converter, which changes the SI/SO character sequence */
1951 cnv
->options
|=_MBCS_OPTION_JIPS
;
1954 /* fix maxBytesPerUChar depending on outputType and options etc. */
1955 if(outputType
==MBCS_OUTPUT_2_SISO
) {
1956 cnv
->maxBytesPerUChar
=3; /* SO+DBCS */
1959 extIndexes
=mbcsTable
->extIndexes
;
1960 if(extIndexes
!=NULL
) {
1961 maxBytesPerUChar
=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes
);
1962 if(outputType
==MBCS_OUTPUT_2_SISO
) {
1963 ++maxBytesPerUChar
; /* SO + multiple DBCS */
1966 if(maxBytesPerUChar
>cnv
->maxBytesPerUChar
) {
1967 cnv
->maxBytesPerUChar
=maxBytesPerUChar
;
1973 * documentation of UConverter fields used for status
1974 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1978 cnv
->toUnicodeStatus
=0; /* offset */
1979 cnv
->mode
=0; /* state */
1980 cnv
->toULength
=0; /* byteIndex */
1984 cnv
->fromUnicodeStatus
=1; /* prevLength */
1990 static const char* U_CALLCONV
1991 ucnv_MBCSGetName(const UConverter
*cnv
) {
1992 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0 && cnv
->sharedData
->mbcs
.swapLFNLName
!=NULL
) {
1993 return cnv
->sharedData
->mbcs
.swapLFNLName
;
1995 return cnv
->sharedData
->staticData
->name
;
2001 /* MBCS-to-Unicode conversion functions ------------------------------------- */
2003 static UChar32 U_CALLCONV
2004 ucnv_MBCSGetFallback(UConverterMBCSTable
*mbcsTable
, uint32_t offset
) {
2005 const _MBCSToUFallback
*toUFallbacks
;
2006 uint32_t i
, start
, limit
;
2008 limit
=mbcsTable
->countToUFallbacks
;
2010 /* do a binary search for the fallback mapping */
2011 toUFallbacks
=mbcsTable
->toUFallbacks
;
2013 while(start
<limit
-1) {
2015 if(offset
<toUFallbacks
[i
].offset
) {
2022 /* did we really find it? */
2023 if(offset
==toUFallbacks
[start
].offset
) {
2024 return toUFallbacks
[start
].codePoint
;
2031 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
2033 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
2034 UErrorCode
*pErrorCode
) {
2036 const uint8_t *source
, *sourceLimit
;
2038 const UChar
*targetLimit
;
2041 const int32_t (*stateTable
)[256];
2043 int32_t sourceIndex
;
2049 /* set up the local pointers */
2050 cnv
=pArgs
->converter
;
2051 source
=(const uint8_t *)pArgs
->source
;
2052 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2053 target
=pArgs
->target
;
2054 targetLimit
=pArgs
->targetLimit
;
2055 offsets
=pArgs
->offsets
;
2057 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2058 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2060 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2063 /* sourceIndex=-1 if the current character began in the previous buffer */
2066 /* conversion loop */
2067 while(source
<sourceLimit
) {
2069 * This following test is to see if available input would overflow the output.
2070 * It does not catch output of more than one code unit that
2071 * overflows as a result of a surrogate pair or callback output
2072 * from the last source byte.
2073 * Therefore, those situations also test for overflows and will
2074 * then break the loop, too.
2076 if(target
>=targetLimit
) {
2077 /* target is full */
2078 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2082 entry
=stateTable
[0][*source
++];
2083 /* MBCS_ENTRY_IS_FINAL(entry) */
2085 /* test the most common case first */
2086 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2087 /* output BMP code point */
2088 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2090 *offsets
++=sourceIndex
;
2093 /* normal end of action codes: prepare for a new character */
2099 * An if-else-if chain provides more reliable performance for
2100 * the most common cases compared to a switch.
2102 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2103 if(action
==MBCS_STATE_VALID_DIRECT_20
||
2104 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2106 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
2107 /* output surrogate pair */
2108 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
2110 *offsets
++=sourceIndex
;
2112 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
2113 if(target
<targetLimit
) {
2116 *offsets
++=sourceIndex
;
2119 /* target overflow */
2120 cnv
->UCharErrorBuffer
[0]=c
;
2121 cnv
->UCharErrorBufferLength
=1;
2122 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2128 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2129 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2130 /* output BMP code point */
2131 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2133 *offsets
++=sourceIndex
;
2139 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2140 /* just fall through */
2141 } else if(action
==MBCS_STATE_ILLEGAL
) {
2142 /* callback(illegal) */
2143 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2145 /* reserved, must never occur */
2150 if(U_FAILURE(*pErrorCode
)) {
2151 /* callback(illegal) */
2153 } else /* unassigned sequences indicated with byteIndex>0 */ {
2154 /* try an extension mapping */
2155 pArgs
->source
=(const char *)source
;
2156 cnv
->toUBytes
[0]=*(source
-1);
2157 cnv
->toULength
=_extToU(cnv
, cnv
->sharedData
,
2158 1, &source
, sourceLimit
,
2159 &target
, targetLimit
,
2160 &offsets
, sourceIndex
,
2163 sourceIndex
+=1+(int32_t)(source
-(const uint8_t *)pArgs
->source
);
2165 if(U_FAILURE(*pErrorCode
)) {
2166 /* not mappable or buffer overflow */
2172 /* write back the updated pointers */
2173 pArgs
->source
=(const char *)source
;
2174 pArgs
->target
=target
;
2175 pArgs
->offsets
=offsets
;
2179 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
2180 * that only map to and from the BMP.
2181 * In addition to single-byte optimizations, the offset calculations
2182 * become much easier.
2185 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs
*pArgs
,
2186 UErrorCode
*pErrorCode
) {
2188 const uint8_t *source
, *sourceLimit
, *lastSource
;
2190 int32_t targetCapacity
, length
;
2193 const int32_t (*stateTable
)[256];
2195 int32_t sourceIndex
;
2200 /* set up the local pointers */
2201 cnv
=pArgs
->converter
;
2202 source
=(const uint8_t *)pArgs
->source
;
2203 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2204 target
=pArgs
->target
;
2205 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
2206 offsets
=pArgs
->offsets
;
2208 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2209 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2211 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2214 /* sourceIndex=-1 if the current character began in the previous buffer */
2219 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2220 * for the minimum of the sourceLength and targetCapacity
2222 length
=(int32_t)(sourceLimit
-source
);
2223 if(length
<targetCapacity
) {
2224 targetCapacity
=length
;
2227 #if MBCS_UNROLL_SINGLE_TO_BMP
2228 /* unrolling makes it faster on Pentium III/Windows 2000 */
2229 /* unroll the loop with the most common case */
2231 if(targetCapacity
>=16) {
2232 int32_t count
, loops
, oredEntries
;
2234 loops
=count
=targetCapacity
>>4;
2236 oredEntries
=entry
=stateTable
[0][*source
++];
2237 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2238 oredEntries
|=entry
=stateTable
[0][*source
++];
2239 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2240 oredEntries
|=entry
=stateTable
[0][*source
++];
2241 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2242 oredEntries
|=entry
=stateTable
[0][*source
++];
2243 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2244 oredEntries
|=entry
=stateTable
[0][*source
++];
2245 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2246 oredEntries
|=entry
=stateTable
[0][*source
++];
2247 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2248 oredEntries
|=entry
=stateTable
[0][*source
++];
2249 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2250 oredEntries
|=entry
=stateTable
[0][*source
++];
2251 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2252 oredEntries
|=entry
=stateTable
[0][*source
++];
2253 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2254 oredEntries
|=entry
=stateTable
[0][*source
++];
2255 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2256 oredEntries
|=entry
=stateTable
[0][*source
++];
2257 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2258 oredEntries
|=entry
=stateTable
[0][*source
++];
2259 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2260 oredEntries
|=entry
=stateTable
[0][*source
++];
2261 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2262 oredEntries
|=entry
=stateTable
[0][*source
++];
2263 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2264 oredEntries
|=entry
=stateTable
[0][*source
++];
2265 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2266 oredEntries
|=entry
=stateTable
[0][*source
++];
2267 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2269 /* were all 16 entries really valid? */
2270 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries
)) {
2271 /* no, return to the first of these 16 */
2278 targetCapacity
-=16*count
;
2281 lastSource
+=16*count
;
2283 *offsets
++=sourceIndex
++;
2284 *offsets
++=sourceIndex
++;
2285 *offsets
++=sourceIndex
++;
2286 *offsets
++=sourceIndex
++;
2287 *offsets
++=sourceIndex
++;
2288 *offsets
++=sourceIndex
++;
2289 *offsets
++=sourceIndex
++;
2290 *offsets
++=sourceIndex
++;
2291 *offsets
++=sourceIndex
++;
2292 *offsets
++=sourceIndex
++;
2293 *offsets
++=sourceIndex
++;
2294 *offsets
++=sourceIndex
++;
2295 *offsets
++=sourceIndex
++;
2296 *offsets
++=sourceIndex
++;
2297 *offsets
++=sourceIndex
++;
2298 *offsets
++=sourceIndex
++;
2305 /* conversion loop */
2306 while(targetCapacity
> 0 && source
< sourceLimit
) {
2307 entry
=stateTable
[0][*source
++];
2308 /* MBCS_ENTRY_IS_FINAL(entry) */
2310 /* test the most common case first */
2311 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2312 /* output BMP code point */
2313 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2319 * An if-else-if chain provides more reliable performance for
2320 * the most common cases compared to a switch.
2322 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2323 if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2324 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2325 /* output BMP code point */
2326 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2330 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2331 /* just fall through */
2332 } else if(action
==MBCS_STATE_ILLEGAL
) {
2333 /* callback(illegal) */
2334 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2336 /* reserved, must never occur */
2340 /* set offsets since the start or the last extension */
2342 int32_t count
=(int32_t)(source
-lastSource
);
2344 /* predecrement: do not set the offset for the callback-causing character */
2346 *offsets
++=sourceIndex
++;
2348 /* offset and sourceIndex are now set for the current character */
2351 if(U_FAILURE(*pErrorCode
)) {
2352 /* callback(illegal) */
2354 } else /* unassigned sequences indicated with byteIndex>0 */ {
2355 /* try an extension mapping */
2357 cnv
->toUBytes
[0]=*(source
-1);
2358 cnv
->toULength
=_extToU(cnv
, cnv
->sharedData
,
2359 1, &source
, sourceLimit
,
2360 &target
, pArgs
->targetLimit
,
2361 &offsets
, sourceIndex
,
2364 sourceIndex
+=1+(int32_t)(source
-lastSource
);
2366 if(U_FAILURE(*pErrorCode
)) {
2367 /* not mappable or buffer overflow */
2371 /* recalculate the targetCapacity after an extension mapping */
2372 targetCapacity
=(int32_t)(pArgs
->targetLimit
-target
);
2373 length
=(int32_t)(sourceLimit
-source
);
2374 if(length
<targetCapacity
) {
2375 targetCapacity
=length
;
2379 #if MBCS_UNROLL_SINGLE_TO_BMP
2380 /* unrolling makes it faster on Pentium III/Windows 2000 */
2385 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
2386 /* target is full */
2387 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2390 /* set offsets since the start or the last callback */
2392 size_t count
=source
-lastSource
;
2394 *offsets
++=sourceIndex
++;
2399 /* write back the updated pointers */
2400 pArgs
->source
=(const char *)source
;
2401 pArgs
->target
=target
;
2402 pArgs
->offsets
=offsets
;
2406 hasValidTrailBytes(const int32_t (*stateTable
)[256], uint8_t state
) {
2407 const int32_t *row
=stateTable
[state
];
2409 /* First test for final entries in this state for some commonly valid byte values. */
2411 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2412 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2417 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2418 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2422 /* Then test for final entries in this state. */
2423 for(b
=0; b
<=0xff; ++b
) {
2425 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2426 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2431 /* Then recurse for transition entries. */
2432 for(b
=0; b
<=0xff; ++b
) {
2434 if( MBCS_ENTRY_IS_TRANSITION(entry
) &&
2435 hasValidTrailBytes(stateTable
, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
))
2444 * Is byte b a single/lead byte in this state?
2445 * Recurse for transition states, because here we don't want to say that
2446 * b is a lead byte if all byte sequences that start with b are illegal.
2449 isSingleOrLead(const int32_t (*stateTable
)[256], uint8_t state
, UBool isDBCSOnly
, uint8_t b
) {
2450 const int32_t *row
=stateTable
[state
];
2451 int32_t entry
=row
[b
];
2452 if(MBCS_ENTRY_IS_TRANSITION(entry
)) { /* lead byte */
2453 return hasValidTrailBytes(stateTable
, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
));
2455 uint8_t action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2456 if(action
==MBCS_STATE_CHANGE_ONLY
&& isDBCSOnly
) {
2457 return FALSE
; /* SI/SO are illegal for DBCS-only conversion */
2459 return action
!=MBCS_STATE_ILLEGAL
;
2465 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
2466 UErrorCode
*pErrorCode
) {
2468 const uint8_t *source
, *sourceLimit
;
2470 const UChar
*targetLimit
;
2473 const int32_t (*stateTable
)[256];
2474 const uint16_t *unicodeCodeUnits
;
2481 int32_t sourceIndex
, nextSourceIndex
;
2487 /* use optimized function if possible */
2488 cnv
=pArgs
->converter
;
2490 if(cnv
->preToULength
>0) {
2492 * pass sourceIndex=-1 because we continue from an earlier buffer
2493 * in the future, this may change with continuous offsets
2495 ucnv_extContinueMatchToU(cnv
, pArgs
, -1, pErrorCode
);
2497 if(U_FAILURE(*pErrorCode
) || cnv
->preToULength
<0) {
2502 if(cnv
->sharedData
->mbcs
.countStates
==1) {
2503 if(!(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
2504 ucnv_MBCSSingleToBMPWithOffsets(pArgs
, pErrorCode
);
2506 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs
, pErrorCode
);
2511 /* set up the local pointers */
2512 source
=(const uint8_t *)pArgs
->source
;
2513 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2514 target
=pArgs
->target
;
2515 targetLimit
=pArgs
->targetLimit
;
2516 offsets
=pArgs
->offsets
;
2518 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2519 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2521 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2523 unicodeCodeUnits
=cnv
->sharedData
->mbcs
.unicodeCodeUnits
;
2525 /* get the converter state from UConverter */
2526 offset
=cnv
->toUnicodeStatus
;
2527 byteIndex
=cnv
->toULength
;
2528 bytes
=cnv
->toUBytes
;
2531 * if we are in the SBCS state for a DBCS-only converter,
2532 * then load the DBCS state from the MBCS data
2533 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2535 if((state
=(uint8_t)(cnv
->mode
))==0) {
2536 state
=cnv
->sharedData
->mbcs
.dbcsOnlyState
;
2539 /* sourceIndex=-1 if the current character began in the previous buffer */
2540 sourceIndex
=byteIndex
==0 ? 0 : -1;
2543 /* conversion loop */
2544 while(source
<sourceLimit
) {
2546 * This following test is to see if available input would overflow the output.
2547 * It does not catch output of more than one code unit that
2548 * overflows as a result of a surrogate pair or callback output
2549 * from the last source byte.
2550 * Therefore, those situations also test for overflows and will
2551 * then break the loop, too.
2553 if(target
>=targetLimit
) {
2554 /* target is full */
2555 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2560 /* optimized loop for 1/2-byte input and BMP output */
2563 entry
=stateTable
[state
][*source
];
2564 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2565 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2566 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2569 if( source
<sourceLimit
&&
2570 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
2571 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
2572 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
2576 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2579 /* set the state and leave the optimized loop */
2580 bytes
[0]=*(source
-1);
2585 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2586 /* output BMP code point */
2588 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2589 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2591 /* leave the optimized loop */
2595 } while(source
<sourceLimit
&& target
<targetLimit
);
2596 } else /* offsets!=NULL */ {
2598 entry
=stateTable
[state
][*source
];
2599 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2600 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2601 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2604 if( source
<sourceLimit
&&
2605 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
2606 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
2607 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
2612 *offsets
++=sourceIndex
;
2613 sourceIndex
=(nextSourceIndex
+=2);
2615 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2618 /* set the state and leave the optimized loop */
2620 bytes
[0]=*(source
-1);
2625 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2626 /* output BMP code point */
2628 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2630 *offsets
++=sourceIndex
;
2631 sourceIndex
=++nextSourceIndex
;
2633 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2635 /* leave the optimized loop */
2639 } while(source
<sourceLimit
&& target
<targetLimit
);
2643 * these tests and break statements could be put inside the loop
2644 * if C had "break outerLoop" like Java
2646 if(source
>=sourceLimit
) {
2649 if(target
>=targetLimit
) {
2650 /* target is full */
2651 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2656 bytes
[byteIndex
++]=*source
++;
2657 } else /* byteIndex>0 */ {
2659 entry
=stateTable
[state
][bytes
[byteIndex
++]=*source
++];
2662 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2663 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2664 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2668 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2671 /* set the next state early so that we can reuse the entry variable */
2672 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2675 * An if-else-if chain provides more reliable performance for
2676 * the most common cases compared to a switch.
2678 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2679 if(action
==MBCS_STATE_VALID_16
) {
2680 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2681 c
=unicodeCodeUnits
[offset
];
2683 /* output BMP code point */
2686 *offsets
++=sourceIndex
;
2689 } else if(c
==0xfffe) {
2690 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (entry
=(int32_t)ucnv_MBCSGetFallback(&cnv
->sharedData
->mbcs
, offset
))!=0xfffe) {
2691 /* output fallback BMP code point */
2692 *target
++=(UChar
)entry
;
2694 *offsets
++=sourceIndex
;
2699 /* callback(illegal) */
2700 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2702 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
2703 /* output BMP code point */
2704 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2706 *offsets
++=sourceIndex
;
2709 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
2710 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2711 c
=unicodeCodeUnits
[offset
++];
2713 /* output BMP code point below 0xd800 */
2716 *offsets
++=sourceIndex
;
2719 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
2720 /* output roundtrip or fallback surrogate pair */
2721 *target
++=(UChar
)(c
&0xdbff);
2723 *offsets
++=sourceIndex
;
2726 if(target
<targetLimit
) {
2727 *target
++=unicodeCodeUnits
[offset
];
2729 *offsets
++=sourceIndex
;
2732 /* target overflow */
2733 cnv
->UCharErrorBuffer
[0]=unicodeCodeUnits
[offset
];
2734 cnv
->UCharErrorBufferLength
=1;
2735 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2740 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
2741 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2742 *target
++=unicodeCodeUnits
[offset
];
2744 *offsets
++=sourceIndex
;
2747 } else if(c
==0xffff) {
2748 /* callback(illegal) */
2749 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2751 } else if(action
==MBCS_STATE_VALID_DIRECT_20
||
2752 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2754 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
2755 /* output surrogate pair */
2756 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
2758 *offsets
++=sourceIndex
;
2761 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
2762 if(target
<targetLimit
) {
2765 *offsets
++=sourceIndex
;
2768 /* target overflow */
2769 cnv
->UCharErrorBuffer
[0]=c
;
2770 cnv
->UCharErrorBufferLength
=1;
2771 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2776 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
2778 * This serves as a state change without any output.
2779 * It is useful for reading simple stateful encodings,
2780 * for example using just Shift-In/Shift-Out codes.
2781 * The 21 unused bits may later be used for more sophisticated
2782 * state transitions.
2784 if(cnv
->sharedData
->mbcs
.dbcsOnlyState
==0) {
2787 /* SI/SO are illegal for DBCS-only conversion */
2788 state
=(uint8_t)(cnv
->mode
); /* restore the previous state */
2790 /* callback(illegal) */
2791 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2793 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2794 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2795 /* output BMP code point */
2796 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2798 *offsets
++=sourceIndex
;
2802 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2803 /* just fall through */
2804 } else if(action
==MBCS_STATE_ILLEGAL
) {
2805 /* callback(illegal) */
2806 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2808 /* reserved, must never occur */
2812 /* end of action codes: prepare for a new character */
2816 sourceIndex
=nextSourceIndex
;
2817 } else if(U_FAILURE(*pErrorCode
)) {
2818 /* callback(illegal) */
2821 * Ticket 5691: consistent illegal sequences:
2822 * - We include at least the first byte in the illegal sequence.
2823 * - If any of the non-initial bytes could be the start of a character,
2824 * we stop the illegal sequence before the first one of those.
2826 UBool isDBCSOnly
=(UBool
)(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0);
2829 i
<byteIndex
&& !isSingleOrLead(stateTable
, state
, isDBCSOnly
, bytes
[i
]);
2832 /* Back out some bytes. */
2833 int8_t backOutDistance
=byteIndex
-i
;
2834 int32_t bytesFromThisBuffer
=(int32_t)(source
-(const uint8_t *)pArgs
->source
);
2835 byteIndex
=i
; /* length of reported illegal byte sequence */
2836 if(backOutDistance
<=bytesFromThisBuffer
) {
2837 source
-=backOutDistance
;
2839 /* Back out bytes from the previous buffer: Need to replay them. */
2840 cnv
->preToULength
=(int8_t)(bytesFromThisBuffer
-backOutDistance
);
2841 /* preToULength is negative! */
2842 uprv_memcpy(cnv
->preToU
, bytes
+i
, -cnv
->preToULength
);
2843 source
=(const uint8_t *)pArgs
->source
;
2848 } else /* unassigned sequences indicated with byteIndex>0 */ {
2849 /* try an extension mapping */
2850 pArgs
->source
=(const char *)source
;
2851 byteIndex
=_extToU(cnv
, cnv
->sharedData
,
2852 byteIndex
, &source
, sourceLimit
,
2853 &target
, targetLimit
,
2854 &offsets
, sourceIndex
,
2857 sourceIndex
=nextSourceIndex
+=(int32_t)(source
-(const uint8_t *)pArgs
->source
);
2859 if(U_FAILURE(*pErrorCode
)) {
2860 /* not mappable or buffer overflow */
2866 /* set the converter state back into UConverter */
2867 cnv
->toUnicodeStatus
=offset
;
2869 cnv
->toULength
=byteIndex
;
2871 /* write back the updated pointers */
2872 pArgs
->source
=(const char *)source
;
2873 pArgs
->target
=target
;
2874 pArgs
->offsets
=offsets
;
2878 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2879 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2882 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
2883 UErrorCode
*pErrorCode
) {
2885 const int32_t (*stateTable
)[256];
2886 const uint8_t *source
, *sourceLimit
;
2891 /* set up the local pointers */
2892 cnv
=pArgs
->converter
;
2893 source
=(const uint8_t *)pArgs
->source
;
2894 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2895 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2896 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2898 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2901 /* conversion loop */
2902 while(source
<sourceLimit
) {
2903 entry
=stateTable
[0][*source
++];
2904 /* MBCS_ENTRY_IS_FINAL(entry) */
2906 /* write back the updated pointer early so that we can return directly */
2907 pArgs
->source
=(const char *)source
;
2909 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2910 /* output BMP code point */
2911 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2915 * An if-else-if chain provides more reliable performance for
2916 * the most common cases compared to a switch.
2918 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2919 if( action
==MBCS_STATE_VALID_DIRECT_20
||
2920 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2922 /* output supplementary code point */
2923 return (UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
2924 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2925 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2926 /* output BMP code point */
2927 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2929 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2930 /* just fall through */
2931 } else if(action
==MBCS_STATE_ILLEGAL
) {
2932 /* callback(illegal) */
2933 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2935 /* reserved, must never occur */
2939 if(U_FAILURE(*pErrorCode
)) {
2940 /* callback(illegal) */
2942 } else /* unassigned sequence */ {
2943 /* defer to the generic implementation */
2944 pArgs
->source
=(const char *)source
-1;
2945 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2949 /* no output because of empty input or only state changes */
2950 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2955 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2956 * conversion without offset handling.
2958 * When a character does not have a mapping to Unicode, then we return to the
2959 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2961 * We also defer to the generic code in other complicated cases and have them
2962 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2964 * All normal mappings and errors are handled here.
2966 static UChar32 U_CALLCONV
2967 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
2968 UErrorCode
*pErrorCode
) {
2970 const uint8_t *source
, *sourceLimit
, *lastSource
;
2972 const int32_t (*stateTable
)[256];
2973 const uint16_t *unicodeCodeUnits
;
2982 /* use optimized function if possible */
2983 cnv
=pArgs
->converter
;
2985 if(cnv
->preToULength
>0) {
2986 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2987 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2990 if(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SURROGATES
) {
2992 * Using the generic ucnv_getNextUChar() code lets us deal correctly
2993 * with the rare case of a codepage that maps single surrogates
2994 * without adding the complexity to this already complicated function here.
2996 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2997 } else if(cnv
->sharedData
->mbcs
.countStates
==1) {
2998 return ucnv_MBCSSingleGetNextUChar(pArgs
, pErrorCode
);
3001 /* set up the local pointers */
3002 source
=lastSource
=(const uint8_t *)pArgs
->source
;
3003 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
3005 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3006 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
3008 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
3010 unicodeCodeUnits
=cnv
->sharedData
->mbcs
.unicodeCodeUnits
;
3012 /* get the converter state from UConverter */
3013 offset
=cnv
->toUnicodeStatus
;
3016 * if we are in the SBCS state for a DBCS-only converter,
3017 * then load the DBCS state from the MBCS data
3018 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
3020 if((state
=(uint8_t)(cnv
->mode
))==0) {
3021 state
=cnv
->sharedData
->mbcs
.dbcsOnlyState
;
3024 /* conversion loop */
3026 while(source
<sourceLimit
) {
3027 entry
=stateTable
[state
][*source
++];
3028 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
3029 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
3030 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
3032 /* optimization for 1/2-byte input and BMP output */
3033 if( source
<sourceLimit
&&
3034 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
3035 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
3036 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
3039 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
3040 /* output BMP code point */
3044 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
3047 /* set the next state early so that we can reuse the entry variable */
3048 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
3051 * An if-else-if chain provides more reliable performance for
3052 * the most common cases compared to a switch.
3054 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
3055 if(action
==MBCS_STATE_VALID_DIRECT_16
) {
3056 /* output BMP code point */
3057 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3059 } else if(action
==MBCS_STATE_VALID_16
) {
3060 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3061 c
=unicodeCodeUnits
[offset
];
3063 /* output BMP code point */
3065 } else if(c
==0xfffe) {
3066 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (c
=ucnv_MBCSGetFallback(&cnv
->sharedData
->mbcs
, offset
))!=0xfffe) {
3070 /* callback(illegal) */
3071 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3073 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
3074 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3075 c
=unicodeCodeUnits
[offset
++];
3077 /* output BMP code point below 0xd800 */
3079 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
3080 /* output roundtrip or fallback supplementary code point */
3081 c
=((c
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00);
3083 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
3084 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3085 c
=unicodeCodeUnits
[offset
];
3087 } else if(c
==0xffff) {
3088 /* callback(illegal) */
3089 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3091 } else if(action
==MBCS_STATE_VALID_DIRECT_20
||
3092 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
3094 /* output supplementary code point */
3095 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
3097 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
3099 * This serves as a state change without any output.
3100 * It is useful for reading simple stateful encodings,
3101 * for example using just Shift-In/Shift-Out codes.
3102 * The 21 unused bits may later be used for more sophisticated
3103 * state transitions.
3105 if(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0) {
3106 /* SI/SO are illegal for DBCS-only conversion */
3107 state
=(uint8_t)(cnv
->mode
); /* restore the previous state */
3109 /* callback(illegal) */
3110 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3112 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
3113 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
3114 /* output BMP code point */
3115 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3118 } else if(action
==MBCS_STATE_UNASSIGNED
) {
3119 /* just fall through */
3120 } else if(action
==MBCS_STATE_ILLEGAL
) {
3121 /* callback(illegal) */
3122 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3124 /* reserved (must never occur), or only state change */
3130 /* end of action codes: prepare for a new character */
3133 if(U_FAILURE(*pErrorCode
)) {
3134 /* callback(illegal) */
3136 } else /* unassigned sequence */ {
3137 /* defer to the generic implementation */
3138 cnv
->toUnicodeStatus
=0;
3140 pArgs
->source
=(const char *)lastSource
;
3141 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
3147 if(U_SUCCESS(*pErrorCode
) && source
==sourceLimit
&& lastSource
<source
) {
3148 /* incomplete character byte sequence */
3149 uint8_t *bytes
=cnv
->toUBytes
;
3150 cnv
->toULength
=(int8_t)(source
-lastSource
);
3152 *bytes
++=*lastSource
++;
3153 } while(lastSource
<source
);
3154 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
3155 } else if(U_FAILURE(*pErrorCode
)) {
3156 /* callback(illegal) */
3158 * Ticket 5691: consistent illegal sequences:
3159 * - We include at least the first byte in the illegal sequence.
3160 * - If any of the non-initial bytes could be the start of a character,
3161 * we stop the illegal sequence before the first one of those.
3163 UBool isDBCSOnly
=(UBool
)(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0);
3164 uint8_t *bytes
=cnv
->toUBytes
;
3165 *bytes
++=*lastSource
++; /* first byte */
3166 if(lastSource
==source
) {
3168 } else /* lastSource<source: multi-byte character */ {
3171 lastSource
<source
&& !isSingleOrLead(stateTable
, state
, isDBCSOnly
, *lastSource
);
3174 *bytes
++=*lastSource
++;
3180 /* no output because of empty input or only state changes */
3181 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
3186 /* set the converter state back into UConverter, ready for a new character */
3187 cnv
->toUnicodeStatus
=0;
3190 /* write back the updated pointer */
3191 pArgs
->source
=(const char *)source
;
3197 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3198 * Removal improves code coverage.
3201 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
3202 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3203 * It does not handle conversion extensions (_extToU()).
3206 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData
*sharedData
,
3207 uint8_t b
, UBool useFallback
) {
3211 entry
=sharedData
->mbcs
.stateTable
[0][b
];
3212 /* MBCS_ENTRY_IS_FINAL(entry) */
3214 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
3215 /* output BMP code point */
3216 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3220 * An if-else-if chain provides more reliable performance for
3221 * the most common cases compared to a switch.
3223 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
3224 if(action
==MBCS_STATE_VALID_DIRECT_20
) {
3225 /* output supplementary code point */
3226 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3227 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
3228 if(!TO_U_USE_FALLBACK(useFallback
)) {
3231 /* output BMP code point */
3232 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3233 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
3234 if(!TO_U_USE_FALLBACK(useFallback
)) {
3237 /* output supplementary code point */
3238 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3239 } else if(action
==MBCS_STATE_UNASSIGNED
) {
3241 } else if(action
==MBCS_STATE_ILLEGAL
) {
3244 /* reserved, must never occur */
3251 * This is a simple version of _MBCSGetNextUChar() that is used
3252 * by other converter implementations.
3253 * It only returns an "assigned" result if it consumes the entire input.
3254 * It does not use state from the converter, nor error codes.
3255 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3256 * It handles conversion extensions but not GB 18030.
3261 * otherwise the Unicode code point
3264 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData
*sharedData
,
3265 const char *source
, int32_t length
,
3266 UBool useFallback
) {
3267 const int32_t (*stateTable
)[256];
3268 const uint16_t *unicodeCodeUnits
;
3271 uint8_t state
, action
;
3277 /* no input at all: "illegal" */
3283 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3284 * TODO In future releases, verify that this function is never called for SBCS
3285 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3286 * Removal improves code coverage.
3288 /* use optimized function if possible */
3289 if(sharedData
->mbcs
.countStates
==1) {
3291 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData
, (uint8_t)*source
, useFallback
);
3293 return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3298 /* set up the local pointers */
3299 stateTable
=sharedData
->mbcs
.stateTable
;
3300 unicodeCodeUnits
=sharedData
->mbcs
.unicodeCodeUnits
;
3302 /* converter state */
3304 state
=sharedData
->mbcs
.dbcsOnlyState
;
3306 /* conversion loop */
3308 entry
=stateTable
[state
][(uint8_t)source
[i
++]];
3309 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
3310 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
3311 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
3314 return 0xffff; /* truncated character */
3318 * An if-else-if chain provides more reliable performance for
3319 * the most common cases compared to a switch.
3321 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
3322 if(action
==MBCS_STATE_VALID_16
) {
3323 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3324 c
=unicodeCodeUnits
[offset
];
3327 } else if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
3328 c
=ucnv_MBCSGetFallback(&sharedData
->mbcs
, offset
);
3329 /* else done with 0xfffe */
3332 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
3333 /* output BMP code point */
3334 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3336 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
3337 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3338 c
=unicodeCodeUnits
[offset
++];
3340 /* output BMP code point below 0xd800 */
3341 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
3342 /* output roundtrip or fallback supplementary code point */
3343 c
=(UChar32
)(((c
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00));
3344 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
3345 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3346 c
=unicodeCodeUnits
[offset
];
3347 } else if(c
==0xffff) {
3353 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
3354 /* output supplementary code point */
3355 c
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3357 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
3358 if(!TO_U_USE_FALLBACK(useFallback
)) {
3362 /* output BMP code point */
3363 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3365 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
3366 if(!TO_U_USE_FALLBACK(useFallback
)) {
3370 /* output supplementary code point */
3371 c
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3373 } else if(action
==MBCS_STATE_UNASSIGNED
) {
3379 * forbid MBCS_STATE_CHANGE_ONLY for this function,
3380 * and MBCS_STATE_ILLEGAL and reserved action codes
3387 /* illegal for this function: not all input consumed */
3392 /* try an extension mapping */
3393 const int32_t *cx
=sharedData
->mbcs
.extIndexes
;
3395 return ucnv_extSimpleMatchToU(cx
, source
, length
, useFallback
);
3402 /* MBCS-from-Unicode conversion functions ----------------------------------- */
3404 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3406 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3407 UErrorCode
*pErrorCode
) {
3409 const UChar
*source
, *sourceLimit
;
3411 int32_t targetCapacity
;
3414 const uint16_t *table
;
3415 const uint16_t *mbcsIndex
;
3416 const uint8_t *bytes
;
3420 int32_t sourceIndex
, nextSourceIndex
;
3422 uint32_t stage2Entry
;
3423 uint32_t asciiRoundtrips
;
3425 uint8_t unicodeMask
;
3427 /* use optimized function if possible */
3428 cnv
=pArgs
->converter
;
3429 unicodeMask
=cnv
->sharedData
->mbcs
.unicodeMask
;
3431 /* set up the local pointers */
3432 source
=pArgs
->source
;
3433 sourceLimit
=pArgs
->sourceLimit
;
3434 target
=(uint8_t *)pArgs
->target
;
3435 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3436 offsets
=pArgs
->offsets
;
3438 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3439 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
3440 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3441 bytes
=cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3443 bytes
=cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3445 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
3447 /* get the converter state from UConverter */
3450 /* sourceIndex=-1 if the current character began in the previous buffer */
3451 sourceIndex
= c
==0 ? 0 : -1;
3454 /* conversion loop */
3455 if(c
!=0 && targetCapacity
>0) {
3459 while(source
<sourceLimit
) {
3461 * This following test is to see if available input would overflow the output.
3462 * It does not catch output of more than one byte that
3463 * overflows as a result of a multi-byte character or callback output
3464 * from the last source character.
3465 * Therefore, those situations also test for overflows and will
3466 * then break the loop, too.
3468 if(targetCapacity
>0) {
3470 * Get a correct Unicode code point:
3471 * a single UChar for a BMP code point or
3472 * a matched surrogate pair for a "supplementary code point".
3476 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
3477 *target
++=(uint8_t)c
;
3479 *offsets
++=sourceIndex
;
3480 sourceIndex
=nextSourceIndex
;
3487 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3488 * to avoid dealing with surrogates.
3489 * MBCS_FAST_MAX must be >=0xd7ff.
3492 value
=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex
, (const uint16_t *)bytes
, c
);
3493 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3497 /* output the value */
3500 * This also tests if the codepage maps single surrogates.
3501 * If it does, then surrogates are not paired but mapped separately.
3502 * Note that in this case unmatched surrogates are not detected.
3504 if(U16_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
3505 if(U16_IS_SURROGATE_LEAD(c
)) {
3507 if(source
<sourceLimit
) {
3508 /* test the following code unit */
3509 UChar trail
=*source
;
3510 if(U16_IS_TRAIL(trail
)) {
3513 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
3514 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
3515 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3516 /* callback(unassigned) */
3519 /* convert this supplementary code point */
3520 /* exit this condition tree */
3522 /* this is an unmatched lead code unit (1st surrogate) */
3523 /* callback(illegal) */
3524 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3532 /* this is an unmatched trail code unit (2nd surrogate) */
3533 /* callback(illegal) */
3534 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3539 /* convert the Unicode code point in c into codepage bytes */
3540 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
3542 /* get the bytes and the length for the output */
3544 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
3546 /* is this code point assigned, or do we use fallbacks? */
3547 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
3548 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
3551 * We allow a 0 byte output if the "assigned" bit is set for this entry.
3552 * There is no way with this data structure for fallback output
3553 * to be a zero byte.
3557 /* try an extension mapping */
3558 pArgs
->source
=source
;
3559 c
=_extFromU(cnv
, cnv
->sharedData
,
3560 c
, &source
, sourceLimit
,
3561 &target
, target
+targetCapacity
,
3562 &offsets
, sourceIndex
,
3565 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
3567 if(U_FAILURE(*pErrorCode
)) {
3568 /* not mappable or buffer overflow */
3571 /* a mapping was written to the target, continue */
3573 /* recalculate the targetCapacity after an extension mapping */
3574 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
3576 /* normal end of conversion: prepare for a new character */
3577 sourceIndex
=nextSourceIndex
;
3583 /* write the output character bytes from value and length */
3584 /* from the first if in the loop we know that targetCapacity>0 */
3586 /* this is easy because we know that there is enough space */
3587 *target
++=(uint8_t)value
;
3589 *offsets
++=sourceIndex
;
3592 } else /* length==2 */ {
3593 *target
++=(uint8_t)(value
>>8);
3594 if(2<=targetCapacity
) {
3595 *target
++=(uint8_t)value
;
3597 *offsets
++=sourceIndex
;
3598 *offsets
++=sourceIndex
;
3603 *offsets
++=sourceIndex
;
3605 cnv
->charErrorBuffer
[0]=(char)value
;
3606 cnv
->charErrorBufferLength
=1;
3608 /* target overflow */
3610 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3616 /* normal end of conversion: prepare for a new character */
3618 sourceIndex
=nextSourceIndex
;
3621 /* target is full */
3622 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3627 /* set the converter state back into UConverter */
3630 /* write back the updated pointers */
3631 pArgs
->source
=source
;
3632 pArgs
->target
=(char *)target
;
3633 pArgs
->offsets
=offsets
;
3636 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3638 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3639 UErrorCode
*pErrorCode
) {
3641 const UChar
*source
, *sourceLimit
;
3643 int32_t targetCapacity
;
3646 const uint16_t *table
;
3647 const uint16_t *results
;
3651 int32_t sourceIndex
, nextSourceIndex
;
3653 uint16_t value
, minValue
;
3654 UBool hasSupplementary
;
3656 /* set up the local pointers */
3657 cnv
=pArgs
->converter
;
3658 source
=pArgs
->source
;
3659 sourceLimit
=pArgs
->sourceLimit
;
3660 target
=(uint8_t *)pArgs
->target
;
3661 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3662 offsets
=pArgs
->offsets
;
3664 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3665 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3666 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3668 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3671 if(cnv
->useFallback
) {
3672 /* use all roundtrip and fallback results */
3675 /* use only roundtrips and fallbacks from private-use characters */
3678 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
3680 /* get the converter state from UConverter */
3683 /* sourceIndex=-1 if the current character began in the previous buffer */
3684 sourceIndex
= c
==0 ? 0 : -1;
3687 /* conversion loop */
3688 if(c
!=0 && targetCapacity
>0) {
3692 while(source
<sourceLimit
) {
3694 * This following test is to see if available input would overflow the output.
3695 * It does not catch output of more than one byte that
3696 * overflows as a result of a multi-byte character or callback output
3697 * from the last source character.
3698 * Therefore, those situations also test for overflows and will
3699 * then break the loop, too.
3701 if(targetCapacity
>0) {
3703 * Get a correct Unicode code point:
3704 * a single UChar for a BMP code point or
3705 * a matched surrogate pair for a "supplementary code point".
3709 if(U16_IS_SURROGATE(c
)) {
3710 if(U16_IS_SURROGATE_LEAD(c
)) {
3712 if(source
<sourceLimit
) {
3713 /* test the following code unit */
3714 UChar trail
=*source
;
3715 if(U16_IS_TRAIL(trail
)) {
3718 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
3719 if(!hasSupplementary
) {
3720 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3721 /* callback(unassigned) */
3724 /* convert this supplementary code point */
3725 /* exit this condition tree */
3727 /* this is an unmatched lead code unit (1st surrogate) */
3728 /* callback(illegal) */
3729 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3737 /* this is an unmatched trail code unit (2nd surrogate) */
3738 /* callback(illegal) */
3739 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3744 /* convert the Unicode code point in c into codepage bytes */
3745 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3747 /* is this code point assigned, or do we use fallbacks? */
3748 if(value
>=minValue
) {
3749 /* assigned, write the output character bytes from value and length */
3751 /* this is easy because we know that there is enough space */
3752 *target
++=(uint8_t)value
;
3754 *offsets
++=sourceIndex
;
3758 /* normal end of conversion: prepare for a new character */
3760 sourceIndex
=nextSourceIndex
;
3761 } else { /* unassigned */
3763 /* try an extension mapping */
3764 pArgs
->source
=source
;
3765 c
=_extFromU(cnv
, cnv
->sharedData
,
3766 c
, &source
, sourceLimit
,
3767 &target
, target
+targetCapacity
,
3768 &offsets
, sourceIndex
,
3771 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
3773 if(U_FAILURE(*pErrorCode
)) {
3774 /* not mappable or buffer overflow */
3777 /* a mapping was written to the target, continue */
3779 /* recalculate the targetCapacity after an extension mapping */
3780 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
3782 /* normal end of conversion: prepare for a new character */
3783 sourceIndex
=nextSourceIndex
;
3787 /* target is full */
3788 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3793 /* set the converter state back into UConverter */
3796 /* write back the updated pointers */
3797 pArgs
->source
=source
;
3798 pArgs
->target
=(char *)target
;
3799 pArgs
->offsets
=offsets
;
3803 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3804 * that map only to and from the BMP.
3805 * In addition to single-byte/state optimizations, the offset calculations
3806 * become much easier.
3807 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3808 * but measurements have shown that this diminishes performance
3809 * in more cases than it improves it.
3810 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3811 * for various MBCS and SBCS optimizations.
3814 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3815 UErrorCode
*pErrorCode
) {
3817 const UChar
*source
, *sourceLimit
, *lastSource
;
3819 int32_t targetCapacity
, length
;
3822 const uint16_t *table
;
3823 const uint16_t *results
;
3827 int32_t sourceIndex
;
3829 uint32_t asciiRoundtrips
;
3830 uint16_t value
, minValue
;
3832 /* set up the local pointers */
3833 cnv
=pArgs
->converter
;
3834 source
=pArgs
->source
;
3835 sourceLimit
=pArgs
->sourceLimit
;
3836 target
=(uint8_t *)pArgs
->target
;
3837 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3838 offsets
=pArgs
->offsets
;
3840 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3841 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3842 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3844 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3846 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
3848 if(cnv
->useFallback
) {
3849 /* use all roundtrip and fallback results */
3852 /* use only roundtrips and fallbacks from private-use characters */
3856 /* get the converter state from UConverter */
3859 /* sourceIndex=-1 if the current character began in the previous buffer */
3860 sourceIndex
= c
==0 ? 0 : -1;
3864 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3865 * for the minimum of the sourceLength and targetCapacity
3867 length
=(int32_t)(sourceLimit
-source
);
3868 if(length
<targetCapacity
) {
3869 targetCapacity
=length
;
3872 /* conversion loop */
3873 if(c
!=0 && targetCapacity
>0) {
3877 #if MBCS_UNROLL_SINGLE_FROM_BMP
3878 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3879 /* unroll the loop with the most common case */
3881 if(targetCapacity
>=4) {
3882 int32_t count
, loops
;
3883 uint16_t andedValues
;
3885 loops
=count
=targetCapacity
>>2;
3888 andedValues
=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3889 *target
++=(uint8_t)value
;
3891 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3892 *target
++=(uint8_t)value
;
3894 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3895 *target
++=(uint8_t)value
;
3897 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3898 *target
++=(uint8_t)value
;
3900 /* were all 4 entries really valid? */
3901 if(andedValues
<minValue
) {
3902 /* no, return to the first of these 4 */
3909 targetCapacity
-=4*count
;
3912 lastSource
+=4*count
;
3914 *offsets
++=sourceIndex
++;
3915 *offsets
++=sourceIndex
++;
3916 *offsets
++=sourceIndex
++;
3917 *offsets
++=sourceIndex
++;
3926 while(targetCapacity
>0) {
3928 * Get a correct Unicode code point:
3929 * a single UChar for a BMP code point or
3930 * a matched surrogate pair for a "supplementary code point".
3934 * Do not immediately check for single surrogates:
3935 * Assume that they are unassigned and check for them in that case.
3936 * This speeds up the conversion of assigned characters.
3938 /* convert the Unicode code point in c into codepage bytes */
3939 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
3940 *target
++=(uint8_t)c
;
3945 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3946 /* is this code point assigned, or do we use fallbacks? */
3947 if(value
>=minValue
) {
3948 /* assigned, write the output character bytes from value and length */
3950 /* this is easy because we know that there is enough space */
3951 *target
++=(uint8_t)value
;
3954 /* normal end of conversion: prepare for a new character */
3957 } else if(!U16_IS_SURROGATE(c
)) {
3958 /* normal, unassigned BMP character */
3959 } else if(U16_IS_SURROGATE_LEAD(c
)) {
3961 if(source
<sourceLimit
) {
3962 /* test the following code unit */
3963 UChar trail
=*source
;
3964 if(U16_IS_TRAIL(trail
)) {
3966 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
3967 /* this codepage does not map supplementary code points */
3968 /* callback(unassigned) */
3970 /* this is an unmatched lead code unit (1st surrogate) */
3971 /* callback(illegal) */
3972 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3978 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
3983 /* this is an unmatched trail code unit (2nd surrogate) */
3984 /* callback(illegal) */
3985 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3989 /* c does not have a mapping */
3991 /* get the number of code units for c to correctly advance sourceIndex */
3992 length
=U16_LENGTH(c
);
3994 /* set offsets since the start or the last extension */
3996 int32_t count
=(int32_t)(source
-lastSource
);
3998 /* do not set the offset for this character */
4002 *offsets
++=sourceIndex
++;
4005 /* offsets and sourceIndex are now set for the current character */
4008 /* try an extension mapping */
4010 c
=_extFromU(cnv
, cnv
->sharedData
,
4011 c
, &source
, sourceLimit
,
4012 &target
, (const uint8_t *)(pArgs
->targetLimit
),
4013 &offsets
, sourceIndex
,
4016 sourceIndex
+=length
+(int32_t)(source
-lastSource
);
4019 if(U_FAILURE(*pErrorCode
)) {
4020 /* not mappable or buffer overflow */
4023 /* a mapping was written to the target, continue */
4025 /* recalculate the targetCapacity after an extension mapping */
4026 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
4027 length
=(int32_t)(sourceLimit
-source
);
4028 if(length
<targetCapacity
) {
4029 targetCapacity
=length
;
4033 #if MBCS_UNROLL_SINGLE_FROM_BMP
4034 /* unrolling makes it slower on Pentium III/Windows 2000?! */
4039 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
4040 /* target is full */
4041 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4044 /* set offsets since the start or the last callback */
4046 size_t count
=source
-lastSource
;
4047 if (count
> 0 && *pErrorCode
== U_TRUNCATED_CHAR_FOUND
) {
4049 Caller gave us a partial supplementary character,
4050 which this function couldn't convert in any case.
4051 The callback will handle the offset.
4056 *offsets
++=sourceIndex
++;
4061 /* set the converter state back into UConverter */
4064 /* write back the updated pointers */
4065 pArgs
->source
=source
;
4066 pArgs
->target
=(char *)target
;
4067 pArgs
->offsets
=offsets
;
4071 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
4072 UErrorCode
*pErrorCode
) {
4074 const UChar
*source
, *sourceLimit
;
4076 int32_t targetCapacity
;
4079 const uint16_t *table
;
4080 const uint16_t *mbcsIndex
;
4081 const uint8_t *p
, *bytes
;
4086 int32_t prevSourceIndex
, sourceIndex
, nextSourceIndex
;
4088 uint32_t stage2Entry
;
4089 uint32_t asciiRoundtrips
;
4091 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
4092 uint8_t siBytes
[2] = {0, 0};
4093 uint8_t soBytes
[2] = {0, 0};
4094 uint8_t siLength
, soLength
;
4095 int32_t length
= 0, prevLength
;
4096 uint8_t unicodeMask
;
4098 cnv
=pArgs
->converter
;
4100 if(cnv
->preFromUFirstCP
>=0) {
4102 * pass sourceIndex=-1 because we continue from an earlier buffer
4103 * in the future, this may change with continuous offsets
4105 ucnv_extContinueMatchFromU(cnv
, pArgs
, -1, pErrorCode
);
4107 if(U_FAILURE(*pErrorCode
) || cnv
->preFromULength
<0) {
4112 /* use optimized function if possible */
4113 outputType
=cnv
->sharedData
->mbcs
.outputType
;
4114 unicodeMask
=cnv
->sharedData
->mbcs
.unicodeMask
;
4115 if(outputType
==MBCS_OUTPUT_1
&& !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
4116 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4117 ucnv_MBCSSingleFromBMPWithOffsets(pArgs
, pErrorCode
);
4119 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
4122 } else if(outputType
==MBCS_OUTPUT_2
&& cnv
->sharedData
->mbcs
.utf8Friendly
) {
4123 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
4127 /* set up the local pointers */
4128 source
=pArgs
->source
;
4129 sourceLimit
=pArgs
->sourceLimit
;
4130 target
=(uint8_t *)pArgs
->target
;
4131 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
4132 offsets
=pArgs
->offsets
;
4134 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
4135 if(cnv
->sharedData
->mbcs
.utf8Friendly
) {
4136 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
4140 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
4141 bytes
=cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
4143 bytes
=cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
4145 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
4147 /* get the converter state from UConverter */
4150 if(outputType
==MBCS_OUTPUT_2_SISO
) {
4151 prevLength
=cnv
->fromUnicodeStatus
;
4153 /* set the real value */
4157 /* prevent fromUnicodeStatus from being set to something non-0 */
4161 /* sourceIndex=-1 if the current character began in the previous buffer */
4163 sourceIndex
= c
==0 ? 0 : -1;
4166 /* Get the SI/SO character for the converter */
4167 siLength
= static_cast<uint8_t>(getSISOBytes(SI
, cnv
->options
, siBytes
));
4168 soLength
= static_cast<uint8_t>(getSISOBytes(SO
, cnv
->options
, soBytes
));
4170 /* conversion loop */
4172 * This is another piece of ugly code:
4173 * A goto into the loop if the converter state contains a first surrogate
4174 * from the previous function call.
4175 * It saves me to check in each loop iteration a check of if(c==0)
4176 * and duplicating the trail-surrogate-handling code in the else
4177 * branch of that check.
4178 * I could not find any other way to get around this other than
4179 * using a function call for the conversion and callback, which would
4180 * be even more inefficient.
4182 * Markus Scherer 2000-jul-19
4184 if(c
!=0 && targetCapacity
>0) {
4188 while(source
<sourceLimit
) {
4190 * This following test is to see if available input would overflow the output.
4191 * It does not catch output of more than one byte that
4192 * overflows as a result of a multi-byte character or callback output
4193 * from the last source character.
4194 * Therefore, those situations also test for overflows and will
4195 * then break the loop, too.
4197 if(targetCapacity
>0) {
4199 * Get a correct Unicode code point:
4200 * a single UChar for a BMP code point or
4201 * a matched surrogate pair for a "supplementary code point".
4205 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
4206 *target
++=(uint8_t)c
;
4208 *offsets
++=sourceIndex
;
4209 prevSourceIndex
=sourceIndex
;
4210 sourceIndex
=nextSourceIndex
;
4217 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
4218 * to avoid dealing with surrogates.
4219 * MBCS_FAST_MAX must be >=0xd7ff.
4221 if(c
<=0xd7ff && mbcsIndex
!=NULL
) {
4222 value
=mbcsIndex
[c
>>6];
4224 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
4225 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
4226 switch(outputType
) {
4228 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4239 case MBCS_OUTPUT_2_SISO
:
4240 /* 1/2-byte stateful with Shift-In/Shift-Out */
4242 * Save the old state in the converter object
4243 * right here, then change the local prevLength state variable if necessary.
4244 * Then, if this character turns out to be unassigned or a fallback that
4245 * is not taken, the callback code must not save the new state in the converter
4246 * because the new state is for a character that is not output.
4247 * However, the callback must still restore the state from the converter
4248 * in case the callback function changed it for its output.
4250 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
4251 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4255 } else if(prevLength
<=1) {
4258 /* change from double-byte mode to single-byte */
4259 if (siLength
== 1) {
4260 value
|=(uint32_t)siBytes
[0]<<8;
4262 } else if (siLength
== 2) {
4263 value
|=(uint32_t)siBytes
[1]<<8;
4264 value
|=(uint32_t)siBytes
[0]<<16;
4273 /* change from single-byte mode to double-byte */
4274 if (soLength
== 1) {
4275 value
|=(uint32_t)soBytes
[0]<<16;
4277 } else if (soLength
== 2) {
4278 value
|=(uint32_t)soBytes
[1]<<16;
4279 value
|=(uint32_t)soBytes
[0]<<24;
4286 case MBCS_OUTPUT_DBCS_ONLY
:
4287 /* table with single-byte results, but only DBCS mappings used */
4288 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4290 /* no mapping or SBCS result, not taken for DBCS-only */
4297 p
=bytes
+(value
+(c
&0x3f))*3;
4298 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4305 } else if(value
<=0xffff) {
4312 value
=((const uint32_t *)bytes
)[value
+(c
&0x3f)];
4319 } else if(value
<=0xffff) {
4321 } else if(value
<=0xffffff) {
4327 case MBCS_OUTPUT_3_EUC
:
4328 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4329 /* EUC 16-bit fixed-length representation */
4336 } else if((value
&0x8000)==0) {
4339 } else if((value
&0x80)==0) {
4346 case MBCS_OUTPUT_4_EUC
:
4347 p
=bytes
+(value
+(c
&0x3f))*3;
4348 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4349 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4356 } else if(value
<=0xffff) {
4358 } else if((value
&0x800000)==0) {
4361 } else if((value
&0x8000)==0) {
4369 /* must not occur */
4371 * To avoid compiler warnings that value & length may be
4372 * used without having been initialized, we set them here.
4373 * In reality, this is unreachable code.
4374 * Not having a default branch also causes warnings with
4381 /* output the value */
4384 * This also tests if the codepage maps single surrogates.
4385 * If it does, then surrogates are not paired but mapped separately.
4386 * Note that in this case unmatched surrogates are not detected.
4388 if(U16_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
4389 if(U16_IS_SURROGATE_LEAD(c
)) {
4391 if(source
<sourceLimit
) {
4392 /* test the following code unit */
4393 UChar trail
=*source
;
4394 if(U16_IS_TRAIL(trail
)) {
4397 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
4398 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4399 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4400 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
4401 /* callback(unassigned) */
4404 /* convert this supplementary code point */
4405 /* exit this condition tree */
4407 /* this is an unmatched lead code unit (1st surrogate) */
4408 /* callback(illegal) */
4409 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
4417 /* this is an unmatched trail code unit (2nd surrogate) */
4418 /* callback(illegal) */
4419 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
4424 /* convert the Unicode code point in c into codepage bytes */
4427 * The basic lookup is a triple-stage compact array (trie) lookup.
4428 * For details see the beginning of this file.
4430 * Single-byte codepages are handled with a different data structure
4431 * by _MBCSSingle... functions.
4433 * The result consists of a 32-bit value from stage 2 and
4434 * a pointer to as many bytes as are stored per character.
4435 * The pointer points to the character's bytes in stage 3.
4436 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4437 * for that pointer, while bits 31..16 are flags for which of
4438 * the 16 characters in the block are roundtrip-assigned.
4440 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4441 * respectively as uint32_t, in the platform encoding.
4442 * For 3-byte codepages, the bytes are always stored in big-endian order.
4444 * For EUC encodings that use only either 0x8e or 0x8f as the first
4445 * byte of their longest byte sequences, the first two bytes in
4446 * this third stage indicate with their 7th bits whether these bytes
4447 * are to be written directly or actually need to be preceeded by
4448 * one of the two Single-Shift codes. With this, the third stage
4449 * stores one byte fewer per character than the actual maximum length of
4450 * EUC byte sequences.
4452 * Other than that, leading zero bytes are removed and the other
4453 * bytes output. A single zero byte may be output if the "assigned"
4454 * bit in stage 2 was on.
4455 * The data structure does not support zero byte output as a fallback,
4456 * and also does not allow output of leading zeros.
4458 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
4460 /* get the bytes and the length for the output */
4461 switch(outputType
) {
4463 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4470 case MBCS_OUTPUT_2_SISO
:
4471 /* 1/2-byte stateful with Shift-In/Shift-Out */
4473 * Save the old state in the converter object
4474 * right here, then change the local prevLength state variable if necessary.
4475 * Then, if this character turns out to be unassigned or a fallback that
4476 * is not taken, the callback code must not save the new state in the converter
4477 * because the new state is for a character that is not output.
4478 * However, the callback must still restore the state from the converter
4479 * in case the callback function changed it for its output.
4481 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
4482 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4484 if(value
==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
)==0) {
4485 /* no mapping, leave value==0 */
4487 } else if(prevLength
<=1) {
4490 /* change from double-byte mode to single-byte */
4491 if (siLength
== 1) {
4492 value
|=(uint32_t)siBytes
[0]<<8;
4494 } else if (siLength
== 2) {
4495 value
|=(uint32_t)siBytes
[1]<<8;
4496 value
|=(uint32_t)siBytes
[0]<<16;
4505 /* change from single-byte mode to double-byte */
4506 if (soLength
== 1) {
4507 value
|=(uint32_t)soBytes
[0]<<16;
4509 } else if (soLength
== 2) {
4510 value
|=(uint32_t)soBytes
[1]<<16;
4511 value
|=(uint32_t)soBytes
[0]<<24;
4518 case MBCS_OUTPUT_DBCS_ONLY
:
4519 /* table with single-byte results, but only DBCS mappings used */
4520 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4522 /* no mapping or SBCS result, not taken for DBCS-only */
4523 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4530 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4531 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4534 } else if(value
<=0xffff) {
4541 value
=MBCS_VALUE_4_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4544 } else if(value
<=0xffff) {
4546 } else if(value
<=0xffffff) {
4552 case MBCS_OUTPUT_3_EUC
:
4553 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4554 /* EUC 16-bit fixed-length representation */
4557 } else if((value
&0x8000)==0) {
4560 } else if((value
&0x80)==0) {
4567 case MBCS_OUTPUT_4_EUC
:
4568 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4569 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4570 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4573 } else if(value
<=0xffff) {
4575 } else if((value
&0x800000)==0) {
4578 } else if((value
&0x8000)==0) {
4586 /* must not occur */
4588 * To avoid compiler warnings that value & length may be
4589 * used without having been initialized, we set them here.
4590 * In reality, this is unreachable code.
4591 * Not having a default branch also causes warnings with
4594 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4599 /* is this code point assigned, or do we use fallbacks? */
4600 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
)!=0 ||
4601 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
4604 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4605 * There is no way with this data structure for fallback output
4606 * to be a zero byte.
4610 /* try an extension mapping */
4611 pArgs
->source
=source
;
4612 c
=_extFromU(cnv
, cnv
->sharedData
,
4613 c
, &source
, sourceLimit
,
4614 &target
, target
+targetCapacity
,
4615 &offsets
, sourceIndex
,
4618 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
4619 prevLength
=cnv
->fromUnicodeStatus
; /* restore SISO state */
4621 if(U_FAILURE(*pErrorCode
)) {
4622 /* not mappable or buffer overflow */
4625 /* a mapping was written to the target, continue */
4627 /* recalculate the targetCapacity after an extension mapping */
4628 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
4630 /* normal end of conversion: prepare for a new character */
4632 prevSourceIndex
=sourceIndex
;
4633 sourceIndex
=nextSourceIndex
;
4640 /* write the output character bytes from value and length */
4641 /* from the first if in the loop we know that targetCapacity>0 */
4642 if(length
<=targetCapacity
) {
4645 /* each branch falls through to the next one */
4647 *target
++=(uint8_t)(value
>>24);
4650 *target
++=(uint8_t)(value
>>16);
4653 *target
++=(uint8_t)(value
>>8);
4656 *target
++=(uint8_t)value
;
4659 /* will never occur */
4664 /* each branch falls through to the next one */
4666 *target
++=(uint8_t)(value
>>24);
4667 *offsets
++=sourceIndex
;
4670 *target
++=(uint8_t)(value
>>16);
4671 *offsets
++=sourceIndex
;
4674 *target
++=(uint8_t)(value
>>8);
4675 *offsets
++=sourceIndex
;
4678 *target
++=(uint8_t)value
;
4679 *offsets
++=sourceIndex
;
4682 /* will never occur */
4686 targetCapacity
-=length
;
4688 uint8_t *charErrorBuffer
;
4691 * We actually do this backwards here:
4692 * In order to save an intermediate variable, we output
4693 * first to the overflow buffer what does not fit into the
4696 /* we know that 1<=targetCapacity<length<=4 */
4697 length
-=targetCapacity
;
4698 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
4700 /* each branch falls through to the next one */
4702 *charErrorBuffer
++=(uint8_t)(value
>>16);
4705 *charErrorBuffer
++=(uint8_t)(value
>>8);
4708 *charErrorBuffer
=(uint8_t)value
;
4711 /* will never occur */
4714 cnv
->charErrorBufferLength
=(int8_t)length
;
4716 /* now output what fits into the regular target */
4717 value
>>=8*length
; /* length was reduced by targetCapacity */
4718 switch(targetCapacity
) {
4719 /* each branch falls through to the next one */
4721 *target
++=(uint8_t)(value
>>16);
4723 *offsets
++=sourceIndex
;
4727 *target
++=(uint8_t)(value
>>8);
4729 *offsets
++=sourceIndex
;
4733 *target
++=(uint8_t)value
;
4735 *offsets
++=sourceIndex
;
4739 /* will never occur */
4743 /* target overflow */
4745 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4750 /* normal end of conversion: prepare for a new character */
4753 prevSourceIndex
=sourceIndex
;
4754 sourceIndex
=nextSourceIndex
;
4758 /* target is full */
4759 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4765 * the end of the input stream and detection of truncated input
4766 * are handled by the framework, but for EBCDIC_STATEFUL conversion
4767 * we need to emit an SI at the very end
4771 * EBCDIC_STATEFUL in DBCS mode
4772 * end of input and no truncated input
4774 if( U_SUCCESS(*pErrorCode
) &&
4775 outputType
==MBCS_OUTPUT_2_SISO
&& prevLength
==2 &&
4776 pArgs
->flush
&& source
>=sourceLimit
&& c
==0
4778 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4779 if(targetCapacity
>0) {
4780 *target
++=(uint8_t)siBytes
[0];
4781 if (siLength
== 2) {
4782 if (targetCapacity
<2) {
4783 cnv
->charErrorBuffer
[0]=(uint8_t)siBytes
[1];
4784 cnv
->charErrorBufferLength
=1;
4785 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4787 *target
++=(uint8_t)siBytes
[1];
4791 /* set the last source character's index (sourceIndex points at sourceLimit now) */
4792 *offsets
++=prevSourceIndex
;
4795 /* target is full */
4796 cnv
->charErrorBuffer
[0]=(uint8_t)siBytes
[0];
4797 if (siLength
== 2) {
4798 cnv
->charErrorBuffer
[1]=(uint8_t)siBytes
[1];
4800 cnv
->charErrorBufferLength
=siLength
;
4801 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4803 prevLength
=1; /* we switched into SBCS */
4806 /* set the converter state back into UConverter */
4808 cnv
->fromUnicodeStatus
=prevLength
;
4810 /* write back the updated pointers */
4811 pArgs
->source
=source
;
4812 pArgs
->target
=(char *)target
;
4813 pArgs
->offsets
=offsets
;
4817 * This is another simple conversion function for internal use by other
4818 * conversion implementations.
4819 * It does not use the converter state nor call callbacks.
4820 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4821 * It handles conversion extensions but not GB 18030.
4823 * It converts one single Unicode code point into codepage bytes, encoded
4824 * as one 32-bit value. The function returns the number of bytes in *pValue:
4825 * 1..4 the number of bytes in *pValue
4826 * 0 unassigned (*pValue undefined)
4827 * -1 illegal (currently not used, *pValue undefined)
4829 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4830 * the second to last byte in bits 15..8, etc.
4831 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4834 ucnv_MBCSFromUChar32(UConverterSharedData
*sharedData
,
4835 UChar32 c
, uint32_t *pValue
,
4836 UBool useFallback
) {
4838 const uint16_t *table
;
4840 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4843 uint32_t stage2Entry
;
4847 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4848 if(c
<=0xffff || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4849 table
=sharedData
->mbcs
.fromUnicodeTable
;
4851 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4852 if(sharedData
->mbcs
.outputType
==MBCS_OUTPUT_1
) {
4853 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
4854 /* is this code point assigned, or do we use fallbacks? */
4855 if(useFallback
? value
>=0x800 : value
>=0xc00) {
4859 } else /* outputType!=MBCS_OUTPUT_1 */ {
4860 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
4862 /* get the bytes and the length for the output */
4863 switch(sharedData
->mbcs
.outputType
) {
4865 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4873 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4874 case MBCS_OUTPUT_DBCS_ONLY
:
4875 /* table with single-byte results, but only DBCS mappings used */
4876 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4878 /* no mapping or SBCS result, not taken for DBCS-only */
4879 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4886 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4887 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4890 } else if(value
<=0xffff) {
4897 value
=MBCS_VALUE_4_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4900 } else if(value
<=0xffff) {
4902 } else if(value
<=0xffffff) {
4908 case MBCS_OUTPUT_3_EUC
:
4909 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4910 /* EUC 16-bit fixed-length representation */
4913 } else if((value
&0x8000)==0) {
4916 } else if((value
&0x80)==0) {
4923 case MBCS_OUTPUT_4_EUC
:
4924 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4925 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4926 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4929 } else if(value
<=0xffff) {
4931 } else if((value
&0x800000)==0) {
4934 } else if((value
&0x8000)==0) {
4943 /* must not occur */
4947 /* is this code point assigned, or do we use fallbacks? */
4948 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
4949 (FROM_U_USE_FALLBACK(useFallback
, c
) && value
!=0)
4952 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4953 * There is no way with this data structure for fallback output
4954 * to be a zero byte.
4963 cx
=sharedData
->mbcs
.extIndexes
;
4965 length
=ucnv_extSimpleMatchFromU(cx
, c
, pValue
, useFallback
);
4966 return length
>=0 ? length
: -length
; /* return abs(length); */
4976 * This function has been moved to ucnv2022.c for inlining.
4977 * This implementation is here only for documentation purposes
4981 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4982 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4983 * It does not handle conversion extensions (_extFromU()).
4985 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4988 ucnv_MBCSSingleFromUChar32(UConverterSharedData
*sharedData
,
4990 UBool useFallback
) {
4991 const uint16_t *table
;
4994 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4995 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4999 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
5000 table
=sharedData
->mbcs
.fromUnicodeTable
;
5002 /* get the byte for the output */
5003 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
5004 /* is this code point assigned, or do we use fallbacks? */
5005 if(useFallback
? value
>=0x800 : value
>=0xc00) {
5013 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
5015 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
5016 static const UChar32
5017 utf8_offsets
[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
5019 static void U_CALLCONV
5020 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
5021 UConverterToUnicodeArgs
*pToUArgs
,
5022 UErrorCode
*pErrorCode
) {
5023 UConverter
*utf8
, *cnv
;
5024 const uint8_t *source
, *sourceLimit
;
5026 int32_t targetCapacity
;
5028 const uint16_t *table
, *sbcsIndex
;
5029 const uint16_t *results
;
5031 int8_t oldToULength
, toULength
, toULimit
;
5036 uint32_t asciiRoundtrips
;
5037 uint16_t value
, minValue
= 0;
5038 UBool hasSupplementary
;
5040 /* set up the local pointers */
5041 utf8
=pToUArgs
->converter
;
5042 cnv
=pFromUArgs
->converter
;
5043 source
=(uint8_t *)pToUArgs
->source
;
5044 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
5045 target
=(uint8_t *)pFromUArgs
->target
;
5046 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
5048 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
5049 sbcsIndex
=cnv
->sharedData
->mbcs
.sbcsIndex
;
5050 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
5051 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
5053 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
5055 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
5057 if(cnv
->useFallback
) {
5058 /* use all roundtrip and fallback results */
5061 /* use only roundtrips and fallbacks from private-use characters */
5064 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
5066 /* get the converter state from the UTF-8 UConverter */
5067 if(utf8
->toULength
> 0) {
5068 toULength
=oldToULength
=utf8
->toULength
;
5069 toULimit
=(int8_t)utf8
->mode
;
5070 c
=(UChar32
)utf8
->toUnicodeStatus
;
5072 toULength
=oldToULength
=toULimit
=0;
5076 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
5077 // If the buffer ends with a truncated 2- or 3-byte sequence,
5078 // then we reduce the sourceLimit to before that,
5079 // and collect the remaining bytes after the conversion loop.
5081 // Do not go back into the bytes that will be read for finishing a partial
5082 // sequence from the previous buffer.
5083 int32_t length
=(int32_t)(sourceLimit
-source
) - (toULimit
-oldToULength
);
5085 uint8_t b1
=*(sourceLimit
-1);
5086 if(U8_IS_SINGLE(b1
)) {
5087 // common ASCII character
5088 } else if(U8_IS_TRAIL(b1
) && length
>=2) {
5089 uint8_t b2
=*(sourceLimit
-2);
5090 if(0xe0<=b2
&& b2
<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2
, b1
)) {
5091 // truncated 3-byte sequence
5094 } else if(0xc2<=b1
&& b1
<0xf0) {
5095 // truncated 2- or 3-byte sequence
5101 if(c
!=0 && targetCapacity
>0) {
5102 utf8
->toUnicodeStatus
=0;
5106 * Note: We could avoid the goto by duplicating some of the moreBytes
5107 * code, but only up to the point of collecting a complete UTF-8
5108 * sequence; then recurse for the toUBytes[toULength]
5109 * and then continue with normal conversion.
5111 * If so, move this code to just after initializing the minimum
5112 * set of local variables for reading the UTF-8 input
5113 * (utf8, source, target, limits but not cnv, table, minValue, etc.).
5115 * Potential advantages:
5117 * - oldToULength could become a local variable in just those code blocks
5118 * that deal with buffer boundaries
5119 * - possibly faster if the goto prevents some compiler optimizations
5120 * (this would need measuring to confirm)
5122 * - code duplication
5126 /* conversion loop */
5127 while(source
<sourceLimit
) {
5128 if(targetCapacity
>0) {
5130 if(U8_IS_SINGLE(b
)) {
5132 if(IS_ASCII_ROUNDTRIP(b
, asciiRoundtrips
)) {
5133 *target
++=(uint8_t)b
;
5138 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, 0, c
);
5142 if( /* handle U+0080..U+07FF inline */
5144 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
5148 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, c
, t1
);
5149 if(value
>=minValue
) {
5150 *target
++=(uint8_t)value
;
5159 } else if(b
==0xe0) {
5160 if( /* handle U+0800..U+0FFF inline */
5161 (t1
=(uint8_t)(source
[0]-0x80)) <= 0x3f && t1
>= 0x20 &&
5162 (t2
=(uint8_t)(source
[1]-0x80)) <= 0x3f
5166 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, c
, t2
);
5167 if(value
>=minValue
) {
5168 *target
++=(uint8_t)value
;
5182 /* handle "complicated" and error cases, and continuing partial characters */
5185 toULimit
=U8_COUNT_BYTES_NON_ASCII(b
);
5188 while(toULength
<toULimit
) {
5190 * The sourceLimit may have been adjusted before the conversion loop
5191 * to stop before a truncated sequence.
5192 * Here we need to use the real limit in case we have two truncated
5193 * sequences at the end.
5196 if(source
<(uint8_t *)pToUArgs
->sourceLimit
) {
5198 if(icu::UTF8::isValidTrail(c
, b
, toULength
, toULimit
)) {
5203 break; /* sequence too short, stop with toULength<toULimit */
5206 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5207 source
-=(toULength
-oldToULength
);
5208 while(oldToULength
<toULength
) {
5209 utf8
->toUBytes
[oldToULength
++]=*source
++;
5211 utf8
->toUnicodeStatus
=c
;
5212 utf8
->toULength
=toULength
;
5213 utf8
->mode
=toULimit
;
5214 pToUArgs
->source
=(char *)source
;
5215 pFromUArgs
->target
=(char *)target
;
5220 if(toULength
==toULimit
) {
5221 c
-=utf8_offsets
[toULength
];
5222 if(toULength
<=3) { /* BMP */
5223 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
5225 /* supplementary code point */
5226 if(!hasSupplementary
) {
5227 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5230 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
5234 /* error handling: illegal UTF-8 byte sequence */
5235 source
-=(toULength
-oldToULength
);
5236 while(oldToULength
<toULength
) {
5237 utf8
->toUBytes
[oldToULength
++]=*source
++;
5239 utf8
->toULength
=toULength
;
5240 pToUArgs
->source
=(char *)source
;
5241 pFromUArgs
->target
=(char *)target
;
5242 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
5248 if(value
>=minValue
) {
5249 /* output the mapping for c */
5250 *target
++=(uint8_t)value
;
5253 /* value<minValue means c is unassigned (unmappable) */
5255 * Try an extension mapping.
5256 * Pass in no source because we don't have UTF-16 input.
5257 * If we have a partial match on c, we will return and revert
5258 * to UTF-8->UTF-16->charset conversion.
5260 static const UChar nul
=0;
5261 const UChar
*noSource
=&nul
;
5262 c
=_extFromU(cnv
, cnv
->sharedData
,
5263 c
, &noSource
, noSource
,
5264 &target
, target
+targetCapacity
,
5269 if(U_FAILURE(*pErrorCode
)) {
5270 /* not mappable or buffer overflow */
5273 } else if(cnv
->preFromUFirstCP
>=0) {
5275 * Partial match, return and revert to pivoting.
5276 * In normal from-UTF-16 conversion, we would just continue
5277 * but then exit the loop because the extension match would
5278 * have consumed the source.
5280 *pErrorCode
=U_USING_DEFAULT_WARNING
;
5283 /* a mapping was written to the target, continue */
5285 /* recalculate the targetCapacity after an extension mapping */
5286 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-(char *)target
);
5290 /* target is full */
5291 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
5297 * The sourceLimit may have been adjusted before the conversion loop
5298 * to stop before a truncated sequence.
5299 * If so, then collect the truncated sequence now.
5301 if(U_SUCCESS(*pErrorCode
) &&
5302 cnv
->preFromUFirstCP
<0 &&
5303 source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
5304 c
=utf8
->toUBytes
[0]=b
=*source
++;
5306 toULimit
=U8_COUNT_BYTES(b
);
5307 while(source
<sourceLimit
) {
5308 utf8
->toUBytes
[toULength
++]=b
=*source
++;
5311 utf8
->toUnicodeStatus
=c
;
5312 utf8
->toULength
=toULength
;
5313 utf8
->mode
=toULimit
;
5316 /* write back the updated pointers */
5317 pToUArgs
->source
=(char *)source
;
5318 pFromUArgs
->target
=(char *)target
;
5321 static void U_CALLCONV
5322 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
5323 UConverterToUnicodeArgs
*pToUArgs
,
5324 UErrorCode
*pErrorCode
) {
5325 UConverter
*utf8
, *cnv
;
5326 const uint8_t *source
, *sourceLimit
;
5328 int32_t targetCapacity
;
5330 const uint16_t *table
, *mbcsIndex
;
5331 const uint16_t *results
;
5333 int8_t oldToULength
, toULength
, toULimit
;
5338 uint32_t stage2Entry
;
5339 uint32_t asciiRoundtrips
;
5341 UBool hasSupplementary
;
5343 /* set up the local pointers */
5344 utf8
=pToUArgs
->converter
;
5345 cnv
=pFromUArgs
->converter
;
5346 source
=(uint8_t *)pToUArgs
->source
;
5347 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
5348 target
=(uint8_t *)pFromUArgs
->target
;
5349 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
5351 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
5352 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
5353 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
5354 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
5356 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
5358 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
5360 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
5362 /* get the converter state from the UTF-8 UConverter */
5363 if(utf8
->toULength
> 0) {
5364 toULength
=oldToULength
=utf8
->toULength
;
5365 toULimit
=(int8_t)utf8
->mode
;
5366 c
=(UChar32
)utf8
->toUnicodeStatus
;
5368 toULength
=oldToULength
=toULimit
=0;
5372 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
5373 // If the buffer ends with a truncated 2- or 3-byte sequence,
5374 // then we reduce the sourceLimit to before that,
5375 // and collect the remaining bytes after the conversion loop.
5377 // Do not go back into the bytes that will be read for finishing a partial
5378 // sequence from the previous buffer.
5379 int32_t length
=(int32_t)(sourceLimit
-source
) - (toULimit
-oldToULength
);
5381 uint8_t b1
=*(sourceLimit
-1);
5382 if(U8_IS_SINGLE(b1
)) {
5383 // common ASCII character
5384 } else if(U8_IS_TRAIL(b1
) && length
>=2) {
5385 uint8_t b2
=*(sourceLimit
-2);
5386 if(0xe0<=b2
&& b2
<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2
, b1
)) {
5387 // truncated 3-byte sequence
5390 } else if(0xc2<=b1
&& b1
<0xf0) {
5391 // truncated 2- or 3-byte sequence
5397 if(c
!=0 && targetCapacity
>0) {
5398 utf8
->toUnicodeStatus
=0;
5401 /* See note in ucnv_SBCSFromUTF8() about this goto. */
5404 /* conversion loop */
5405 while(source
<sourceLimit
) {
5406 if(targetCapacity
>0) {
5408 if(U8_IS_SINGLE(b
)) {
5410 if(IS_ASCII_ROUNDTRIP(b
, asciiRoundtrips
)) {
5415 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, 0, b
);
5423 if( /* handle U+0800..U+D7FF inline */
5424 b
<=0xed && // do not assume maxFastUChar>0xd7ff
5425 U8_IS_VALID_LEAD3_AND_T1(b
, t1
=source
[0]) &&
5426 (t2
=(uint8_t)(source
[1]-0x80)) <= 0x3f
5428 c
=((b
&0xf)<<6)|(t1
&0x3f);
5430 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, c
, t2
);
5439 if( /* handle U+0080..U+07FF inline */
5441 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
5445 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, c
, t1
);
5456 /* handle "complicated" and error cases, and continuing partial characters */
5459 toULimit
=U8_COUNT_BYTES_NON_ASCII(b
);
5462 while(toULength
<toULimit
) {
5464 * The sourceLimit may have been adjusted before the conversion loop
5465 * to stop before a truncated sequence.
5466 * Here we need to use the real limit in case we have two truncated
5467 * sequences at the end.
5470 if(source
<(uint8_t *)pToUArgs
->sourceLimit
) {
5472 if(icu::UTF8::isValidTrail(c
, b
, toULength
, toULimit
)) {
5477 break; /* sequence too short, stop with toULength<toULimit */
5480 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5481 source
-=(toULength
-oldToULength
);
5482 while(oldToULength
<toULength
) {
5483 utf8
->toUBytes
[oldToULength
++]=*source
++;
5485 utf8
->toUnicodeStatus
=c
;
5486 utf8
->toULength
=toULength
;
5487 utf8
->mode
=toULimit
;
5488 pToUArgs
->source
=(char *)source
;
5489 pFromUArgs
->target
=(char *)target
;
5494 if(toULength
==toULimit
) {
5495 c
-=utf8_offsets
[toULength
];
5496 if(toULength
<=3) { /* BMP */
5497 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
5499 /* supplementary code point */
5500 if(!hasSupplementary
) {
5501 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5504 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
5508 /* error handling: illegal UTF-8 byte sequence */
5509 source
-=(toULength
-oldToULength
);
5510 while(oldToULength
<toULength
) {
5511 utf8
->toUBytes
[oldToULength
++]=*source
++;
5513 utf8
->toULength
=toULength
;
5514 pToUArgs
->source
=(char *)source
;
5515 pFromUArgs
->target
=(char *)target
;
5516 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
5520 /* get the bytes and the length for the output */
5522 value
=MBCS_VALUE_2_FROM_STAGE_2(results
, stage2Entry
, c
);
5524 /* is this code point assigned, or do we use fallbacks? */
5525 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
5526 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
5533 /* write the output character bytes from value and length */
5534 /* from the first if in the loop we know that targetCapacity>0 */
5536 /* this is easy because we know that there is enough space */
5537 *target
++=(uint8_t)value
;
5539 } else /* length==2 */ {
5540 *target
++=(uint8_t)(value
>>8);
5541 if(2<=targetCapacity
) {
5542 *target
++=(uint8_t)value
;
5545 cnv
->charErrorBuffer
[0]=(char)value
;
5546 cnv
->charErrorBufferLength
=1;
5548 /* target overflow */
5549 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
5558 * Try an extension mapping.
5559 * Pass in no source because we don't have UTF-16 input.
5560 * If we have a partial match on c, we will return and revert
5561 * to UTF-8->UTF-16->charset conversion.
5563 static const UChar nul
=0;
5564 const UChar
*noSource
=&nul
;
5565 c
=_extFromU(cnv
, cnv
->sharedData
,
5566 c
, &noSource
, noSource
,
5567 &target
, target
+targetCapacity
,
5572 if(U_FAILURE(*pErrorCode
)) {
5573 /* not mappable or buffer overflow */
5576 } else if(cnv
->preFromUFirstCP
>=0) {
5578 * Partial match, return and revert to pivoting.
5579 * In normal from-UTF-16 conversion, we would just continue
5580 * but then exit the loop because the extension match would
5581 * have consumed the source.
5583 *pErrorCode
=U_USING_DEFAULT_WARNING
;
5586 /* a mapping was written to the target, continue */
5588 /* recalculate the targetCapacity after an extension mapping */
5589 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-(char *)target
);
5594 /* target is full */
5595 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
5601 * The sourceLimit may have been adjusted before the conversion loop
5602 * to stop before a truncated sequence.
5603 * If so, then collect the truncated sequence now.
5605 if(U_SUCCESS(*pErrorCode
) &&
5606 cnv
->preFromUFirstCP
<0 &&
5607 source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
5608 c
=utf8
->toUBytes
[0]=b
=*source
++;
5610 toULimit
=U8_COUNT_BYTES(b
);
5611 while(source
<sourceLimit
) {
5612 utf8
->toUBytes
[toULength
++]=b
=*source
++;
5615 utf8
->toUnicodeStatus
=c
;
5616 utf8
->toULength
=toULength
;
5617 utf8
->mode
=toULimit
;
5620 /* write back the updated pointers */
5621 pToUArgs
->source
=(char *)source
;
5622 pFromUArgs
->target
=(char *)target
;
5625 /* miscellaneous ------------------------------------------------------------ */
5627 static void U_CALLCONV
5628 ucnv_MBCSGetStarters(const UConverter
* cnv
,
5629 UBool starters
[256],
5631 const int32_t *state0
;
5634 state0
=cnv
->sharedData
->mbcs
.stateTable
[cnv
->sharedData
->mbcs
.dbcsOnlyState
];
5635 for(i
=0; i
<256; ++i
) {
5636 /* all bytes that cause a state transition from state 0 are lead bytes */
5637 starters
[i
]= (UBool
)MBCS_ENTRY_IS_TRANSITION(state0
[i
]);
5642 * This is an internal function that allows other converter implementations
5643 * to check whether a byte is a lead byte.
5646 ucnv_MBCSIsLeadByte(UConverterSharedData
*sharedData
, char byte
) {
5647 return (UBool
)MBCS_ENTRY_IS_TRANSITION(sharedData
->mbcs
.stateTable
[0][(uint8_t)byte
]);
5650 static void U_CALLCONV
5651 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs
*pArgs
,
5652 int32_t offsetIndex
,
5653 UErrorCode
*pErrorCode
) {
5654 UConverter
*cnv
=pArgs
->converter
;
5659 /* first, select between subChar and subChar1 */
5660 if( cnv
->subChar1
!=0 &&
5661 (cnv
->sharedData
->mbcs
.extIndexes
!=NULL
?
5663 (cnv
->invalidUCharBuffer
[0]<=0xff))
5665 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5666 subchar
=(char *)&cnv
->subChar1
;
5669 /* select subChar in all other cases */
5670 subchar
=(char *)cnv
->subChars
;
5671 length
=cnv
->subCharLen
;
5674 /* reset the selector for the next code point */
5675 cnv
->useSubChar1
=FALSE
;
5677 if (cnv
->sharedData
->mbcs
.outputType
== MBCS_OUTPUT_2_SISO
) {
5680 /* fromUnicodeStatus contains prevLength */
5683 if(cnv
->fromUnicodeStatus
==2) {
5684 /* DBCS mode and SBCS sub char: change to SBCS */
5685 cnv
->fromUnicodeStatus
=1;
5691 if(cnv
->fromUnicodeStatus
<=1) {
5692 /* SBCS mode and DBCS sub char: change to DBCS */
5693 cnv
->fromUnicodeStatus
=2;
5700 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
5704 length
=(int32_t)(p
-buffer
);
5707 ucnv_cbFromUWriteBytes(pArgs
, subchar
, length
, offsetIndex
, pErrorCode
);
5710 U_CFUNC UConverterType
5711 ucnv_MBCSGetType(const UConverter
* converter
) {
5712 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
5713 if(converter
->sharedData
->mbcs
.countStates
==1) {
5714 return (UConverterType
)UCNV_SBCS
;
5715 } else if((converter
->sharedData
->mbcs
.outputType
&0xff)==MBCS_OUTPUT_2_SISO
) {
5716 return (UConverterType
)UCNV_EBCDIC_STATEFUL
;
5717 } else if(converter
->sharedData
->staticData
->minBytesPerChar
==2 && converter
->sharedData
->staticData
->maxBytesPerChar
==2) {
5718 return (UConverterType
)UCNV_DBCS
;
5720 return (UConverterType
)UCNV_MBCS
;
5723 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */