2 ******************************************************************************
4 * Copyright (C) 2000-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvmbcs.c
10 * tab size: 8 (not used)
13 * created on: 2000jul03
14 * created by: Markus W. Scherer
16 * The current code in this file replaces the previous implementation
17 * of conversion code from multi-byte codepages to Unicode and back.
18 * This implementation supports the following:
19 * - legacy variable-length codepages with up to 4 bytes per character
20 * - all Unicode code points (up to 0x10ffff)
21 * - efficient distinction of unassigned vs. illegal byte sequences
22 * - it is possible in fromUnicode() to directly deal with simple
23 * stateful encodings (used for EBCDIC_STATEFUL)
24 * - it is possible to convert Unicode code points
25 * to a single zero byte (but not as a fallback except for SBCS)
27 * Remaining limitations in fromUnicode:
28 * - byte sequences must not have leading zero bytes
29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30 * - limitation to up to 4 bytes per character
32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33 * limitations and adds m:n character mappings and other features.
34 * See ucnv_ext.h for details.
38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40 * macros to ucnvmbcs.h file
43 #include "unicode/utypes.h"
45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
47 #include "unicode/ucnv.h"
48 #include "unicode/ucnv_cb.h"
49 #include "unicode/udata.h"
50 #include "unicode/uset.h"
59 /* control optimizations according to the platform */
60 #define MBCS_UNROLL_SINGLE_TO_BMP 1
61 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
64 * _MBCSHeader versions 5.3 & 4.3
65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
67 * This version is optional. Version 5 is used for incompatible data format changes.
68 * makeconv will continue to generate version 4 files if possible.
70 * Changes from version 4:
72 * The main difference is an additional _MBCSHeader field with
73 * - the length (number of uint32_t) of the _MBCSHeader
74 * - flags for further incompatible data format changes
75 * - flags for further, backward compatible data format changes
77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
78 * the file and needs to be reconstituted at load time.
79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
81 * (For details about these structures see below, and see ucnvmbcs.h.)
83 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
84 * of the Unicode code points. (This requires that the .ucm file has the |0 etc.
85 * precision markers for all mappings.)
87 * All fallbacks have been moved to the extension table, leaving only roundtrips in the
88 * omitted data that can be reconstituted from the toUnicode data.
90 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
91 * With only roundtrip mappings in the base fromUnicode data, this part is fully
92 * redundant with the mbcsIndex and will be reconstituted from that (also using the
93 * stage 1 table which contains the information about how stage 2 was compacted).
95 * The rest of the stage 2 table, the part for code points above maxFastUChar,
96 * is stored in the file and will be appended to the reconstituted part.
98 * The entire fromUBytes array is omitted from the file and will be reconstitued.
99 * This is done by enumerating all toUnicode roundtrip mappings, performing
100 * each mapping (using the stage 1 and reconstituted stage 2 tables) and
101 * writing instead of reading the byte values.
103 * _MBCSHeader version 4.3
105 * Change from version 4.2:
106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
107 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
108 * files which can be used instead of stages 1 & 2.
109 * Faster lookups for roundtrips from most commonly used characters,
110 * and lookups from UTF-8 byte sequences with a natural bit distribution.
111 * See ucnvmbcs.h for more details.
113 * Change from version 4.1:
114 * - Added an optional extension table structure at the end of the .cnv file.
115 * It is present if the upper bits of the header flags field contains a non-zero
117 * Files that contain only a conversion table and no base table
118 * use the special outputType MBCS_OUTPUT_EXT_ONLY.
119 * These contain the base table name between the MBCS header and the extension
122 * Change from version 4.0:
123 * - Replace header.reserved with header.fromUBytesLength so that all
124 * fields in the data have length.
126 * Changes from version 3 (for performance improvements):
127 * - new bit distribution for state table entries
128 * - reordered action codes
129 * - new data structure for single-byte fromUnicode
130 * + stage 2 only contains indexes
131 * + stage 3 stores 16 bits per character with classification bits 15..8
132 * - no multiplier for stage 1 entries
133 * - stage 2 for non-single-byte codepages contains the index and the flags in
135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
137 * For more details about old versions of the MBCS data structure, see
138 * the corresponding versions of this file.
140 * Converting stateless codepage data ---------------------------------------***
141 * (or codepage data with simple states) to Unicode.
143 * Data structure and algorithm for converting from complex legacy codepages
144 * to Unicode. (Designed before 2000-may-22.)
146 * The basic idea is that the structure of legacy codepages can be described
148 * When reading a byte stream, each input byte causes a state transition.
149 * Some transitions result in the output of a code point, some result in
150 * "unassigned" or "illegal" output.
151 * This is used here for character conversion.
153 * The data structure begins with a state table consisting of a row
154 * per state, with 256 entries (columns) per row for each possible input
156 * Each entry is 32 bits wide, with two formats distinguished by
157 * the sign bit (bit 31):
159 * One format for transitional entries (bit 31 not set) for non-final bytes, and
160 * one format for final entries (bit 31 set).
161 * Both formats contain the number of the next state in the same bit
163 * State 0 is the initial state.
165 * Most of the time, the offset values of subsequent states are added
166 * up to a scalar value. This value will eventually be the index of
167 * the Unicode code point in a table that follows the state table.
168 * The effect is that the code points for final state table rows
169 * are contiguous. The code points of final state rows follow each other
170 * in the order of the references to those final states by previous
173 * For some terminal states, the offset is itself the output Unicode
174 * code point (16 bits for a BMP code point or 20 bits for a supplementary
175 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
176 * For others, the code point in the Unicode table is stored with either
177 * one or two code units: one for BMP code points, two for a pair of
179 * All code points for a final state entry take up the same number of code
180 * units, regardless of whether they all actually _use_ the same number
181 * of code units. This is necessary for simple array access.
183 * An additional feature comes in with what in ICU is called "fallback"
186 * In addition to round-trippable, precise, 1:1 mappings, there are often
187 * mappings defined between similar, though not the same, characters.
188 * Typically, such mappings occur only in fromUnicode mapping tables because
189 * Unicode has a superset repertoire of most other codepages. However, it
190 * is possible to provide such mappings in the toUnicode tables, too.
191 * In this case, the fallback mappings are partly integrated into the
192 * general state tables because the structure of the encoding includes their
194 * For final entries in an initial state, fallback mappings are stored in
195 * the entry itself like with roundtrip mappings.
196 * For other final entries, they are stored in the code units table if
197 * the entry is for a pair of code units.
198 * For single-unit results in the code units table, there is no space to
199 * alternatively hold a fallback mapping; in this case, the code unit
200 * is stored as U+fffe (unassigned), and the fallback mapping needs to
201 * be looked up by the scalar offset value in a separate table.
203 * "Unassigned" state entries really mean "structurally unassigned",
204 * i.e., such a byte sequence will never have a mapping result.
206 * The interpretation of the bits in each entry is as follows:
208 * Bit 31 not set, not a terminal entry ("transitional"):
210 * 23..0 offset delta, to be added up
212 * Bit 31 set, terminal ("final") entry:
213 * 30..24 next state (regardless of action code)
214 * 23..20 action code:
215 * action codes 0 and 1 result in precise-mapping Unicode code points
216 * 0 valid byte sequence
218 * 15..0 16-bit Unicode BMP code point
219 * never U+fffe or U+ffff
220 * 1 valid byte sequence
221 * 19..0 20-bit Unicode supplementary code point
222 * never U+fffe or U+ffff
224 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
225 * 2 valid byte sequence (fallback)
227 * 15..0 16-bit Unicode BMP code point as fallback result
228 * 3 valid byte sequence (fallback)
229 * 19..0 20-bit Unicode supplementary code point as fallback result
231 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
232 * depending on the code units they result in
233 * 4 valid byte sequence
235 * 8..0 final offset delta
236 * pointing to one 16-bit code unit which may be
237 * fffe unassigned -- look for a fallback for this offset
239 * 5 valid byte sequence
241 * 8..0 final offset delta
242 * pointing to two 16-bit code units
243 * (typically UTF-16 surrogates)
244 * the result depends on the first code unit as follows:
245 * 0000..d7ff roundtrip BMP code point (1st alone)
246 * d800..dbff roundtrip surrogate pair (1st, 2nd)
247 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
248 * e000 roundtrip BMP code point (2nd alone)
249 * e001 fallback BMP code point (2nd alone)
252 * (the final offset deltas are at most 255 * 2,
253 * times 2 because of storing code unit pairs)
255 * 6 unassigned byte sequence
257 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
258 * this does not contain a final offset delta because the main
259 * purpose of this action code is to save scalar offset values;
260 * therefore, fallback values cannot be assigned to byte
261 * sequences that result in this action code
262 * 7 illegal byte sequence
264 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
265 * 8 state change only
267 * useful for state changes in simple stateful encodings,
268 * at Shift-In/Shift-Out codes
271 * 9..15 reserved for future use
272 * current implementations will only perform a state change
273 * and ignore bits 19..0
275 * An encoding with contiguous ranges of unassigned byte sequences, like
276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
277 * at least two states for the trail bytes:
278 * One trail byte state that results in code points, and one that only
279 * has "unassigned" and "illegal" terminal states.
281 * Note: partly by accident, this data structure supports simple stateful
282 * encodings without any additional logic.
283 * Currently, only simple Shift-In/Shift-Out schemes are handled with
284 * appropriate state tables (especially EBCDIC_STATEFUL!).
286 * MBCS version 2 added:
287 * unassigned and illegal action codes have U+fffe and U+ffff
288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
290 * Converting from Unicode to codepage bytes --------------------------------***
292 * The conversion data structure for fromUnicode is designed for the known
293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
295 * a roundtrip mapping.
297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
298 * like in the character properties table.
299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
300 * with the resulting bytes is at offsetFromUBytes.
302 * Beginning with version 4, single-byte codepages have a significantly different
303 * trie compared to other codepages.
304 * In all cases, the entry in stage 1 is directly the index of the block of
305 * 64 entries in stage 2.
307 * Single-byte lookup:
309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
310 * Stage 3 contains one 16-bit word per result:
311 * Bits 15..8 indicate the kind of result:
313 * c fallback result from private-use code point
314 * 8 fallback result from other code points
316 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
321 * ASCII code points can be looked up with a linear array access into stage 3.
322 * See maxFastUChar and other details in ucnvmbcs.h.
326 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
328 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
329 * If this test is false, then a non-zero result will be interpreted as
330 * a fallback mapping.
331 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
333 * Stage 3 contains 2, 3, or 4 bytes per result.
334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
335 * while 3 bytes are stored as bytes in big-endian order.
336 * Leading zero bytes are ignored, and the number of bytes is counted.
337 * A zero byte mapping result is possible as a roundtrip result.
338 * For some output types, the actual result is processed from this;
339 * see ucnv_MBCSFromUnicodeWithOffsets().
341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
347 * ASCII code points can be looked up with a linear array access into stage 3.
348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
353 * may overlap by any number of entries.
355 * MBCS version 2 added:
356 * the converter checks for known output types, which allows
357 * adding new ones without crashing an unaware converter
360 static const UConverterImpl _SBCSUTF8Impl
;
361 static const UConverterImpl _DBCSUTF8Impl
;
363 /* GB 18030 data ------------------------------------------------------------ */
365 /* helper macros for linear values for GB 18030 four-byte sequences */
366 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
368 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
370 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
373 * Some ranges of GB 18030 where both the Unicode code points and the
374 * GB four-byte sequences are contiguous and are handled algorithmically by
375 * the special callback functions below.
376 * The values are start & end of Unicode & GB codes.
378 * Note that single surrogates are not mapped by GB 18030
379 * as of the re-released mapping tables from 2000-nov-30.
381 static const uint32_t
382 gb18030Ranges
[13][4]={
383 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
384 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
385 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
386 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
387 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
388 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
389 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
390 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
391 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
392 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
393 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
394 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
395 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
398 /* bit flag for UConverter.options indicating GB 18030 special handling */
399 #define _MBCS_OPTION_GB18030 0x8000
401 /* Miscellaneous ------------------------------------------------------------ */
404 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
405 * consecutive sequences of bytes, starting from the one encoded in value,
406 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
407 * Does not currently support m:n mappings or reverse fallbacks.
408 * This function will not be called for sequences of bytes with leading zeros.
410 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
411 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
412 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
413 * not map to anything
414 * @return TRUE to continue enumeration, FALSE to stop
416 typedef UBool U_CALLCONV
417 UConverterEnumToUCallback(const void *context
, uint32_t value
, UChar32 codePoints
[32]);
419 /* similar to ucnv_MBCSGetNextUChar() but recursive */
421 enumToU(UConverterMBCSTable
*mbcsTable
, int8_t stateProps
[],
422 int32_t state
, uint32_t offset
,
424 UConverterEnumToUCallback
*callback
, const void *context
,
425 UErrorCode
*pErrorCode
) {
426 UChar32 codePoints
[32];
428 const uint16_t *unicodeCodeUnits
;
429 UChar32 anyCodePoints
;
432 row
=mbcsTable
->stateTable
[state
];
433 unicodeCodeUnits
=mbcsTable
->unicodeCodeUnits
;
436 anyCodePoints
=-1; /* becomes non-negative if there is a mapping */
438 b
=(stateProps
[state
]&0x38)<<2;
439 if(b
==0 && stateProps
[state
]>=0x40) {
440 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
441 codePoints
[0]=U_SENTINEL
;
444 limit
=((stateProps
[state
]&7)+1)<<5;
446 int32_t entry
=row
[b
];
447 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
448 int32_t nextState
=MBCS_ENTRY_TRANSITION_STATE(entry
);
449 if(stateProps
[nextState
]>=0) {
450 /* recurse to a state with non-ignorable actions */
452 mbcsTable
, stateProps
, nextState
,
453 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
460 codePoints
[b
&0x1f]=U_SENTINEL
;
466 * An if-else-if chain provides more reliable performance for
467 * the most common cases compared to a switch.
469 action
=MBCS_ENTRY_FINAL_ACTION(entry
);
470 if(action
==MBCS_STATE_VALID_DIRECT_16
) {
471 /* output BMP code point */
472 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
473 } else if(action
==MBCS_STATE_VALID_16
) {
474 int32_t finalOffset
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
475 c
=unicodeCodeUnits
[finalOffset
];
477 /* output BMP code point */
481 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
482 int32_t finalOffset
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
483 c
=unicodeCodeUnits
[finalOffset
++];
485 /* output BMP code point below 0xd800 */
486 } else if(c
<=0xdbff) {
487 /* output roundtrip or fallback supplementary code point */
488 c
=((c
&0x3ff)<<10)+unicodeCodeUnits
[finalOffset
]+(0x10000-0xdc00);
489 } else if(c
==0xe000) {
490 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
491 c
=unicodeCodeUnits
[finalOffset
];
495 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
496 /* output supplementary code point */
497 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
502 codePoints
[b
&0x1f]=c
;
505 if(((++b
)&0x1f)==0) {
506 if(anyCodePoints
>=0) {
507 if(!callback(context
, value
|(uint32_t)(b
-0x20), codePoints
)) {
518 * Only called if stateProps[state]==-1.
519 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
520 * MBCS_STATE_CHANGE_ONLY.
523 getStateProp(const int32_t (*stateTable
)[256], int8_t stateProps
[], int state
) {
525 int32_t min
, max
, entry
, nextState
;
527 row
=stateTable
[state
];
530 /* find first non-ignorable state */
533 nextState
=MBCS_ENTRY_STATE(entry
);
534 if(stateProps
[nextState
]==-1) {
535 getStateProp(stateTable
, stateProps
, nextState
);
537 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
538 if(stateProps
[nextState
]>=0) {
541 } else if(MBCS_ENTRY_FINAL_ACTION(entry
)<MBCS_STATE_UNASSIGNED
) {
545 stateProps
[state
]=-0x40; /* (int8_t)0xc0 */
546 return stateProps
[state
];
549 stateProps
[state
]|=(int8_t)((min
>>5)<<3);
551 /* find last non-ignorable state */
552 for(max
=0xff; min
<max
; --max
) {
554 nextState
=MBCS_ENTRY_STATE(entry
);
555 if(stateProps
[nextState
]==-1) {
556 getStateProp(stateTable
, stateProps
, nextState
);
558 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
559 if(stateProps
[nextState
]>=0) {
562 } else if(MBCS_ENTRY_FINAL_ACTION(entry
)<MBCS_STATE_UNASSIGNED
) {
566 stateProps
[state
]|=(int8_t)(max
>>5);
568 /* recurse further and collect direct-state information */
571 nextState
=MBCS_ENTRY_STATE(entry
);
572 if(stateProps
[nextState
]==-1) {
573 getStateProp(stateTable
, stateProps
, nextState
);
575 if(MBCS_ENTRY_IS_FINAL(entry
)) {
576 stateProps
[nextState
]|=0x40;
577 if(MBCS_ENTRY_FINAL_ACTION(entry
)<=MBCS_STATE_FALLBACK_DIRECT_20
) {
578 stateProps
[state
]|=0x40;
583 return stateProps
[state
];
587 * Internal function enumerating the toUnicode data of an MBCS converter.
588 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
589 * table, but could also be used for a future ucnv_getUnicodeSet() option
590 * that includes reverse fallbacks (after updating this function's implementation).
591 * Currently only handles roundtrip mappings.
592 * Does not currently handle extensions.
595 ucnv_MBCSEnumToUnicode(UConverterMBCSTable
*mbcsTable
,
596 UConverterEnumToUCallback
*callback
, const void *context
,
597 UErrorCode
*pErrorCode
) {
599 * Properties for each state, to speed up the enumeration.
600 * Ignorable actions are unassigned/illegal/state-change-only:
601 * They do not lead to mappings.
604 * 1 direct/initial state (stateful converters have multiple)
605 * 0 non-initial state with transitions or with non-ignorable result actions
606 * -1 final state with only ignorable actions
609 * The lowest byte value with non-ignorable actions is
610 * value<<5 (rounded down).
613 * The highest byte value with non-ignorable actions is
614 * (value<<5)&0x1f (rounded up).
616 int8_t stateProps
[MBCS_MAX_STATE_COUNT
];
619 uprv_memset(stateProps
, -1, sizeof(stateProps
));
621 /* recurse from state 0 and set all stateProps */
622 getStateProp(mbcsTable
->stateTable
, stateProps
, 0);
624 for(state
=0; state
<mbcsTable
->countStates
; ++state
) {
625 /*if(stateProps[state]==-1) {
626 printf("unused/unreachable <icu:state> %d\n", state);
628 if(stateProps
[state
]>=0x40) {
629 /* start from each direct state */
631 mbcsTable
, stateProps
, state
, 0, 0,
639 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData
*sharedData
,
641 UConverterUnicodeSet which
,
642 UConverterSetFilter filter
,
643 UErrorCode
*pErrorCode
) {
644 const UConverterMBCSTable
*mbcsTable
;
645 const uint16_t *table
;
648 uint16_t st1
, maxStage1
, st2
;
652 /* enumerate the from-Unicode trie table */
653 mbcsTable
=&sharedData
->mbcs
;
654 table
=mbcsTable
->fromUnicodeTable
;
655 if(mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
661 c
=0; /* keep track of the current code point while enumerating */
663 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
664 const uint16_t *stage2
, *stage3
, *results
;
667 results
=(const uint16_t *)mbcsTable
->fromUnicodeBytes
;
670 * Set a threshold variable for selecting which mappings to use.
671 * See ucnv_MBCSSingleFromBMPWithOffsets() and
672 * MBCS_SINGLE_RESULT_FROM_U() for details.
674 if(which
==UCNV_ROUNDTRIP_SET
) {
675 /* use only roundtrips */
677 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
678 /* use all roundtrip and fallback results */
682 for(st1
=0; st1
<maxStage1
; ++st1
) {
686 for(st2
=0; st2
<64; ++st2
) {
687 if((st3
=stage2
[st2
])!=0) {
688 /* read the stage 3 block */
692 if(*stage3
++>=minValue
) {
695 } while((++c
&0xf)!=0);
697 c
+=16; /* empty stage 3 block */
701 c
+=1024; /* empty stage 2 block */
705 const uint32_t *stage2
;
706 const uint8_t *stage3
, *bytes
;
707 uint32_t st3Multiplier
;
711 bytes
=mbcsTable
->fromUnicodeBytes
;
713 useFallback
=(UBool
)(which
==UCNV_ROUNDTRIP_AND_FALLBACK_SET
);
715 switch(mbcsTable
->outputType
) {
717 case MBCS_OUTPUT_4_EUC
:
728 for(st1
=0; st1
<maxStage1
; ++st1
) {
730 if(st2
>(maxStage1
>>1)) {
731 stage2
=(const uint32_t *)table
+st2
;
732 for(st2
=0; st2
<64; ++st2
) {
733 if((st3
=stage2
[st2
])!=0) {
734 /* read the stage 3 block */
735 stage3
=bytes
+st3Multiplier
*16*(uint32_t)(uint16_t)st3
;
737 /* get the roundtrip flags for the stage 3 block */
741 * Add code points for which the roundtrip flag is set,
742 * or which map to non-zero bytes if we use fallbacks.
743 * See ucnv_MBCSFromUnicodeWithOffsets() for details.
746 case UCNV_SET_FILTER_NONE
:
750 stage3
+=st3Multiplier
;
751 } else if(useFallback
) {
753 switch(st3Multiplier
) {
759 b
|=stage3
[0]|stage3
[1];
769 } while((++c
&0xf)!=0);
771 case UCNV_SET_FILTER_DBCS_ONLY
:
772 /* Ignore single-byte results (<0x100). */
774 if(((st3
&1)!=0 || useFallback
) && *((const uint16_t *)stage3
)>=0x100) {
778 stage3
+=2; /* +=st3Multiplier */
779 } while((++c
&0xf)!=0);
781 case UCNV_SET_FILTER_2022_CN
:
782 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
784 if(((st3
&1)!=0 || useFallback
) && ((value
=*stage3
)==0x81 || value
==0x82)) {
788 stage3
+=3; /* +=st3Multiplier */
789 } while((++c
&0xf)!=0);
791 case UCNV_SET_FILTER_SJIS
:
792 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
794 if(((st3
&1)!=0 || useFallback
) && (value
=*((const uint16_t *)stage3
))>=0x8140 && value
<=0xeffc) {
798 stage3
+=2; /* +=st3Multiplier */
799 } while((++c
&0xf)!=0);
801 case UCNV_SET_FILTER_GR94DBCS
:
802 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
804 if( ((st3
&1)!=0 || useFallback
) &&
805 (uint16_t)((value
=*((const uint16_t *)stage3
)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
806 (uint8_t)(value
-0xa1)<=(0xfe - 0xa1)
811 stage3
+=2; /* +=st3Multiplier */
812 } while((++c
&0xf)!=0);
814 case UCNV_SET_FILTER_HZ
:
815 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
817 if( ((st3
&1)!=0 || useFallback
) &&
818 (uint16_t)((value
=*((const uint16_t *)stage3
))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
819 (uint8_t)(value
-0xa1)<=(0xfe - 0xa1)
824 stage3
+=2; /* +=st3Multiplier */
825 } while((++c
&0xf)!=0);
828 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
832 c
+=16; /* empty stage 3 block */
836 c
+=1024; /* empty stage 2 block */
841 ucnv_extGetUnicodeSet(sharedData
, sa
, which
, filter
, pErrorCode
);
845 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData
*sharedData
,
847 UConverterUnicodeSet which
,
848 UErrorCode
*pErrorCode
) {
849 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
850 sharedData
, sa
, which
,
851 sharedData
->mbcs
.outputType
==MBCS_OUTPUT_DBCS_ONLY
?
852 UCNV_SET_FILTER_DBCS_ONLY
:
853 UCNV_SET_FILTER_NONE
,
858 ucnv_MBCSGetUnicodeSet(const UConverter
*cnv
,
860 UConverterUnicodeSet which
,
861 UErrorCode
*pErrorCode
) {
862 if(cnv
->options
&_MBCS_OPTION_GB18030
) {
863 sa
->addRange(sa
->set
, 0, 0xd7ff);
864 sa
->addRange(sa
->set
, 0xe000, 0x10ffff);
866 ucnv_MBCSGetUnicodeSetForUnicode(cnv
->sharedData
, sa
, which
, pErrorCode
);
870 /* conversion extensions for input not in the main table -------------------- */
873 * Hardcoded extension handling for GB 18030.
874 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
876 * In the future, conversion extensions may handle m:n mappings and delta tables,
877 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
879 * If an input character cannot be mapped, then these functions set an error
880 * code. The framework will then call the callback function.
884 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
885 * else return 0 after output has been written to the target
888 _extFromU(UConverter
*cnv
, const UConverterSharedData
*sharedData
,
890 const UChar
**source
, const UChar
*sourceLimit
,
891 uint8_t **target
, const uint8_t *targetLimit
,
892 int32_t **offsets
, int32_t sourceIndex
,
894 UErrorCode
*pErrorCode
) {
897 cnv
->useSubChar1
=FALSE
;
899 if( (cx
=sharedData
->mbcs
.extIndexes
)!=NULL
&&
900 ucnv_extInitialMatchFromU(
902 cp
, source
, sourceLimit
,
903 (char **)target
, (char *)targetLimit
,
904 offsets
, sourceIndex
,
908 return 0; /* an extension mapping handled the input */
912 if((cnv
->options
&_MBCS_OPTION_GB18030
)!=0) {
913 const uint32_t *range
;
916 range
=gb18030Ranges
[0];
917 for(i
=0; i
<sizeof(gb18030Ranges
)/sizeof(gb18030Ranges
[0]); range
+=4, ++i
) {
918 if(range
[0]<=(uint32_t)cp
&& (uint32_t)cp
<=range
[1]) {
919 /* found the Unicode code point, output the four-byte sequence for it */
923 /* get the linear value of the first GB 18030 code in this range */
924 linear
=range
[2]-LINEAR_18030_BASE
;
926 /* add the offset from the beginning of the range */
927 linear
+=((uint32_t)cp
-range
[0]);
929 /* turn this into a four-byte sequence */
930 bytes
[3]=(char)(0x30+linear%10
); linear
/=10;
931 bytes
[2]=(char)(0x81+linear%126
); linear
/=126;
932 bytes
[1]=(char)(0x30+linear%10
); linear
/=10;
933 bytes
[0]=(char)(0x81+linear
);
935 /* output this sequence */
936 ucnv_fromUWriteBytes(cnv
,
937 bytes
, 4, (char **)target
, (char *)targetLimit
,
938 offsets
, sourceIndex
, pErrorCode
);
945 *pErrorCode
=U_INVALID_CHAR_FOUND
;
950 * Input sequence: cnv->toUBytes[0..length[
951 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
952 * else return 0 after output has been written to the target
955 _extToU(UConverter
*cnv
, const UConverterSharedData
*sharedData
,
957 const uint8_t **source
, const uint8_t *sourceLimit
,
958 UChar
**target
, const UChar
*targetLimit
,
959 int32_t **offsets
, int32_t sourceIndex
,
961 UErrorCode
*pErrorCode
) {
964 if( (cx
=sharedData
->mbcs
.extIndexes
)!=NULL
&&
965 ucnv_extInitialMatchToU(
967 length
, (const char **)source
, (const char *)sourceLimit
,
969 offsets
, sourceIndex
,
973 return 0; /* an extension mapping handled the input */
977 if(length
==4 && (cnv
->options
&_MBCS_OPTION_GB18030
)!=0) {
978 const uint32_t *range
;
982 linear
=LINEAR_18030(cnv
->toUBytes
[0], cnv
->toUBytes
[1], cnv
->toUBytes
[2], cnv
->toUBytes
[3]);
983 range
=gb18030Ranges
[0];
984 for(i
=0; i
<sizeof(gb18030Ranges
)/sizeof(gb18030Ranges
[0]); range
+=4, ++i
) {
985 if(range
[2]<=linear
&& linear
<=range
[3]) {
986 /* found the sequence, output the Unicode code point for it */
987 *pErrorCode
=U_ZERO_ERROR
;
989 /* add the linear difference between the input and start sequences to the start code point */
990 linear
=range
[0]+(linear
-range
[2]);
992 /* output this code point */
993 ucnv_toUWriteCodePoint(cnv
, linear
, target
, targetLimit
, offsets
, sourceIndex
, pErrorCode
);
1001 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1005 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
1008 * This code modifies a standard EBCDIC<->Unicode mapping table for
1009 * OS/390 (z/OS) Unix System Services (Open Edition).
1010 * The difference is in the mapping of Line Feed and New Line control codes:
1011 * Standard EBCDIC maps
1016 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1022 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1023 * by copying it into allocated memory and swapping the LF and NL values.
1024 * It allows to support the same EBCDIC charset in both versions without
1025 * duplicating the entire installed table.
1028 /* standard EBCDIC codes */
1029 #define EBCDIC_LF 0x25
1030 #define EBCDIC_NL 0x15
1032 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1033 #define EBCDIC_RT_LF 0xf25
1034 #define EBCDIC_RT_NL 0xf15
1036 /* Unicode code points */
1041 _EBCDICSwapLFNL(UConverterSharedData
*sharedData
, UErrorCode
*pErrorCode
) {
1042 UConverterMBCSTable
*mbcsTable
;
1044 const uint16_t *table
, *results
;
1045 const uint8_t *bytes
;
1047 int32_t (*newStateTable
)[256];
1048 uint16_t *newResults
;
1052 uint32_t stage2Entry
;
1053 uint32_t size
, sizeofFromUBytes
;
1055 mbcsTable
=&sharedData
->mbcs
;
1057 table
=mbcsTable
->fromUnicodeTable
;
1058 bytes
=mbcsTable
->fromUnicodeBytes
;
1059 results
=(const uint16_t *)bytes
;
1062 * Check that this is an EBCDIC table with SBCS portion -
1063 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1065 * If not, ignore the option. Options are always ignored if they do not apply.
1068 (mbcsTable
->outputType
==MBCS_OUTPUT_1
|| mbcsTable
->outputType
==MBCS_OUTPUT_2_SISO
) &&
1069 mbcsTable
->stateTable
[0][EBCDIC_LF
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
) &&
1070 mbcsTable
->stateTable
[0][EBCDIC_NL
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
)
1075 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1077 EBCDIC_RT_LF
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_LF
) &&
1078 EBCDIC_RT_NL
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_NL
)
1082 } else /* MBCS_OUTPUT_2_SISO */ {
1083 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
1085 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_LF
)!=0 &&
1086 EBCDIC_LF
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_LF
)
1091 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
1093 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_NL
)!=0 &&
1094 EBCDIC_NL
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_NL
)
1100 if(mbcsTable
->fromUBytesLength
>0) {
1102 * We _know_ the number of bytes in the fromUnicodeBytes array
1103 * starting with header.version 4.1.
1105 sizeofFromUBytes
=mbcsTable
->fromUBytesLength
;
1109 * There used to be code to enumerate the fromUnicode
1110 * trie and find the highest entry, but it was removed in ICU 3.2
1111 * because it was not tested and caused a low code coverage number.
1112 * See Jitterbug 3674.
1113 * This affects only some .cnv file formats with a header.version
1114 * below 4.1, and only when swaplfnl is requested.
1116 * ucnvmbcs.c revision 1.99 is the last one with the
1117 * ucnv_MBCSSizeofFromUBytes() function.
1119 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
1124 * The table has an appropriate format.
1125 * Allocate and build
1126 * - a modified to-Unicode state table
1127 * - a modified from-Unicode output array
1128 * - a converter name string with the swap option appended
1131 mbcsTable
->countStates
*1024+
1133 UCNV_MAX_CONVERTER_NAME_LENGTH
+20;
1134 p
=(uint8_t *)uprv_malloc(size
);
1136 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1140 /* copy and modify the to-Unicode state table */
1141 newStateTable
=(int32_t (*)[256])p
;
1142 uprv_memcpy(newStateTable
, mbcsTable
->stateTable
, mbcsTable
->countStates
*1024);
1144 newStateTable
[0][EBCDIC_LF
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
);
1145 newStateTable
[0][EBCDIC_NL
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
);
1147 /* copy and modify the from-Unicode result table */
1148 newResults
=(uint16_t *)newStateTable
[mbcsTable
->countStates
];
1149 uprv_memcpy(newResults
, bytes
, sizeofFromUBytes
);
1151 /* conveniently, the table access macros work on the left side of expressions */
1152 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1153 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_LF
)=EBCDIC_RT_NL
;
1154 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_NL
)=EBCDIC_RT_LF
;
1155 } else /* MBCS_OUTPUT_2_SISO */ {
1156 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
1157 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_LF
)=EBCDIC_NL
;
1159 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
1160 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_NL
)=EBCDIC_LF
;
1163 /* set the canonical converter name */
1164 name
=(char *)newResults
+sizeofFromUBytes
;
1165 uprv_strcpy(name
, sharedData
->staticData
->name
);
1166 uprv_strcat(name
, UCNV_SWAP_LFNL_OPTION_STRING
);
1168 /* set the pointers */
1170 if(mbcsTable
->swapLFNLStateTable
==NULL
) {
1171 mbcsTable
->swapLFNLStateTable
=newStateTable
;
1172 mbcsTable
->swapLFNLFromUnicodeBytes
=(uint8_t *)newResults
;
1173 mbcsTable
->swapLFNLName
=name
;
1179 /* release the allocated memory if another thread beat us to it */
1180 if(newStateTable
!=NULL
) {
1181 uprv_free(newStateTable
);
1186 /* reconstitute omitted fromUnicode data ------------------------------------ */
1188 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1189 static UBool U_CALLCONV
1190 writeStage3Roundtrip(const void *context
, uint32_t value
, UChar32 codePoints
[32]) {
1191 UConverterMBCSTable
*mbcsTable
=(UConverterMBCSTable
*)context
;
1192 const uint16_t *table
;
1198 table
=mbcsTable
->fromUnicodeTable
;
1199 bytes
=(uint8_t *)mbcsTable
->fromUnicodeBytes
;
1201 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1202 switch(mbcsTable
->outputType
) {
1203 case MBCS_OUTPUT_3_EUC
:
1205 /* short sequences are stored directly */
1206 /* code set 0 or 1 */
1207 } else if(value
<=0x8effff) {
1210 } else /* first byte is 0x8f */ {
1215 case MBCS_OUTPUT_4_EUC
:
1216 if(value
<=0xffffff) {
1217 /* short sequences are stored directly */
1218 /* code set 0 or 1 */
1219 } else if(value
<=0x8effffff) {
1222 } else /* first byte is 0x8f */ {
1231 for(i
=0; i
<=0x1f; ++value
, ++i
) {
1237 /* locate the stage 2 & 3 data */
1238 stage2
=((uint32_t *)table
)+table
[c
>>10]+((c
>>4)&0x3f);
1240 st3
=(int32_t)(uint16_t)*stage2
*16+(c
&0xf);
1242 /* write the codepage bytes into stage 3 */
1243 switch(mbcsTable
->outputType
) {
1245 case MBCS_OUTPUT_4_EUC
:
1247 p
[0]=(uint8_t)(value
>>16);
1248 p
[1]=(uint8_t)(value
>>8);
1249 p
[2]=(uint8_t)value
;
1252 ((uint32_t *)p
)[st3
]=value
;
1255 /* 2 bytes per character */
1256 ((uint16_t *)p
)[st3
]=(uint16_t)value
;
1260 /* set the roundtrip flag */
1261 *stage2
|=(1UL<<(16+(c
&0xf)));
1267 reconstituteData(UConverterMBCSTable
*mbcsTable
,
1268 uint32_t stage1Length
, uint32_t stage2Length
,
1269 uint32_t fullStage2Length
, /* lengths are numbers of units, not bytes */
1270 UErrorCode
*pErrorCode
) {
1274 uint32_t dataLength
=stage1Length
*2+fullStage2Length
*4+mbcsTable
->fromUBytesLength
;
1275 mbcsTable
->reconstitutedData
=(uint8_t *)uprv_malloc(dataLength
);
1276 if(mbcsTable
->reconstitutedData
==NULL
) {
1277 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1280 uprv_memset(mbcsTable
->reconstitutedData
, 0, dataLength
);
1282 /* copy existing data and reroute the pointers */
1283 stage1
=(uint16_t *)mbcsTable
->reconstitutedData
;
1284 uprv_memcpy(stage1
, mbcsTable
->fromUnicodeTable
, stage1Length
*2);
1286 stage2
=(uint32_t *)(stage1
+stage1Length
);
1287 uprv_memcpy(stage2
+(fullStage2Length
-stage2Length
),
1288 mbcsTable
->fromUnicodeTable
+stage1Length
,
1291 mbcsTable
->fromUnicodeTable
=stage1
;
1292 mbcsTable
->fromUnicodeBytes
=bytes
=(uint8_t *)(stage2
+fullStage2Length
);
1294 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1295 stage2
=(uint32_t *)stage1
;
1297 /* reconstitute the initial part of stage 2 from the mbcsIndex */
1299 int32_t stageUTF8Length
=((int32_t)mbcsTable
->maxFastUChar
+1)>>6;
1300 int32_t stageUTF8Index
=0;
1301 int32_t st1
, st2
, st3
, i
;
1303 for(st1
=0; stageUTF8Index
<stageUTF8Length
; ++st1
) {
1305 if(st2
!=stage1Length
/2) {
1306 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1307 for(i
=0; i
<16; ++i
) {
1308 st3
=mbcsTable
->mbcsIndex
[stageUTF8Index
++];
1310 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1313 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1314 * allocated together as a single 64-block for access from the mbcsIndex
1316 stage2
[st2
++]=st3
++;
1317 stage2
[st2
++]=st3
++;
1318 stage2
[st2
++]=st3
++;
1321 /* no stage 3 block, skip */
1326 /* no stage 2 block, skip */
1332 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1333 ucnv_MBCSEnumToUnicode(mbcsTable
, writeStage3Roundtrip
, mbcsTable
, pErrorCode
);
1336 /* MBCS setup functions ----------------------------------------------------- */
1339 ucnv_MBCSLoad(UConverterSharedData
*sharedData
,
1340 UConverterLoadArgs
*pArgs
,
1342 UErrorCode
*pErrorCode
) {
1344 UConverterMBCSTable
*mbcsTable
=&sharedData
->mbcs
;
1345 _MBCSHeader
*header
=(_MBCSHeader
*)raw
;
1347 uint32_t headerLength
;
1348 UBool noFromU
=FALSE
;
1350 if(header
->version
[0]==4) {
1351 headerLength
=MBCS_HEADER_V4_LENGTH
;
1352 } else if(header
->version
[0]==5 && header
->version
[1]>=3 &&
1353 (header
->options
&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK
)==0) {
1354 headerLength
=header
->options
&MBCS_OPT_LENGTH_MASK
;
1355 noFromU
=(UBool
)((header
->options
&MBCS_OPT_NO_FROM_U
)!=0);
1357 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1361 mbcsTable
->outputType
=(uint8_t)header
->flags
;
1362 if(noFromU
&& mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
1363 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1367 /* extension data, header version 4.2 and higher */
1368 offset
=header
->flags
>>8;
1370 mbcsTable
->extIndexes
=(const int32_t *)(raw
+offset
);
1373 if(mbcsTable
->outputType
==MBCS_OUTPUT_EXT_ONLY
) {
1374 UConverterLoadArgs args
={ 0 };
1375 UConverterSharedData
*baseSharedData
;
1376 const int32_t *extIndexes
;
1377 const char *baseName
;
1379 /* extension-only file, load the base table and set values appropriately */
1380 if((extIndexes
=mbcsTable
->extIndexes
)==NULL
) {
1381 /* extension-only file without extension */
1382 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1386 if(pArgs
->nestedLoads
!=1) {
1387 /* an extension table must not be loaded as a base table */
1388 *pErrorCode
=U_INVALID_TABLE_FILE
;
1392 /* load the base table */
1393 baseName
=(const char *)header
+headerLength
*4;
1394 if(0==uprv_strcmp(baseName
, sharedData
->staticData
->name
)) {
1395 /* forbid loading this same extension-only file */
1396 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1400 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1401 args
.size
=sizeof(UConverterLoadArgs
);
1403 args
.reserved
=pArgs
->reserved
;
1404 args
.options
=pArgs
->options
;
1405 args
.pkg
=pArgs
->pkg
;
1407 baseSharedData
=ucnv_load(&args
, pErrorCode
);
1408 if(U_FAILURE(*pErrorCode
)) {
1411 if( baseSharedData
->staticData
->conversionType
!=UCNV_MBCS
||
1412 baseSharedData
->mbcs
.baseSharedData
!=NULL
1414 ucnv_unload(baseSharedData
);
1415 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1419 /* copy the base table data */
1420 uprv_memcpy(mbcsTable
, &baseSharedData
->mbcs
, sizeof(UConverterMBCSTable
));
1422 /* overwrite values with relevant ones for the extension converter */
1423 mbcsTable
->baseSharedData
=baseSharedData
;
1424 mbcsTable
->extIndexes
=extIndexes
;
1427 * It would be possible to share the swapLFNL data with a base converter,
1428 * but the generated name would have to be different, and the memory
1429 * would have to be free'd only once.
1430 * It is easier to just create the data for the extension converter
1431 * separately when it is requested.
1433 mbcsTable
->swapLFNLStateTable
=NULL
;
1434 mbcsTable
->swapLFNLFromUnicodeBytes
=NULL
;
1435 mbcsTable
->swapLFNLName
=NULL
;
1438 * The reconstitutedData must be deleted only when the base converter
1441 mbcsTable
->reconstitutedData
=NULL
;
1444 * Set a special, runtime-only outputType if the extension converter
1445 * is a DBCS version of a base converter that also maps single bytes.
1447 if( sharedData
->staticData
->conversionType
==UCNV_DBCS
||
1448 (sharedData
->staticData
->conversionType
==UCNV_MBCS
&&
1449 sharedData
->staticData
->minBytesPerChar
>=2)
1451 if(baseSharedData
->mbcs
.outputType
==MBCS_OUTPUT_2_SISO
) {
1452 /* the base converter is SI/SO-stateful */
1455 /* get the dbcs state from the state table entry for SO=0x0e */
1456 entry
=mbcsTable
->stateTable
[0][0xe];
1457 if( MBCS_ENTRY_IS_FINAL(entry
) &&
1458 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_CHANGE_ONLY
&&
1459 MBCS_ENTRY_FINAL_STATE(entry
)!=0
1461 mbcsTable
->dbcsOnlyState
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1463 mbcsTable
->outputType
=MBCS_OUTPUT_DBCS_ONLY
;
1466 baseSharedData
->staticData
->conversionType
==UCNV_MBCS
&&
1467 baseSharedData
->staticData
->minBytesPerChar
==1 &&
1468 baseSharedData
->staticData
->maxBytesPerChar
==2 &&
1469 mbcsTable
->countStates
<=127
1471 /* non-stateful base converter, need to modify the state table */
1472 int32_t (*newStateTable
)[256];
1476 /* allocate a new state table and copy the base state table contents */
1477 count
=mbcsTable
->countStates
;
1478 newStateTable
=(int32_t (*)[256])uprv_malloc((count
+1)*1024);
1479 if(newStateTable
==NULL
) {
1480 ucnv_unload(baseSharedData
);
1481 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1485 uprv_memcpy(newStateTable
, mbcsTable
->stateTable
, count
*1024);
1487 /* change all final single-byte entries to go to a new all-illegal state */
1488 state
=newStateTable
[0];
1489 for(i
=0; i
<256; ++i
) {
1490 if(MBCS_ENTRY_IS_FINAL(state
[i
])) {
1491 state
[i
]=MBCS_ENTRY_TRANSITION(count
, 0);
1495 /* build the new all-illegal state */
1496 state
=newStateTable
[count
];
1497 for(i
=0; i
<256; ++i
) {
1498 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0);
1500 mbcsTable
->stateTable
=(const int32_t (*)[256])newStateTable
;
1501 mbcsTable
->countStates
=(uint8_t)(count
+1);
1502 mbcsTable
->stateTableOwned
=TRUE
;
1504 mbcsTable
->outputType
=MBCS_OUTPUT_DBCS_ONLY
;
1509 * unlike below for files with base tables, do not get the unicodeMask
1510 * from the sharedData; instead, use the base table's unicodeMask,
1511 * which we copied in the memcpy above;
1512 * this is necessary because the static data unicodeMask, especially
1513 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1516 /* conversion file with a base table; an additional extension table is optional */
1517 /* make sure that the output type is known */
1518 switch(mbcsTable
->outputType
) {
1523 case MBCS_OUTPUT_3_EUC
:
1524 case MBCS_OUTPUT_4_EUC
:
1525 case MBCS_OUTPUT_2_SISO
:
1529 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
1533 mbcsTable
->countStates
=(uint8_t)header
->countStates
;
1534 mbcsTable
->countToUFallbacks
=header
->countToUFallbacks
;
1535 mbcsTable
->stateTable
=(const int32_t (*)[256])(raw
+headerLength
*4);
1536 mbcsTable
->toUFallbacks
=(const _MBCSToUFallback
*)(mbcsTable
->stateTable
+header
->countStates
);
1537 mbcsTable
->unicodeCodeUnits
=(const uint16_t *)(raw
+header
->offsetToUCodeUnits
);
1539 mbcsTable
->fromUnicodeTable
=(const uint16_t *)(raw
+header
->offsetFromUTable
);
1540 mbcsTable
->fromUnicodeBytes
=(const uint8_t *)(raw
+header
->offsetFromUBytes
);
1541 mbcsTable
->fromUBytesLength
=header
->fromUBytesLength
;
1544 * converter versions 6.1 and up contain a unicodeMask that is
1545 * used here to select the most efficient function implementations
1547 info
.size
=sizeof(UDataInfo
);
1548 udata_getInfo((UDataMemory
*)sharedData
->dataMemory
, &info
);
1549 if(info
.formatVersion
[0]>6 || (info
.formatVersion
[0]==6 && info
.formatVersion
[1]>=1)) {
1550 /* mask off possible future extensions to be safe */
1551 mbcsTable
->unicodeMask
=(uint8_t)(sharedData
->staticData
->unicodeMask
&3);
1553 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1554 mbcsTable
->unicodeMask
=UCNV_HAS_SUPPLEMENTARY
|UCNV_HAS_SURROGATES
;
1558 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1559 * Check for the header version, SBCS vs. MBCS, and for whether the
1560 * data structures are optimized for code points as high as what the
1561 * runtime code is designed for.
1562 * The implementation does not handle mapping tables with entries for
1563 * unpaired surrogates.
1565 if( header
->version
[1]>=3 &&
1566 (mbcsTable
->unicodeMask
&UCNV_HAS_SURROGATES
)==0 &&
1567 (mbcsTable
->countStates
==1 ?
1568 (header
->version
[2]>=(SBCS_FAST_MAX
>>8)) :
1569 (header
->version
[2]>=(MBCS_FAST_MAX
>>8))
1572 mbcsTable
->utf8Friendly
=TRUE
;
1574 if(mbcsTable
->countStates
==1) {
1576 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1577 * Build a table with indexes to each block, to be used instead of
1578 * the regular stage 1/2 table.
1581 for(i
=0; i
<(SBCS_FAST_LIMIT
>>6); ++i
) {
1582 mbcsTable
->sbcsIndex
[i
]=mbcsTable
->fromUnicodeTable
[mbcsTable
->fromUnicodeTable
[i
>>4]+((i
<<2)&0x3c)];
1584 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1585 mbcsTable
->maxFastUChar
=SBCS_FAST_MAX
;
1588 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1589 * The .cnv file is prebuilt with an additional stage table with indexes
1592 mbcsTable
->mbcsIndex
=(const uint16_t *)
1593 (mbcsTable
->fromUnicodeBytes
+
1594 (noFromU
? 0 : mbcsTable
->fromUBytesLength
));
1595 mbcsTable
->maxFastUChar
=(((UChar
)header
->version
[2])<<8)|0xff;
1599 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1601 uint32_t asciiRoundtrips
=0xffffffff;
1604 for(i
=0; i
<0x80; ++i
) {
1605 if(mbcsTable
->stateTable
[0][i
]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, i
)) {
1606 asciiRoundtrips
&=~((uint32_t)1<<(i
>>2));
1609 mbcsTable
->asciiRoundtrips
=asciiRoundtrips
;
1613 uint32_t stage1Length
=
1614 mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
?
1616 uint32_t stage2Length
=
1617 (header
->offsetFromUBytes
-header
->offsetFromUTable
)/4-
1619 reconstituteData(mbcsTable
, stage1Length
, stage2Length
, header
->fullStage2Length
, pErrorCode
);
1623 /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1624 if(mbcsTable
->utf8Friendly
) {
1625 if(mbcsTable
->countStates
==1) {
1626 sharedData
->impl
=&_SBCSUTF8Impl
;
1628 if(mbcsTable
->outputType
==MBCS_OUTPUT_2
) {
1629 sharedData
->impl
=&_DBCSUTF8Impl
;
1634 if(mbcsTable
->outputType
==MBCS_OUTPUT_DBCS_ONLY
|| mbcsTable
->outputType
==MBCS_OUTPUT_2_SISO
) {
1636 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1637 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1639 mbcsTable
->asciiRoundtrips
=0;
1644 ucnv_MBCSUnload(UConverterSharedData
*sharedData
) {
1645 UConverterMBCSTable
*mbcsTable
=&sharedData
->mbcs
;
1647 if(mbcsTable
->swapLFNLStateTable
!=NULL
) {
1648 uprv_free(mbcsTable
->swapLFNLStateTable
);
1650 if(mbcsTable
->stateTableOwned
) {
1651 uprv_free((void *)mbcsTable
->stateTable
);
1653 if(mbcsTable
->baseSharedData
!=NULL
) {
1654 ucnv_unload(mbcsTable
->baseSharedData
);
1656 if(mbcsTable
->reconstitutedData
!=NULL
) {
1657 uprv_free(mbcsTable
->reconstitutedData
);
1662 ucnv_MBCSOpen(UConverter
*cnv
,
1666 UErrorCode
*pErrorCode
) {
1667 UConverterMBCSTable
*mbcsTable
;
1668 const int32_t *extIndexes
;
1670 int8_t maxBytesPerUChar
;
1672 mbcsTable
=&cnv
->sharedData
->mbcs
;
1673 outputType
=mbcsTable
->outputType
;
1675 if(outputType
==MBCS_OUTPUT_DBCS_ONLY
) {
1676 /* the swaplfnl option does not apply, remove it */
1677 cnv
->options
=options
&=~UCNV_OPTION_SWAP_LFNL
;
1680 if((options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1681 /* do this because double-checked locking is broken */
1685 isCached
=mbcsTable
->swapLFNLStateTable
!=NULL
;
1689 if(!_EBCDICSwapLFNL(cnv
->sharedData
, pErrorCode
)) {
1690 if(U_FAILURE(*pErrorCode
)) {
1691 return; /* something went wrong */
1694 /* the option does not apply, remove it */
1695 cnv
->options
=options
&=~UCNV_OPTION_SWAP_LFNL
;
1700 if(uprv_strstr(name
, "18030")!=NULL
) {
1701 if(uprv_strstr(name
, "gb18030")!=NULL
|| uprv_strstr(name
, "GB18030")!=NULL
) {
1702 /* set a flag for GB 18030 mode, which changes the callback behavior */
1703 cnv
->options
|=_MBCS_OPTION_GB18030
;
1707 /* fix maxBytesPerUChar depending on outputType and options etc. */
1708 if(outputType
==MBCS_OUTPUT_2_SISO
) {
1709 cnv
->maxBytesPerUChar
=3; /* SO+DBCS */
1712 extIndexes
=mbcsTable
->extIndexes
;
1713 if(extIndexes
!=NULL
) {
1714 maxBytesPerUChar
=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes
);
1715 if(outputType
==MBCS_OUTPUT_2_SISO
) {
1716 ++maxBytesPerUChar
; /* SO + multiple DBCS */
1719 if(maxBytesPerUChar
>cnv
->maxBytesPerUChar
) {
1720 cnv
->maxBytesPerUChar
=maxBytesPerUChar
;
1726 * documentation of UConverter fields used for status
1727 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1731 cnv
->toUnicodeStatus
=0; /* offset */
1732 cnv
->mode
=0; /* state */
1733 cnv
->toULength
=0; /* byteIndex */
1737 cnv
->fromUnicodeStatus
=1; /* prevLength */
1742 ucnv_MBCSGetName(const UConverter
*cnv
) {
1743 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0 && cnv
->sharedData
->mbcs
.swapLFNLName
!=NULL
) {
1744 return cnv
->sharedData
->mbcs
.swapLFNLName
;
1746 return cnv
->sharedData
->staticData
->name
;
1750 /* MBCS-to-Unicode conversion functions ------------------------------------- */
1753 ucnv_MBCSGetFallback(UConverterMBCSTable
*mbcsTable
, uint32_t offset
) {
1754 const _MBCSToUFallback
*toUFallbacks
;
1755 uint32_t i
, start
, limit
;
1757 limit
=mbcsTable
->countToUFallbacks
;
1759 /* do a binary search for the fallback mapping */
1760 toUFallbacks
=mbcsTable
->toUFallbacks
;
1762 while(start
<limit
-1) {
1764 if(offset
<toUFallbacks
[i
].offset
) {
1771 /* did we really find it? */
1772 if(offset
==toUFallbacks
[start
].offset
) {
1773 return toUFallbacks
[start
].codePoint
;
1780 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1782 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1783 UErrorCode
*pErrorCode
) {
1785 const uint8_t *source
, *sourceLimit
;
1787 const UChar
*targetLimit
;
1790 const int32_t (*stateTable
)[256];
1792 int32_t sourceIndex
;
1798 /* set up the local pointers */
1799 cnv
=pArgs
->converter
;
1800 source
=(const uint8_t *)pArgs
->source
;
1801 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1802 target
=pArgs
->target
;
1803 targetLimit
=pArgs
->targetLimit
;
1804 offsets
=pArgs
->offsets
;
1806 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1807 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
1809 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
1812 /* sourceIndex=-1 if the current character began in the previous buffer */
1815 /* conversion loop */
1816 while(source
<sourceLimit
) {
1818 * This following test is to see if available input would overflow the output.
1819 * It does not catch output of more than one code unit that
1820 * overflows as a result of a surrogate pair or callback output
1821 * from the last source byte.
1822 * Therefore, those situations also test for overflows and will
1823 * then break the loop, too.
1825 if(target
>=targetLimit
) {
1826 /* target is full */
1827 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1831 entry
=stateTable
[0][*source
++];
1832 /* MBCS_ENTRY_IS_FINAL(entry) */
1834 /* test the most common case first */
1835 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
1836 /* output BMP code point */
1837 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1839 *offsets
++=sourceIndex
;
1842 /* normal end of action codes: prepare for a new character */
1848 * An if-else-if chain provides more reliable performance for
1849 * the most common cases compared to a switch.
1851 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
1852 if(action
==MBCS_STATE_VALID_DIRECT_20
||
1853 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
1855 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
1856 /* output surrogate pair */
1857 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
1859 *offsets
++=sourceIndex
;
1861 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
1862 if(target
<targetLimit
) {
1865 *offsets
++=sourceIndex
;
1868 /* target overflow */
1869 cnv
->UCharErrorBuffer
[0]=c
;
1870 cnv
->UCharErrorBufferLength
=1;
1871 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1877 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1878 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
1879 /* output BMP code point */
1880 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1882 *offsets
++=sourceIndex
;
1888 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1889 /* just fall through */
1890 } else if(action
==MBCS_STATE_ILLEGAL
) {
1891 /* callback(illegal) */
1892 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1894 /* reserved, must never occur */
1899 if(U_FAILURE(*pErrorCode
)) {
1900 /* callback(illegal) */
1902 } else /* unassigned sequences indicated with byteIndex>0 */ {
1903 /* try an extension mapping */
1904 pArgs
->source
=(const char *)source
;
1905 cnv
->toUBytes
[0]=*(source
-1);
1906 cnv
->toULength
=_extToU(cnv
, cnv
->sharedData
,
1907 1, &source
, sourceLimit
,
1908 &target
, targetLimit
,
1909 &offsets
, sourceIndex
,
1912 sourceIndex
+=1+(int32_t)(source
-(const uint8_t *)pArgs
->source
);
1914 if(U_FAILURE(*pErrorCode
)) {
1915 /* not mappable or buffer overflow */
1921 /* write back the updated pointers */
1922 pArgs
->source
=(const char *)source
;
1923 pArgs
->target
=target
;
1924 pArgs
->offsets
=offsets
;
1928 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1929 * that only map to and from the BMP.
1930 * In addition to single-byte optimizations, the offset calculations
1931 * become much easier.
1934 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1935 UErrorCode
*pErrorCode
) {
1937 const uint8_t *source
, *sourceLimit
, *lastSource
;
1939 int32_t targetCapacity
, length
;
1942 const int32_t (*stateTable
)[256];
1944 int32_t sourceIndex
;
1949 /* set up the local pointers */
1950 cnv
=pArgs
->converter
;
1951 source
=(const uint8_t *)pArgs
->source
;
1952 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1953 target
=pArgs
->target
;
1954 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1955 offsets
=pArgs
->offsets
;
1957 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1958 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
1960 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
1963 /* sourceIndex=-1 if the current character began in the previous buffer */
1968 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1969 * for the minimum of the sourceLength and targetCapacity
1971 length
=(int32_t)(sourceLimit
-source
);
1972 if(length
<targetCapacity
) {
1973 targetCapacity
=length
;
1976 #if MBCS_UNROLL_SINGLE_TO_BMP
1977 /* unrolling makes it faster on Pentium III/Windows 2000 */
1978 /* unroll the loop with the most common case */
1980 if(targetCapacity
>=16) {
1981 int32_t count
, loops
, oredEntries
;
1983 loops
=count
=targetCapacity
>>4;
1985 oredEntries
=entry
=stateTable
[0][*source
++];
1986 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1987 oredEntries
|=entry
=stateTable
[0][*source
++];
1988 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1989 oredEntries
|=entry
=stateTable
[0][*source
++];
1990 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1991 oredEntries
|=entry
=stateTable
[0][*source
++];
1992 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1993 oredEntries
|=entry
=stateTable
[0][*source
++];
1994 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1995 oredEntries
|=entry
=stateTable
[0][*source
++];
1996 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1997 oredEntries
|=entry
=stateTable
[0][*source
++];
1998 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1999 oredEntries
|=entry
=stateTable
[0][*source
++];
2000 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2001 oredEntries
|=entry
=stateTable
[0][*source
++];
2002 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2003 oredEntries
|=entry
=stateTable
[0][*source
++];
2004 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2005 oredEntries
|=entry
=stateTable
[0][*source
++];
2006 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2007 oredEntries
|=entry
=stateTable
[0][*source
++];
2008 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2009 oredEntries
|=entry
=stateTable
[0][*source
++];
2010 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2011 oredEntries
|=entry
=stateTable
[0][*source
++];
2012 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2013 oredEntries
|=entry
=stateTable
[0][*source
++];
2014 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2015 oredEntries
|=entry
=stateTable
[0][*source
++];
2016 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2018 /* were all 16 entries really valid? */
2019 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries
)) {
2020 /* no, return to the first of these 16 */
2027 targetCapacity
-=16*count
;
2030 lastSource
+=16*count
;
2032 *offsets
++=sourceIndex
++;
2033 *offsets
++=sourceIndex
++;
2034 *offsets
++=sourceIndex
++;
2035 *offsets
++=sourceIndex
++;
2036 *offsets
++=sourceIndex
++;
2037 *offsets
++=sourceIndex
++;
2038 *offsets
++=sourceIndex
++;
2039 *offsets
++=sourceIndex
++;
2040 *offsets
++=sourceIndex
++;
2041 *offsets
++=sourceIndex
++;
2042 *offsets
++=sourceIndex
++;
2043 *offsets
++=sourceIndex
++;
2044 *offsets
++=sourceIndex
++;
2045 *offsets
++=sourceIndex
++;
2046 *offsets
++=sourceIndex
++;
2047 *offsets
++=sourceIndex
++;
2054 /* conversion loop */
2055 while(targetCapacity
>0) {
2056 entry
=stateTable
[0][*source
++];
2057 /* MBCS_ENTRY_IS_FINAL(entry) */
2059 /* test the most common case first */
2060 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2061 /* output BMP code point */
2062 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2068 * An if-else-if chain provides more reliable performance for
2069 * the most common cases compared to a switch.
2071 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2072 if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2073 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2074 /* output BMP code point */
2075 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2079 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2080 /* just fall through */
2081 } else if(action
==MBCS_STATE_ILLEGAL
) {
2082 /* callback(illegal) */
2083 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2085 /* reserved, must never occur */
2089 /* set offsets since the start or the last extension */
2091 int32_t count
=(int32_t)(source
-lastSource
);
2093 /* predecrement: do not set the offset for the callback-causing character */
2095 *offsets
++=sourceIndex
++;
2097 /* offset and sourceIndex are now set for the current character */
2100 if(U_FAILURE(*pErrorCode
)) {
2101 /* callback(illegal) */
2103 } else /* unassigned sequences indicated with byteIndex>0 */ {
2104 /* try an extension mapping */
2106 cnv
->toUBytes
[0]=*(source
-1);
2107 cnv
->toULength
=_extToU(cnv
, cnv
->sharedData
,
2108 1, &source
, sourceLimit
,
2109 &target
, pArgs
->targetLimit
,
2110 &offsets
, sourceIndex
,
2113 sourceIndex
+=1+(int32_t)(source
-lastSource
);
2115 if(U_FAILURE(*pErrorCode
)) {
2116 /* not mappable or buffer overflow */
2120 /* recalculate the targetCapacity after an extension mapping */
2121 targetCapacity
=(int32_t)(pArgs
->targetLimit
-target
);
2122 length
=(int32_t)(sourceLimit
-source
);
2123 if(length
<targetCapacity
) {
2124 targetCapacity
=length
;
2128 #if MBCS_UNROLL_SINGLE_TO_BMP
2129 /* unrolling makes it faster on Pentium III/Windows 2000 */
2134 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
2135 /* target is full */
2136 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2139 /* set offsets since the start or the last callback */
2141 size_t count
=source
-lastSource
;
2143 *offsets
++=sourceIndex
++;
2148 /* write back the updated pointers */
2149 pArgs
->source
=(const char *)source
;
2150 pArgs
->target
=target
;
2151 pArgs
->offsets
=offsets
;
2155 hasValidTrailBytes(const int32_t (*stateTable
)[256], uint8_t state
) {
2156 const int32_t *row
=stateTable
[state
];
2158 /* First test for final entries in this state for some commonly valid byte values. */
2160 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2161 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2166 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2167 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2171 /* Then test for final entries in this state. */
2172 for(b
=0; b
<=0xff; ++b
) {
2174 if( !MBCS_ENTRY_IS_TRANSITION(entry
) &&
2175 MBCS_ENTRY_FINAL_ACTION(entry
)!=MBCS_STATE_ILLEGAL
2180 /* Then recurse for transition entries. */
2181 for(b
=0; b
<=0xff; ++b
) {
2183 if( MBCS_ENTRY_IS_TRANSITION(entry
) &&
2184 hasValidTrailBytes(stateTable
, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
))
2193 * Is byte b a single/lead byte in this state?
2194 * Recurse for transition states, because here we don't want to say that
2195 * b is a lead byte if all byte sequences that start with b are illegal.
2198 isSingleOrLead(const int32_t (*stateTable
)[256], uint8_t state
, UBool isDBCSOnly
, uint8_t b
) {
2199 const int32_t *row
=stateTable
[state
];
2200 int32_t entry
=row
[b
];
2201 if(MBCS_ENTRY_IS_TRANSITION(entry
)) { /* lead byte */
2202 return hasValidTrailBytes(stateTable
, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
));
2204 uint8_t action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2205 if(action
==MBCS_STATE_CHANGE_ONLY
&& isDBCSOnly
) {
2206 return FALSE
; /* SI/SO are illegal for DBCS-only conversion */
2208 return action
!=MBCS_STATE_ILLEGAL
;
2214 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
2215 UErrorCode
*pErrorCode
) {
2217 const uint8_t *source
, *sourceLimit
;
2219 const UChar
*targetLimit
;
2222 const int32_t (*stateTable
)[256];
2223 const uint16_t *unicodeCodeUnits
;
2230 int32_t sourceIndex
, nextSourceIndex
;
2236 /* use optimized function if possible */
2237 cnv
=pArgs
->converter
;
2239 if(cnv
->preToULength
>0) {
2241 * pass sourceIndex=-1 because we continue from an earlier buffer
2242 * in the future, this may change with continuous offsets
2244 ucnv_extContinueMatchToU(cnv
, pArgs
, -1, pErrorCode
);
2246 if(U_FAILURE(*pErrorCode
) || cnv
->preToULength
<0) {
2251 if(cnv
->sharedData
->mbcs
.countStates
==1) {
2252 if(!(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
2253 ucnv_MBCSSingleToBMPWithOffsets(pArgs
, pErrorCode
);
2255 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs
, pErrorCode
);
2260 /* set up the local pointers */
2261 source
=(const uint8_t *)pArgs
->source
;
2262 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2263 target
=pArgs
->target
;
2264 targetLimit
=pArgs
->targetLimit
;
2265 offsets
=pArgs
->offsets
;
2267 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2268 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2270 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2272 unicodeCodeUnits
=cnv
->sharedData
->mbcs
.unicodeCodeUnits
;
2274 /* get the converter state from UConverter */
2275 offset
=cnv
->toUnicodeStatus
;
2276 byteIndex
=cnv
->toULength
;
2277 bytes
=cnv
->toUBytes
;
2280 * if we are in the SBCS state for a DBCS-only converter,
2281 * then load the DBCS state from the MBCS data
2282 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2284 if((state
=(uint8_t)(cnv
->mode
))==0) {
2285 state
=cnv
->sharedData
->mbcs
.dbcsOnlyState
;
2288 /* sourceIndex=-1 if the current character began in the previous buffer */
2289 sourceIndex
=byteIndex
==0 ? 0 : -1;
2292 /* conversion loop */
2293 while(source
<sourceLimit
) {
2295 * This following test is to see if available input would overflow the output.
2296 * It does not catch output of more than one code unit that
2297 * overflows as a result of a surrogate pair or callback output
2298 * from the last source byte.
2299 * Therefore, those situations also test for overflows and will
2300 * then break the loop, too.
2302 if(target
>=targetLimit
) {
2303 /* target is full */
2304 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2309 /* optimized loop for 1/2-byte input and BMP output */
2312 entry
=stateTable
[state
][*source
];
2313 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2314 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2315 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2318 if( source
<sourceLimit
&&
2319 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
2320 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
2321 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
2325 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2328 /* set the state and leave the optimized loop */
2329 bytes
[0]=*(source
-1);
2334 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2335 /* output BMP code point */
2337 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2338 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2340 /* leave the optimized loop */
2344 } while(source
<sourceLimit
&& target
<targetLimit
);
2345 } else /* offsets!=NULL */ {
2347 entry
=stateTable
[state
][*source
];
2348 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2349 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2350 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2353 if( source
<sourceLimit
&&
2354 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
2355 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
2356 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
2361 *offsets
++=sourceIndex
;
2362 sourceIndex
=(nextSourceIndex
+=2);
2364 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2367 /* set the state and leave the optimized loop */
2369 bytes
[0]=*(source
-1);
2374 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2375 /* output BMP code point */
2377 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2379 *offsets
++=sourceIndex
;
2380 sourceIndex
=++nextSourceIndex
;
2382 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2384 /* leave the optimized loop */
2388 } while(source
<sourceLimit
&& target
<targetLimit
);
2392 * these tests and break statements could be put inside the loop
2393 * if C had "break outerLoop" like Java
2395 if(source
>=sourceLimit
) {
2398 if(target
>=targetLimit
) {
2399 /* target is full */
2400 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2405 bytes
[byteIndex
++]=*source
++;
2406 } else /* byteIndex>0 */ {
2408 entry
=stateTable
[state
][bytes
[byteIndex
++]=*source
++];
2411 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2412 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2413 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2417 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2420 /* set the next state early so that we can reuse the entry variable */
2421 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2424 * An if-else-if chain provides more reliable performance for
2425 * the most common cases compared to a switch.
2427 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2428 if(action
==MBCS_STATE_VALID_16
) {
2429 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2430 c
=unicodeCodeUnits
[offset
];
2432 /* output BMP code point */
2435 *offsets
++=sourceIndex
;
2438 } else if(c
==0xfffe) {
2439 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (entry
=(int32_t)ucnv_MBCSGetFallback(&cnv
->sharedData
->mbcs
, offset
))!=0xfffe) {
2440 /* output fallback BMP code point */
2441 *target
++=(UChar
)entry
;
2443 *offsets
++=sourceIndex
;
2448 /* callback(illegal) */
2449 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2451 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
2452 /* output BMP code point */
2453 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2455 *offsets
++=sourceIndex
;
2458 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
2459 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2460 c
=unicodeCodeUnits
[offset
++];
2462 /* output BMP code point below 0xd800 */
2465 *offsets
++=sourceIndex
;
2468 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
2469 /* output roundtrip or fallback surrogate pair */
2470 *target
++=(UChar
)(c
&0xdbff);
2472 *offsets
++=sourceIndex
;
2475 if(target
<targetLimit
) {
2476 *target
++=unicodeCodeUnits
[offset
];
2478 *offsets
++=sourceIndex
;
2481 /* target overflow */
2482 cnv
->UCharErrorBuffer
[0]=unicodeCodeUnits
[offset
];
2483 cnv
->UCharErrorBufferLength
=1;
2484 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2489 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
2490 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2491 *target
++=unicodeCodeUnits
[offset
];
2493 *offsets
++=sourceIndex
;
2496 } else if(c
==0xffff) {
2497 /* callback(illegal) */
2498 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2500 } else if(action
==MBCS_STATE_VALID_DIRECT_20
||
2501 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2503 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
2504 /* output surrogate pair */
2505 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
2507 *offsets
++=sourceIndex
;
2510 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
2511 if(target
<targetLimit
) {
2514 *offsets
++=sourceIndex
;
2517 /* target overflow */
2518 cnv
->UCharErrorBuffer
[0]=c
;
2519 cnv
->UCharErrorBufferLength
=1;
2520 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2525 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
2527 * This serves as a state change without any output.
2528 * It is useful for reading simple stateful encodings,
2529 * for example using just Shift-In/Shift-Out codes.
2530 * The 21 unused bits may later be used for more sophisticated
2531 * state transitions.
2533 if(cnv
->sharedData
->mbcs
.dbcsOnlyState
==0) {
2536 /* SI/SO are illegal for DBCS-only conversion */
2537 state
=(uint8_t)(cnv
->mode
); /* restore the previous state */
2539 /* callback(illegal) */
2540 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2542 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2543 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2544 /* output BMP code point */
2545 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2547 *offsets
++=sourceIndex
;
2551 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2552 /* just fall through */
2553 } else if(action
==MBCS_STATE_ILLEGAL
) {
2554 /* callback(illegal) */
2555 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2557 /* reserved, must never occur */
2561 /* end of action codes: prepare for a new character */
2565 sourceIndex
=nextSourceIndex
;
2566 } else if(U_FAILURE(*pErrorCode
)) {
2567 /* callback(illegal) */
2570 * Ticket 5691: consistent illegal sequences:
2571 * - We include at least the first byte in the illegal sequence.
2572 * - If any of the non-initial bytes could be the start of a character,
2573 * we stop the illegal sequence before the first one of those.
2575 UBool isDBCSOnly
=(UBool
)(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0);
2578 i
<byteIndex
&& !isSingleOrLead(stateTable
, state
, isDBCSOnly
, bytes
[i
]);
2581 /* Back out some bytes. */
2582 int8_t backOutDistance
=byteIndex
-i
;
2583 int32_t bytesFromThisBuffer
=(int32_t)(source
-(const uint8_t *)pArgs
->source
);
2584 byteIndex
=i
; /* length of reported illegal byte sequence */
2585 if(backOutDistance
<=bytesFromThisBuffer
) {
2586 source
-=backOutDistance
;
2588 /* Back out bytes from the previous buffer: Need to replay them. */
2589 cnv
->preToULength
=(int8_t)(bytesFromThisBuffer
-backOutDistance
);
2590 /* preToULength is negative! */
2591 uprv_memcpy(cnv
->preToU
, bytes
+i
, -cnv
->preToULength
);
2592 source
=(const uint8_t *)pArgs
->source
;
2597 } else /* unassigned sequences indicated with byteIndex>0 */ {
2598 /* try an extension mapping */
2599 pArgs
->source
=(const char *)source
;
2600 byteIndex
=_extToU(cnv
, cnv
->sharedData
,
2601 byteIndex
, &source
, sourceLimit
,
2602 &target
, targetLimit
,
2603 &offsets
, sourceIndex
,
2606 sourceIndex
=nextSourceIndex
+=(int32_t)(source
-(const uint8_t *)pArgs
->source
);
2608 if(U_FAILURE(*pErrorCode
)) {
2609 /* not mappable or buffer overflow */
2615 /* set the converter state back into UConverter */
2616 cnv
->toUnicodeStatus
=offset
;
2618 cnv
->toULength
=byteIndex
;
2620 /* write back the updated pointers */
2621 pArgs
->source
=(const char *)source
;
2622 pArgs
->target
=target
;
2623 pArgs
->offsets
=offsets
;
2627 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2628 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2631 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
2632 UErrorCode
*pErrorCode
) {
2634 const int32_t (*stateTable
)[256];
2635 const uint8_t *source
, *sourceLimit
;
2640 /* set up the local pointers */
2641 cnv
=pArgs
->converter
;
2642 source
=(const uint8_t *)pArgs
->source
;
2643 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2644 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2645 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2647 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2650 /* conversion loop */
2651 while(source
<sourceLimit
) {
2652 entry
=stateTable
[0][*source
++];
2653 /* MBCS_ENTRY_IS_FINAL(entry) */
2655 /* write back the updated pointer early so that we can return directly */
2656 pArgs
->source
=(const char *)source
;
2658 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2659 /* output BMP code point */
2660 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2664 * An if-else-if chain provides more reliable performance for
2665 * the most common cases compared to a switch.
2667 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2668 if( action
==MBCS_STATE_VALID_DIRECT_20
||
2669 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2671 /* output supplementary code point */
2672 return (UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
2673 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2674 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2675 /* output BMP code point */
2676 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2678 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2679 /* just fall through */
2680 } else if(action
==MBCS_STATE_ILLEGAL
) {
2681 /* callback(illegal) */
2682 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2684 /* reserved, must never occur */
2688 if(U_FAILURE(*pErrorCode
)) {
2689 /* callback(illegal) */
2691 } else /* unassigned sequence */ {
2692 /* defer to the generic implementation */
2693 pArgs
->source
=(const char *)source
-1;
2694 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2698 /* no output because of empty input or only state changes */
2699 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2704 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2705 * conversion without offset handling.
2707 * When a character does not have a mapping to Unicode, then we return to the
2708 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2710 * We also defer to the generic code in other complicated cases and have them
2711 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2713 * All normal mappings and errors are handled here.
2716 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
2717 UErrorCode
*pErrorCode
) {
2719 const uint8_t *source
, *sourceLimit
, *lastSource
;
2721 const int32_t (*stateTable
)[256];
2722 const uint16_t *unicodeCodeUnits
;
2731 /* use optimized function if possible */
2732 cnv
=pArgs
->converter
;
2734 if(cnv
->preToULength
>0) {
2735 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2736 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2739 if(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SURROGATES
) {
2741 * Using the generic ucnv_getNextUChar() code lets us deal correctly
2742 * with the rare case of a codepage that maps single surrogates
2743 * without adding the complexity to this already complicated function here.
2745 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2746 } else if(cnv
->sharedData
->mbcs
.countStates
==1) {
2747 return ucnv_MBCSSingleGetNextUChar(pArgs
, pErrorCode
);
2750 /* set up the local pointers */
2751 source
=lastSource
=(const uint8_t *)pArgs
->source
;
2752 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
2754 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2755 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->mbcs
.swapLFNLStateTable
;
2757 stateTable
=cnv
->sharedData
->mbcs
.stateTable
;
2759 unicodeCodeUnits
=cnv
->sharedData
->mbcs
.unicodeCodeUnits
;
2761 /* get the converter state from UConverter */
2762 offset
=cnv
->toUnicodeStatus
;
2765 * if we are in the SBCS state for a DBCS-only converter,
2766 * then load the DBCS state from the MBCS data
2767 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2769 if((state
=(uint8_t)(cnv
->mode
))==0) {
2770 state
=cnv
->sharedData
->mbcs
.dbcsOnlyState
;
2773 /* conversion loop */
2775 while(source
<sourceLimit
) {
2776 entry
=stateTable
[state
][*source
++];
2777 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
2778 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
2779 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2781 /* optimization for 1/2-byte input and BMP output */
2782 if( source
<sourceLimit
&&
2783 MBCS_ENTRY_IS_FINAL(entry
=stateTable
[state
][*source
]) &&
2784 MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
&&
2785 (c
=unicodeCodeUnits
[offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
)])<0xfffe
2788 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2789 /* output BMP code point */
2793 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2796 /* set the next state early so that we can reuse the entry variable */
2797 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2800 * An if-else-if chain provides more reliable performance for
2801 * the most common cases compared to a switch.
2803 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2804 if(action
==MBCS_STATE_VALID_DIRECT_16
) {
2805 /* output BMP code point */
2806 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2808 } else if(action
==MBCS_STATE_VALID_16
) {
2809 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2810 c
=unicodeCodeUnits
[offset
];
2812 /* output BMP code point */
2814 } else if(c
==0xfffe) {
2815 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (c
=ucnv_MBCSGetFallback(&cnv
->sharedData
->mbcs
, offset
))!=0xfffe) {
2819 /* callback(illegal) */
2820 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2822 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
2823 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2824 c
=unicodeCodeUnits
[offset
++];
2826 /* output BMP code point below 0xd800 */
2828 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
2829 /* output roundtrip or fallback supplementary code point */
2830 c
=((c
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00);
2832 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
2833 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2834 c
=unicodeCodeUnits
[offset
];
2836 } else if(c
==0xffff) {
2837 /* callback(illegal) */
2838 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2840 } else if(action
==MBCS_STATE_VALID_DIRECT_20
||
2841 (action
==MBCS_STATE_FALLBACK_DIRECT_20
&& UCNV_TO_U_USE_FALLBACK(cnv
))
2843 /* output supplementary code point */
2844 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
2846 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
2848 * This serves as a state change without any output.
2849 * It is useful for reading simple stateful encodings,
2850 * for example using just Shift-In/Shift-Out codes.
2851 * The 21 unused bits may later be used for more sophisticated
2852 * state transitions.
2854 if(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0) {
2855 /* SI/SO are illegal for DBCS-only conversion */
2856 state
=(uint8_t)(cnv
->mode
); /* restore the previous state */
2858 /* callback(illegal) */
2859 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2861 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2862 if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2863 /* output BMP code point */
2864 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2867 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2868 /* just fall through */
2869 } else if(action
==MBCS_STATE_ILLEGAL
) {
2870 /* callback(illegal) */
2871 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2873 /* reserved (must never occur), or only state change */
2879 /* end of action codes: prepare for a new character */
2882 if(U_FAILURE(*pErrorCode
)) {
2883 /* callback(illegal) */
2885 } else /* unassigned sequence */ {
2886 /* defer to the generic implementation */
2887 cnv
->toUnicodeStatus
=0;
2889 pArgs
->source
=(const char *)lastSource
;
2890 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
2896 if(U_SUCCESS(*pErrorCode
) && source
==sourceLimit
&& lastSource
<source
) {
2897 /* incomplete character byte sequence */
2898 uint8_t *bytes
=cnv
->toUBytes
;
2899 cnv
->toULength
=(int8_t)(source
-lastSource
);
2901 *bytes
++=*lastSource
++;
2902 } while(lastSource
<source
);
2903 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
2904 } else if(U_FAILURE(*pErrorCode
)) {
2905 /* callback(illegal) */
2907 * Ticket 5691: consistent illegal sequences:
2908 * - We include at least the first byte in the illegal sequence.
2909 * - If any of the non-initial bytes could be the start of a character,
2910 * we stop the illegal sequence before the first one of those.
2912 UBool isDBCSOnly
=(UBool
)(cnv
->sharedData
->mbcs
.dbcsOnlyState
!=0);
2913 uint8_t *bytes
=cnv
->toUBytes
;
2914 *bytes
++=*lastSource
++; /* first byte */
2915 if(lastSource
==source
) {
2917 } else /* lastSource<source: multi-byte character */ {
2920 lastSource
<source
&& !isSingleOrLead(stateTable
, state
, isDBCSOnly
, *lastSource
);
2923 *bytes
++=*lastSource
++;
2929 /* no output because of empty input or only state changes */
2930 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2935 /* set the converter state back into UConverter, ready for a new character */
2936 cnv
->toUnicodeStatus
=0;
2939 /* write back the updated pointer */
2940 pArgs
->source
=(const char *)source
;
2946 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2947 * Removal improves code coverage.
2950 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2951 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2952 * It does not handle conversion extensions (_extToU()).
2955 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData
*sharedData
,
2956 uint8_t b
, UBool useFallback
) {
2960 entry
=sharedData
->mbcs
.stateTable
[0][b
];
2961 /* MBCS_ENTRY_IS_FINAL(entry) */
2963 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2964 /* output BMP code point */
2965 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2969 * An if-else-if chain provides more reliable performance for
2970 * the most common cases compared to a switch.
2972 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2973 if(action
==MBCS_STATE_VALID_DIRECT_20
) {
2974 /* output supplementary code point */
2975 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2976 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2977 if(!TO_U_USE_FALLBACK(useFallback
)) {
2980 /* output BMP code point */
2981 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2982 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
2983 if(!TO_U_USE_FALLBACK(useFallback
)) {
2986 /* output supplementary code point */
2987 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2988 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2990 } else if(action
==MBCS_STATE_ILLEGAL
) {
2993 /* reserved, must never occur */
3000 * This is a simple version of _MBCSGetNextUChar() that is used
3001 * by other converter implementations.
3002 * It only returns an "assigned" result if it consumes the entire input.
3003 * It does not use state from the converter, nor error codes.
3004 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3005 * It handles conversion extensions but not GB 18030.
3010 * otherwise the Unicode code point
3013 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData
*sharedData
,
3014 const char *source
, int32_t length
,
3015 UBool useFallback
) {
3016 const int32_t (*stateTable
)[256];
3017 const uint16_t *unicodeCodeUnits
;
3020 uint8_t state
, action
;
3026 /* no input at all: "illegal" */
3032 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3033 * TODO In future releases, verify that this function is never called for SBCS
3034 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3035 * Removal improves code coverage.
3037 /* use optimized function if possible */
3038 if(sharedData
->mbcs
.countStates
==1) {
3040 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData
, (uint8_t)*source
, useFallback
);
3042 return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3047 /* set up the local pointers */
3048 stateTable
=sharedData
->mbcs
.stateTable
;
3049 unicodeCodeUnits
=sharedData
->mbcs
.unicodeCodeUnits
;
3051 /* converter state */
3053 state
=sharedData
->mbcs
.dbcsOnlyState
;
3055 /* conversion loop */
3057 entry
=stateTable
[state
][(uint8_t)source
[i
++]];
3058 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
3059 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
3060 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
3063 return 0xffff; /* truncated character */
3067 * An if-else-if chain provides more reliable performance for
3068 * the most common cases compared to a switch.
3070 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
3071 if(action
==MBCS_STATE_VALID_16
) {
3072 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3073 c
=unicodeCodeUnits
[offset
];
3076 } else if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
3077 c
=ucnv_MBCSGetFallback(&sharedData
->mbcs
, offset
);
3078 /* else done with 0xfffe */
3081 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
3082 /* output BMP code point */
3083 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3085 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
3086 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
3087 c
=unicodeCodeUnits
[offset
++];
3089 /* output BMP code point below 0xd800 */
3090 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
3091 /* output roundtrip or fallback supplementary code point */
3092 c
=(UChar32
)(((c
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00));
3093 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
3094 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3095 c
=unicodeCodeUnits
[offset
];
3096 } else if(c
==0xffff) {
3102 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
3103 /* output supplementary code point */
3104 c
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3106 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
3107 if(!TO_U_USE_FALLBACK(useFallback
)) {
3111 /* output BMP code point */
3112 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
3114 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
3115 if(!TO_U_USE_FALLBACK(useFallback
)) {
3119 /* output supplementary code point */
3120 c
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
3122 } else if(action
==MBCS_STATE_UNASSIGNED
) {
3128 * forbid MBCS_STATE_CHANGE_ONLY for this function,
3129 * and MBCS_STATE_ILLEGAL and reserved action codes
3136 /* illegal for this function: not all input consumed */
3141 /* try an extension mapping */
3142 const int32_t *cx
=sharedData
->mbcs
.extIndexes
;
3144 return ucnv_extSimpleMatchToU(cx
, source
, length
, useFallback
);
3151 /* MBCS-from-Unicode conversion functions ----------------------------------- */
3153 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3155 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3156 UErrorCode
*pErrorCode
) {
3158 const UChar
*source
, *sourceLimit
;
3160 int32_t targetCapacity
;
3163 const uint16_t *table
;
3164 const uint16_t *mbcsIndex
;
3165 const uint8_t *bytes
;
3169 int32_t sourceIndex
, nextSourceIndex
;
3171 uint32_t stage2Entry
;
3172 uint32_t asciiRoundtrips
;
3174 uint8_t unicodeMask
;
3176 /* use optimized function if possible */
3177 cnv
=pArgs
->converter
;
3178 unicodeMask
=cnv
->sharedData
->mbcs
.unicodeMask
;
3180 /* set up the local pointers */
3181 source
=pArgs
->source
;
3182 sourceLimit
=pArgs
->sourceLimit
;
3183 target
=(uint8_t *)pArgs
->target
;
3184 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3185 offsets
=pArgs
->offsets
;
3187 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3188 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
3189 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3190 bytes
=cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3192 bytes
=cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3194 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
3196 /* get the converter state from UConverter */
3199 /* sourceIndex=-1 if the current character began in the previous buffer */
3200 sourceIndex
= c
==0 ? 0 : -1;
3203 /* conversion loop */
3204 if(c
!=0 && targetCapacity
>0) {
3208 while(source
<sourceLimit
) {
3210 * This following test is to see if available input would overflow the output.
3211 * It does not catch output of more than one byte that
3212 * overflows as a result of a multi-byte character or callback output
3213 * from the last source character.
3214 * Therefore, those situations also test for overflows and will
3215 * then break the loop, too.
3217 if(targetCapacity
>0) {
3219 * Get a correct Unicode code point:
3220 * a single UChar for a BMP code point or
3221 * a matched surrogate pair for a "supplementary code point".
3225 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
3226 *target
++=(uint8_t)c
;
3228 *offsets
++=sourceIndex
;
3229 sourceIndex
=nextSourceIndex
;
3236 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3237 * to avoid dealing with surrogates.
3238 * MBCS_FAST_MAX must be >=0xd7ff.
3241 value
=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex
, (const uint16_t *)bytes
, c
);
3242 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3246 /* output the value */
3249 * This also tests if the codepage maps single surrogates.
3250 * If it does, then surrogates are not paired but mapped separately.
3251 * Note that in this case unmatched surrogates are not detected.
3253 if(UTF_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
3254 if(UTF_IS_SURROGATE_FIRST(c
)) {
3256 if(source
<sourceLimit
) {
3257 /* test the following code unit */
3258 UChar trail
=*source
;
3259 if(UTF_IS_SECOND_SURROGATE(trail
)) {
3262 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
3263 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
3264 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3265 /* callback(unassigned) */
3268 /* convert this supplementary code point */
3269 /* exit this condition tree */
3271 /* this is an unmatched lead code unit (1st surrogate) */
3272 /* callback(illegal) */
3273 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3281 /* this is an unmatched trail code unit (2nd surrogate) */
3282 /* callback(illegal) */
3283 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3288 /* convert the Unicode code point in c into codepage bytes */
3289 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
3291 /* get the bytes and the length for the output */
3293 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
3295 /* is this code point assigned, or do we use fallbacks? */
3296 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
3297 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
3300 * We allow a 0 byte output if the "assigned" bit is set for this entry.
3301 * There is no way with this data structure for fallback output
3302 * to be a zero byte.
3306 /* try an extension mapping */
3307 pArgs
->source
=source
;
3308 c
=_extFromU(cnv
, cnv
->sharedData
,
3309 c
, &source
, sourceLimit
,
3310 &target
, target
+targetCapacity
,
3311 &offsets
, sourceIndex
,
3314 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
3316 if(U_FAILURE(*pErrorCode
)) {
3317 /* not mappable or buffer overflow */
3320 /* a mapping was written to the target, continue */
3322 /* recalculate the targetCapacity after an extension mapping */
3323 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
3325 /* normal end of conversion: prepare for a new character */
3326 sourceIndex
=nextSourceIndex
;
3332 /* write the output character bytes from value and length */
3333 /* from the first if in the loop we know that targetCapacity>0 */
3335 /* this is easy because we know that there is enough space */
3336 *target
++=(uint8_t)value
;
3338 *offsets
++=sourceIndex
;
3341 } else /* length==2 */ {
3342 *target
++=(uint8_t)(value
>>8);
3343 if(2<=targetCapacity
) {
3344 *target
++=(uint8_t)value
;
3346 *offsets
++=sourceIndex
;
3347 *offsets
++=sourceIndex
;
3352 *offsets
++=sourceIndex
;
3354 cnv
->charErrorBuffer
[0]=(char)value
;
3355 cnv
->charErrorBufferLength
=1;
3357 /* target overflow */
3359 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3365 /* normal end of conversion: prepare for a new character */
3367 sourceIndex
=nextSourceIndex
;
3370 /* target is full */
3371 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3376 /* set the converter state back into UConverter */
3379 /* write back the updated pointers */
3380 pArgs
->source
=source
;
3381 pArgs
->target
=(char *)target
;
3382 pArgs
->offsets
=offsets
;
3385 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3387 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3388 UErrorCode
*pErrorCode
) {
3390 const UChar
*source
, *sourceLimit
;
3392 int32_t targetCapacity
;
3395 const uint16_t *table
;
3396 const uint16_t *results
;
3400 int32_t sourceIndex
, nextSourceIndex
;
3402 uint16_t value
, minValue
;
3403 UBool hasSupplementary
;
3405 /* set up the local pointers */
3406 cnv
=pArgs
->converter
;
3407 source
=pArgs
->source
;
3408 sourceLimit
=pArgs
->sourceLimit
;
3409 target
=(uint8_t *)pArgs
->target
;
3410 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3411 offsets
=pArgs
->offsets
;
3413 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3414 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3415 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3417 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3420 if(cnv
->useFallback
) {
3421 /* use all roundtrip and fallback results */
3424 /* use only roundtrips and fallbacks from private-use characters */
3427 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
3429 /* get the converter state from UConverter */
3432 /* sourceIndex=-1 if the current character began in the previous buffer */
3433 sourceIndex
= c
==0 ? 0 : -1;
3436 /* conversion loop */
3437 if(c
!=0 && targetCapacity
>0) {
3441 while(source
<sourceLimit
) {
3443 * This following test is to see if available input would overflow the output.
3444 * It does not catch output of more than one byte that
3445 * overflows as a result of a multi-byte character or callback output
3446 * from the last source character.
3447 * Therefore, those situations also test for overflows and will
3448 * then break the loop, too.
3450 if(targetCapacity
>0) {
3452 * Get a correct Unicode code point:
3453 * a single UChar for a BMP code point or
3454 * a matched surrogate pair for a "supplementary code point".
3458 if(UTF_IS_SURROGATE(c
)) {
3459 if(UTF_IS_SURROGATE_FIRST(c
)) {
3461 if(source
<sourceLimit
) {
3462 /* test the following code unit */
3463 UChar trail
=*source
;
3464 if(UTF_IS_SECOND_SURROGATE(trail
)) {
3467 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
3468 if(!hasSupplementary
) {
3469 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3470 /* callback(unassigned) */
3473 /* convert this supplementary code point */
3474 /* exit this condition tree */
3476 /* this is an unmatched lead code unit (1st surrogate) */
3477 /* callback(illegal) */
3478 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3486 /* this is an unmatched trail code unit (2nd surrogate) */
3487 /* callback(illegal) */
3488 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3493 /* convert the Unicode code point in c into codepage bytes */
3494 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3496 /* is this code point assigned, or do we use fallbacks? */
3497 if(value
>=minValue
) {
3498 /* assigned, write the output character bytes from value and length */
3500 /* this is easy because we know that there is enough space */
3501 *target
++=(uint8_t)value
;
3503 *offsets
++=sourceIndex
;
3507 /* normal end of conversion: prepare for a new character */
3509 sourceIndex
=nextSourceIndex
;
3510 } else { /* unassigned */
3512 /* try an extension mapping */
3513 pArgs
->source
=source
;
3514 c
=_extFromU(cnv
, cnv
->sharedData
,
3515 c
, &source
, sourceLimit
,
3516 &target
, target
+targetCapacity
,
3517 &offsets
, sourceIndex
,
3520 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
3522 if(U_FAILURE(*pErrorCode
)) {
3523 /* not mappable or buffer overflow */
3526 /* a mapping was written to the target, continue */
3528 /* recalculate the targetCapacity after an extension mapping */
3529 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
3531 /* normal end of conversion: prepare for a new character */
3532 sourceIndex
=nextSourceIndex
;
3536 /* target is full */
3537 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3542 /* set the converter state back into UConverter */
3545 /* write back the updated pointers */
3546 pArgs
->source
=source
;
3547 pArgs
->target
=(char *)target
;
3548 pArgs
->offsets
=offsets
;
3552 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3553 * that map only to and from the BMP.
3554 * In addition to single-byte/state optimizations, the offset calculations
3555 * become much easier.
3556 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3557 * but measurements have shown that this diminishes performance
3558 * in more cases than it improves it.
3559 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3560 * for various MBCS and SBCS optimizations.
3563 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3564 UErrorCode
*pErrorCode
) {
3566 const UChar
*source
, *sourceLimit
, *lastSource
;
3568 int32_t targetCapacity
, length
;
3571 const uint16_t *table
;
3572 const uint16_t *results
;
3576 int32_t sourceIndex
;
3578 uint32_t asciiRoundtrips
;
3579 uint16_t value
, minValue
;
3581 /* set up the local pointers */
3582 cnv
=pArgs
->converter
;
3583 source
=pArgs
->source
;
3584 sourceLimit
=pArgs
->sourceLimit
;
3585 target
=(uint8_t *)pArgs
->target
;
3586 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3587 offsets
=pArgs
->offsets
;
3589 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3590 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3591 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3593 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3595 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
3597 if(cnv
->useFallback
) {
3598 /* use all roundtrip and fallback results */
3601 /* use only roundtrips and fallbacks from private-use characters */
3605 /* get the converter state from UConverter */
3608 /* sourceIndex=-1 if the current character began in the previous buffer */
3609 sourceIndex
= c
==0 ? 0 : -1;
3613 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3614 * for the minimum of the sourceLength and targetCapacity
3616 length
=(int32_t)(sourceLimit
-source
);
3617 if(length
<targetCapacity
) {
3618 targetCapacity
=length
;
3621 /* conversion loop */
3622 if(c
!=0 && targetCapacity
>0) {
3626 #if MBCS_UNROLL_SINGLE_FROM_BMP
3627 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3628 /* unroll the loop with the most common case */
3630 if(targetCapacity
>=4) {
3631 int32_t count
, loops
;
3632 uint16_t andedValues
;
3634 loops
=count
=targetCapacity
>>2;
3637 andedValues
=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3638 *target
++=(uint8_t)value
;
3640 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3641 *target
++=(uint8_t)value
;
3643 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3644 *target
++=(uint8_t)value
;
3646 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3647 *target
++=(uint8_t)value
;
3649 /* were all 4 entries really valid? */
3650 if(andedValues
<minValue
) {
3651 /* no, return to the first of these 4 */
3658 targetCapacity
-=4*count
;
3661 lastSource
+=4*count
;
3663 *offsets
++=sourceIndex
++;
3664 *offsets
++=sourceIndex
++;
3665 *offsets
++=sourceIndex
++;
3666 *offsets
++=sourceIndex
++;
3675 while(targetCapacity
>0) {
3677 * Get a correct Unicode code point:
3678 * a single UChar for a BMP code point or
3679 * a matched surrogate pair for a "supplementary code point".
3683 * Do not immediately check for single surrogates:
3684 * Assume that they are unassigned and check for them in that case.
3685 * This speeds up the conversion of assigned characters.
3687 /* convert the Unicode code point in c into codepage bytes */
3688 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
3689 *target
++=(uint8_t)c
;
3694 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3695 /* is this code point assigned, or do we use fallbacks? */
3696 if(value
>=minValue
) {
3697 /* assigned, write the output character bytes from value and length */
3699 /* this is easy because we know that there is enough space */
3700 *target
++=(uint8_t)value
;
3703 /* normal end of conversion: prepare for a new character */
3706 } else if(!UTF_IS_SURROGATE(c
)) {
3707 /* normal, unassigned BMP character */
3708 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
3710 if(source
<sourceLimit
) {
3711 /* test the following code unit */
3712 UChar trail
=*source
;
3713 if(UTF_IS_SECOND_SURROGATE(trail
)) {
3715 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
3716 /* this codepage does not map supplementary code points */
3717 /* callback(unassigned) */
3719 /* this is an unmatched lead code unit (1st surrogate) */
3720 /* callback(illegal) */
3721 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3727 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
3732 /* this is an unmatched trail code unit (2nd surrogate) */
3733 /* callback(illegal) */
3734 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3738 /* c does not have a mapping */
3740 /* get the number of code units for c to correctly advance sourceIndex */
3741 length
=U16_LENGTH(c
);
3743 /* set offsets since the start or the last extension */
3745 int32_t count
=(int32_t)(source
-lastSource
);
3747 /* do not set the offset for this character */
3751 *offsets
++=sourceIndex
++;
3754 /* offsets and sourceIndex are now set for the current character */
3757 /* try an extension mapping */
3759 c
=_extFromU(cnv
, cnv
->sharedData
,
3760 c
, &source
, sourceLimit
,
3761 &target
, (const uint8_t *)(pArgs
->targetLimit
),
3762 &offsets
, sourceIndex
,
3765 sourceIndex
+=length
+(int32_t)(source
-lastSource
);
3768 if(U_FAILURE(*pErrorCode
)) {
3769 /* not mappable or buffer overflow */
3772 /* a mapping was written to the target, continue */
3774 /* recalculate the targetCapacity after an extension mapping */
3775 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
3776 length
=(int32_t)(sourceLimit
-source
);
3777 if(length
<targetCapacity
) {
3778 targetCapacity
=length
;
3782 #if MBCS_UNROLL_SINGLE_FROM_BMP
3783 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3788 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
3789 /* target is full */
3790 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3793 /* set offsets since the start or the last callback */
3795 size_t count
=source
-lastSource
;
3796 if (count
> 0 && *pErrorCode
== U_TRUNCATED_CHAR_FOUND
) {
3798 Caller gave us a partial supplementary character,
3799 which this function couldn't convert in any case.
3800 The callback will handle the offset.
3805 *offsets
++=sourceIndex
++;
3810 /* set the converter state back into UConverter */
3813 /* write back the updated pointers */
3814 pArgs
->source
=source
;
3815 pArgs
->target
=(char *)target
;
3816 pArgs
->offsets
=offsets
;
3820 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3821 UErrorCode
*pErrorCode
) {
3823 const UChar
*source
, *sourceLimit
;
3825 int32_t targetCapacity
;
3828 const uint16_t *table
;
3829 const uint16_t *mbcsIndex
;
3830 const uint8_t *p
, *bytes
;
3835 int32_t prevSourceIndex
, sourceIndex
, nextSourceIndex
;
3837 uint32_t stage2Entry
;
3838 uint32_t asciiRoundtrips
;
3840 int32_t length
, prevLength
;
3841 uint8_t unicodeMask
;
3843 cnv
=pArgs
->converter
;
3845 if(cnv
->preFromUFirstCP
>=0) {
3847 * pass sourceIndex=-1 because we continue from an earlier buffer
3848 * in the future, this may change with continuous offsets
3850 ucnv_extContinueMatchFromU(cnv
, pArgs
, -1, pErrorCode
);
3852 if(U_FAILURE(*pErrorCode
) || cnv
->preFromULength
<0) {
3857 /* use optimized function if possible */
3858 outputType
=cnv
->sharedData
->mbcs
.outputType
;
3859 unicodeMask
=cnv
->sharedData
->mbcs
.unicodeMask
;
3860 if(outputType
==MBCS_OUTPUT_1
&& !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
3861 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
3862 ucnv_MBCSSingleFromBMPWithOffsets(pArgs
, pErrorCode
);
3864 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
3867 } else if(outputType
==MBCS_OUTPUT_2
&& cnv
->sharedData
->mbcs
.utf8Friendly
) {
3868 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
3872 /* set up the local pointers */
3873 source
=pArgs
->source
;
3874 sourceLimit
=pArgs
->sourceLimit
;
3875 target
=(uint8_t *)pArgs
->target
;
3876 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
3877 offsets
=pArgs
->offsets
;
3879 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
3880 if(cnv
->sharedData
->mbcs
.utf8Friendly
) {
3881 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
3885 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3886 bytes
=cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
3888 bytes
=cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
3890 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
3892 /* get the converter state from UConverter */
3895 if(outputType
==MBCS_OUTPUT_2_SISO
) {
3896 prevLength
=cnv
->fromUnicodeStatus
;
3898 /* set the real value */
3902 /* prevent fromUnicodeStatus from being set to something non-0 */
3906 /* sourceIndex=-1 if the current character began in the previous buffer */
3908 sourceIndex
= c
==0 ? 0 : -1;
3911 /* conversion loop */
3913 * This is another piece of ugly code:
3914 * A goto into the loop if the converter state contains a first surrogate
3915 * from the previous function call.
3916 * It saves me to check in each loop iteration a check of if(c==0)
3917 * and duplicating the trail-surrogate-handling code in the else
3918 * branch of that check.
3919 * I could not find any other way to get around this other than
3920 * using a function call for the conversion and callback, which would
3921 * be even more inefficient.
3923 * Markus Scherer 2000-jul-19
3925 if(c
!=0 && targetCapacity
>0) {
3929 while(source
<sourceLimit
) {
3931 * This following test is to see if available input would overflow the output.
3932 * It does not catch output of more than one byte that
3933 * overflows as a result of a multi-byte character or callback output
3934 * from the last source character.
3935 * Therefore, those situations also test for overflows and will
3936 * then break the loop, too.
3938 if(targetCapacity
>0) {
3940 * Get a correct Unicode code point:
3941 * a single UChar for a BMP code point or
3942 * a matched surrogate pair for a "supplementary code point".
3946 if(c
<=0x7f && IS_ASCII_ROUNDTRIP(c
, asciiRoundtrips
)) {
3947 *target
++=(uint8_t)c
;
3949 *offsets
++=sourceIndex
;
3950 prevSourceIndex
=sourceIndex
;
3951 sourceIndex
=nextSourceIndex
;
3958 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3959 * to avoid dealing with surrogates.
3960 * MBCS_FAST_MAX must be >=0xd7ff.
3962 if(c
<=0xd7ff && mbcsIndex
!=NULL
) {
3963 value
=mbcsIndex
[c
>>6];
3965 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
3966 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3967 switch(outputType
) {
3969 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
3980 case MBCS_OUTPUT_2_SISO
:
3981 /* 1/2-byte stateful with Shift-In/Shift-Out */
3983 * Save the old state in the converter object
3984 * right here, then change the local prevLength state variable if necessary.
3985 * Then, if this character turns out to be unassigned or a fallback that
3986 * is not taken, the callback code must not save the new state in the converter
3987 * because the new state is for a character that is not output.
3988 * However, the callback must still restore the state from the converter
3989 * in case the callback function changed it for its output.
3991 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
3992 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
3996 } else if(prevLength
<=1) {
3999 /* change from double-byte mode to single-byte */
4000 value
|=(uint32_t)UCNV_SI
<<8;
4008 /* change from single-byte mode to double-byte */
4009 value
|=(uint32_t)UCNV_SO
<<16;
4015 case MBCS_OUTPUT_DBCS_ONLY
:
4016 /* table with single-byte results, but only DBCS mappings used */
4017 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4019 /* no mapping or SBCS result, not taken for DBCS-only */
4026 p
=bytes
+(value
+(c
&0x3f))*3;
4027 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4034 } else if(value
<=0xffff) {
4041 value
=((const uint32_t *)bytes
)[value
+(c
&0x3f)];
4048 } else if(value
<=0xffff) {
4050 } else if(value
<=0xffffff) {
4056 case MBCS_OUTPUT_3_EUC
:
4057 value
=((const uint16_t *)bytes
)[value
+(c
&0x3f)];
4058 /* EUC 16-bit fixed-length representation */
4065 } else if((value
&0x8000)==0) {
4068 } else if((value
&0x80)==0) {
4075 case MBCS_OUTPUT_4_EUC
:
4076 p
=bytes
+(value
+(c
&0x3f))*3;
4077 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4078 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4085 } else if(value
<=0xffff) {
4087 } else if((value
&0x800000)==0) {
4090 } else if((value
&0x8000)==0) {
4098 /* must not occur */
4100 * To avoid compiler warnings that value & length may be
4101 * used without having been initialized, we set them here.
4102 * In reality, this is unreachable code.
4103 * Not having a default branch also causes warnings with
4110 /* output the value */
4113 * This also tests if the codepage maps single surrogates.
4114 * If it does, then surrogates are not paired but mapped separately.
4115 * Note that in this case unmatched surrogates are not detected.
4117 if(UTF_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
4118 if(UTF_IS_SURROGATE_FIRST(c
)) {
4120 if(source
<sourceLimit
) {
4121 /* test the following code unit */
4122 UChar trail
=*source
;
4123 if(UTF_IS_SECOND_SURROGATE(trail
)) {
4126 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
4127 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4128 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4129 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
4130 /* callback(unassigned) */
4133 /* convert this supplementary code point */
4134 /* exit this condition tree */
4136 /* this is an unmatched lead code unit (1st surrogate) */
4137 /* callback(illegal) */
4138 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
4146 /* this is an unmatched trail code unit (2nd surrogate) */
4147 /* callback(illegal) */
4148 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
4153 /* convert the Unicode code point in c into codepage bytes */
4156 * The basic lookup is a triple-stage compact array (trie) lookup.
4157 * For details see the beginning of this file.
4159 * Single-byte codepages are handled with a different data structure
4160 * by _MBCSSingle... functions.
4162 * The result consists of a 32-bit value from stage 2 and
4163 * a pointer to as many bytes as are stored per character.
4164 * The pointer points to the character's bytes in stage 3.
4165 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4166 * for that pointer, while bits 31..16 are flags for which of
4167 * the 16 characters in the block are roundtrip-assigned.
4169 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4170 * respectively as uint32_t, in the platform encoding.
4171 * For 3-byte codepages, the bytes are always stored in big-endian order.
4173 * For EUC encodings that use only either 0x8e or 0x8f as the first
4174 * byte of their longest byte sequences, the first two bytes in
4175 * this third stage indicate with their 7th bits whether these bytes
4176 * are to be written directly or actually need to be preceeded by
4177 * one of the two Single-Shift codes. With this, the third stage
4178 * stores one byte fewer per character than the actual maximum length of
4179 * EUC byte sequences.
4181 * Other than that, leading zero bytes are removed and the other
4182 * bytes output. A single zero byte may be output if the "assigned"
4183 * bit in stage 2 was on.
4184 * The data structure does not support zero byte output as a fallback,
4185 * and also does not allow output of leading zeros.
4187 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
4189 /* get the bytes and the length for the output */
4190 switch(outputType
) {
4192 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4199 case MBCS_OUTPUT_2_SISO
:
4200 /* 1/2-byte stateful with Shift-In/Shift-Out */
4202 * Save the old state in the converter object
4203 * right here, then change the local prevLength state variable if necessary.
4204 * Then, if this character turns out to be unassigned or a fallback that
4205 * is not taken, the callback code must not save the new state in the converter
4206 * because the new state is for a character that is not output.
4207 * However, the callback must still restore the state from the converter
4208 * in case the callback function changed it for its output.
4210 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
4211 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4213 if(value
==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
)==0) {
4214 /* no mapping, leave value==0 */
4216 } else if(prevLength
<=1) {
4219 /* change from double-byte mode to single-byte */
4220 value
|=(uint32_t)UCNV_SI
<<8;
4228 /* change from single-byte mode to double-byte */
4229 value
|=(uint32_t)UCNV_SO
<<16;
4235 case MBCS_OUTPUT_DBCS_ONLY
:
4236 /* table with single-byte results, but only DBCS mappings used */
4237 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4239 /* no mapping or SBCS result, not taken for DBCS-only */
4240 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4247 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4248 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4251 } else if(value
<=0xffff) {
4258 value
=MBCS_VALUE_4_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4261 } else if(value
<=0xffff) {
4263 } else if(value
<=0xffffff) {
4269 case MBCS_OUTPUT_3_EUC
:
4270 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4271 /* EUC 16-bit fixed-length representation */
4274 } else if((value
&0x8000)==0) {
4277 } else if((value
&0x80)==0) {
4284 case MBCS_OUTPUT_4_EUC
:
4285 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
4286 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4287 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4290 } else if(value
<=0xffff) {
4292 } else if((value
&0x800000)==0) {
4295 } else if((value
&0x8000)==0) {
4303 /* must not occur */
4305 * To avoid compiler warnings that value & length may be
4306 * used without having been initialized, we set them here.
4307 * In reality, this is unreachable code.
4308 * Not having a default branch also causes warnings with
4311 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4316 /* is this code point assigned, or do we use fallbacks? */
4317 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
)!=0 ||
4318 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
4321 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4322 * There is no way with this data structure for fallback output
4323 * to be a zero byte.
4327 /* try an extension mapping */
4328 pArgs
->source
=source
;
4329 c
=_extFromU(cnv
, cnv
->sharedData
,
4330 c
, &source
, sourceLimit
,
4331 &target
, target
+targetCapacity
,
4332 &offsets
, sourceIndex
,
4335 nextSourceIndex
+=(int32_t)(source
-pArgs
->source
);
4336 prevLength
=cnv
->fromUnicodeStatus
; /* restore SISO state */
4338 if(U_FAILURE(*pErrorCode
)) {
4339 /* not mappable or buffer overflow */
4342 /* a mapping was written to the target, continue */
4344 /* recalculate the targetCapacity after an extension mapping */
4345 targetCapacity
=(int32_t)(pArgs
->targetLimit
-(char *)target
);
4347 /* normal end of conversion: prepare for a new character */
4349 prevSourceIndex
=sourceIndex
;
4350 sourceIndex
=nextSourceIndex
;
4357 /* write the output character bytes from value and length */
4358 /* from the first if in the loop we know that targetCapacity>0 */
4359 if(length
<=targetCapacity
) {
4362 /* each branch falls through to the next one */
4364 *target
++=(uint8_t)(value
>>24);
4366 *target
++=(uint8_t)(value
>>16);
4368 *target
++=(uint8_t)(value
>>8);
4370 *target
++=(uint8_t)value
;
4372 /* will never occur */
4377 /* each branch falls through to the next one */
4379 *target
++=(uint8_t)(value
>>24);
4380 *offsets
++=sourceIndex
;
4382 *target
++=(uint8_t)(value
>>16);
4383 *offsets
++=sourceIndex
;
4385 *target
++=(uint8_t)(value
>>8);
4386 *offsets
++=sourceIndex
;
4388 *target
++=(uint8_t)value
;
4389 *offsets
++=sourceIndex
;
4391 /* will never occur */
4395 targetCapacity
-=length
;
4397 uint8_t *charErrorBuffer
;
4400 * We actually do this backwards here:
4401 * In order to save an intermediate variable, we output
4402 * first to the overflow buffer what does not fit into the
4405 /* we know that 1<=targetCapacity<length<=4 */
4406 length
-=targetCapacity
;
4407 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
4409 /* each branch falls through to the next one */
4411 *charErrorBuffer
++=(uint8_t)(value
>>16);
4413 *charErrorBuffer
++=(uint8_t)(value
>>8);
4415 *charErrorBuffer
=(uint8_t)value
;
4417 /* will never occur */
4420 cnv
->charErrorBufferLength
=(int8_t)length
;
4422 /* now output what fits into the regular target */
4423 value
>>=8*length
; /* length was reduced by targetCapacity */
4424 switch(targetCapacity
) {
4425 /* each branch falls through to the next one */
4427 *target
++=(uint8_t)(value
>>16);
4429 *offsets
++=sourceIndex
;
4432 *target
++=(uint8_t)(value
>>8);
4434 *offsets
++=sourceIndex
;
4437 *target
++=(uint8_t)value
;
4439 *offsets
++=sourceIndex
;
4442 /* will never occur */
4446 /* target overflow */
4448 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4453 /* normal end of conversion: prepare for a new character */
4456 prevSourceIndex
=sourceIndex
;
4457 sourceIndex
=nextSourceIndex
;
4461 /* target is full */
4462 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4468 * the end of the input stream and detection of truncated input
4469 * are handled by the framework, but for EBCDIC_STATEFUL conversion
4470 * we need to emit an SI at the very end
4474 * EBCDIC_STATEFUL in DBCS mode
4475 * end of input and no truncated input
4477 if( U_SUCCESS(*pErrorCode
) &&
4478 outputType
==MBCS_OUTPUT_2_SISO
&& prevLength
==2 &&
4479 pArgs
->flush
&& source
>=sourceLimit
&& c
==0
4481 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4482 if(targetCapacity
>0) {
4483 *target
++=(uint8_t)UCNV_SI
;
4485 /* set the last source character's index (sourceIndex points at sourceLimit now) */
4486 *offsets
++=prevSourceIndex
;
4489 /* target is full */
4490 cnv
->charErrorBuffer
[0]=(char)UCNV_SI
;
4491 cnv
->charErrorBufferLength
=1;
4492 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4494 prevLength
=1; /* we switched into SBCS */
4497 /* set the converter state back into UConverter */
4499 cnv
->fromUnicodeStatus
=prevLength
;
4501 /* write back the updated pointers */
4502 pArgs
->source
=source
;
4503 pArgs
->target
=(char *)target
;
4504 pArgs
->offsets
=offsets
;
4508 * This is another simple conversion function for internal use by other
4509 * conversion implementations.
4510 * It does not use the converter state nor call callbacks.
4511 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4512 * It handles conversion extensions but not GB 18030.
4514 * It converts one single Unicode code point into codepage bytes, encoded
4515 * as one 32-bit value. The function returns the number of bytes in *pValue:
4516 * 1..4 the number of bytes in *pValue
4517 * 0 unassigned (*pValue undefined)
4518 * -1 illegal (currently not used, *pValue undefined)
4520 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4521 * the second to last byte in bits 15..8, etc.
4522 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4525 ucnv_MBCSFromUChar32(UConverterSharedData
*sharedData
,
4526 UChar32 c
, uint32_t *pValue
,
4527 UBool useFallback
) {
4529 const uint16_t *table
;
4531 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4534 uint32_t stage2Entry
;
4538 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4539 if(c
<=0xffff || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4540 table
=sharedData
->mbcs
.fromUnicodeTable
;
4542 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4543 if(sharedData
->mbcs
.outputType
==MBCS_OUTPUT_1
) {
4544 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
4545 /* is this code point assigned, or do we use fallbacks? */
4546 if(useFallback
? value
>=0x800 : value
>=0xc00) {
4550 } else /* outputType!=MBCS_OUTPUT_1 */ {
4551 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
4553 /* get the bytes and the length for the output */
4554 switch(sharedData
->mbcs
.outputType
) {
4556 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4564 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4565 case MBCS_OUTPUT_DBCS_ONLY
:
4566 /* table with single-byte results, but only DBCS mappings used */
4567 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4569 /* no mapping or SBCS result, not taken for DBCS-only */
4570 value
=stage2Entry
=0; /* stage2Entry=0 to reset roundtrip flags */
4577 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4578 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4581 } else if(value
<=0xffff) {
4588 value
=MBCS_VALUE_4_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4591 } else if(value
<=0xffff) {
4593 } else if(value
<=0xffffff) {
4599 case MBCS_OUTPUT_3_EUC
:
4600 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4601 /* EUC 16-bit fixed-length representation */
4604 } else if((value
&0x8000)==0) {
4607 } else if((value
&0x80)==0) {
4614 case MBCS_OUTPUT_4_EUC
:
4615 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
4616 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
4617 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4620 } else if(value
<=0xffff) {
4622 } else if((value
&0x800000)==0) {
4625 } else if((value
&0x8000)==0) {
4634 /* must not occur */
4638 /* is this code point assigned, or do we use fallbacks? */
4639 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
4640 (FROM_U_USE_FALLBACK(useFallback
, c
) && value
!=0)
4643 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4644 * There is no way with this data structure for fallback output
4645 * to be a zero byte.
4654 cx
=sharedData
->mbcs
.extIndexes
;
4656 length
=ucnv_extSimpleMatchFromU(cx
, c
, pValue
, useFallback
);
4657 return length
>=0 ? length
: -length
; /* return abs(length); */
4667 * This function has been moved to ucnv2022.c for inlining.
4668 * This implementation is here only for documentation purposes
4672 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4673 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4674 * It does not handle conversion extensions (_extFromU()).
4676 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4679 ucnv_MBCSSingleFromUChar32(UConverterSharedData
*sharedData
,
4681 UBool useFallback
) {
4682 const uint16_t *table
;
4685 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4686 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
4690 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4691 table
=sharedData
->mbcs
.fromUnicodeTable
;
4693 /* get the byte for the output */
4694 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
4695 /* is this code point assigned, or do we use fallbacks? */
4696 if(useFallback
? value
>=0x800 : value
>=0xc00) {
4704 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4706 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4707 static const UChar32
4708 utf8_minLegal
[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4710 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4711 static const UChar32
4712 utf8_offsets
[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4715 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
4716 UConverterToUnicodeArgs
*pToUArgs
,
4717 UErrorCode
*pErrorCode
) {
4718 UConverter
*utf8
, *cnv
;
4719 const uint8_t *source
, *sourceLimit
;
4721 int32_t targetCapacity
;
4723 const uint16_t *table
, *sbcsIndex
;
4724 const uint16_t *results
;
4726 int8_t oldToULength
, toULength
, toULimit
;
4731 uint32_t asciiRoundtrips
;
4732 uint16_t value
, minValue
;
4733 UBool hasSupplementary
;
4735 /* set up the local pointers */
4736 utf8
=pToUArgs
->converter
;
4737 cnv
=pFromUArgs
->converter
;
4738 source
=(uint8_t *)pToUArgs
->source
;
4739 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
4740 target
=(uint8_t *)pFromUArgs
->target
;
4741 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
4743 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
4744 sbcsIndex
=cnv
->sharedData
->mbcs
.sbcsIndex
;
4745 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
4746 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
4748 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
4750 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
4752 if(cnv
->useFallback
) {
4753 /* use all roundtrip and fallback results */
4756 /* use only roundtrips and fallbacks from private-use characters */
4759 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
4761 /* get the converter state from the UTF-8 UConverter */
4762 c
=(UChar32
)utf8
->toUnicodeStatus
;
4764 toULength
=oldToULength
=utf8
->toULength
;
4765 toULimit
=(int8_t)utf8
->mode
;
4767 toULength
=oldToULength
=toULimit
=0;
4771 * Make sure that the last byte sequence before sourceLimit is complete
4772 * or runs into a lead byte.
4773 * Do not go back into the bytes that will be read for finishing a partial
4774 * sequence from the previous buffer.
4775 * In the conversion loop compare source with sourceLimit only once
4776 * per multi-byte character.
4781 length
=(int32_t)(sourceLimit
-source
) - (toULimit
-oldToULength
);
4782 for(i
=0; i
<3 && i
<length
;) {
4783 b
=*(sourceLimit
-i
-1);
4784 if(U8_IS_TRAIL(b
)) {
4787 if(i
<utf8_countTrailBytes
[b
]) {
4788 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4796 if(c
!=0 && targetCapacity
>0) {
4797 utf8
->toUnicodeStatus
=0;
4801 * Note: We could avoid the goto by duplicating some of the moreBytes
4802 * code, but only up to the point of collecting a complete UTF-8
4803 * sequence; then recurse for the toUBytes[toULength]
4804 * and then continue with normal conversion.
4806 * If so, move this code to just after initializing the minimum
4807 * set of local variables for reading the UTF-8 input
4808 * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4810 * Potential advantages:
4812 * - oldToULength could become a local variable in just those code blocks
4813 * that deal with buffer boundaries
4814 * - possibly faster if the goto prevents some compiler optimizations
4815 * (this would need measuring to confirm)
4817 * - code duplication
4821 /* conversion loop */
4822 while(source
<sourceLimit
) {
4823 if(targetCapacity
>0) {
4827 if(IS_ASCII_ROUNDTRIP(b
, asciiRoundtrips
)) {
4828 *target
++=(uint8_t)b
;
4833 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, 0, c
);
4837 if( /* handle U+0080..U+07FF inline */
4839 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
4843 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, c
, t1
);
4844 if(value
>=minValue
) {
4845 *target
++=(uint8_t)value
;
4854 } else if(b
==0xe0) {
4855 if( /* handle U+0800..U+0FFF inline */
4856 (t1
=(uint8_t)(source
[0]-0x80)) <= 0x3f && t1
>= 0x20 &&
4857 (t2
=(uint8_t)(source
[1]-0x80)) <= 0x3f
4861 value
=SBCS_RESULT_FROM_UTF8(sbcsIndex
, results
, c
, t2
);
4862 if(value
>=minValue
) {
4863 *target
++=(uint8_t)value
;
4877 /* handle "complicated" and error cases, and continuing partial characters */
4880 toULimit
=utf8_countTrailBytes
[b
]+1;
4883 while(toULength
<toULimit
) {
4884 if(source
<sourceLimit
) {
4886 if(U8_IS_TRAIL(b
)) {
4891 break; /* sequence too short, stop with toULength<toULimit */
4894 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4895 source
-=(toULength
-oldToULength
);
4896 while(oldToULength
<toULength
) {
4897 utf8
->toUBytes
[oldToULength
++]=*source
++;
4899 utf8
->toUnicodeStatus
=c
;
4900 utf8
->toULength
=toULength
;
4901 utf8
->mode
=toULimit
;
4902 pToUArgs
->source
=(char *)source
;
4903 pFromUArgs
->target
=(char *)target
;
4908 if( toULength
==toULimit
&& /* consumed all trail bytes */
4909 (toULength
==3 || toULength
==2) && /* BMP */
4910 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
4911 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
4913 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
4915 toULength
==toULimit
&& toULength
==4 &&
4916 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
4918 /* supplementary code point */
4919 if(!hasSupplementary
) {
4920 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4923 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
4926 /* error handling: illegal UTF-8 byte sequence */
4927 source
-=(toULength
-oldToULength
);
4928 while(oldToULength
<toULength
) {
4929 utf8
->toUBytes
[oldToULength
++]=*source
++;
4931 utf8
->toULength
=toULength
;
4932 pToUArgs
->source
=(char *)source
;
4933 pFromUArgs
->target
=(char *)target
;
4934 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
4940 if(value
>=minValue
) {
4941 /* output the mapping for c */
4942 *target
++=(uint8_t)value
;
4945 /* value<minValue means c is unassigned (unmappable) */
4947 * Try an extension mapping.
4948 * Pass in no source because we don't have UTF-16 input.
4949 * If we have a partial match on c, we will return and revert
4950 * to UTF-8->UTF-16->charset conversion.
4952 static const UChar nul
=0;
4953 const UChar
*noSource
=&nul
;
4954 c
=_extFromU(cnv
, cnv
->sharedData
,
4955 c
, &noSource
, noSource
,
4956 &target
, target
+targetCapacity
,
4961 if(U_FAILURE(*pErrorCode
)) {
4962 /* not mappable or buffer overflow */
4965 } else if(cnv
->preFromUFirstCP
>=0) {
4967 * Partial match, return and revert to pivoting.
4968 * In normal from-UTF-16 conversion, we would just continue
4969 * but then exit the loop because the extension match would
4970 * have consumed the source.
4974 /* a mapping was written to the target, continue */
4976 /* recalculate the targetCapacity after an extension mapping */
4977 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-(char *)target
);
4981 /* target is full */
4982 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
4988 * The sourceLimit may have been adjusted before the conversion loop
4989 * to stop before a truncated sequence.
4990 * If so, then collect the truncated sequence now.
4992 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
4993 c
=utf8
->toUBytes
[0]=b
=*source
++;
4995 toULimit
=utf8_countTrailBytes
[b
]+1;
4996 while(source
<sourceLimit
) {
4997 utf8
->toUBytes
[toULength
++]=b
=*source
++;
5000 utf8
->toUnicodeStatus
=c
;
5001 utf8
->toULength
=toULength
;
5002 utf8
->mode
=toULimit
;
5005 /* write back the updated pointers */
5006 pToUArgs
->source
=(char *)source
;
5007 pFromUArgs
->target
=(char *)target
;
5011 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
5012 UConverterToUnicodeArgs
*pToUArgs
,
5013 UErrorCode
*pErrorCode
) {
5014 UConverter
*utf8
, *cnv
;
5015 const uint8_t *source
, *sourceLimit
;
5017 int32_t targetCapacity
;
5019 const uint16_t *table
, *mbcsIndex
;
5020 const uint16_t *results
;
5022 int8_t oldToULength
, toULength
, toULimit
;
5027 uint32_t stage2Entry
;
5028 uint32_t asciiRoundtrips
;
5029 uint16_t value
, minValue
;
5030 UBool hasSupplementary
;
5032 /* set up the local pointers */
5033 utf8
=pToUArgs
->converter
;
5034 cnv
=pFromUArgs
->converter
;
5035 source
=(uint8_t *)pToUArgs
->source
;
5036 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
5037 target
=(uint8_t *)pFromUArgs
->target
;
5038 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
5040 table
=cnv
->sharedData
->mbcs
.fromUnicodeTable
;
5041 mbcsIndex
=cnv
->sharedData
->mbcs
.mbcsIndex
;
5042 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
5043 results
=(uint16_t *)cnv
->sharedData
->mbcs
.swapLFNLFromUnicodeBytes
;
5045 results
=(uint16_t *)cnv
->sharedData
->mbcs
.fromUnicodeBytes
;
5047 asciiRoundtrips
=cnv
->sharedData
->mbcs
.asciiRoundtrips
;
5049 if(cnv
->useFallback
) {
5050 /* use all roundtrip and fallback results */
5053 /* use only roundtrips and fallbacks from private-use characters */
5056 hasSupplementary
=(UBool
)(cnv
->sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
5058 /* get the converter state from the UTF-8 UConverter */
5059 c
=(UChar32
)utf8
->toUnicodeStatus
;
5061 toULength
=oldToULength
=utf8
->toULength
;
5062 toULimit
=(int8_t)utf8
->mode
;
5064 toULength
=oldToULength
=toULimit
=0;
5068 * Make sure that the last byte sequence before sourceLimit is complete
5069 * or runs into a lead byte.
5070 * Do not go back into the bytes that will be read for finishing a partial
5071 * sequence from the previous buffer.
5072 * In the conversion loop compare source with sourceLimit only once
5073 * per multi-byte character.
5078 length
=(int32_t)(sourceLimit
-source
) - (toULimit
-oldToULength
);
5079 for(i
=0; i
<3 && i
<length
;) {
5080 b
=*(sourceLimit
-i
-1);
5081 if(U8_IS_TRAIL(b
)) {
5084 if(i
<utf8_countTrailBytes
[b
]) {
5085 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5093 if(c
!=0 && targetCapacity
>0) {
5094 utf8
->toUnicodeStatus
=0;
5097 /* See note in ucnv_SBCSFromUTF8() about this goto. */
5100 /* conversion loop */
5101 while(source
<sourceLimit
) {
5102 if(targetCapacity
>0) {
5106 if(IS_ASCII_ROUNDTRIP(b
, asciiRoundtrips
)) {
5111 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, 0, b
);
5119 if( /* handle U+1000..U+D7FF inline */
5120 (((t1
=(uint8_t)(source
[0]-0x80), b
<0xed) && (t1
<= 0x3f)) ||
5121 (b
==0xed && (t1
<= 0x1f))) &&
5122 (t2
=(uint8_t)(source
[1]-0x80)) <= 0x3f
5126 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, c
, t2
);
5135 if( /* handle U+0080..U+07FF inline */
5137 (t1
=(uint8_t)(*source
-0x80)) <= 0x3f
5141 value
=DBCS_RESULT_FROM_UTF8(mbcsIndex
, results
, c
, t1
);
5154 /* handle "complicated" and error cases, and continuing partial characters */
5157 toULimit
=utf8_countTrailBytes
[b
]+1;
5160 while(toULength
<toULimit
) {
5161 if(source
<sourceLimit
) {
5163 if(U8_IS_TRAIL(b
)) {
5168 break; /* sequence too short, stop with toULength<toULimit */
5171 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5172 source
-=(toULength
-oldToULength
);
5173 while(oldToULength
<toULength
) {
5174 utf8
->toUBytes
[oldToULength
++]=*source
++;
5176 utf8
->toUnicodeStatus
=c
;
5177 utf8
->toULength
=toULength
;
5178 utf8
->mode
=toULimit
;
5179 pToUArgs
->source
=(char *)source
;
5180 pFromUArgs
->target
=(char *)target
;
5185 if( toULength
==toULimit
&& /* consumed all trail bytes */
5186 (toULength
==3 || toULength
==2) && /* BMP */
5187 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
5188 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
5190 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
5192 toULength
==toULimit
&& toULength
==4 &&
5193 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
5195 /* supplementary code point */
5196 if(!hasSupplementary
) {
5197 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5200 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
5203 /* error handling: illegal UTF-8 byte sequence */
5204 source
-=(toULength
-oldToULength
);
5205 while(oldToULength
<toULength
) {
5206 utf8
->toUBytes
[oldToULength
++]=*source
++;
5208 utf8
->toULength
=toULength
;
5209 pToUArgs
->source
=(char *)source
;
5210 pFromUArgs
->target
=(char *)target
;
5211 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
5215 /* get the bytes and the length for the output */
5217 value
=MBCS_VALUE_2_FROM_STAGE_2(results
, stage2Entry
, c
);
5219 /* is this code point assigned, or do we use fallbacks? */
5220 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
5221 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && value
!=0))
5228 /* write the output character bytes from value and length */
5229 /* from the first if in the loop we know that targetCapacity>0 */
5231 /* this is easy because we know that there is enough space */
5232 *target
++=(uint8_t)value
;
5234 } else /* length==2 */ {
5235 *target
++=(uint8_t)(value
>>8);
5236 if(2<=targetCapacity
) {
5237 *target
++=(uint8_t)value
;
5240 cnv
->charErrorBuffer
[0]=(char)value
;
5241 cnv
->charErrorBufferLength
=1;
5243 /* target overflow */
5244 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
5253 * Try an extension mapping.
5254 * Pass in no source because we don't have UTF-16 input.
5255 * If we have a partial match on c, we will return and revert
5256 * to UTF-8->UTF-16->charset conversion.
5258 static const UChar nul
=0;
5259 const UChar
*noSource
=&nul
;
5260 c
=_extFromU(cnv
, cnv
->sharedData
,
5261 c
, &noSource
, noSource
,
5262 &target
, target
+targetCapacity
,
5267 if(U_FAILURE(*pErrorCode
)) {
5268 /* not mappable or buffer overflow */
5271 } else if(cnv
->preFromUFirstCP
>=0) {
5273 * Partial match, return and revert to pivoting.
5274 * In normal from-UTF-16 conversion, we would just continue
5275 * but then exit the loop because the extension match would
5276 * have consumed the source.
5280 /* a mapping was written to the target, continue */
5282 /* recalculate the targetCapacity after an extension mapping */
5283 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-(char *)target
);
5288 /* target is full */
5289 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
5295 * The sourceLimit may have been adjusted before the conversion loop
5296 * to stop before a truncated sequence.
5297 * If so, then collect the truncated sequence now.
5299 if(U_SUCCESS(*pErrorCode
) && source
<(sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
)) {
5300 c
=utf8
->toUBytes
[0]=b
=*source
++;
5302 toULimit
=utf8_countTrailBytes
[b
]+1;
5303 while(source
<sourceLimit
) {
5304 utf8
->toUBytes
[toULength
++]=b
=*source
++;
5307 utf8
->toUnicodeStatus
=c
;
5308 utf8
->toULength
=toULength
;
5309 utf8
->mode
=toULimit
;
5312 /* write back the updated pointers */
5313 pToUArgs
->source
=(char *)source
;
5314 pFromUArgs
->target
=(char *)target
;
5317 /* miscellaneous ------------------------------------------------------------ */
5320 ucnv_MBCSGetStarters(const UConverter
* cnv
,
5321 UBool starters
[256],
5322 UErrorCode
*pErrorCode
) {
5323 const int32_t *state0
;
5326 state0
=cnv
->sharedData
->mbcs
.stateTable
[cnv
->sharedData
->mbcs
.dbcsOnlyState
];
5327 for(i
=0; i
<256; ++i
) {
5328 /* all bytes that cause a state transition from state 0 are lead bytes */
5329 starters
[i
]= (UBool
)MBCS_ENTRY_IS_TRANSITION(state0
[i
]);
5334 * This is an internal function that allows other converter implementations
5335 * to check whether a byte is a lead byte.
5338 ucnv_MBCSIsLeadByte(UConverterSharedData
*sharedData
, char byte
) {
5339 return (UBool
)MBCS_ENTRY_IS_TRANSITION(sharedData
->mbcs
.stateTable
[0][(uint8_t)byte
]);
5343 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs
*pArgs
,
5344 int32_t offsetIndex
,
5345 UErrorCode
*pErrorCode
) {
5346 UConverter
*cnv
=pArgs
->converter
;
5351 /* first, select between subChar and subChar1 */
5352 if( cnv
->subChar1
!=0 &&
5353 (cnv
->sharedData
->mbcs
.extIndexes
!=NULL
?
5355 (cnv
->invalidUCharBuffer
[0]<=0xff))
5357 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5358 subchar
=(char *)&cnv
->subChar1
;
5361 /* select subChar in all other cases */
5362 subchar
=(char *)cnv
->subChars
;
5363 length
=cnv
->subCharLen
;
5366 /* reset the selector for the next code point */
5367 cnv
->useSubChar1
=FALSE
;
5369 if (cnv
->sharedData
->mbcs
.outputType
== MBCS_OUTPUT_2_SISO
) {
5372 /* fromUnicodeStatus contains prevLength */
5375 if(cnv
->fromUnicodeStatus
==2) {
5376 /* DBCS mode and SBCS sub char: change to SBCS */
5377 cnv
->fromUnicodeStatus
=1;
5383 if(cnv
->fromUnicodeStatus
<=1) {
5384 /* SBCS mode and DBCS sub char: change to DBCS */
5385 cnv
->fromUnicodeStatus
=2;
5392 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
5396 length
=(int32_t)(p
-buffer
);
5399 ucnv_cbFromUWriteBytes(pArgs
, subchar
, length
, offsetIndex
, pErrorCode
);
5402 U_CFUNC UConverterType
5403 ucnv_MBCSGetType(const UConverter
* converter
) {
5404 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
5405 if(converter
->sharedData
->mbcs
.countStates
==1) {
5406 return (UConverterType
)UCNV_SBCS
;
5407 } else if((converter
->sharedData
->mbcs
.outputType
&0xff)==MBCS_OUTPUT_2_SISO
) {
5408 return (UConverterType
)UCNV_EBCDIC_STATEFUL
;
5409 } else if(converter
->sharedData
->staticData
->minBytesPerChar
==2 && converter
->sharedData
->staticData
->maxBytesPerChar
==2) {
5410 return (UConverterType
)UCNV_DBCS
;
5412 return (UConverterType
)UCNV_MBCS
;
5415 static const UConverterImpl _SBCSUTF8Impl
={
5425 ucnv_MBCSToUnicodeWithOffsets
,
5426 ucnv_MBCSToUnicodeWithOffsets
,
5427 ucnv_MBCSFromUnicodeWithOffsets
,
5428 ucnv_MBCSFromUnicodeWithOffsets
,
5429 ucnv_MBCSGetNextUChar
,
5431 ucnv_MBCSGetStarters
,
5435 ucnv_MBCSGetUnicodeSet
,
5441 static const UConverterImpl _DBCSUTF8Impl
={
5451 ucnv_MBCSToUnicodeWithOffsets
,
5452 ucnv_MBCSToUnicodeWithOffsets
,
5453 ucnv_MBCSFromUnicodeWithOffsets
,
5454 ucnv_MBCSFromUnicodeWithOffsets
,
5455 ucnv_MBCSGetNextUChar
,
5457 ucnv_MBCSGetStarters
,
5461 ucnv_MBCSGetUnicodeSet
,
5467 static const UConverterImpl _MBCSImpl
={
5477 ucnv_MBCSToUnicodeWithOffsets
,
5478 ucnv_MBCSToUnicodeWithOffsets
,
5479 ucnv_MBCSFromUnicodeWithOffsets
,
5480 ucnv_MBCSFromUnicodeWithOffsets
,
5481 ucnv_MBCSGetNextUChar
,
5483 ucnv_MBCSGetStarters
,
5487 ucnv_MBCSGetUnicodeSet
5491 /* Static data is in tools/makeconv/ucnvstat.c for data-based
5492 * converters. Be sure to update it as well.
5495 const UConverterSharedData _MBCSData
={
5496 sizeof(UConverterSharedData
), 1,
5497 NULL
, NULL
, NULL
, FALSE
, &_MBCSImpl
,
5501 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */