2 ******************************************************************************
4 * Copyright (C) 2000-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvmbcs.c
10 * tab size: 8 (not used)
13 * created on: 2000jul03
14 * created by: Markus W. Scherer
16 * The current code in this file replaces the previous implementation
17 * of conversion code from multi-byte codepages to Unicode and back.
18 * This implementation supports the following:
19 * - legacy variable-length codepages with up to 4 bytes per character
20 * - all Unicode code points (up to 0x10ffff)
21 * - efficient distinction of unassigned vs. illegal byte sequences
22 * - it is possible in fromUnicode() to directly deal with simple
23 * stateful encodings (used for EBCDIC_STATEFUL)
24 * - it is possible to convert Unicode code points other than U+0000
25 * to a single zero byte (but not as a fallback except for SBCS)
27 * Remaining limitations in fromUnicode:
28 * - byte sequences must not have leading zero bytes
29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30 * - limitation to up to 4 bytes per character
34 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
35 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
36 * macros to ucnvmbcs.h file
39 #include "unicode/utypes.h"
41 #if !UCONFIG_NO_LEGACY_CONVERSION
43 #include "unicode/ucnv.h"
44 #include "unicode/ucnv_cb.h"
45 #include "unicode/udata.h"
46 #include "unicode/uset.h"
54 /* control optimizations according to the platform */
55 #define MBCS_UNROLL_SINGLE_TO_BMP 1
56 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
59 * _MBCSHeader versions 4.1
60 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
62 * Change from version 4.0:
63 * - Replace header.reserved with header.fromUBytesLength so that all
64 * fields in the data have length.
66 * Changes from version 3 (for performance improvements):
67 * - new bit distribution for state table entries
68 * - reordered action codes
69 * - new data structure for single-byte fromUnicode
70 * + stage 2 only contains indexes
71 * + stage 3 stores 16 bits per character with classification bits 15..8
72 * - no multiplier for stage 1 entries
73 * - stage 2 for non-single-byte codepages contains the index and the flags in
75 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
77 * For more details about old versions of the MBCS data structure, see
78 * the corresponding versions of this file.
80 * Converting stateless codepage data ---------------------------------------***
81 * (or codepage data with simple states) to Unicode.
83 * Data structure and algorithm for converting from complex legacy codepages
84 * to Unicode. (Designed before 2000-may-22.)
86 * The basic idea is that the structure of legacy codepages can be described
88 * When reading a byte stream, each input byte causes a state transition.
89 * Some transitions result in the output of a code point, some result in
90 * "unassigned" or "illegal" output.
91 * This is used here for character conversion.
93 * The data structure begins with a state table consisting of a row
94 * per state, with 256 entries (columns) per row for each possible input
96 * Each entry is 32 bits wide, with two formats distinguished by
97 * the sign bit (bit 31):
99 * One format for transitional entries (bit 31 not set) for non-final bytes, and
100 * one format for final entries (bit 31 set).
101 * Both formats contain the number of the next state in the same bit
103 * State 0 is the initial state.
105 * Most of the time, the offset values of subsequent states are added
106 * up to a scalar value. This value will eventually be the index of
107 * the Unicode code point in a table that follows the state table.
108 * The effect is that the code points for final state table rows
109 * are contiguous. The code points of final state rows follow each other
110 * in the order of the references to those final states by previous
113 * For some terminal states, the offset is itself the output Unicode
114 * code point (16 bits for a BMP code point or 20 bits for a supplementary
115 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
116 * For others, the code point in the Unicode table is stored with either
117 * one or two code units: one for BMP code points, two for a pair of
119 * All code points for a final state entry take up the same number of code
120 * units, regardless of whether they all actually _use_ the same number
121 * of code units. This is necessary for simple array access.
123 * An additional feature comes in with what in ICU is called "fallback"
126 * In addition to round-trippable, precise, 1:1 mappings, there are often
127 * mappings defined between similar, though not the same, characters.
128 * Typically, such mappings occur only in fromUnicode mapping tables because
129 * Unicode has a superset repertoire of most other codepages. However, it
130 * is possible to provide such mappings in the toUnicode tables, too.
131 * In this case, the fallback mappings are partly integrated into the
132 * general state tables because the structure of the encoding includes their
134 * For final entries in an initial state, fallback mappings are stored in
135 * the entry itself like with roundtrip mappings.
136 * For other final entries, they are stored in the code units table if
137 * the entry is for a pair of code units.
138 * For single-unit results in the code units table, there is no space to
139 * alternatively hold a fallback mapping; in this case, the code unit
140 * is stored as U+fffe (unassigned), and the fallback mapping needs to
141 * be looked up by the scalar offset value in a separate table.
143 * "Unassigned" state entries really mean "structurally unassigned",
144 * i.e., such a byte sequence will never have a mapping result.
146 * The interpretation of the bits in each entry is as follows:
148 * Bit 31 not set, not a terminal entry ("transitional"):
150 * 23..0 offset delta, to be added up
152 * Bit 31 set, terminal ("final") entry:
153 * 30..24 next state (regardless of action code)
154 * 23..20 action code:
155 * action codes 0 and 1 result in precise-mapping Unicode code points
156 * 0 valid byte sequence
158 * 15..0 16-bit Unicode BMP code point
159 * never U+fffe or U+ffff
160 * 1 valid byte sequence
161 * 19..0 20-bit Unicode supplementary code point
162 * never U+fffe or U+ffff
164 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
165 * 2 valid byte sequence (fallback)
167 * 15..0 16-bit Unicode BMP code point as fallback result
168 * 3 valid byte sequence (fallback)
169 * 19..0 20-bit Unicode supplementary code point as fallback result
171 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
172 * depending on the code units they result in
173 * 4 valid byte sequence
175 * 8..0 final offset delta
176 * pointing to one 16-bit code unit which may be
177 * fffe unassigned -- look for a fallback for this offset
179 * 5 valid byte sequence
181 * 8..0 final offset delta
182 * pointing to two 16-bit code units
183 * (typically UTF-16 surrogates)
184 * the result depends on the first code unit as follows:
185 * 0000..d7ff roundtrip BMP code point (1st alone)
186 * d800..dbff roundtrip surrogate pair (1st, 2nd)
187 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
188 * e000 roundtrip BMP code point (2nd alone)
189 * e001 fallback BMP code point (2nd alone)
192 * (the final offset deltas are at most 255 * 2,
193 * times 2 because of storing code unit pairs)
195 * 6 unassigned byte sequence
197 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
198 * this does not contain a final offset delta because the main
199 * purpose of this action code is to save scalar offset values;
200 * therefore, fallback values cannot be assigned to byte
201 * sequences that result in this action code
202 * 7 illegal byte sequence
204 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
205 * 8 state change only
207 * useful for state changes in simple stateful encodings,
208 * at Shift-In/Shift-Out codes
211 * 9..15 reserved for future use
212 * current implementations will only perform a state change
213 * and ignore bits 19..0
215 * An encoding with contiguous ranges of unassigned byte sequences, like
216 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
217 * at least two states for the trail bytes:
218 * One trail byte state that results in code points, and one that only
219 * has "unassigned" and "illegal" terminal states.
221 * Note: partly by accident, this data structure supports simple stateless
222 * encodings without any additional logic.
223 * Currently, only simple Shift-In/Shift-Out schemes are handled with
224 * appropriate state tables (especially EBCDIC_STATEFUL!).
226 * MBCS version 2 added:
227 * unassigned and illegal action codes have U+fffe and U+ffff
228 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
230 * Converting from Unicode to codepage bytes --------------------------------***
232 * The conversion data structure for fromUnicode is designed for the known
233 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
234 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
235 * a roundtrip mapping.
237 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
238 * like in the character properties table.
239 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
240 * with the resulting bytes is at offsetFromUBytes.
242 * Beginning with version 4, single-byte codepages have a significantly different
243 * trie compared to other codepages.
244 * In all cases, the entry in stage 1 is directly the index of the block of
245 * 64 entries in stage 2.
247 * Single-byte lookup:
249 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
250 * Stage 3 contains one 16-bit word per result:
251 * Bits 15..8 indicate the kind of result:
253 * c fallback result from private-use code point
254 * 8 fallback result from other code points
256 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
260 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
261 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
262 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
263 * If this test is false, then a non-zero result will be interpreted as
264 * a fallback mapping.
265 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
267 * Stage 3 contains 2, 3, or 4 bytes per result.
268 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
269 * while 3 bytes are stored as bytes in big-endian order.
270 * Leading zero bytes are ignored, and the number of bytes is counted.
271 * A zero byte mapping result is possible as a roundtrip result.
272 * For some output types, the actual result is processed from this;
273 * see _MBCSFromUnicodeWithOffsets().
275 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
276 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
278 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
280 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
281 * may overlap by any number of entries.
283 * MBCS version 2 added:
284 * the converter checks for known output types, which allows
285 * adding new ones without crashing an unaware converter
288 /* prototypes --------------------------------------------------------------- */
291 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
292 UErrorCode
*pErrorCode
);
295 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs
*pArgs
,
296 UErrorCode
*pErrorCode
);
299 _MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
300 UErrorCode
*pErrorCode
);
303 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
304 UErrorCode
*pErrorCode
);
307 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
308 UErrorCode
*pErrorCode
);
311 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
312 UErrorCode
*pErrorCode
);
315 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
316 UErrorCode
*pErrorCode
);
319 fromUCallback(UConverter
*cnv
,
320 const void *context
, UConverterFromUnicodeArgs
*pArgs
,
322 UConverterCallbackReason reason
, UErrorCode
*pErrorCode
);
325 toUCallback(UConverter
*cnv
,
326 const void *context
, UConverterToUnicodeArgs
*pArgs
,
327 const char *codeUnits
, int32_t length
,
328 UConverterCallbackReason reason
, UErrorCode
*pErrorCode
);
330 /* GB 18030 data ------------------------------------------------------------ */
332 /* helper macros for linear values for GB 18030 four-byte sequences */
333 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
335 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
337 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
340 * Some ranges of GB 18030 where both the Unicode code points and the
341 * GB four-byte sequences are contiguous and are handled algorithmically by
342 * the special callback functions below.
343 * The values are start & end of Unicode & GB codes.
345 * Note that single surrogates are not mapped by GB 18030
346 * as of the re-released mapping tables from 2000-nov-30.
348 static const uint32_t
349 gb18030Ranges
[13][4]={
350 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
351 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
352 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
353 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
354 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
355 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
356 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
357 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
358 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
359 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
360 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
361 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
362 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
365 /* bit flag for UConverter.options indicating GB 18030 special handling */
366 #define _MBCS_OPTION_GB18030 0x8000
368 /* Miscellaneous ------------------------------------------------------------ */
371 _MBCSSizeofFromUBytes(UConverterMBCSTable
*mbcsTable
) {
372 const uint16_t *table
;
374 uint32_t st3
, maxStage3
;
375 uint16_t st1
, maxStage1
, st2
;
377 if(mbcsTable
->fromUBytesLength
>0) {
379 * We _know_ the number of bytes in the fromUnicodeBytes array
380 * starting with header.version 4.1.
381 * Otherwise, below, we need to enumerate the fromUnicode
382 * trie and find the highest entry.
384 return mbcsTable
->fromUBytesLength
;
387 /* Enumerate the from-Unicode trie table to find the highest stage 3 index. */
388 table
=mbcsTable
->fromUnicodeTable
;
390 if(mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
397 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
398 const uint16_t *stage2
;
400 for(st1
=0; st1
<maxStage1
; ++st1
) {
404 for(st2
=0; st2
<64; ++st2
) {
414 * add 16 to get the limit not start index of the last stage 3 block,
415 * times 2 for number of bytes
417 return (maxStage3
+16)*2;
419 const uint32_t *stage2
;
421 for(st1
=0; st1
<maxStage1
; ++st1
) {
423 if(st2
>(maxStage1
>>1)) {
424 stage2
=(const uint32_t *)table
+st2
;
425 for(st2
=0; st2
<64; ++st2
) {
426 st3
=stage2
[st2
]&0xffff;
435 * add 16 to get the limit not start index of the last stage 3 block,
436 * times 2..4 for number of bytes
438 maxStage3
=16*maxStage3
+16;
439 switch(mbcsTable
->outputType
) {
441 case MBCS_OUTPUT_4_EUC
:
448 /* MBCS_OUTPUT_2... and MBCS_OUTPUT_3_EUC */
457 _MBCSGetUnicodeSet(const UConverter
*cnv
,
459 UConverterUnicodeSet which
,
460 UErrorCode
*pErrorCode
) {
461 UConverterMBCSTable
*mbcsTable
;
462 const uint16_t *table
;
465 uint16_t st1
, maxStage1
, st2
;
469 if(cnv
->options
&_MBCS_OPTION_GB18030
) {
470 uset_addRange(set
, 0, 0xd7ff);
471 uset_addRange(set
, 0xe000, 0x10ffff);
475 /* enumerate the from-Unicode trie table */
476 mbcsTable
=&cnv
->sharedData
->table
->mbcs
;
477 table
=mbcsTable
->fromUnicodeTable
;
478 if(mbcsTable
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
484 c
=0; /* keep track of the current code point while enumerating */
486 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
487 const uint16_t *stage2
, *stage3
, *results
;
489 results
=(const uint16_t *)mbcsTable
->fromUnicodeBytes
;
491 for(st1
=0; st1
<maxStage1
; ++st1
) {
495 for(st2
=0; st2
<64; ++st2
) {
496 if((st3
=stage2
[st2
])!=0) {
497 /* read the stage 3 block */
501 * Add code points for which the roundtrip flag is set.
502 * Once we get a set for fallback mappings, we have to use
503 * a threshold variable with a value of 0x800.
504 * See _MBCSSingleFromBMPWithOffsets() and
505 * MBCS_SINGLE_RESULT_FROM_U() for details.
508 if(*stage3
++>=0xf00) {
511 } while((++c
&0xf)!=0);
513 c
+=16; /* empty stage 3 block */
517 c
+=1024; /* empty stage 2 block */
521 const uint32_t *stage2
;
523 for(st1
=0; st1
<maxStage1
; ++st1
) {
525 if(st2
>(maxStage1
>>1)) {
526 stage2
=(const uint32_t *)table
+st2
;
527 for(st2
=0; st2
<64; ++st2
) {
528 if((st3
=stage2
[st2
])!=0) {
529 /* get the roundtrip flags for the stage 3 block */
533 * Add code points for which the roundtrip flag is set.
534 * Once we get a set for fallback mappings, we have to check
535 * non-roundtrip stage 3 results for whether they are 0.
536 * See _MBCSFromUnicodeWithOffsets() for details.
543 } while((++c
&0xf)!=0);
545 c
+=16; /* empty stage 3 block */
549 c
+=1024; /* empty stage 2 block */
555 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
558 * This code modifies a standard EBCDIC<->Unicode mapping table for
559 * OS/390 (z/OS) Unix System Services (Open Edition).
560 * The difference is in the mapping of Line Feed and New Line control codes:
561 * Standard EBCDIC maps
566 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
572 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
573 * by copying it into allocated memory and swapping the LF and NL values.
574 * It allows to support the same EBCDIC charset in both versions without
575 * duplicating the entire installed table.
578 /* standard EBCDIC codes */
579 #define EBCDIC_LF 0x25
580 #define EBCDIC_NL 0x15
582 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
583 #define EBCDIC_RT_LF 0xf25
584 #define EBCDIC_RT_NL 0xf15
586 /* Unicode code points */
591 _EBCDICSwapLFNL(UConverterSharedData
*sharedData
, UErrorCode
*pErrorCode
) {
592 UConverterMBCSTable
*mbcsTable
;
594 const uint16_t *table
, *results
;
595 const uint8_t *bytes
;
597 int32_t (*newStateTable
)[256];
598 uint16_t *newResults
;
602 uint32_t stage2Entry
;
603 uint32_t size
, sizeofFromUBytes
;
605 mbcsTable
=&sharedData
->table
->mbcs
;
607 table
=mbcsTable
->fromUnicodeTable
;
608 bytes
=mbcsTable
->fromUnicodeBytes
;
609 results
=(const uint16_t *)bytes
;
612 * Check that this is an EBCDIC table with SBCS portion -
613 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
615 * If not, ignore the option. Options are always ignored if they do not apply.
618 (mbcsTable
->outputType
==MBCS_OUTPUT_1
|| mbcsTable
->outputType
==MBCS_OUTPUT_2_SISO
) &&
619 mbcsTable
->stateTable
[0][EBCDIC_LF
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
) &&
620 mbcsTable
->stateTable
[0][EBCDIC_NL
]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
)
625 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
627 EBCDIC_RT_LF
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_LF
) &&
628 EBCDIC_RT_NL
==MBCS_SINGLE_RESULT_FROM_U(table
, results
, U_NL
)
632 } else /* MBCS_OUTPUT_2_SISO */ {
633 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
635 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_LF
)!=0 &&
636 EBCDIC_LF
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_LF
)
641 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
643 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, U_NL
)!=0 &&
644 EBCDIC_NL
==MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, U_NL
)
651 * The table has an appropriate format.
653 * - a modified to-Unicode state table
654 * - a modified from-Unicode output array
655 * - a converter name string with the swap option appended
657 sizeofFromUBytes
=_MBCSSizeofFromUBytes(mbcsTable
);
659 mbcsTable
->countStates
*1024+
661 UCNV_MAX_CONVERTER_NAME_LENGTH
+20;
662 p
=(uint8_t *)uprv_malloc(size
);
664 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
668 /* copy and modify the to-Unicode state table */
669 newStateTable
=(int32_t (*)[256])p
;
670 uprv_memcpy(newStateTable
, mbcsTable
->stateTable
, mbcsTable
->countStates
*1024);
672 newStateTable
[0][EBCDIC_LF
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_NL
);
673 newStateTable
[0][EBCDIC_NL
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, U_LF
);
675 /* copy and modify the from-Unicode result table */
676 newResults
=(uint16_t *)newStateTable
[mbcsTable
->countStates
];
677 uprv_memcpy(newResults
, bytes
, sizeofFromUBytes
);
679 /* conveniently, the table access macros work on the left side of expressions */
680 if(mbcsTable
->outputType
==MBCS_OUTPUT_1
) {
681 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_LF
)=EBCDIC_RT_NL
;
682 MBCS_SINGLE_RESULT_FROM_U(table
, newResults
, U_NL
)=EBCDIC_RT_LF
;
683 } else /* MBCS_OUTPUT_2_SISO */ {
684 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_LF
);
685 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_LF
)=EBCDIC_NL
;
687 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, U_NL
);
688 MBCS_VALUE_2_FROM_STAGE_2(newResults
, stage2Entry
, U_NL
)=EBCDIC_LF
;
691 /* set the canonical converter name */
692 name
=(char *)newResults
+sizeofFromUBytes
;
693 uprv_strcpy(name
, sharedData
->staticData
->name
);
694 uprv_strcat(name
, UCNV_SWAP_LFNL_OPTION_STRING
);
696 /* set the pointers */
698 if(mbcsTable
->swapLFNLStateTable
==NULL
) {
699 mbcsTable
->swapLFNLStateTable
=newStateTable
;
700 mbcsTable
->swapLFNLFromUnicodeBytes
=(uint8_t *)newResults
;
701 mbcsTable
->swapLFNLName
=name
;
707 /* release the allocated memory if another thread beat us to it */
708 if(newStateTable
!=NULL
) {
709 uprv_free(newStateTable
);
714 /* MBCS setup functions ----------------------------------------------------- */
717 _MBCSLoad(UConverterSharedData
*sharedData
,
719 UErrorCode
*pErrorCode
) {
721 UConverterMBCSTable
*mbcsTable
=&sharedData
->table
->mbcs
;
722 _MBCSHeader
*header
=(_MBCSHeader
*)raw
;
724 if(header
->version
[0]!=4) {
725 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
729 mbcsTable
->countStates
=(uint8_t)header
->countStates
;
730 mbcsTable
->countToUFallbacks
=header
->countToUFallbacks
;
731 mbcsTable
->stateTable
=(const int32_t (*)[256])(raw
+sizeof(_MBCSHeader
));
732 mbcsTable
->toUFallbacks
=(const _MBCSToUFallback
*)(mbcsTable
->stateTable
+header
->countStates
);
733 mbcsTable
->unicodeCodeUnits
=(const uint16_t *)(raw
+header
->offsetToUCodeUnits
);
735 mbcsTable
->fromUnicodeTable
=(const uint16_t *)(raw
+header
->offsetFromUTable
);
736 mbcsTable
->fromUnicodeBytes
=(const uint8_t *)(raw
+header
->offsetFromUBytes
);
737 mbcsTable
->fromUBytesLength
=header
->fromUBytesLength
;
738 mbcsTable
->outputType
=(uint8_t)header
->flags
;
740 /* make sure that the output type is known */
741 switch(mbcsTable
->outputType
) {
746 case MBCS_OUTPUT_3_EUC
:
747 case MBCS_OUTPUT_4_EUC
:
748 case MBCS_OUTPUT_2_SISO
:
752 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
757 * converter versions 6.1 and up contain a unicodeMask that is
758 * used here to select the most efficient function implementations
760 info
.size
=sizeof(UDataInfo
);
761 udata_getInfo((UDataMemory
*)sharedData
->dataMemory
, &info
);
762 if(info
.formatVersion
[0]>6 || (info
.formatVersion
[0]==6 && info
.formatVersion
[1]>=1)) {
763 /* mask off possible future extensions to be safe */
764 mbcsTable
->unicodeMask
=(uint8_t)(sharedData
->staticData
->unicodeMask
&3);
766 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
767 mbcsTable
->unicodeMask
=UCNV_HAS_SUPPLEMENTARY
|UCNV_HAS_SURROGATES
;
772 _MBCSUnload(UConverterSharedData
*sharedData
) {
773 UConverterMBCSTable
*mbcsTable
=&sharedData
->table
->mbcs
;
775 if(mbcsTable
->swapLFNLStateTable
!=NULL
) {
776 uprv_free(mbcsTable
->swapLFNLStateTable
);
781 _MBCSReset(UConverter
*cnv
, UConverterResetChoice choice
) {
782 if(choice
<=UCNV_RESET_TO_UNICODE
) {
784 cnv
->toUnicodeStatus
=0; /* offset */
785 cnv
->mode
=0; /* state */
786 cnv
->toULength
=0; /* byteIndex */
788 if(choice
!=UCNV_RESET_TO_UNICODE
) {
790 cnv
->fromUSurrogateLead
=0;
791 cnv
->fromUnicodeStatus
=1; /* prevLength */
796 _MBCSOpen(UConverter
*cnv
,
800 UErrorCode
*pErrorCode
) {
801 if((options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
802 /* do this because double-checked locking is broken */
806 isCached
=cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
!=NULL
;
810 if(!_EBCDICSwapLFNL(cnv
->sharedData
, pErrorCode
)) {
811 /* the option does not apply, remove it */
812 cnv
->options
&=~UCNV_OPTION_SWAP_LFNL
;
818 if(uprv_strstr(name
, "18030")!=NULL
) {
819 if(uprv_strstr(name
, "gb18030")!=NULL
|| uprv_strstr(name
, "GB18030")!=NULL
) {
820 /* set a flag for GB 18030 mode, which changes the callback behavior */
821 cnv
->options
|=_MBCS_OPTION_GB18030
;
825 _MBCSReset(cnv
, UCNV_RESET_BOTH
);
829 _MBCSGetName(const UConverter
*cnv
) {
830 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0 && cnv
->sharedData
->table
->mbcs
.swapLFNLName
!=NULL
) {
831 return cnv
->sharedData
->table
->mbcs
.swapLFNLName
;
833 return cnv
->sharedData
->staticData
->name
;
837 /* MBCS-to-Unicode conversion functions ------------------------------------- */
840 _MBCSGetFallback(UConverterMBCSTable
*mbcsTable
, uint32_t offset
) {
841 const _MBCSToUFallback
*toUFallbacks
;
842 uint32_t i
, start
, limit
;
844 limit
=mbcsTable
->countToUFallbacks
;
846 /* do a binary search for the fallback mapping */
847 toUFallbacks
=mbcsTable
->toUFallbacks
;
849 while(start
<limit
-1) {
851 if(offset
<toUFallbacks
[i
].offset
) {
858 /* did we really find it? */
859 if(offset
==toUFallbacks
[start
].offset
) {
860 return toUFallbacks
[start
].codePoint
;
868 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
869 UErrorCode
*pErrorCode
) {
871 const uint8_t *source
, *sourceLimit
;
873 const UChar
*targetLimit
;
876 const int32_t (*stateTable
)[256];
877 const uint16_t *unicodeCodeUnits
;
884 int32_t sourceIndex
, nextSourceIndex
;
889 UConverterCallbackReason reason
;
891 /* use optimized function if possible */
892 cnv
=pArgs
->converter
;
893 if(cnv
->sharedData
->table
->mbcs
.countStates
==1) {
894 if(!(cnv
->sharedData
->table
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
895 _MBCSSingleToBMPWithOffsets(pArgs
, pErrorCode
);
897 _MBCSSingleToUnicodeWithOffsets(pArgs
, pErrorCode
);
902 /* set up the local pointers */
903 source
=(const uint8_t *)pArgs
->source
;
904 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
905 target
=pArgs
->target
;
906 targetLimit
=pArgs
->targetLimit
;
907 offsets
=pArgs
->offsets
;
909 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
910 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
;
912 stateTable
=cnv
->sharedData
->table
->mbcs
.stateTable
;
914 unicodeCodeUnits
=cnv
->sharedData
->table
->mbcs
.unicodeCodeUnits
;
916 /* get the converter state from UConverter */
917 offset
=cnv
->toUnicodeStatus
;
918 state
=(uint8_t)(cnv
->mode
);
919 byteIndex
=cnv
->toULength
;
922 /* sourceIndex=-1 if the current character began in the previous buffer */
923 sourceIndex
=byteIndex
==0 ? 0 : -1;
926 /* conversion loop */
927 while(source
<sourceLimit
) {
929 * This following test is to see if available input would overflow the output.
930 * It does not catch output of more than one code unit that
931 * overflows as a result of a surrogate pair or callback output
932 * from the last source byte.
933 * Therefore, those situations also test for overflows and will
934 * then break the loop, too.
936 if(target
<targetLimit
) {
938 entry
=stateTable
[state
][bytes
[byteIndex
++]=*source
++];
939 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
940 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
941 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
943 /* set the next state early so that we can reuse the entry variable */
944 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
947 * An if-else-if chain provides more reliable performance for
948 * the most common cases compared to a switch.
950 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
951 if(action
==MBCS_STATE_VALID_16
) {
952 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
953 c
=unicodeCodeUnits
[offset
];
955 /* output BMP code point */
958 *offsets
++=sourceIndex
;
960 } else if(c
==0xfffe) {
961 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (entry
=(int32_t)_MBCSGetFallback(&cnv
->sharedData
->table
->mbcs
, offset
))!=0xfffe) {
962 /* output fallback BMP code point */
963 *target
++=(UChar
)entry
;
965 *offsets
++=sourceIndex
;
968 /* callback(unassigned) */
972 /* callback(illegal) */
975 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
976 /* output BMP code point */
977 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
979 *offsets
++=sourceIndex
;
981 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
982 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
983 c
=unicodeCodeUnits
[offset
++];
985 /* output BMP code point below 0xd800 */
988 *offsets
++=sourceIndex
;
990 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
991 /* output roundtrip or fallback surrogate pair */
992 *target
++=(UChar
)(c
&0xdbff);
994 *offsets
++=sourceIndex
;
996 if(target
<targetLimit
) {
997 *target
++=unicodeCodeUnits
[offset
];
999 *offsets
++=sourceIndex
;
1002 /* target overflow */
1003 cnv
->UCharErrorBuffer
[0]=unicodeCodeUnits
[offset
];
1004 cnv
->UCharErrorBufferLength
=1;
1005 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1011 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
1012 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1013 *target
++=unicodeCodeUnits
[offset
];
1015 *offsets
++=sourceIndex
;
1017 } else if(c
==0xffff) {
1018 /* callback(illegal) */
1021 /* callback(unassigned) */
1024 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
1026 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
1027 /* output surrogate pair */
1028 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
1030 *offsets
++=sourceIndex
;
1032 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
1033 if(target
<targetLimit
) {
1036 *offsets
++=sourceIndex
;
1039 /* target overflow */
1040 cnv
->UCharErrorBuffer
[0]=c
;
1041 cnv
->UCharErrorBufferLength
=1;
1042 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1048 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
1050 * This serves as a state change without any output.
1051 * It is useful for reading simple stateful encodings,
1052 * for example using just Shift-In/Shift-Out codes.
1053 * The 21 unused bits may later be used for more sophisticated
1054 * state transitions.
1056 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1057 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1058 /* callback(unassigned) */
1061 /* output BMP code point */
1062 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1064 *offsets
++=sourceIndex
;
1066 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
1067 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1068 /* callback(unassigned) */
1072 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1073 /* callback(unassigned) */
1075 } else if(action
==MBCS_STATE_ILLEGAL
) {
1076 /* callback(illegal) */
1079 /* reserved, must never occur */
1082 /* normal end of action codes: prepare for a new character */
1085 sourceIndex
=nextSourceIndex
;
1089 reason
=UCNV_ILLEGAL
;
1090 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1093 reason
=UCNV_UNASSIGNED
;
1094 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1096 /* call the callback function with all the preparations and post-processing */
1097 /* update the arguments structure */
1098 pArgs
->source
=(const char *)source
;
1099 pArgs
->target
=target
;
1100 pArgs
->offsets
=offsets
;
1102 /* set the converter state in UConverter to deal with the next character */
1103 cnv
->toUnicodeStatus
=0;
1107 /* call the callback function */
1108 toUCallback(cnv
, cnv
->toUContext
, pArgs
, (const char *)bytes
, byteIndex
, reason
, pErrorCode
);
1110 /* get the converter state from UConverter */
1111 offset
=cnv
->toUnicodeStatus
;
1112 state
=(uint8_t)cnv
->mode
;
1113 byteIndex
=cnv
->toULength
;
1115 /* update target and deal with offsets if necessary */
1116 offsets
=ucnv_updateCallbackOffsets(offsets
, pArgs
->target
-target
, sourceIndex
);
1117 target
=pArgs
->target
;
1119 /* update the source pointer and index */
1120 sourceIndex
=nextSourceIndex
+((const uint8_t *)pArgs
->source
-source
);
1121 source
=(const uint8_t *)pArgs
->source
;
1124 * If the callback overflowed the target, then we need to
1125 * stop here with an overflow indication.
1127 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
1129 } else if(U_FAILURE(*pErrorCode
)) {
1130 /* break on error */
1135 } else if(cnv
->UCharErrorBufferLength
>0) {
1136 /* target is full */
1137 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1142 * We do not need to repeat the statements from the normal
1143 * end of the action codes because we already updated all the
1144 * necessary variables.
1148 /* target is full */
1149 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1154 if(pArgs
->flush
&& source
>=sourceLimit
) {
1155 /* reset the state for the next conversion */
1156 if(byteIndex
>0 && U_SUCCESS(*pErrorCode
)) {
1157 /* a character byte sequence remains incomplete */
1158 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
1160 cnv
->toUnicodeStatus
=0;
1164 /* set the converter state back into UConverter */
1165 cnv
->toUnicodeStatus
=offset
;
1167 cnv
->toULength
=byteIndex
;
1170 /* write back the updated pointers */
1171 pArgs
->source
=(const char *)source
;
1172 pArgs
->target
=target
;
1173 pArgs
->offsets
=offsets
;
1176 /* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1178 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1179 UErrorCode
*pErrorCode
) {
1181 const uint8_t *source
, *sourceLimit
;
1183 const UChar
*targetLimit
;
1186 const int32_t (*stateTable
)[256];
1188 int32_t sourceIndex
, nextSourceIndex
;
1193 UConverterCallbackReason reason
;
1195 /* set up the local pointers */
1196 cnv
=pArgs
->converter
;
1197 source
=(const uint8_t *)pArgs
->source
;
1198 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1199 target
=pArgs
->target
;
1200 targetLimit
=pArgs
->targetLimit
;
1201 offsets
=pArgs
->offsets
;
1203 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1204 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
;
1206 stateTable
=cnv
->sharedData
->table
->mbcs
.stateTable
;
1209 /* sourceIndex=-1 if the current character began in the previous buffer */
1213 /* conversion loop */
1214 while(source
<sourceLimit
) {
1216 * This following test is to see if available input would overflow the output.
1217 * It does not catch output of more than one code unit that
1218 * overflows as a result of a surrogate pair or callback output
1219 * from the last source byte.
1220 * Therefore, those situations also test for overflows and will
1221 * then break the loop, too.
1223 if(target
<targetLimit
) {
1225 entry
=stateTable
[0][*source
++];
1226 /* MBCS_ENTRY_IS_FINAL(entry) */
1228 /* test the most common case first */
1229 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
1230 /* output BMP code point */
1231 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1233 *offsets
++=sourceIndex
;
1236 /* normal end of action codes: prepare for a new character */
1237 sourceIndex
=nextSourceIndex
;
1242 * An if-else-if chain provides more reliable performance for
1243 * the most common cases compared to a switch.
1245 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
1246 if(action
==MBCS_STATE_VALID_DIRECT_20
) {
1248 entry
=MBCS_ENTRY_FINAL_VALUE(entry
);
1249 /* output surrogate pair */
1250 *target
++=(UChar
)(0xd800|(UChar
)(entry
>>10));
1252 *offsets
++=sourceIndex
;
1254 c
=(UChar
)(0xdc00|(UChar
)(entry
&0x3ff));
1255 if(target
<targetLimit
) {
1258 *offsets
++=sourceIndex
;
1261 /* target overflow */
1262 cnv
->UCharErrorBuffer
[0]=c
;
1263 cnv
->UCharErrorBufferLength
=1;
1264 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1267 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1268 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1269 /* callback(unassigned) */
1272 /* output BMP code point */
1273 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1275 *offsets
++=sourceIndex
;
1277 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
1278 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1279 /* callback(unassigned) */
1283 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1284 /* callback(unassigned) */
1286 } else if(action
==MBCS_STATE_ILLEGAL
) {
1287 /* callback(illegal) */
1288 reason
=UCNV_ILLEGAL
;
1289 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1292 /* reserved, must never occur */
1295 /* normal end of action codes: prepare for a new character */
1296 sourceIndex
=nextSourceIndex
;
1300 reason
=UCNV_UNASSIGNED
;
1301 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1303 /* call the callback function with all the preparations and post-processing */
1304 /* update the arguments structure */
1305 pArgs
->source
=(const char *)source
;
1306 pArgs
->target
=target
;
1307 pArgs
->offsets
=offsets
;
1309 /* call the callback function */
1310 toUCallback(cnv
, cnv
->toUContext
, pArgs
, (const char *)(source
-1), 1, reason
, pErrorCode
);
1312 /* update target and deal with offsets if necessary */
1313 offsets
=ucnv_updateCallbackOffsets(offsets
, pArgs
->target
-target
, sourceIndex
);
1314 target
=pArgs
->target
;
1316 /* update the source pointer and index */
1317 sourceIndex
=nextSourceIndex
+((const uint8_t *)pArgs
->source
-source
);
1318 source
=(const uint8_t *)pArgs
->source
;
1321 * If the callback overflowed the target, then we need to
1322 * stop here with an overflow indication.
1324 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
1326 } else if(U_FAILURE(*pErrorCode
)) {
1327 /* break on error */
1329 } else if(cnv
->UCharErrorBufferLength
>0) {
1330 /* target is full */
1331 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1336 * We do not need to repeat the statements from the normal
1337 * end of the action codes because we already updated all the
1338 * necessary variables.
1341 /* target is full */
1342 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1347 /* write back the updated pointers */
1348 pArgs
->source
=(const char *)source
;
1349 pArgs
->target
=target
;
1350 pArgs
->offsets
=offsets
;
1354 * This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1355 * that only map to and from the BMP.
1356 * In addition to single-byte optimizations, the offset calculations
1357 * become much easier.
1360 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1361 UErrorCode
*pErrorCode
) {
1363 const uint8_t *source
, *sourceLimit
, *lastSource
;
1365 int32_t targetCapacity
, length
;
1368 const int32_t (*stateTable
)[256];
1370 int32_t sourceIndex
;
1374 UConverterCallbackReason reason
;
1376 /* set up the local pointers */
1377 cnv
=pArgs
->converter
;
1378 source
=(const uint8_t *)pArgs
->source
;
1379 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1380 target
=pArgs
->target
;
1381 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
1382 offsets
=pArgs
->offsets
;
1384 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1385 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
;
1387 stateTable
=cnv
->sharedData
->table
->mbcs
.stateTable
;
1390 /* sourceIndex=-1 if the current character began in the previous buffer */
1395 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1396 * for the minimum of the sourceLength and targetCapacity
1398 length
=sourceLimit
-source
;
1399 if(length
<targetCapacity
) {
1400 targetCapacity
=length
;
1403 #if MBCS_UNROLL_SINGLE_TO_BMP
1404 /* unrolling makes it faster on Pentium III/Windows 2000 */
1405 /* unroll the loop with the most common case */
1407 if(targetCapacity
>=16) {
1408 int32_t count
, loops
, oredEntries
;
1410 loops
=count
=targetCapacity
>>4;
1412 oredEntries
=entry
=stateTable
[0][*source
++];
1413 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1414 oredEntries
|=entry
=stateTable
[0][*source
++];
1415 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1416 oredEntries
|=entry
=stateTable
[0][*source
++];
1417 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1418 oredEntries
|=entry
=stateTable
[0][*source
++];
1419 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1420 oredEntries
|=entry
=stateTable
[0][*source
++];
1421 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1422 oredEntries
|=entry
=stateTable
[0][*source
++];
1423 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1424 oredEntries
|=entry
=stateTable
[0][*source
++];
1425 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1426 oredEntries
|=entry
=stateTable
[0][*source
++];
1427 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1428 oredEntries
|=entry
=stateTable
[0][*source
++];
1429 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1430 oredEntries
|=entry
=stateTable
[0][*source
++];
1431 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1432 oredEntries
|=entry
=stateTable
[0][*source
++];
1433 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1434 oredEntries
|=entry
=stateTable
[0][*source
++];
1435 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1436 oredEntries
|=entry
=stateTable
[0][*source
++];
1437 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1438 oredEntries
|=entry
=stateTable
[0][*source
++];
1439 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1440 oredEntries
|=entry
=stateTable
[0][*source
++];
1441 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1442 oredEntries
|=entry
=stateTable
[0][*source
++];
1443 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1445 /* were all 16 entries really valid? */
1446 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries
)) {
1447 /* no, return to the first of these 16 */
1454 targetCapacity
-=16*count
;
1457 lastSource
+=16*count
;
1459 *offsets
++=sourceIndex
++;
1460 *offsets
++=sourceIndex
++;
1461 *offsets
++=sourceIndex
++;
1462 *offsets
++=sourceIndex
++;
1463 *offsets
++=sourceIndex
++;
1464 *offsets
++=sourceIndex
++;
1465 *offsets
++=sourceIndex
++;
1466 *offsets
++=sourceIndex
++;
1467 *offsets
++=sourceIndex
++;
1468 *offsets
++=sourceIndex
++;
1469 *offsets
++=sourceIndex
++;
1470 *offsets
++=sourceIndex
++;
1471 *offsets
++=sourceIndex
++;
1472 *offsets
++=sourceIndex
++;
1473 *offsets
++=sourceIndex
++;
1474 *offsets
++=sourceIndex
++;
1481 /* conversion loop */
1482 while(targetCapacity
>0) {
1483 entry
=stateTable
[0][*source
++];
1484 /* MBCS_ENTRY_IS_FINAL(entry) */
1486 /* test the most common case first */
1487 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
1488 /* output BMP code point */
1489 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1495 * An if-else-if chain provides more reliable performance for
1496 * the most common cases compared to a switch.
1498 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
1499 if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1500 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1501 /* callback(unassigned) */
1502 reason
=UCNV_UNASSIGNED
;
1503 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1505 /* output BMP code point */
1506 *target
++=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1509 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1510 /* callback(unassigned) */
1511 reason
=UCNV_UNASSIGNED
;
1512 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1513 } else if(action
==MBCS_STATE_ILLEGAL
) {
1514 /* callback(illegal) */
1515 reason
=UCNV_ILLEGAL
;
1516 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1518 /* reserved, must never occur */
1522 /* call the callback function with all the preparations and post-processing */
1523 /* set offsets since the start or the last callback */
1525 int32_t count
=(int32_t)(source
-lastSource
);
1527 /* predecrement: do not set the offset for the callback-causing character */
1529 *offsets
++=sourceIndex
++;
1531 /* offset and sourceIndex are now set for the current character */
1534 /* update the arguments structure */
1535 pArgs
->source
=(const char *)source
;
1536 pArgs
->target
=target
;
1537 pArgs
->offsets
=offsets
;
1539 /* call the callback function */
1540 toUCallback(cnv
, cnv
->toUContext
, pArgs
, (const char *)(source
-1), 1, reason
, pErrorCode
);
1542 /* update target and deal with offsets if necessary */
1543 offsets
=ucnv_updateCallbackOffsets(offsets
, pArgs
->target
-target
, sourceIndex
);
1544 target
=pArgs
->target
;
1546 /* update the source pointer and index */
1547 sourceIndex
+=1+((const uint8_t *)pArgs
->source
-source
);
1548 source
=lastSource
=(const uint8_t *)pArgs
->source
;
1549 targetCapacity
=pArgs
->targetLimit
-target
;
1550 length
=sourceLimit
-source
;
1551 if(length
<targetCapacity
) {
1552 targetCapacity
=length
;
1556 * If the callback overflowed the target, then we need to
1557 * stop here with an overflow indication.
1559 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
1561 } else if(U_FAILURE(*pErrorCode
)) {
1562 /* break on error */
1564 } else if(cnv
->UCharErrorBufferLength
>0) {
1565 /* target is full */
1566 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1570 #if MBCS_UNROLL_SINGLE_TO_BMP
1571 /* unrolling makes it faster on Pentium III/Windows 2000 */
1576 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=pArgs
->targetLimit
) {
1577 /* target is full */
1578 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1581 /* set offsets since the start or the last callback */
1583 size_t count
=source
-lastSource
;
1585 *offsets
++=sourceIndex
++;
1590 /* write back the updated pointers */
1591 pArgs
->source
=(const char *)source
;
1592 pArgs
->target
=target
;
1593 pArgs
->offsets
=offsets
;
1597 _MBCSGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1598 UErrorCode
*pErrorCode
) {
1599 UChar buffer
[UTF_MAX_CHAR_LENGTH
];
1602 const uint8_t *source
, *sourceLimit
;
1604 const int32_t (*stateTable
)[256];
1605 const uint16_t *unicodeCodeUnits
;
1615 UConverterCallbackReason reason
;
1617 /* use optimized function if possible */
1618 cnv
=pArgs
->converter
;
1619 if(cnv
->sharedData
->table
->mbcs
.unicodeMask
&UCNV_HAS_SURROGATES
) {
1621 * Calling the inefficient, generic getNextUChar() lets us deal correctly
1622 * with the rare case of a codepage that maps single surrogates
1623 * without adding the complexity to this already complicated function here.
1625 return ucnv_getNextUCharFromToUImpl(pArgs
, _MBCSToUnicodeWithOffsets
, TRUE
, pErrorCode
);
1626 } else if(cnv
->sharedData
->table
->mbcs
.countStates
==1) {
1627 return _MBCSSingleGetNextUChar(pArgs
, pErrorCode
);
1630 /* set up the local pointers */
1631 source
=(const uint8_t *)pArgs
->source
;
1632 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1634 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1635 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
;
1637 stateTable
=cnv
->sharedData
->table
->mbcs
.stateTable
;
1639 unicodeCodeUnits
=cnv
->sharedData
->table
->mbcs
.unicodeCodeUnits
;
1641 /* get the converter state from UConverter */
1642 offset
=cnv
->toUnicodeStatus
;
1643 state
=(uint8_t)(cnv
->mode
);
1644 byteIndex
=cnv
->toULength
;
1645 bytes
=cnv
->toUBytes
;
1647 /* conversion loop */
1648 while(source
<sourceLimit
) {
1649 entry
=stateTable
[state
][bytes
[byteIndex
++]=*source
++];
1650 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
1651 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1652 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1654 /* set the next state early so that we can reuse the entry variable */
1655 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
1658 * An if-else-if chain provides more reliable performance for
1659 * the most common cases compared to a switch.
1661 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
1662 if(action
==MBCS_STATE_VALID_16
) {
1663 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
1664 c
=unicodeCodeUnits
[offset
];
1666 /* output BMP code point */
1668 } else if(c
==0xfffe) {
1669 if(UCNV_TO_U_USE_FALLBACK(cnv
) && (c
=_MBCSGetFallback(&cnv
->sharedData
->table
->mbcs
, offset
))!=0xfffe) {
1672 /* callback(unassigned) */
1675 /* callback(illegal) */
1678 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
1679 /* output BMP code point */
1680 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1682 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
1683 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
1684 c
=unicodeCodeUnits
[offset
++];
1686 /* output BMP code point below 0xd800 */
1688 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? c
<=0xdfff : c
<=0xdbff) {
1689 /* output roundtrip or fallback supplementary code point */
1690 c
=((c
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00);
1692 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (c
&0xfffe)==0xe000 : c
==0xe000) {
1693 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1694 c
=unicodeCodeUnits
[offset
];
1696 } else if(c
==0xffff) {
1697 /* callback(illegal) */
1700 /* callback(unassigned) */
1703 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
1704 /* output supplementary code point */
1705 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
1707 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
1709 * This serves as a state change without any output.
1710 * It is useful for reading simple stateful encodings,
1711 * for example using just Shift-In/Shift-Out codes.
1712 * The 21 unused bits may later be used for more sophisticated
1713 * state transitions.
1715 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1716 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1717 /* callback(unassigned) */
1720 /* output BMP code point */
1721 c
=(UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1723 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
1724 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1725 /* callback(unassigned) */
1728 /* output supplementary code point */
1729 c
=(UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
1731 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1732 /* callback(unassigned) */
1734 } else if(action
==MBCS_STATE_ILLEGAL
) {
1735 /* callback(illegal) */
1738 /* reserved, must never occur */
1741 /* normal end of action codes: prepare for a new character */
1747 reason
=UCNV_ILLEGAL
;
1748 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1751 reason
=UCNV_UNASSIGNED
;
1752 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1754 /* call the callback function with all the preparations and post-processing */
1755 /* update the arguments structure */
1756 pArgs
->source
=(const char *)source
;
1757 pArgs
->target
=buffer
;
1758 pArgs
->targetLimit
=buffer
+UTF_MAX_CHAR_LENGTH
;
1760 /* set the converter state in UConverter to deal with the next character */
1761 cnv
->toUnicodeStatus
=0;
1765 /* call the callback function */
1766 toUCallback(cnv
, cnv
->toUContext
, pArgs
, (const char *)bytes
, byteIndex
, reason
, pErrorCode
);
1768 /* get the converter state from UConverter */
1769 offset
=cnv
->toUnicodeStatus
;
1770 state
=(uint8_t)cnv
->mode
;
1771 byteIndex
=cnv
->toULength
;
1773 /* update the source pointer */
1774 source
=(const uint8_t *)pArgs
->source
;
1777 * return the first character if the callback wrote some
1778 * we do not need to goto finish because the converter state is already set
1780 if(U_SUCCESS(*pErrorCode
)) {
1781 entry
=pArgs
->target
-buffer
;
1783 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, entry
);
1785 /* else (callback did not write anything) continue */
1786 } else if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
1787 *pErrorCode
=U_ZERO_ERROR
;
1788 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, UTF_MAX_CHAR_LENGTH
);
1790 /* break on error */
1791 /* ### what if a callback set an error but _also_ generated output?! */
1798 * We do not need to repeat the statements from the normal
1799 * end of the action codes because we already updated all the
1800 * necessary variables.
1806 /* incomplete character byte sequence */
1807 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
1810 /* no output because of empty input or only state changes and skipping callbacks */
1811 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1816 /* set the converter state back into UConverter, ready for a new character */
1817 cnv
->toUnicodeStatus
=0;
1821 /* write back the updated pointer */
1822 pArgs
->source
=(const char *)source
;
1827 * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
1828 * We still need a conversion loop in case a skipping callback is called.
1831 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1832 UErrorCode
*pErrorCode
) {
1833 UChar buffer
[UTF_MAX_CHAR_LENGTH
];
1836 const int32_t (*stateTable
)[256];
1837 const uint8_t *source
, *sourceLimit
;
1841 UConverterCallbackReason reason
;
1843 /* set up the local pointers */
1844 cnv
=pArgs
->converter
;
1845 source
=(const uint8_t *)pArgs
->source
;
1846 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1847 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
1848 stateTable
=(const int32_t (*)[256])cnv
->sharedData
->table
->mbcs
.swapLFNLStateTable
;
1850 stateTable
=cnv
->sharedData
->table
->mbcs
.stateTable
;
1853 /* conversion loop */
1854 while(source
<sourceLimit
) {
1855 entry
=stateTable
[0][*source
++];
1856 /* MBCS_ENTRY_IS_FINAL(entry) */
1858 /* write back the updated pointer early so that we can return directly */
1859 pArgs
->source
=(const char *)source
;
1861 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
1862 /* output BMP code point */
1863 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1867 * An if-else-if chain provides more reliable performance for
1868 * the most common cases compared to a switch.
1870 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
1871 if(action
==MBCS_STATE_VALID_DIRECT_20
) {
1872 /* output supplementary code point */
1873 return (UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
1874 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
1875 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1876 /* callback(unassigned) */
1877 reason
=UCNV_UNASSIGNED
;
1878 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1880 /* output BMP code point */
1881 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
1883 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
1884 if(!UCNV_TO_U_USE_FALLBACK(cnv
)) {
1885 /* callback(unassigned) */
1886 reason
=UCNV_UNASSIGNED
;
1887 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1889 /* output supplementary code point */
1890 return (UChar32
)(MBCS_ENTRY_FINAL_VALUE(entry
)+0x10000);
1892 } else if(action
==MBCS_STATE_UNASSIGNED
) {
1893 /* callback(unassigned) */
1894 reason
=UCNV_UNASSIGNED
;
1895 *pErrorCode
=U_INVALID_CHAR_FOUND
;
1896 } else if(action
==MBCS_STATE_ILLEGAL
) {
1897 /* callback(illegal) */
1898 reason
=UCNV_ILLEGAL
;
1899 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1901 /* reserved, must never occur */
1902 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1906 /* call the callback function with all the preparations and post-processing */
1907 /* update the arguments structure */
1908 pArgs
->target
=buffer
;
1909 pArgs
->targetLimit
=buffer
+UTF_MAX_CHAR_LENGTH
;
1911 /* call the callback function */
1912 toUCallback(cnv
, cnv
->toUContext
, pArgs
, (const char *)(source
-1), 1, reason
, pErrorCode
);
1914 /* update the source pointer */
1915 source
=(const uint8_t *)pArgs
->source
;
1918 * return the first character if the callback wrote some
1919 * we do not need to goto finish because the converter state is already set
1921 if(U_SUCCESS(*pErrorCode
)) {
1922 entry
=pArgs
->target
-buffer
;
1924 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, entry
);
1926 /* else (callback did not write anything) continue */
1927 } else if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
1928 *pErrorCode
=U_ZERO_ERROR
;
1929 return ucnv_getUChar32KeepOverflow(cnv
, buffer
, UTF_MAX_CHAR_LENGTH
);
1931 /* break on error */
1932 /* ### what if a callback set an error but _also_ generated output?! */
1937 /* no output because of empty input or only state changes and skipping callbacks */
1938 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1943 * This is a simple version of getNextUChar() that is used
1944 * by other converter implementations.
1945 * It does not use state from the converter, nor error codes.
1946 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
1951 * otherwise the Unicode code point
1954 _MBCSSimpleGetNextUChar(UConverterSharedData
*sharedData
,
1955 const char **pSource
, const char *sourceLimit
,
1956 UBool useFallback
) {
1957 const uint8_t *source
;
1959 const int32_t (*stateTable
)[256];
1960 const uint16_t *unicodeCodeUnits
;
1963 uint8_t state
, action
;
1967 /* set up the local pointers */
1968 source
=(const uint8_t *)*pSource
;
1969 if(source
>=(const uint8_t *)sourceLimit
) {
1970 /* no input at all: "illegal" */
1976 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
1977 * TODO In future releases, verify that this function is never called for SBCS
1978 * conversions, i.e., that sharedData->table->mbcs.countStates==1 is still true.
1979 * Removal improves code coverage.
1981 /* use optimized function if possible */
1982 if(sharedData
->table
->mbcs
.countStates
==1) {
1983 return _MBCSSingleSimpleGetNextUChar(sharedData
, (uint8_t)(*(*pSource
)++), useFallback
);
1987 stateTable
=sharedData
->table
->mbcs
.stateTable
;
1988 unicodeCodeUnits
=sharedData
->table
->mbcs
.unicodeCodeUnits
;
1990 /* converter state */
1994 /* conversion loop */
1996 entry
=stateTable
[state
][*source
++];
1997 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
1998 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1999 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
2001 *pSource
=(const char *)source
;
2004 * An if-else-if chain provides more reliable performance for
2005 * the most common cases compared to a switch.
2007 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2008 if(action
==MBCS_STATE_VALID_16
) {
2009 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2010 entry
=unicodeCodeUnits
[offset
];
2012 return (UChar32
)entry
;
2013 } else if(UCNV_TO_U_USE_FALLBACK(cnv
)) {
2014 return _MBCSGetFallback(&sharedData
->table
->mbcs
, offset
);
2018 } else if(action
==MBCS_STATE_VALID_DIRECT_16
) {
2019 /* output BMP code point */
2020 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2021 } else if(action
==MBCS_STATE_VALID_16_PAIR
) {
2022 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
2023 entry
=unicodeCodeUnits
[offset
++];
2025 /* output BMP code point below 0xd800 */
2026 return (UChar32
)entry
;
2027 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? entry
<=0xdfff : entry
<=0xdbff) {
2028 /* output roundtrip or fallback supplementary code point */
2029 return (UChar32
)(((entry
&0x3ff)<<10)+unicodeCodeUnits
[offset
]+(0x10000-0xdc00));
2030 } else if(UCNV_TO_U_USE_FALLBACK(cnv
) ? (entry
&0xfffe)==0xe000 : entry
==0xe000) {
2031 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2032 return unicodeCodeUnits
[offset
];
2033 } else if(entry
==0xffff) {
2038 } else if(action
==MBCS_STATE_VALID_DIRECT_20
) {
2039 /* output supplementary code point */
2040 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2041 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2042 if(!TO_U_USE_FALLBACK(useFallback
)) {
2045 /* output BMP code point */
2046 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2047 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
2048 if(!TO_U_USE_FALLBACK(useFallback
)) {
2051 /* output supplementary code point */
2052 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2053 } else if(action
==MBCS_STATE_CHANGE_ONLY
) {
2055 * This serves as a state change without any output.
2056 * It is useful for reading simple stateful encodings,
2057 * for example using just Shift-In/Shift-Out codes.
2058 * The 21 unused bits may later be used for more sophisticated
2059 * state transitions.
2061 if(source
==(const uint8_t *)sourceLimit
) {
2062 /* if there are only state changes, then return "unassigned" */
2065 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2067 } else if(action
==MBCS_STATE_ILLEGAL
) {
2070 /* reserved, must never occur */
2073 /* state change only - prepare for a new character */
2074 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
); /* typically 0 */
2077 } while(source
<(const uint8_t *)sourceLimit
);
2079 *pSource
=(const char *)source
;
2085 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2086 * Removal improves code coverage.
2089 * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2090 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2093 _MBCSSingleSimpleGetNextUChar(UConverterSharedData
*sharedData
,
2094 uint8_t b
, UBool useFallback
) {
2098 entry
=sharedData
->table
->mbcs
.stateTable
[0][b
];
2099 /* MBCS_ENTRY_IS_FINAL(entry) */
2101 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry
)) {
2102 /* output BMP code point */
2103 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2107 * An if-else-if chain provides more reliable performance for
2108 * the most common cases compared to a switch.
2110 action
=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry
));
2111 if(action
==MBCS_STATE_VALID_DIRECT_20
) {
2112 /* output supplementary code point */
2113 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2114 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_16
) {
2115 if(!TO_U_USE_FALLBACK(useFallback
)) {
2118 /* output BMP code point */
2119 return (UChar
)MBCS_ENTRY_FINAL_VALUE_16(entry
);
2120 } else if(action
==MBCS_STATE_FALLBACK_DIRECT_20
) {
2121 if(!TO_U_USE_FALLBACK(useFallback
)) {
2124 /* output supplementary code point */
2125 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
2126 } else if(action
==MBCS_STATE_UNASSIGNED
) {
2128 } else if(action
==MBCS_STATE_ILLEGAL
) {
2131 /* reserved, must never occur */
2137 /* MBCS-from-Unicode conversion functions ----------------------------------- */
2140 _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
2141 UErrorCode
*pErrorCode
) {
2143 const UChar
*source
, *sourceLimit
;
2145 int32_t targetCapacity
;
2148 const uint16_t *table
;
2149 const uint8_t *p
, *bytes
;
2154 int32_t prevSourceIndex
, sourceIndex
, nextSourceIndex
;
2156 UConverterCallbackReason reason
;
2157 uint32_t stage2Entry
;
2159 int32_t length
, prevLength
;
2160 uint8_t unicodeMask
;
2162 /* use optimized function if possible */
2163 cnv
=pArgs
->converter
;
2164 outputType
=cnv
->sharedData
->table
->mbcs
.outputType
;
2165 unicodeMask
=cnv
->sharedData
->table
->mbcs
.unicodeMask
;
2166 if(outputType
==MBCS_OUTPUT_1
&& !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
2167 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
2168 _MBCSSingleFromBMPWithOffsets(pArgs
, pErrorCode
);
2170 _MBCSSingleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
2173 } else if(outputType
==MBCS_OUTPUT_2
) {
2174 _MBCSDoubleFromUnicodeWithOffsets(pArgs
, pErrorCode
);
2178 /* set up the local pointers */
2179 source
=pArgs
->source
;
2180 sourceLimit
=pArgs
->sourceLimit
;
2181 target
=(uint8_t *)pArgs
->target
;
2182 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
2183 offsets
=pArgs
->offsets
;
2185 table
=cnv
->sharedData
->table
->mbcs
.fromUnicodeTable
;
2186 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2187 bytes
=cnv
->sharedData
->table
->mbcs
.swapLFNLFromUnicodeBytes
;
2189 bytes
=cnv
->sharedData
->table
->mbcs
.fromUnicodeBytes
;
2192 /* get the converter state from UConverter */
2193 c
=cnv
->fromUSurrogateLead
;
2194 prevLength
=cnv
->fromUnicodeStatus
;
2196 /* sourceIndex=-1 if the current character began in the previous buffer */
2198 sourceIndex
= c
==0 ? 0 : -1;
2201 /* conversion loop */
2203 * This is another piece of ugly code:
2204 * A goto into the loop if the converter state contains a first surrogate
2205 * from the previous function call.
2206 * It saves me to check in each loop iteration a check of if(c==0)
2207 * and duplicating the trail-surrogate-handling code in the else
2208 * branch of that check.
2209 * I could not find any other way to get around this other than
2210 * using a function call for the conversion and callback, which would
2211 * be even more inefficient.
2213 * Markus Scherer 2000-jul-19
2215 if(c
!=0 && targetCapacity
>0) {
2219 while(source
<sourceLimit
) {
2221 * This following test is to see if available input would overflow the output.
2222 * It does not catch output of more than one byte that
2223 * overflows as a result of a multi-byte character or callback output
2224 * from the last source character.
2225 * Therefore, those situations also test for overflows and will
2226 * then break the loop, too.
2228 if(targetCapacity
>0) {
2230 * Get a correct Unicode code point:
2231 * a single UChar for a BMP code point or
2232 * a matched surrogate pair for a "supplementary code point".
2237 * This also tests if the codepage maps single surrogates.
2238 * If it does, then surrogates are not paired but mapped separately.
2239 * Note that in this case unmatched surrogates are not detected.
2241 if(UTF_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
2242 if(UTF_IS_SURROGATE_FIRST(c
)) {
2244 if(source
<sourceLimit
) {
2245 /* test the following code unit */
2246 UChar trail
=*source
;
2247 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2250 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
2251 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
2252 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2253 /* callback(unassigned) */
2256 /* convert this supplementary code point */
2257 /* exit this condition tree */
2259 /* this is an unmatched lead code unit (1st surrogate) */
2260 /* callback(illegal) */
2261 reason
=UCNV_ILLEGAL
;
2262 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2270 /* this is an unmatched trail code unit (2nd surrogate) */
2271 /* callback(illegal) */
2272 reason
=UCNV_ILLEGAL
;
2273 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2278 /* convert the Unicode code point in c into codepage bytes */
2281 * The basic lookup is a triple-stage compact array (trie) lookup.
2282 * For details see the beginning of this file.
2284 * Single-byte codepages are handled with a different data structure
2285 * by _MBCSSingle... functions.
2287 * The result consists of a 32-bit value from stage 2 and
2288 * a pointer to as many bytes as are stored per character.
2289 * The pointer points to the character's bytes in stage 3.
2290 * Bits 15..0 of the stage 2 entry contain the stage 3 index
2291 * for that pointer, while bits 31..16 are flags for which of
2292 * the 16 characters in the block are roundtrip-assigned.
2294 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
2295 * respectively as uint32_t, in the platform encoding.
2296 * For 3-byte codepages, the bytes are always stored in big-endian order.
2298 * For EUC encodings that use only either 0x8e or 0x8f as the first
2299 * byte of their longest byte sequences, the first two bytes in
2300 * this third stage indicate with their 7th bits whether these bytes
2301 * are to be written directly or actually need to be preceeded by
2302 * one of the two Single-Shift codes. With this, the third stage
2303 * stores one byte fewer per character than the actual maximum length of
2304 * EUC byte sequences.
2306 * Other than that, leading zero bytes are removed and the other
2307 * bytes output. A single zero byte may be output if the "assigned"
2308 * bit in stage 2 was on or also if the Unicode code point is U+0000.
2309 * The data structure does not support zero byte output as a fallback
2310 * for other code points, and also does not allow output of leading zeros.
2312 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
2314 /* get the bytes and the length for the output */
2315 switch(outputType
) {
2317 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2324 case MBCS_OUTPUT_2_SISO
:
2325 /* 1/2-byte stateful with Shift-In/Shift-Out */
2327 * Save the old state in the converter object
2328 * right here, then change the local prevLength state variable if necessary.
2329 * Then, if this character turns out to be unassigned or a fallback that
2330 * is not taken, the callback code must not save the new state in the converter
2331 * because the new state is for a character that is not output.
2332 * However, the callback must still restore the state from the converter
2333 * in case the callback function changed it for its output.
2335 cnv
->fromUnicodeStatus
=prevLength
; /* save the old state */
2336 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2341 /* change from double-byte mode to single-byte */
2342 value
|=(uint32_t)UCNV_SI
<<8;
2350 /* change from single-byte mode to double-byte */
2351 value
|=(uint32_t)UCNV_SO
<<16;
2358 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2359 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
2362 } else if(value
<=0xffff) {
2369 value
=MBCS_VALUE_4_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2372 } else if(value
<=0xffff) {
2374 } else if(value
<=0xffffff) {
2380 case MBCS_OUTPUT_3_EUC
:
2381 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2382 /* EUC 16-bit fixed-length representation */
2385 } else if((value
&0x8000)==0) {
2388 } else if((value
&0x80)==0) {
2395 case MBCS_OUTPUT_4_EUC
:
2396 p
=MBCS_POINTER_3_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2397 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
2398 /* EUC 16-bit fixed-length representation applied to the first two bytes */
2401 } else if(value
<=0xffff) {
2403 } else if((value
&0x800000)==0) {
2406 } else if((value
&0x8000)==0) {
2414 /* must not occur */
2416 * To avoid compiler warnings that value & length may be
2417 * used without having been initialized, we set them here.
2418 * In reality, this is unreachable code.
2419 * Not having a default branch also causes warnings with
2427 /* is this code point assigned, or do we use fallbacks? */
2428 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
)!=0 ||
2429 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && (value
!=0 || c
==0)))
2432 * We allow a 0 byte output if the Unicode code point is
2433 * U+0000 and also if the "assigned" bit is set for this entry.
2434 * There is no way with this data structure for fallback output
2435 * for other than U+0000 to be a zero byte.
2437 /* callback(unassigned) */
2441 /* write the output character bytes from value and length */
2442 /* from the first if in the loop we know that targetCapacity>0 */
2443 if(length
<=targetCapacity
) {
2446 /* each branch falls through to the next one */
2448 *target
++=(uint8_t)(value
>>24);
2450 *target
++=(uint8_t)(value
>>16);
2452 *target
++=(uint8_t)(value
>>8);
2454 *target
++=(uint8_t)value
;
2456 /* will never occur */
2461 /* each branch falls through to the next one */
2463 *target
++=(uint8_t)(value
>>24);
2464 *offsets
++=sourceIndex
;
2466 *target
++=(uint8_t)(value
>>16);
2467 *offsets
++=sourceIndex
;
2469 *target
++=(uint8_t)(value
>>8);
2470 *offsets
++=sourceIndex
;
2472 *target
++=(uint8_t)value
;
2473 *offsets
++=sourceIndex
;
2475 /* will never occur */
2479 targetCapacity
-=length
;
2481 uint8_t *charErrorBuffer
;
2484 * We actually do this backwards here:
2485 * In order to save an intermediate variable, we output
2486 * first to the overflow buffer what does not fit into the
2489 /* we know that 1<=targetCapacity<length<=4 */
2490 length
-=targetCapacity
;
2491 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
2493 /* each branch falls through to the next one */
2495 *charErrorBuffer
++=(uint8_t)(value
>>16);
2497 *charErrorBuffer
++=(uint8_t)(value
>>8);
2499 *charErrorBuffer
=(uint8_t)value
;
2501 /* will never occur */
2504 cnv
->charErrorBufferLength
=(int8_t)length
;
2506 /* now output what fits into the regular target */
2507 value
>>=8*length
; /* length was reduced by targetCapacity */
2508 switch(targetCapacity
) {
2509 /* each branch falls through to the next one */
2511 *target
++=(uint8_t)(value
>>16);
2513 *offsets
++=sourceIndex
;
2516 *target
++=(uint8_t)(value
>>8);
2518 *offsets
++=sourceIndex
;
2521 *target
++=(uint8_t)value
;
2523 *offsets
++=sourceIndex
;
2526 /* will never occur */
2530 /* target overflow */
2532 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2537 /* normal end of conversion: prepare for a new character */
2540 prevSourceIndex
=sourceIndex
;
2541 sourceIndex
=nextSourceIndex
;
2546 * This is the same ugly trick as in ToUnicode(), for the
2550 reason
=UCNV_UNASSIGNED
;
2551 *pErrorCode
=U_INVALID_CHAR_FOUND
;
2553 /* call the callback function with all the preparations and post-processing */
2554 /* update the arguments structure */
2555 pArgs
->source
=source
;
2556 pArgs
->target
=(char *)target
;
2557 pArgs
->offsets
=offsets
;
2559 /* set the converter state in UConverter to deal with the next character */
2560 cnv
->fromUSurrogateLead
=0;
2562 * Do not save the prevLength SISO state because prevLength is set for
2563 * the character that is now not output because it is unassigned or it is
2564 * a fallback that is not taken.
2565 * The above branch for MBCS_OUTPUT_2_SISO has saved the previous state already.
2566 * See comments there.
2568 prevSourceIndex
=sourceIndex
;
2570 /* call the callback function */
2571 fromUCallback(cnv
, cnv
->fromUContext
, pArgs
, c
, reason
, pErrorCode
);
2573 /* get the converter state from UConverter */
2574 c
=cnv
->fromUSurrogateLead
;
2575 prevLength
=cnv
->fromUnicodeStatus
;
2577 /* update target and deal with offsets if necessary */
2578 offsets
=ucnv_updateCallbackOffsets(offsets
, ((uint8_t *)pArgs
->target
)-target
, sourceIndex
);
2579 target
=(uint8_t *)pArgs
->target
;
2581 /* update the source pointer and index */
2582 sourceIndex
=nextSourceIndex
+(pArgs
->source
-source
);
2583 source
=pArgs
->source
;
2584 targetCapacity
=(uint8_t *)pArgs
->targetLimit
-target
;
2587 * If the callback overflowed the target, then we need to
2588 * stop here with an overflow indication.
2590 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
2592 } else if(U_FAILURE(*pErrorCode
)) {
2593 /* break on error */
2596 } else if(cnv
->charErrorBufferLength
>0) {
2597 /* target is full */
2598 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2603 * We do not need to repeat the statements from the normal
2604 * end of the conversion because we already updated all the
2605 * necessary variables.
2608 /* target is full */
2609 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2614 if(pArgs
->flush
&& source
>=sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
2615 /* end of input stream */
2617 /* a Unicode code point remains incomplete (only a first surrogate) */
2618 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
2619 /* the following may change with Jitterbug 2449: would prepare for callback instead of resetting */
2622 } else if(outputType
==MBCS_OUTPUT_2_SISO
&& prevLength
==2) {
2623 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
2624 if(targetCapacity
>0) {
2625 *target
++=(uint8_t)UCNV_SI
;
2627 /* set the last source character's index (sourceIndex points at sourceLimit now) */
2628 *offsets
++=prevSourceIndex
;
2631 /* target is full */
2632 cnv
->charErrorBuffer
[0]=(char)UCNV_SI
;
2633 cnv
->charErrorBufferLength
=1;
2634 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2636 prevLength
=1; /* we switched into SBCS */
2639 /* reset the state for the next conversion */
2640 if(U_SUCCESS(*pErrorCode
)) {
2646 /* set the converter state back into UConverter */
2647 cnv
->fromUSurrogateLead
=(UChar
)c
;
2648 cnv
->fromUnicodeStatus
=prevLength
;
2650 /* write back the updated pointers */
2651 pArgs
->source
=source
;
2652 pArgs
->target
=(char *)target
;
2653 pArgs
->offsets
=offsets
;
2656 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
2658 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
2659 UErrorCode
*pErrorCode
) {
2661 const UChar
*source
, *sourceLimit
;
2663 int32_t targetCapacity
;
2666 const uint16_t *table
;
2667 const uint8_t *bytes
;
2671 int32_t sourceIndex
, nextSourceIndex
;
2673 UConverterCallbackReason reason
;
2674 uint32_t stage2Entry
;
2676 int32_t length
, prevLength
;
2677 uint8_t unicodeMask
;
2679 /* use optimized function if possible */
2680 cnv
=pArgs
->converter
;
2681 unicodeMask
=cnv
->sharedData
->table
->mbcs
.unicodeMask
;
2683 /* set up the local pointers */
2684 source
=pArgs
->source
;
2685 sourceLimit
=pArgs
->sourceLimit
;
2686 target
=(uint8_t *)pArgs
->target
;
2687 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
2688 offsets
=pArgs
->offsets
;
2690 table
=cnv
->sharedData
->table
->mbcs
.fromUnicodeTable
;
2691 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2692 bytes
=cnv
->sharedData
->table
->mbcs
.swapLFNLFromUnicodeBytes
;
2694 bytes
=cnv
->sharedData
->table
->mbcs
.fromUnicodeBytes
;
2697 /* get the converter state from UConverter */
2698 c
=cnv
->fromUSurrogateLead
;
2699 prevLength
=cnv
->fromUnicodeStatus
;
2701 /* sourceIndex=-1 if the current character began in the previous buffer */
2702 sourceIndex
= c
==0 ? 0 : -1;
2705 /* conversion loop */
2706 if(c
!=0 && targetCapacity
>0) {
2710 while(source
<sourceLimit
) {
2712 * This following test is to see if available input would overflow the output.
2713 * It does not catch output of more than one byte that
2714 * overflows as a result of a multi-byte character or callback output
2715 * from the last source character.
2716 * Therefore, those situations also test for overflows and will
2717 * then break the loop, too.
2719 if(targetCapacity
>0) {
2721 * Get a correct Unicode code point:
2722 * a single UChar for a BMP code point or
2723 * a matched surrogate pair for a "supplementary code point".
2728 * This also tests if the codepage maps single surrogates.
2729 * If it does, then surrogates are not paired but mapped separately.
2730 * Note that in this case unmatched surrogates are not detected.
2732 if(UTF_IS_SURROGATE(c
) && !(unicodeMask
&UCNV_HAS_SURROGATES
)) {
2733 if(UTF_IS_SURROGATE_FIRST(c
)) {
2735 if(source
<sourceLimit
) {
2736 /* test the following code unit */
2737 UChar trail
=*source
;
2738 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2741 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
2742 if(!(unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
2743 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2744 /* callback(unassigned) */
2747 /* convert this supplementary code point */
2748 /* exit this condition tree */
2750 /* this is an unmatched lead code unit (1st surrogate) */
2751 /* callback(illegal) */
2752 reason
=UCNV_ILLEGAL
;
2753 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2761 /* this is an unmatched trail code unit (2nd surrogate) */
2762 /* callback(illegal) */
2763 reason
=UCNV_ILLEGAL
;
2764 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
2769 /* convert the Unicode code point in c into codepage bytes */
2770 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
2772 /* get the bytes and the length for the output */
2774 value
=MBCS_VALUE_2_FROM_STAGE_2(bytes
, stage2Entry
, c
);
2781 /* is this code point assigned, or do we use fallbacks? */
2782 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
2783 (UCNV_FROM_U_USE_FALLBACK(cnv
, c
) && (value
!=0 || c
==0)))
2786 * We allow a 0 byte output if the Unicode code point is
2787 * U+0000 and also if the "assigned" bit is set for this entry.
2788 * There is no way with this data structure for fallback output
2789 * for other than U+0000 to be a zero byte.
2791 /* callback(unassigned) */
2795 /* write the output character bytes from value and length */
2796 /* from the first if in the loop we know that targetCapacity>0 */
2798 /* this is easy because we know that there is enough space */
2799 *target
++=(uint8_t)value
;
2801 *offsets
++=sourceIndex
;
2804 } else /* length==2 */ {
2805 *target
++=(uint8_t)(value
>>8);
2806 if(2<=targetCapacity
) {
2807 *target
++=(uint8_t)value
;
2809 *offsets
++=sourceIndex
;
2810 *offsets
++=sourceIndex
;
2815 *offsets
++=sourceIndex
;
2817 cnv
->charErrorBuffer
[0]=(char)value
;
2818 cnv
->charErrorBufferLength
=1;
2820 /* target overflow */
2822 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2828 /* normal end of conversion: prepare for a new character */
2830 sourceIndex
=nextSourceIndex
;
2834 * This is the same ugly trick as in ToUnicode(), for the
2838 reason
=UCNV_UNASSIGNED
;
2839 *pErrorCode
=U_INVALID_CHAR_FOUND
;
2841 /* call the callback function with all the preparations and post-processing */
2842 /* update the arguments structure */
2843 pArgs
->source
=source
;
2844 pArgs
->target
=(char *)target
;
2845 pArgs
->offsets
=offsets
;
2847 /* set the converter state in UConverter to deal with the next character */
2848 cnv
->fromUSurrogateLead
=0;
2849 cnv
->fromUnicodeStatus
=prevLength
;
2851 /* call the callback function */
2852 fromUCallback(cnv
, cnv
->fromUContext
, pArgs
, c
, reason
, pErrorCode
);
2854 /* get the converter state from UConverter */
2855 c
=cnv
->fromUSurrogateLead
;
2856 prevLength
=cnv
->fromUnicodeStatus
;
2858 /* update target and deal with offsets if necessary */
2859 offsets
=ucnv_updateCallbackOffsets(offsets
, ((uint8_t *)pArgs
->target
)-target
, sourceIndex
);
2860 target
=(uint8_t *)pArgs
->target
;
2862 /* update the source pointer and index */
2863 sourceIndex
=nextSourceIndex
+(pArgs
->source
-source
);
2864 source
=pArgs
->source
;
2865 targetCapacity
=(uint8_t *)pArgs
->targetLimit
-target
;
2868 * If the callback overflowed the target, then we need to
2869 * stop here with an overflow indication.
2871 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
2873 } else if(U_FAILURE(*pErrorCode
)) {
2874 /* break on error */
2877 } else if(cnv
->charErrorBufferLength
>0) {
2878 /* target is full */
2879 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2884 * We do not need to repeat the statements from the normal
2885 * end of the conversion because we already updated all the
2886 * necessary variables.
2889 /* target is full */
2890 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
2895 if(pArgs
->flush
&& source
>=sourceLimit
) {
2896 /* reset the state for the next conversion */
2897 if(c
!=0 && U_SUCCESS(*pErrorCode
)) {
2898 /* a Unicode code point remains incomplete (only a first surrogate) */
2899 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
2901 cnv
->fromUSurrogateLead
=0;
2902 cnv
->fromUnicodeStatus
=1;
2904 /* set the converter state back into UConverter */
2905 cnv
->fromUSurrogateLead
=(UChar
)c
;
2906 cnv
->fromUnicodeStatus
=prevLength
;
2909 /* write back the updated pointers */
2910 pArgs
->source
=source
;
2911 pArgs
->target
=(char *)target
;
2912 pArgs
->offsets
=offsets
;
2915 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
2917 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
2918 UErrorCode
*pErrorCode
) {
2920 const UChar
*source
, *sourceLimit
;
2922 int32_t targetCapacity
;
2925 const uint16_t *table
;
2926 const uint16_t *results
;
2930 int32_t sourceIndex
, nextSourceIndex
;
2932 UConverterCallbackReason reason
;
2933 uint16_t value
, minValue
;
2934 UBool hasSupplementary
;
2936 /* set up the local pointers */
2937 cnv
=pArgs
->converter
;
2938 source
=pArgs
->source
;
2939 sourceLimit
=pArgs
->sourceLimit
;
2940 target
=(uint8_t *)pArgs
->target
;
2941 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
2942 offsets
=pArgs
->offsets
;
2944 table
=cnv
->sharedData
->table
->mbcs
.fromUnicodeTable
;
2945 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
2946 results
=(uint16_t *)cnv
->sharedData
->table
->mbcs
.swapLFNLFromUnicodeBytes
;
2948 results
=(uint16_t *)cnv
->sharedData
->table
->mbcs
.fromUnicodeBytes
;
2951 if(cnv
->useFallback
) {
2952 /* use all roundtrip and fallback results */
2955 /* use only roundtrips and fallbacks from private-use characters */
2958 hasSupplementary
=(UBool
)(cnv
->sharedData
->table
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
);
2960 /* get the converter state from UConverter */
2961 c
=cnv
->fromUSurrogateLead
;
2963 /* sourceIndex=-1 if the current character began in the previous buffer */
2964 sourceIndex
= c
==0 ? 0 : -1;
2967 /* conversion loop */
2968 if(c
!=0 && targetCapacity
>0) {
2972 while(source
<sourceLimit
) {
2974 * This following test is to see if available input would overflow the output.
2975 * It does not catch output of more than one byte that
2976 * overflows as a result of a multi-byte character or callback output
2977 * from the last source character.
2978 * Therefore, those situations also test for overflows and will
2979 * then break the loop, too.
2981 if(targetCapacity
>0) {
2983 * Get a correct Unicode code point:
2984 * a single UChar for a BMP code point or
2985 * a matched surrogate pair for a "supplementary code point".
2989 if(UTF_IS_SURROGATE(c
)) {
2990 if(UTF_IS_SURROGATE_FIRST(c
)) {
2992 if(source
<sourceLimit
) {
2993 /* test the following code unit */
2994 UChar trail
=*source
;
2995 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2998 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
2999 if(!hasSupplementary
) {
3000 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3001 /* callback(unassigned) */
3004 /* convert this supplementary code point */
3005 /* exit this condition tree */
3007 /* this is an unmatched lead code unit (1st surrogate) */
3008 /* callback(illegal) */
3009 reason
=UCNV_ILLEGAL
;
3010 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3018 /* this is an unmatched trail code unit (2nd surrogate) */
3019 /* callback(illegal) */
3020 reason
=UCNV_ILLEGAL
;
3021 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3026 /* convert the Unicode code point in c into codepage bytes */
3027 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3029 /* is this code point assigned, or do we use fallbacks? */
3030 if(value
>=minValue
) {
3031 /* assigned, write the output character bytes from value and length */
3033 /* this is easy because we know that there is enough space */
3034 *target
++=(uint8_t)value
;
3036 *offsets
++=sourceIndex
;
3040 /* normal end of conversion: prepare for a new character */
3042 sourceIndex
=nextSourceIndex
;
3044 } else { /* unassigned */
3046 * We allow a 0 byte output if the Unicode code point is
3047 * U+0000 and also if the "assigned" bit is set for this entry.
3048 * There is no way with this data structure for fallback output
3049 * for other than U+0000 to be a zero byte.
3051 /* callback(unassigned) */
3054 reason
=UCNV_UNASSIGNED
;
3055 *pErrorCode
=U_INVALID_CHAR_FOUND
;
3057 /* call the callback function with all the preparations and post-processing */
3058 /* update the arguments structure */
3059 pArgs
->source
=source
;
3060 pArgs
->target
=(char *)target
;
3061 pArgs
->offsets
=offsets
;
3063 /* set the converter state in UConverter to deal with the next character */
3064 cnv
->fromUSurrogateLead
=0;
3066 /* call the callback function */
3067 fromUCallback(cnv
, cnv
->fromUContext
, pArgs
, c
, reason
, pErrorCode
);
3069 /* get the converter state from UConverter */
3070 c
=cnv
->fromUSurrogateLead
;
3072 /* update target and deal with offsets if necessary */
3073 offsets
=ucnv_updateCallbackOffsets(offsets
, ((uint8_t *)pArgs
->target
)-target
, sourceIndex
);
3074 target
=(uint8_t *)pArgs
->target
;
3076 /* update the source pointer and index */
3077 sourceIndex
=nextSourceIndex
+(pArgs
->source
-source
);
3078 source
=pArgs
->source
;
3079 targetCapacity
=(uint8_t *)pArgs
->targetLimit
-target
;
3082 * If the callback overflowed the target, then we need to
3083 * stop here with an overflow indication.
3085 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
3087 } else if(U_FAILURE(*pErrorCode
)) {
3088 /* break on error */
3091 } else if(cnv
->charErrorBufferLength
>0) {
3092 /* target is full */
3093 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3098 * We do not need to repeat the statements from the normal
3099 * end of the conversion because we already updated all the
3100 * necessary variables.
3103 /* target is full */
3104 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3109 if(pArgs
->flush
&& source
>=sourceLimit
) {
3110 /* reset the state for the next conversion */
3111 if(c
!=0 && U_SUCCESS(*pErrorCode
)) {
3112 /* a Unicode code point remains incomplete (only a first surrogate) */
3113 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
3115 cnv
->fromUSurrogateLead
=0;
3117 /* set the converter state back into UConverter */
3118 cnv
->fromUSurrogateLead
=(UChar
)c
;
3121 /* write back the updated pointers */
3122 pArgs
->source
=source
;
3123 pArgs
->target
=(char *)target
;
3124 pArgs
->offsets
=offsets
;
3128 * This version of _MBCSFromUnicode() is optimized for single-byte codepages
3129 * that map only to and from the BMP.
3130 * In addition to single-byte/state optimizations, the offset calculations
3131 * become much easier.
3134 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
3135 UErrorCode
*pErrorCode
) {
3137 const UChar
*source
, *sourceLimit
, *lastSource
;
3139 int32_t targetCapacity
, length
;
3142 const uint16_t *table
;
3143 const uint16_t *results
;
3147 int32_t sourceIndex
;
3149 UConverterCallbackReason reason
;
3150 uint16_t value
, minValue
;
3152 /* set up the local pointers */
3153 cnv
=pArgs
->converter
;
3154 source
=pArgs
->source
;
3155 sourceLimit
=pArgs
->sourceLimit
;
3156 target
=(uint8_t *)pArgs
->target
;
3157 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
3158 offsets
=pArgs
->offsets
;
3160 table
=cnv
->sharedData
->table
->mbcs
.fromUnicodeTable
;
3161 if((cnv
->options
&UCNV_OPTION_SWAP_LFNL
)!=0) {
3162 results
=(uint16_t *)cnv
->sharedData
->table
->mbcs
.swapLFNLFromUnicodeBytes
;
3164 results
=(uint16_t *)cnv
->sharedData
->table
->mbcs
.fromUnicodeBytes
;
3167 if(cnv
->useFallback
) {
3168 /* use all roundtrip and fallback results */
3171 /* use only roundtrips and fallbacks from private-use characters */
3175 /* get the converter state from UConverter */
3176 c
=cnv
->fromUSurrogateLead
;
3178 /* sourceIndex=-1 if the current character began in the previous buffer */
3179 sourceIndex
= c
==0 ? 0 : -1;
3183 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3184 * for the minimum of the sourceLength and targetCapacity
3186 length
=sourceLimit
-source
;
3187 if(length
<targetCapacity
) {
3188 targetCapacity
=length
;
3191 /* conversion loop */
3192 if(c
!=0 && targetCapacity
>0) {
3196 #if MBCS_UNROLL_SINGLE_FROM_BMP
3197 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3198 /* unroll the loop with the most common case */
3200 if(targetCapacity
>=4) {
3201 int32_t count
, loops
;
3202 uint16_t andedValues
;
3204 loops
=count
=targetCapacity
>>2;
3207 andedValues
=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3208 *target
++=(uint8_t)value
;
3210 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3211 *target
++=(uint8_t)value
;
3213 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3214 *target
++=(uint8_t)value
;
3216 andedValues
&=value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3217 *target
++=(uint8_t)value
;
3219 /* were all 4 entries really valid? */
3220 if(andedValues
<minValue
) {
3221 /* no, return to the first of these 4 */
3228 targetCapacity
-=4*count
;
3231 lastSource
+=4*count
;
3233 *offsets
++=sourceIndex
++;
3234 *offsets
++=sourceIndex
++;
3235 *offsets
++=sourceIndex
++;
3236 *offsets
++=sourceIndex
++;
3245 while(targetCapacity
>0) {
3247 * Get a correct Unicode code point:
3248 * a single UChar for a BMP code point or
3249 * a matched surrogate pair for a "supplementary code point".
3253 * Do not immediately check for single surrogates:
3254 * Assume that they are unassigned and check for them in that case.
3255 * This speeds up the conversion of assigned characters.
3257 /* convert the Unicode code point in c into codepage bytes */
3258 value
=MBCS_SINGLE_RESULT_FROM_U(table
, results
, c
);
3260 /* is this code point assigned, or do we use fallbacks? */
3261 if(value
>=minValue
) {
3262 /* assigned, write the output character bytes from value and length */
3264 /* this is easy because we know that there is enough space */
3265 *target
++=(uint8_t)value
;
3268 /* normal end of conversion: prepare for a new character */
3271 } else if(!UTF_IS_SURROGATE(c
)) {
3272 /* normal, unassigned BMP character */
3274 * We allow a 0 byte output if the Unicode code point is
3275 * U+0000 and also if the "assigned" bit is set for this entry.
3276 * There is no way with this data structure for fallback output
3277 * for other than U+0000 to be a zero byte.
3279 /* callback(unassigned) */
3280 reason
=UCNV_UNASSIGNED
;
3281 *pErrorCode
=U_INVALID_CHAR_FOUND
;
3282 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
3284 if(source
<sourceLimit
) {
3285 /* test the following code unit */
3286 UChar trail
=*source
;
3287 if(UTF_IS_SECOND_SURROGATE(trail
)) {
3289 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
3290 /* this codepage does not map supplementary code points */
3291 /* callback(unassigned) */
3292 reason
=UCNV_UNASSIGNED
;
3293 *pErrorCode
=U_INVALID_CHAR_FOUND
;
3295 /* this is an unmatched lead code unit (1st surrogate) */
3296 /* callback(illegal) */
3297 reason
=UCNV_ILLEGAL
;
3298 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3305 /* this is an unmatched trail code unit (2nd surrogate) */
3306 /* callback(illegal) */
3307 reason
=UCNV_ILLEGAL
;
3308 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
3311 /* call the callback function with all the preparations and post-processing */
3312 /* get the number of code units for c to correctly advance sourceIndex after the callback call */
3313 length
=UTF_CHAR_LENGTH(c
);
3315 /* set offsets since the start or the last callback */
3317 int32_t count
=(int32_t)(source
-lastSource
);
3319 /* do not set the offset for the callback-causing character */
3323 *offsets
++=sourceIndex
++;
3326 /* offset and sourceIndex are now set for the current character */
3329 /* update the arguments structure */
3330 pArgs
->source
=source
;
3331 pArgs
->target
=(char *)target
;
3332 pArgs
->offsets
=offsets
;
3334 /* set the converter state in UConverter to deal with the next character */
3335 cnv
->fromUSurrogateLead
=0;
3337 /* call the callback function */
3338 fromUCallback(cnv
, cnv
->fromUContext
, pArgs
, c
, reason
, pErrorCode
);
3340 /* get the converter state from UConverter */
3341 c
=cnv
->fromUSurrogateLead
;
3343 /* update target and deal with offsets if necessary */
3344 offsets
=ucnv_updateCallbackOffsets(offsets
, ((uint8_t *)pArgs
->target
)-target
, sourceIndex
);
3345 target
=(uint8_t *)pArgs
->target
;
3347 /* update the source pointer and index */
3348 sourceIndex
+=length
+(pArgs
->source
-source
);
3349 source
=lastSource
=pArgs
->source
;
3350 targetCapacity
=(uint8_t *)pArgs
->targetLimit
-target
;
3351 length
=sourceLimit
-source
;
3352 if(length
<targetCapacity
) {
3353 targetCapacity
=length
;
3357 * If the callback overflowed the target, then we need to
3358 * stop here with an overflow indication.
3360 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
3362 } else if(U_FAILURE(*pErrorCode
)) {
3363 /* break on error */
3366 } else if(cnv
->charErrorBufferLength
>0) {
3367 /* target is full */
3368 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3372 #if MBCS_UNROLL_SINGLE_FROM_BMP
3373 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3378 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
&& target
>=(uint8_t *)pArgs
->targetLimit
) {
3379 /* target is full */
3380 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
3383 /* set offsets since the start or the last callback */
3385 size_t count
=source
-lastSource
;
3387 *offsets
++=sourceIndex
++;
3392 if(pArgs
->flush
&& source
>=sourceLimit
) {
3393 /* reset the state for the next conversion */
3394 if(c
!=0 && U_SUCCESS(*pErrorCode
)) {
3395 /* a Unicode code point remains incomplete (only a first surrogate) */
3396 *pErrorCode
=U_TRUNCATED_CHAR_FOUND
;
3398 cnv
->fromUSurrogateLead
=0;
3400 /* set the converter state back into UConverter */
3401 cnv
->fromUSurrogateLead
=(UChar
)c
;
3404 /* write back the updated pointers */
3405 pArgs
->source
=source
;
3406 pArgs
->target
=(char *)target
;
3407 pArgs
->offsets
=offsets
;
3411 * This is another simple conversion function for internal use by other
3412 * conversion implementations.
3413 * It does not use the converter state nor call callbacks.
3414 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3416 * It converts one single Unicode code point into codepage bytes, encoded
3417 * as one 32-bit value. The function returns the number of bytes in *pValue:
3418 * 1..4 the number of bytes in *pValue
3419 * 0 unassigned (*pValue undefined)
3420 * -1 illegal (currently not used, *pValue undefined)
3422 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
3423 * the second to last byte in bits 15..8, etc.
3424 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3427 _MBCSFromUChar32(UConverterSharedData
*sharedData
,
3428 UChar32 c
, uint32_t *pValue
,
3429 UBool useFallback
) {
3430 const uint16_t *table
=sharedData
->table
->mbcs
.fromUnicodeTable
;
3432 uint32_t stage2Entry
;
3436 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3437 if(c
>=0x10000 && !(sharedData
->table
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
3441 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3442 if(sharedData
->table
->mbcs
.outputType
==MBCS_OUTPUT_1
) {
3443 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->table
->mbcs
.fromUnicodeBytes
, c
);
3444 /* is this code point assigned, or do we use fallbacks? */
3445 if(useFallback
? value
>=0x800 : value
>=0xc00) {
3453 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
3455 /* get the bytes and the length for the output */
3456 switch(sharedData
->table
->mbcs
.outputType
) {
3458 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->table
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
3466 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->table
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
3467 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
3470 } else if(value
<=0xffff) {
3477 value
=MBCS_VALUE_4_FROM_STAGE_2(sharedData
->table
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
3480 } else if(value
<=0xffff) {
3482 } else if(value
<=0xffffff) {
3488 case MBCS_OUTPUT_3_EUC
:
3489 value
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->table
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
3490 /* EUC 16-bit fixed-length representation */
3493 } else if((value
&0x8000)==0) {
3496 } else if((value
&0x80)==0) {
3503 case MBCS_OUTPUT_4_EUC
:
3504 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->table
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
3505 value
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
3506 /* EUC 16-bit fixed-length representation applied to the first two bytes */
3509 } else if(value
<=0xffff) {
3511 } else if((value
&0x800000)==0) {
3514 } else if((value
&0x8000)==0) {
3522 /* must not occur */
3526 /* is this code point assigned, or do we use fallbacks? */
3527 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry
, c
) ||
3528 (FROM_U_USE_FALLBACK(useFallback
, c
) && (value
!=0 || c
==0))
3531 * We allow a 0 byte output if the Unicode code point is
3532 * U+0000 and also if the "assigned" bit is set for this entry.
3533 * There is no way with this data structure for fallback output
3534 * for other than U+0000 to be a zero byte.
3547 * ################################################################
3549 * # This function has been moved to ucnv2022.c for inlining.
3550 * # This implementation is here only for documentation purposes
3552 * ################################################################
3556 * This version of _MBCSFromUChar32() is optimized for single-byte codepages.
3557 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3559 * It returns the codepage byte for the code point, or -1 if it is unassigned.
3562 _MBCSSingleFromUChar32(UConverterSharedData
*sharedData
,
3564 UBool useFallback
) {
3565 const uint16_t *table
;
3568 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3569 if(c
>=0x10000 && !(sharedData
->table
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
3573 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3574 table
=sharedData
->table
->mbcs
.fromUnicodeTable
;
3576 /* get the byte for the output */
3577 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->table
->mbcs
.fromUnicodeBytes
, c
);
3578 /* is this code point assigned, or do we use fallbacks? */
3579 if(useFallback
? value
>=0x800 : value
>=0xc00) {
3587 /* miscellaneous ------------------------------------------------------------ */
3590 _MBCSGetStarters(const UConverter
* cnv
,
3591 UBool starters
[256],
3592 UErrorCode
*pErrorCode
) {
3593 const int32_t *state0
=cnv
->sharedData
->table
->mbcs
.stateTable
[0];
3596 for(i
=0; i
<256; ++i
) {
3597 /* all bytes that cause a state transition from state 0 are lead bytes */
3598 starters
[i
]= (UBool
)MBCS_ENTRY_IS_TRANSITION(state0
[i
]);
3603 * This is an internal function that allows other converter implementations
3604 * to check whether a byte is a lead byte.
3607 _MBCSIsLeadByte(UConverterSharedData
*sharedData
, char byte
) {
3608 return (UBool
)MBCS_ENTRY_IS_TRANSITION(sharedData
->table
->mbcs
.stateTable
[0][(uint8_t)byte
]);
3612 _MBCSWriteSub(UConverterFromUnicodeArgs
*pArgs
,
3613 int32_t offsetIndex
,
3614 UErrorCode
*pErrorCode
) {
3615 UConverter
*cnv
=pArgs
->converter
;
3620 /* first, select between subChar and subChar1 */
3621 if(cnv
->subChar1
!=0 && cnv
->invalidUCharBuffer
[0]<=0xff) {
3622 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
3623 subchar
=(char *)&cnv
->subChar1
;
3626 /* select subChar in all other cases */
3627 subchar
=(char *)cnv
->subChar
;
3628 length
=cnv
->subCharLen
;
3631 switch(cnv
->sharedData
->table
->mbcs
.outputType
) {
3632 case MBCS_OUTPUT_2_SISO
:
3635 /* fromUnicodeStatus contains prevLength */
3638 if(cnv
->fromUnicodeStatus
==2) {
3639 /* DBCS mode and SBCS sub char: change to SBCS */
3640 cnv
->fromUnicodeStatus
=1;
3646 if(cnv
->fromUnicodeStatus
==1) {
3647 /* SBCS mode and DBCS sub char: change to DBCS */
3648 cnv
->fromUnicodeStatus
=2;
3655 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3658 ucnv_cbFromUWriteBytes(pArgs
,
3659 buffer
, (int32_t)(p
-buffer
),
3660 offsetIndex
, pErrorCode
);
3663 ucnv_cbFromUWriteBytes(pArgs
,
3665 offsetIndex
, pErrorCode
);
3670 U_CFUNC UConverterType
3671 _MBCSGetType(const UConverter
* converter
) {
3672 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
3673 if(converter
->sharedData
->table
->mbcs
.countStates
==1) {
3674 return (UConverterType
)UCNV_SBCS
;
3675 } else if((converter
->sharedData
->table
->mbcs
.outputType
&0xff)==MBCS_OUTPUT_2_SISO
) {
3676 return (UConverterType
)UCNV_EBCDIC_STATEFUL
;
3677 } else if(converter
->sharedData
->staticData
->minBytesPerChar
==2 && converter
->sharedData
->staticData
->maxBytesPerChar
==2) {
3678 return (UConverterType
)UCNV_DBCS
;
3680 return (UConverterType
)UCNV_MBCS
;
3683 static const UConverterImpl _MBCSImpl
={
3693 _MBCSToUnicodeWithOffsets
,
3694 _MBCSToUnicodeWithOffsets
,
3695 _MBCSFromUnicodeWithOffsets
,
3696 _MBCSFromUnicodeWithOffsets
,
3707 /* Static data is in tools/makeconv/ucnvstat.c for data-based
3708 * converters. Be sure to update it as well.
3711 const UConverterSharedData _MBCSData
={
3712 sizeof(UConverterSharedData
), 1,
3713 NULL
, NULL
, NULL
, FALSE
, &_MBCSImpl
,
3717 /* GB 18030 special handling ------------------------------------------------ */
3719 /* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
3721 /* the callback functions handle GB 18030 specially */
3723 fromUCallback(UConverter
*cnv
,
3724 const void *context
, UConverterFromUnicodeArgs
*pArgs
,
3726 UConverterCallbackReason reason
, UErrorCode
*pErrorCode
) {
3729 if((cnv
->options
&_MBCS_OPTION_GB18030
)!=0 && reason
==UCNV_UNASSIGNED
) {
3730 const uint32_t *range
;
3732 range
=gb18030Ranges
[0];
3733 for(i
=0; i
<sizeof(gb18030Ranges
)/sizeof(gb18030Ranges
[0]); range
+=4, ++i
) {
3734 if(range
[0]<=(uint32_t)codePoint
&& (uint32_t)codePoint
<=range
[1]) {
3738 /* found the Unicode code point, output the four-byte sequence for it */
3739 *pErrorCode
=U_ZERO_ERROR
;
3741 /* get the linear value of the first GB 18030 code in this range */
3742 linear
=range
[2]-LINEAR_18030_BASE
;
3744 /* add the offset from the beginning of the range */
3745 linear
+=((uint32_t)codePoint
-range
[0]);
3747 /* turn this into a four-byte sequence */
3748 bytes
[3]=(char)(0x30+linear%10
); linear
/=10;
3749 bytes
[2]=(char)(0x81+linear%126
); linear
/=126;
3750 bytes
[1]=(char)(0x30+linear%10
); linear
/=10;
3751 bytes
[0]=(char)(0x81+linear
);
3753 /* output this sequence */
3754 ucnv_cbFromUWriteBytes(pArgs
, bytes
, 4, 0, pErrorCode
);
3760 /* write the code point as code units */
3762 UTF_APPEND_CHAR_UNSAFE(cnv
->invalidUCharBuffer
, i
, codePoint
);
3763 cnv
->invalidUCharLength
=(int8_t)i
;
3765 /* call the normal callback function */
3766 cnv
->fromUCharErrorBehaviour(context
, pArgs
, cnv
->invalidUCharBuffer
, i
, codePoint
, reason
, pErrorCode
);
3770 toUCallback(UConverter
*cnv
,
3771 const void *context
, UConverterToUnicodeArgs
*pArgs
,
3772 const char *codeUnits
, int32_t length
,
3773 UConverterCallbackReason reason
, UErrorCode
*pErrorCode
) {
3776 if((cnv
->options
&_MBCS_OPTION_GB18030
)!=0 && reason
==UCNV_UNASSIGNED
&& length
==4) {
3777 const uint32_t *range
;
3780 linear
=LINEAR_18030((uint8_t)codeUnits
[0], (uint8_t)codeUnits
[1], (uint8_t)codeUnits
[2], (uint8_t)codeUnits
[3]);
3781 range
=gb18030Ranges
[0];
3782 for(i
=0; i
<sizeof(gb18030Ranges
)/sizeof(gb18030Ranges
[0]); range
+=4, ++i
) {
3783 if(range
[2]<=linear
&& linear
<=range
[3]) {
3784 UChar u
[UTF_MAX_CHAR_LENGTH
];
3786 /* found the sequence, output the Unicode code point for it */
3787 *pErrorCode
=U_ZERO_ERROR
;
3789 /* add the linear difference between the input and start sequences to the start code point */
3790 linear
=range
[0]+(linear
-range
[2]);
3792 /* write the result as UChars and output */
3794 UTF_APPEND_CHAR_UNSAFE(u
, i
, linear
);
3795 ucnv_cbToUWriteUChars(pArgs
, u
, i
, 0, pErrorCode
);
3801 /* copy the current bytes to invalidCharBuffer */
3802 for(i
=0; i
<length
; ++i
) {
3803 cnv
->invalidCharBuffer
[i
]=codeUnits
[i
];
3805 cnv
->invalidCharLength
=(int8_t)length
;
3807 /* call the normal callback function */
3808 cnv
->fromCharErrorBehaviour(context
, pArgs
, codeUnits
, length
, reason
, pErrorCode
);
3811 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */