]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvmbcs.c
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / ucnvmbcs.c
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
46f4442e 4* Copyright (C) 2000-2008, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: ucnvmbcs.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000jul03
14* created by: Markus W. Scherer
15*
16* The current code in this file replaces the previous implementation
17* of conversion code from multi-byte codepages to Unicode and back.
18* This implementation supports the following:
19* - legacy variable-length codepages with up to 4 bytes per character
20* - all Unicode code points (up to 0x10ffff)
21* - efficient distinction of unassigned vs. illegal byte sequences
22* - it is possible in fromUnicode() to directly deal with simple
23* stateful encodings (used for EBCDIC_STATEFUL)
374ca955 24* - it is possible to convert Unicode code points
b75a7d8f
A
25* to a single zero byte (but not as a fallback except for SBCS)
26*
27* Remaining limitations in fromUnicode:
28* - byte sequences must not have leading zero bytes
29* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30* - limitation to up to 4 bytes per character
31*
374ca955
A
32* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33* limitations and adds m:n character mappings and other features.
34* See ucnv_ext.h for details.
35*
b75a7d8f
A
36* Change history:
37*
38* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40* macros to ucnvmbcs.h file
41*/
42
43#include "unicode/utypes.h"
44
374ca955 45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
46
47#include "unicode/ucnv.h"
48#include "unicode/ucnv_cb.h"
49#include "unicode/udata.h"
50#include "unicode/uset.h"
51#include "ucnv_bld.h"
52#include "ucnvmbcs.h"
374ca955 53#include "ucnv_ext.h"
b75a7d8f
A
54#include "ucnv_cnv.h"
55#include "umutex.h"
56#include "cmemory.h"
57#include "cstring.h"
58
59/* control optimizations according to the platform */
60#define MBCS_UNROLL_SINGLE_TO_BMP 1
61#define MBCS_UNROLL_SINGLE_FROM_BMP 0
62
63/*
46f4442e 64 * _MBCSHeader versions 5.3 & 4.3
b75a7d8f
A
65 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
66 *
46f4442e
A
67 * This version is optional. Version 5 is used for incompatible data format changes.
68 * makeconv will continue to generate version 4 files if possible.
69 *
70 * Changes from version 4:
71 *
72 * The main difference is an additional _MBCSHeader field with
73 * - the length (number of uint32_t) of the _MBCSHeader
74 * - flags for further incompatible data format changes
75 * - flags for further, backward compatible data format changes
76 *
77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
78 * the file and needs to be reconstituted at load time.
79 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
81 * (For details about these structures see below, and see ucnvmbcs.h.)
82 *
83 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
84 * of the Unicode code points. (This requires that the .ucm file has the |0 etc.
85 * precision markers for all mappings.)
86 *
87 * All fallbacks have been moved to the extension table, leaving only roundtrips in the
88 * omitted data that can be reconstituted from the toUnicode data.
89 *
90 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
91 * With only roundtrip mappings in the base fromUnicode data, this part is fully
92 * redundant with the mbcsIndex and will be reconstituted from that (also using the
93 * stage 1 table which contains the information about how stage 2 was compacted).
94 *
95 * The rest of the stage 2 table, the part for code points above maxFastUChar,
96 * is stored in the file and will be appended to the reconstituted part.
97 *
98 * The entire fromUBytes array is omitted from the file and will be reconstitued.
99 * This is done by enumerating all toUnicode roundtrip mappings, performing
100 * each mapping (using the stage 1 and reconstituted stage 2 tables) and
101 * writing instead of reading the byte values.
102 *
103 * _MBCSHeader version 4.3
104 *
105 * Change from version 4.2:
106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
107 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
108 * files which can be used instead of stages 1 & 2.
109 * Faster lookups for roundtrips from most commonly used characters,
110 * and lookups from UTF-8 byte sequences with a natural bit distribution.
111 * See ucnvmbcs.h for more details.
112 *
374ca955
A
113 * Change from version 4.1:
114 * - Added an optional extension table structure at the end of the .cnv file.
115 * It is present if the upper bits of the header flags field contains a non-zero
116 * byte offset to it.
117 * Files that contain only a conversion table and no base table
118 * use the special outputType MBCS_OUTPUT_EXT_ONLY.
119 * These contain the base table name between the MBCS header and the extension
120 * data.
121 *
b75a7d8f
A
122 * Change from version 4.0:
123 * - Replace header.reserved with header.fromUBytesLength so that all
124 * fields in the data have length.
125 *
126 * Changes from version 3 (for performance improvements):
127 * - new bit distribution for state table entries
128 * - reordered action codes
129 * - new data structure for single-byte fromUnicode
130 * + stage 2 only contains indexes
131 * + stage 3 stores 16 bits per character with classification bits 15..8
132 * - no multiplier for stage 1 entries
133 * - stage 2 for non-single-byte codepages contains the index and the flags in
134 * one 32-bit value
135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
136 *
137 * For more details about old versions of the MBCS data structure, see
138 * the corresponding versions of this file.
139 *
140 * Converting stateless codepage data ---------------------------------------***
141 * (or codepage data with simple states) to Unicode.
142 *
143 * Data structure and algorithm for converting from complex legacy codepages
144 * to Unicode. (Designed before 2000-may-22.)
145 *
146 * The basic idea is that the structure of legacy codepages can be described
147 * with state tables.
148 * When reading a byte stream, each input byte causes a state transition.
149 * Some transitions result in the output of a code point, some result in
150 * "unassigned" or "illegal" output.
151 * This is used here for character conversion.
152 *
153 * The data structure begins with a state table consisting of a row
154 * per state, with 256 entries (columns) per row for each possible input
155 * byte value.
156 * Each entry is 32 bits wide, with two formats distinguished by
157 * the sign bit (bit 31):
158 *
159 * One format for transitional entries (bit 31 not set) for non-final bytes, and
160 * one format for final entries (bit 31 set).
161 * Both formats contain the number of the next state in the same bit
162 * positions.
163 * State 0 is the initial state.
164 *
165 * Most of the time, the offset values of subsequent states are added
166 * up to a scalar value. This value will eventually be the index of
167 * the Unicode code point in a table that follows the state table.
168 * The effect is that the code points for final state table rows
169 * are contiguous. The code points of final state rows follow each other
170 * in the order of the references to those final states by previous
171 * states, etc.
172 *
173 * For some terminal states, the offset is itself the output Unicode
174 * code point (16 bits for a BMP code point or 20 bits for a supplementary
175 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
176 * For others, the code point in the Unicode table is stored with either
177 * one or two code units: one for BMP code points, two for a pair of
178 * surrogates.
179 * All code points for a final state entry take up the same number of code
180 * units, regardless of whether they all actually _use_ the same number
181 * of code units. This is necessary for simple array access.
182 *
183 * An additional feature comes in with what in ICU is called "fallback"
184 * mappings:
185 *
186 * In addition to round-trippable, precise, 1:1 mappings, there are often
187 * mappings defined between similar, though not the same, characters.
188 * Typically, such mappings occur only in fromUnicode mapping tables because
189 * Unicode has a superset repertoire of most other codepages. However, it
190 * is possible to provide such mappings in the toUnicode tables, too.
191 * In this case, the fallback mappings are partly integrated into the
192 * general state tables because the structure of the encoding includes their
193 * byte sequences.
194 * For final entries in an initial state, fallback mappings are stored in
195 * the entry itself like with roundtrip mappings.
196 * For other final entries, they are stored in the code units table if
197 * the entry is for a pair of code units.
198 * For single-unit results in the code units table, there is no space to
199 * alternatively hold a fallback mapping; in this case, the code unit
200 * is stored as U+fffe (unassigned), and the fallback mapping needs to
201 * be looked up by the scalar offset value in a separate table.
202 *
203 * "Unassigned" state entries really mean "structurally unassigned",
204 * i.e., such a byte sequence will never have a mapping result.
205 *
206 * The interpretation of the bits in each entry is as follows:
207 *
208 * Bit 31 not set, not a terminal entry ("transitional"):
209 * 30..24 next state
210 * 23..0 offset delta, to be added up
211 *
212 * Bit 31 set, terminal ("final") entry:
213 * 30..24 next state (regardless of action code)
214 * 23..20 action code:
215 * action codes 0 and 1 result in precise-mapping Unicode code points
216 * 0 valid byte sequence
217 * 19..16 not used, 0
218 * 15..0 16-bit Unicode BMP code point
219 * never U+fffe or U+ffff
220 * 1 valid byte sequence
221 * 19..0 20-bit Unicode supplementary code point
222 * never U+fffe or U+ffff
223 *
224 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
225 * 2 valid byte sequence (fallback)
226 * 19..16 not used, 0
227 * 15..0 16-bit Unicode BMP code point as fallback result
228 * 3 valid byte sequence (fallback)
229 * 19..0 20-bit Unicode supplementary code point as fallback result
230 *
231 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
232 * depending on the code units they result in
233 * 4 valid byte sequence
234 * 19..9 not used, 0
235 * 8..0 final offset delta
236 * pointing to one 16-bit code unit which may be
237 * fffe unassigned -- look for a fallback for this offset
238 * ffff illegal
239 * 5 valid byte sequence
240 * 19..9 not used, 0
241 * 8..0 final offset delta
242 * pointing to two 16-bit code units
243 * (typically UTF-16 surrogates)
244 * the result depends on the first code unit as follows:
245 * 0000..d7ff roundtrip BMP code point (1st alone)
246 * d800..dbff roundtrip surrogate pair (1st, 2nd)
247 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
248 * e000 roundtrip BMP code point (2nd alone)
249 * e001 fallback BMP code point (2nd alone)
250 * fffe unassigned
251 * ffff illegal
252 * (the final offset deltas are at most 255 * 2,
253 * times 2 because of storing code unit pairs)
254 *
255 * 6 unassigned byte sequence
256 * 19..16 not used, 0
257 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
258 * this does not contain a final offset delta because the main
259 * purpose of this action code is to save scalar offset values;
260 * therefore, fallback values cannot be assigned to byte
261 * sequences that result in this action code
262 * 7 illegal byte sequence
263 * 19..16 not used, 0
264 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
265 * 8 state change only
266 * 19..0 not used, 0
267 * useful for state changes in simple stateful encodings,
268 * at Shift-In/Shift-Out codes
269 *
270 *
271 * 9..15 reserved for future use
272 * current implementations will only perform a state change
273 * and ignore bits 19..0
274 *
275 * An encoding with contiguous ranges of unassigned byte sequences, like
276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
277 * at least two states for the trail bytes:
278 * One trail byte state that results in code points, and one that only
279 * has "unassigned" and "illegal" terminal states.
280 *
46f4442e 281 * Note: partly by accident, this data structure supports simple stateful
b75a7d8f
A
282 * encodings without any additional logic.
283 * Currently, only simple Shift-In/Shift-Out schemes are handled with
284 * appropriate state tables (especially EBCDIC_STATEFUL!).
285 *
286 * MBCS version 2 added:
287 * unassigned and illegal action codes have U+fffe and U+ffff
288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
289 *
290 * Converting from Unicode to codepage bytes --------------------------------***
291 *
292 * The conversion data structure for fromUnicode is designed for the known
293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
295 * a roundtrip mapping.
296 *
297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
298 * like in the character properties table.
299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
300 * with the resulting bytes is at offsetFromUBytes.
301 *
302 * Beginning with version 4, single-byte codepages have a significantly different
303 * trie compared to other codepages.
304 * In all cases, the entry in stage 1 is directly the index of the block of
305 * 64 entries in stage 2.
306 *
307 * Single-byte lookup:
308 *
309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
310 * Stage 3 contains one 16-bit word per result:
311 * Bits 15..8 indicate the kind of result:
312 * f roundtrip result
313 * c fallback result from private-use code point
314 * 8 fallback result from other code points
315 * 0 unassigned
316 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
317 *
46f4442e
A
318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
321 * ASCII code points can be looked up with a linear array access into stage 3.
322 * See maxFastUChar and other details in ucnvmbcs.h.
323 *
b75a7d8f
A
324 * Multi-byte lookup:
325 *
326 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
328 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
329 * If this test is false, then a non-zero result will be interpreted as
330 * a fallback mapping.
331 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
332 *
333 * Stage 3 contains 2, 3, or 4 bytes per result.
334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
335 * while 3 bytes are stored as bytes in big-endian order.
336 * Leading zero bytes are ignored, and the number of bytes is counted.
337 * A zero byte mapping result is possible as a roundtrip result.
338 * For some output types, the actual result is processed from this;
374ca955 339 * see ucnv_MBCSFromUnicodeWithOffsets().
b75a7d8f
A
340 *
341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
343 *
46f4442e
A
344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
347 * ASCII code points can be looked up with a linear array access into stage 3.
348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
349 *
b75a7d8f
A
350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
351 * for compaction.
352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
353 * may overlap by any number of entries.
354 *
355 * MBCS version 2 added:
356 * the converter checks for known output types, which allows
357 * adding new ones without crashing an unaware converter
358 */
359
46f4442e
A
360static const UConverterImpl _SBCSUTF8Impl;
361static const UConverterImpl _DBCSUTF8Impl;
b75a7d8f
A
362
363/* GB 18030 data ------------------------------------------------------------ */
364
365/* helper macros for linear values for GB 18030 four-byte sequences */
366#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
367
368#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
369
370#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
371
372/*
373 * Some ranges of GB 18030 where both the Unicode code points and the
374 * GB four-byte sequences are contiguous and are handled algorithmically by
375 * the special callback functions below.
376 * The values are start & end of Unicode & GB codes.
377 *
378 * Note that single surrogates are not mapped by GB 18030
379 * as of the re-released mapping tables from 2000-nov-30.
380 */
381static const uint32_t
382gb18030Ranges[13][4]={
383 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
384 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
385 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
386 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
387 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
388 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
389 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
390 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
391 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
392 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
393 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
394 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
395 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
396};
397
398/* bit flag for UConverter.options indicating GB 18030 special handling */
399#define _MBCS_OPTION_GB18030 0x8000
400
401/* Miscellaneous ------------------------------------------------------------ */
402
46f4442e
A
403/**
404 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
405 * consecutive sequences of bytes, starting from the one encoded in value,
406 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
407 * Does not currently support m:n mappings or reverse fallbacks.
408 * This function will not be called for sequences of bytes with leading zeros.
409 *
410 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
411 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
412 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
413 * not map to anything
414 * @return TRUE to continue enumeration, FALSE to stop
415 */
416typedef UBool U_CALLCONV
417UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
418
374ca955 419/* similar to ucnv_MBCSGetNextUChar() but recursive */
46f4442e
A
420static UBool
421enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
422 int32_t state, uint32_t offset,
423 uint32_t value,
424 UConverterEnumToUCallback *callback, const void *context,
425 UErrorCode *pErrorCode) {
426 UChar32 codePoints[32];
427 const int32_t *row;
428 const uint16_t *unicodeCodeUnits;
429 UChar32 anyCodePoints;
430 int32_t b, limit;
431
432 row=mbcsTable->stateTable[state];
433 unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
374ca955 434
46f4442e
A
435 value<<=8;
436 anyCodePoints=-1; /* becomes non-negative if there is a mapping */
437
438 b=(stateProps[state]&0x38)<<2;
439 if(b==0 && stateProps[state]>=0x40) {
440 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
441 codePoints[0]=U_SENTINEL;
442 b=1;
443 }
444 limit=((stateProps[state]&7)+1)<<5;
445 while(b<limit) {
446 int32_t entry=row[b];
374ca955 447 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
46f4442e
A
448 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
449 if(stateProps[nextState]>=0) {
450 /* recurse to a state with non-ignorable actions */
451 if(!enumToU(
452 mbcsTable, stateProps, nextState,
453 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
454 value|(uint32_t)b,
455 callback, context,
456 pErrorCode)) {
457 return FALSE;
458 }
459 }
460 codePoints[b&0x1f]=U_SENTINEL;
374ca955
A
461 } else {
462 UChar32 c;
46f4442e 463 int32_t action;
b75a7d8f 464
374ca955
A
465 /*
466 * An if-else-if chain provides more reliable performance for
467 * the most common cases compared to a switch.
468 */
46f4442e 469 action=MBCS_ENTRY_FINAL_ACTION(entry);
374ca955
A
470 if(action==MBCS_STATE_VALID_DIRECT_16) {
471 /* output BMP code point */
472 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
473 } else if(action==MBCS_STATE_VALID_16) {
46f4442e
A
474 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
475 c=unicodeCodeUnits[finalOffset];
374ca955
A
476 if(c<0xfffe) {
477 /* output BMP code point */
478 } else {
479 c=U_SENTINEL;
b75a7d8f 480 }
374ca955 481 } else if(action==MBCS_STATE_VALID_16_PAIR) {
46f4442e
A
482 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
483 c=unicodeCodeUnits[finalOffset++];
374ca955
A
484 if(c<0xd800) {
485 /* output BMP code point below 0xd800 */
486 } else if(c<=0xdbff) {
487 /* output roundtrip or fallback supplementary code point */
46f4442e 488 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
374ca955
A
489 } else if(c==0xe000) {
490 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
46f4442e 491 c=unicodeCodeUnits[finalOffset];
374ca955
A
492 } else {
493 c=U_SENTINEL;
b75a7d8f 494 }
374ca955
A
495 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
496 /* output supplementary code point */
497 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
46f4442e
A
498 } else {
499 c=U_SENTINEL;
500 }
501
502 codePoints[b&0x1f]=c;
503 anyCodePoints&=c;
504 }
505 if(((++b)&0x1f)==0) {
506 if(anyCodePoints>=0) {
507 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
508 return FALSE;
509 }
510 anyCodePoints=-1;
b75a7d8f 511 }
46f4442e
A
512 }
513 }
514 return TRUE;
515}
b75a7d8f 516
46f4442e
A
517/*
518 * Only called if stateProps[state]==-1.
519 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
520 * MBCS_STATE_CHANGE_ONLY.
521 */
522static int8_t
523getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
524 const int32_t *row;
525 int32_t min, max, entry, nextState;
526
527 row=stateTable[state];
528 stateProps[state]=0;
529
530 /* find first non-ignorable state */
531 for(min=0;; ++min) {
532 entry=row[min];
533 nextState=MBCS_ENTRY_STATE(entry);
534 if(stateProps[nextState]==-1) {
535 getStateProp(stateTable, stateProps, nextState);
536 }
537 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
538 if(stateProps[nextState]>=0) {
539 break;
540 }
541 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
542 break;
543 }
544 if(min==0xff) {
545 stateProps[state]=-0x40; /* (int8_t)0xc0 */
546 return stateProps[state];
547 }
548 }
549 stateProps[state]|=(int8_t)((min>>5)<<3);
550
551 /* find last non-ignorable state */
552 for(max=0xff; min<max; --max) {
553 entry=row[max];
554 nextState=MBCS_ENTRY_STATE(entry);
555 if(stateProps[nextState]==-1) {
556 getStateProp(stateTable, stateProps, nextState);
557 }
558 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
559 if(stateProps[nextState]>=0) {
560 break;
561 }
562 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
563 break;
564 }
565 }
566 stateProps[state]|=(int8_t)(max>>5);
567
568 /* recurse further and collect direct-state information */
569 while(min<=max) {
570 entry=row[min];
571 nextState=MBCS_ENTRY_STATE(entry);
572 if(stateProps[nextState]==-1) {
573 getStateProp(stateTable, stateProps, nextState);
574 }
575 if(MBCS_ENTRY_IS_FINAL(entry)) {
576 stateProps[nextState]|=0x40;
577 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
578 stateProps[state]|=0x40;
374ca955 579 }
b75a7d8f 580 }
46f4442e 581 ++min;
b75a7d8f 582 }
46f4442e 583 return stateProps[state];
b75a7d8f
A
584}
585
374ca955 586/*
46f4442e
A
587 * Internal function enumerating the toUnicode data of an MBCS converter.
588 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
589 * table, but could also be used for a future ucnv_getUnicodeSet() option
590 * that includes reverse fallbacks (after updating this function's implementation).
591 * Currently only handles roundtrip mappings.
374ca955 592 * Does not currently handle extensions.
374ca955 593 */
46f4442e
A
594static void
595ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
596 UConverterEnumToUCallback *callback, const void *context,
597 UErrorCode *pErrorCode) {
598 /*
599 * Properties for each state, to speed up the enumeration.
600 * Ignorable actions are unassigned/illegal/state-change-only:
601 * They do not lead to mappings.
602 *
603 * Bits 7..6:
604 * 1 direct/initial state (stateful converters have multiple)
605 * 0 non-initial state with transitions or with non-ignorable result actions
606 * -1 final state with only ignorable actions
607 *
608 * Bits 5..3:
609 * The lowest byte value with non-ignorable actions is
610 * value<<5 (rounded down).
611 *
612 * Bits 2..0:
613 * The highest byte value with non-ignorable actions is
614 * (value<<5)&0x1f (rounded up).
615 */
616 int8_t stateProps[MBCS_MAX_STATE_COUNT];
617 int32_t state;
618
619 uprv_memset(stateProps, -1, sizeof(stateProps));
620
621 /* recurse from state 0 and set all stateProps */
622 getStateProp(mbcsTable->stateTable, stateProps, 0);
623
624 for(state=0; state<mbcsTable->countStates; ++state) {
625 /*if(stateProps[state]==-1) {
626 printf("unused/unreachable <icu:state> %d\n", state);
627 }*/
628 if(stateProps[state]>=0x40) {
629 /* start from each direct state */
630 enumToU(
631 mbcsTable, stateProps, state, 0, 0,
632 callback, context,
633 pErrorCode);
634 }
635 }
374ca955
A
636}
637
638U_CFUNC void
46f4442e
A
639ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
640 const USetAdder *sa,
641 UConverterUnicodeSet which,
642 UConverterSetFilter filter,
643 UErrorCode *pErrorCode) {
374ca955 644 const UConverterMBCSTable *mbcsTable;
b75a7d8f
A
645 const uint16_t *table;
646
647 uint32_t st3;
648 uint16_t st1, maxStage1, st2;
649
650 UChar32 c;
651
b75a7d8f 652 /* enumerate the from-Unicode trie table */
374ca955 653 mbcsTable=&sharedData->mbcs;
b75a7d8f
A
654 table=mbcsTable->fromUnicodeTable;
655 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
656 maxStage1=0x440;
657 } else {
658 maxStage1=0x40;
659 }
660
661 c=0; /* keep track of the current code point while enumerating */
662
663 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
664 const uint16_t *stage2, *stage3, *results;
46f4442e 665 uint16_t minValue;
b75a7d8f
A
666
667 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
668
46f4442e
A
669 /*
670 * Set a threshold variable for selecting which mappings to use.
671 * See ucnv_MBCSSingleFromBMPWithOffsets() and
672 * MBCS_SINGLE_RESULT_FROM_U() for details.
673 */
674 if(which==UCNV_ROUNDTRIP_SET) {
675 /* use only roundtrips */
676 minValue=0xf00;
677 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
678 /* use all roundtrip and fallback results */
679 minValue=0x800;
680 }
681
b75a7d8f
A
682 for(st1=0; st1<maxStage1; ++st1) {
683 st2=table[st1];
684 if(st2>maxStage1) {
685 stage2=table+st2;
686 for(st2=0; st2<64; ++st2) {
687 if((st3=stage2[st2])!=0) {
688 /* read the stage 3 block */
689 stage3=results+st3;
690
b75a7d8f 691 do {
46f4442e 692 if(*stage3++>=minValue) {
374ca955
A
693 sa->add(sa->set, c);
694 }
695 } while((++c&0xf)!=0);
696 } else {
697 c+=16; /* empty stage 3 block */
698 }
699 }
700 } else {
701 c+=1024; /* empty stage 2 block */
702 }
703 }
46f4442e 704 } else {
374ca955 705 const uint32_t *stage2;
46f4442e
A
706 const uint8_t *stage3, *bytes;
707 uint32_t st3Multiplier;
708 uint32_t value;
709 UBool useFallback;
374ca955 710
46f4442e 711 bytes=mbcsTable->fromUnicodeBytes;
374ca955 712
46f4442e 713 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
374ca955 714
46f4442e
A
715 switch(mbcsTable->outputType) {
716 case MBCS_OUTPUT_3:
717 case MBCS_OUTPUT_4_EUC:
718 st3Multiplier=3;
719 break;
720 case MBCS_OUTPUT_4:
721 st3Multiplier=4;
722 break;
723 default:
724 st3Multiplier=2;
725 break;
b75a7d8f 726 }
b75a7d8f
A
727
728 for(st1=0; st1<maxStage1; ++st1) {
729 st2=table[st1];
730 if(st2>(maxStage1>>1)) {
731 stage2=(const uint32_t *)table+st2;
732 for(st2=0; st2<64; ++st2) {
733 if((st3=stage2[st2])!=0) {
46f4442e
A
734 /* read the stage 3 block */
735 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
736
b75a7d8f
A
737 /* get the roundtrip flags for the stage 3 block */
738 st3>>=16;
739
740 /*
46f4442e
A
741 * Add code points for which the roundtrip flag is set,
742 * or which map to non-zero bytes if we use fallbacks.
374ca955 743 * See ucnv_MBCSFromUnicodeWithOffsets() for details.
b75a7d8f 744 */
46f4442e
A
745 switch(filter) {
746 case UCNV_SET_FILTER_NONE:
747 do {
748 if(st3&1) {
749 sa->add(sa->set, c);
750 stage3+=st3Multiplier;
751 } else if(useFallback) {
752 uint8_t b=0;
753 switch(st3Multiplier) {
754 case 4:
755 b|=*stage3++;
756 case 3:
757 b|=*stage3++;
758 case 2:
759 b|=stage3[0]|stage3[1];
760 stage3+=2;
761 default:
762 break;
763 }
764 if(b!=0) {
765 sa->add(sa->set, c);
766 }
767 }
768 st3>>=1;
769 } while((++c&0xf)!=0);
770 break;
771 case UCNV_SET_FILTER_DBCS_ONLY:
772 /* Ignore single-byte results (<0x100). */
773 do {
774 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
775 sa->add(sa->set, c);
776 }
777 st3>>=1;
778 stage3+=2; /* +=st3Multiplier */
779 } while((++c&0xf)!=0);
780 break;
781 case UCNV_SET_FILTER_2022_CN:
782 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
783 do {
784 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
785 sa->add(sa->set, c);
786 }
787 st3>>=1;
788 stage3+=3; /* +=st3Multiplier */
789 } while((++c&0xf)!=0);
790 break;
791 case UCNV_SET_FILTER_SJIS:
792 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
793 do {
794 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
795 sa->add(sa->set, c);
796 }
797 st3>>=1;
798 stage3+=2; /* +=st3Multiplier */
799 } while((++c&0xf)!=0);
800 break;
801 case UCNV_SET_FILTER_GR94DBCS:
802 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
803 do {
804 if( ((st3&1)!=0 || useFallback) &&
805 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
806 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
807 ) {
808 sa->add(sa->set, c);
809 }
810 st3>>=1;
811 stage3+=2; /* +=st3Multiplier */
812 } while((++c&0xf)!=0);
813 break;
814 case UCNV_SET_FILTER_HZ:
815 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
816 do {
817 if( ((st3&1)!=0 || useFallback) &&
818 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
819 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
820 ) {
821 sa->add(sa->set, c);
822 }
823 st3>>=1;
824 stage3+=2; /* +=st3Multiplier */
825 } while((++c&0xf)!=0);
826 break;
827 default:
828 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
829 return;
830 }
b75a7d8f
A
831 } else {
832 c+=16; /* empty stage 3 block */
833 }
834 }
835 } else {
836 c+=1024; /* empty stage 2 block */
837 }
838 }
839 }
374ca955 840
46f4442e
A
841 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
842}
843
844U_CFUNC void
845ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
846 const USetAdder *sa,
847 UConverterUnicodeSet which,
848 UErrorCode *pErrorCode) {
849 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
850 sharedData, sa, which,
851 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
852 UCNV_SET_FILTER_DBCS_ONLY :
853 UCNV_SET_FILTER_NONE,
854 pErrorCode);
374ca955
A
855}
856
857static void
858ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
73c04bcf 859 const USetAdder *sa,
374ca955
A
860 UConverterUnicodeSet which,
861 UErrorCode *pErrorCode) {
862 if(cnv->options&_MBCS_OPTION_GB18030) {
863 sa->addRange(sa->set, 0, 0xd7ff);
864 sa->addRange(sa->set, 0xe000, 0x10ffff);
865 } else {
866 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
867 }
868}
869
870/* conversion extensions for input not in the main table -------------------- */
871
872/*
873 * Hardcoded extension handling for GB 18030.
874 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
875 *
876 * In the future, conversion extensions may handle m:n mappings and delta tables,
46f4442e 877 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
374ca955
A
878 *
879 * If an input character cannot be mapped, then these functions set an error
880 * code. The framework will then call the callback function.
881 */
882
883/*
884 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
885 * else return 0 after output has been written to the target
886 */
887static UChar32
888_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
889 UChar32 cp,
890 const UChar **source, const UChar *sourceLimit,
46f4442e 891 uint8_t **target, const uint8_t *targetLimit,
374ca955
A
892 int32_t **offsets, int32_t sourceIndex,
893 UBool flush,
894 UErrorCode *pErrorCode) {
895 const int32_t *cx;
896
897 cnv->useSubChar1=FALSE;
898
899 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
900 ucnv_extInitialMatchFromU(
901 cnv, cx,
902 cp, source, sourceLimit,
46f4442e 903 (char **)target, (char *)targetLimit,
374ca955
A
904 offsets, sourceIndex,
905 flush,
906 pErrorCode)
907 ) {
908 return 0; /* an extension mapping handled the input */
909 }
910
911 /* GB 18030 */
912 if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
913 const uint32_t *range;
914 int32_t i;
915
916 range=gb18030Ranges[0];
917 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
918 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
919 /* found the Unicode code point, output the four-byte sequence for it */
920 uint32_t linear;
921 char bytes[4];
922
923 /* get the linear value of the first GB 18030 code in this range */
924 linear=range[2]-LINEAR_18030_BASE;
925
926 /* add the offset from the beginning of the range */
927 linear+=((uint32_t)cp-range[0]);
928
929 /* turn this into a four-byte sequence */
930 bytes[3]=(char)(0x30+linear%10); linear/=10;
931 bytes[2]=(char)(0x81+linear%126); linear/=126;
932 bytes[1]=(char)(0x30+linear%10); linear/=10;
933 bytes[0]=(char)(0x81+linear);
934
935 /* output this sequence */
936 ucnv_fromUWriteBytes(cnv,
46f4442e 937 bytes, 4, (char **)target, (char *)targetLimit,
374ca955
A
938 offsets, sourceIndex, pErrorCode);
939 return 0;
940 }
941 }
942 }
943
944 /* no mapping */
945 *pErrorCode=U_INVALID_CHAR_FOUND;
946 return cp;
947}
948
949/*
950 * Input sequence: cnv->toUBytes[0..length[
951 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
952 * else return 0 after output has been written to the target
953 */
954static int8_t
955_extToU(UConverter *cnv, const UConverterSharedData *sharedData,
956 int8_t length,
46f4442e 957 const uint8_t **source, const uint8_t *sourceLimit,
374ca955
A
958 UChar **target, const UChar *targetLimit,
959 int32_t **offsets, int32_t sourceIndex,
960 UBool flush,
961 UErrorCode *pErrorCode) {
962 const int32_t *cx;
963
964 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
965 ucnv_extInitialMatchToU(
966 cnv, cx,
46f4442e 967 length, (const char **)source, (const char *)sourceLimit,
374ca955
A
968 target, targetLimit,
969 offsets, sourceIndex,
970 flush,
971 pErrorCode)
972 ) {
973 return 0; /* an extension mapping handled the input */
974 }
975
976 /* GB 18030 */
977 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
978 const uint32_t *range;
979 uint32_t linear;
980 int32_t i;
981
982 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
983 range=gb18030Ranges[0];
984 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
985 if(range[2]<=linear && linear<=range[3]) {
986 /* found the sequence, output the Unicode code point for it */
987 *pErrorCode=U_ZERO_ERROR;
988
989 /* add the linear difference between the input and start sequences to the start code point */
990 linear=range[0]+(linear-range[2]);
991
992 /* output this code point */
993 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
994
995 return 0;
996 }
997 }
998 }
999
1000 /* no mapping */
1001 *pErrorCode=U_INVALID_CHAR_FOUND;
1002 return length;
b75a7d8f
A
1003}
1004
1005/* EBCDIC swap LF<->NL ------------------------------------------------------ */
1006
1007/*
1008 * This code modifies a standard EBCDIC<->Unicode mapping table for
1009 * OS/390 (z/OS) Unix System Services (Open Edition).
1010 * The difference is in the mapping of Line Feed and New Line control codes:
1011 * Standard EBCDIC maps
1012 *
1013 * <U000A> \x25 |0
1014 * <U0085> \x15 |0
1015 *
1016 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1017 * mapping
1018 *
1019 * <U000A> \x15 |0
1020 * <U0085> \x25 |0
1021 *
1022 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1023 * by copying it into allocated memory and swapping the LF and NL values.
1024 * It allows to support the same EBCDIC charset in both versions without
1025 * duplicating the entire installed table.
1026 */
1027
1028/* standard EBCDIC codes */
1029#define EBCDIC_LF 0x25
1030#define EBCDIC_NL 0x15
1031
1032/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1033#define EBCDIC_RT_LF 0xf25
1034#define EBCDIC_RT_NL 0xf15
1035
1036/* Unicode code points */
1037#define U_LF 0x0a
1038#define U_NL 0x85
1039
1040static UBool
1041_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
1042 UConverterMBCSTable *mbcsTable;
1043
1044 const uint16_t *table, *results;
1045 const uint8_t *bytes;
1046
1047 int32_t (*newStateTable)[256];
1048 uint16_t *newResults;
1049 uint8_t *p;
1050 char *name;
1051
1052 uint32_t stage2Entry;
1053 uint32_t size, sizeofFromUBytes;
1054
374ca955 1055 mbcsTable=&sharedData->mbcs;
b75a7d8f
A
1056
1057 table=mbcsTable->fromUnicodeTable;
1058 bytes=mbcsTable->fromUnicodeBytes;
1059 results=(const uint16_t *)bytes;
1060
1061 /*
1062 * Check that this is an EBCDIC table with SBCS portion -
1063 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1064 *
1065 * If not, ignore the option. Options are always ignored if they do not apply.
1066 */
1067 if(!(
1068 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1069 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1070 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
1071 )) {
1072 return FALSE;
1073 }
1074
1075 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1076 if(!(
1077 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1078 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1079 )) {
1080 return FALSE;
1081 }
1082 } else /* MBCS_OUTPUT_2_SISO */ {
1083 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1084 if(!(
1085 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1086 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1087 )) {
1088 return FALSE;
1089 }
1090
1091 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1092 if(!(
1093 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1094 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1095 )) {
1096 return FALSE;
1097 }
1098 }
1099
374ca955
A
1100 if(mbcsTable->fromUBytesLength>0) {
1101 /*
1102 * We _know_ the number of bytes in the fromUnicodeBytes array
1103 * starting with header.version 4.1.
1104 */
1105 sizeofFromUBytes=mbcsTable->fromUBytesLength;
1106 } else {
1107 /*
1108 * Otherwise:
1109 * There used to be code to enumerate the fromUnicode
1110 * trie and find the highest entry, but it was removed in ICU 3.2
1111 * because it was not tested and caused a low code coverage number.
1112 * See Jitterbug 3674.
1113 * This affects only some .cnv file formats with a header.version
1114 * below 4.1, and only when swaplfnl is requested.
1115 *
1116 * ucnvmbcs.c revision 1.99 is the last one with the
1117 * ucnv_MBCSSizeofFromUBytes() function.
1118 */
1119 *pErrorCode=U_INVALID_FORMAT_ERROR;
1120 return FALSE;
1121 }
1122
b75a7d8f
A
1123 /*
1124 * The table has an appropriate format.
1125 * Allocate and build
1126 * - a modified to-Unicode state table
1127 * - a modified from-Unicode output array
1128 * - a converter name string with the swap option appended
1129 */
b75a7d8f
A
1130 size=
1131 mbcsTable->countStates*1024+
1132 sizeofFromUBytes+
1133 UCNV_MAX_CONVERTER_NAME_LENGTH+20;
1134 p=(uint8_t *)uprv_malloc(size);
1135 if(p==NULL) {
1136 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1137 return FALSE;
1138 }
1139
1140 /* copy and modify the to-Unicode state table */
1141 newStateTable=(int32_t (*)[256])p;
1142 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1143
1144 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1145 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1146
1147 /* copy and modify the from-Unicode result table */
1148 newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1149 uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1150
1151 /* conveniently, the table access macros work on the left side of expressions */
1152 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1153 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1154 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1155 } else /* MBCS_OUTPUT_2_SISO */ {
1156 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1157 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1158
1159 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1160 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1161 }
1162
1163 /* set the canonical converter name */
1164 name=(char *)newResults+sizeofFromUBytes;
1165 uprv_strcpy(name, sharedData->staticData->name);
1166 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1167
1168 /* set the pointers */
1169 umtx_lock(NULL);
1170 if(mbcsTable->swapLFNLStateTable==NULL) {
1171 mbcsTable->swapLFNLStateTable=newStateTable;
1172 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1173 mbcsTable->swapLFNLName=name;
1174
1175 newStateTable=NULL;
1176 }
1177 umtx_unlock(NULL);
1178
1179 /* release the allocated memory if another thread beat us to it */
1180 if(newStateTable!=NULL) {
1181 uprv_free(newStateTable);
1182 }
1183 return TRUE;
1184}
1185
46f4442e
A
1186/* reconstitute omitted fromUnicode data ------------------------------------ */
1187
1188/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1189static UBool U_CALLCONV
1190writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1191 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
1192 const uint16_t *table;
1193 uint32_t *stage2;
1194 uint8_t *bytes, *p;
1195 UChar32 c;
1196 int32_t i, st3;
1197
1198 table=mbcsTable->fromUnicodeTable;
1199 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1200
1201 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1202 switch(mbcsTable->outputType) {
1203 case MBCS_OUTPUT_3_EUC:
1204 if(value<=0xffff) {
1205 /* short sequences are stored directly */
1206 /* code set 0 or 1 */
1207 } else if(value<=0x8effff) {
1208 /* code set 2 */
1209 value&=0x7fff;
1210 } else /* first byte is 0x8f */ {
1211 /* code set 3 */
1212 value&=0xff7f;
1213 }
1214 break;
1215 case MBCS_OUTPUT_4_EUC:
1216 if(value<=0xffffff) {
1217 /* short sequences are stored directly */
1218 /* code set 0 or 1 */
1219 } else if(value<=0x8effffff) {
1220 /* code set 2 */
1221 value&=0x7fffff;
1222 } else /* first byte is 0x8f */ {
1223 /* code set 3 */
1224 value&=0xff7fff;
1225 }
1226 break;
1227 default:
1228 break;
1229 }
1230
1231 for(i=0; i<=0x1f; ++value, ++i) {
1232 c=codePoints[i];
1233 if(c<0) {
1234 continue;
1235 }
1236
1237 /* locate the stage 2 & 3 data */
1238 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1239 p=bytes;
1240 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1241
1242 /* write the codepage bytes into stage 3 */
1243 switch(mbcsTable->outputType) {
1244 case MBCS_OUTPUT_3:
1245 case MBCS_OUTPUT_4_EUC:
1246 p+=st3*3;
1247 p[0]=(uint8_t)(value>>16);
1248 p[1]=(uint8_t)(value>>8);
1249 p[2]=(uint8_t)value;
1250 break;
1251 case MBCS_OUTPUT_4:
1252 ((uint32_t *)p)[st3]=value;
1253 break;
1254 default:
1255 /* 2 bytes per character */
1256 ((uint16_t *)p)[st3]=(uint16_t)value;
1257 break;
1258 }
1259
1260 /* set the roundtrip flag */
1261 *stage2|=(1UL<<(16+(c&0xf)));
1262 }
1263 return TRUE;
1264 }
1265
1266static void
1267reconstituteData(UConverterMBCSTable *mbcsTable,
1268 uint32_t stage1Length, uint32_t stage2Length,
1269 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
1270 UErrorCode *pErrorCode) {
1271 uint16_t *stage1;
1272 uint32_t *stage2;
1273 uint8_t *bytes;
1274 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1275 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1276 if(mbcsTable->reconstitutedData==NULL) {
1277 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1278 return;
1279 }
1280 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1281
1282 /* copy existing data and reroute the pointers */
1283 stage1=(uint16_t *)mbcsTable->reconstitutedData;
1284 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1285
1286 stage2=(uint32_t *)(stage1+stage1Length);
1287 uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1288 mbcsTable->fromUnicodeTable+stage1Length,
1289 stage2Length*4);
1290
1291 mbcsTable->fromUnicodeTable=stage1;
1292 mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
1293
1294 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1295 stage2=(uint32_t *)stage1;
1296
1297 /* reconstitute the initial part of stage 2 from the mbcsIndex */
1298 {
1299 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1300 int32_t stageUTF8Index=0;
1301 int32_t st1, st2, st3, i;
1302
1303 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1304 st2=stage1[st1];
1305 if(st2!=stage1Length/2) {
1306 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1307 for(i=0; i<16; ++i) {
1308 st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1309 if(st3!=0) {
1310 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1311 st3>>=4;
1312 /*
1313 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1314 * allocated together as a single 64-block for access from the mbcsIndex
1315 */
1316 stage2[st2++]=st3++;
1317 stage2[st2++]=st3++;
1318 stage2[st2++]=st3++;
1319 stage2[st2++]=st3;
1320 } else {
1321 /* no stage 3 block, skip */
1322 st2+=4;
1323 }
1324 }
1325 } else {
1326 /* no stage 2 block, skip */
1327 stageUTF8Index+=16;
1328 }
1329 }
1330 }
1331
1332 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1333 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1334}
1335
b75a7d8f
A
1336/* MBCS setup functions ----------------------------------------------------- */
1337
1338static void
374ca955
A
1339ucnv_MBCSLoad(UConverterSharedData *sharedData,
1340 UConverterLoadArgs *pArgs,
b75a7d8f
A
1341 const uint8_t *raw,
1342 UErrorCode *pErrorCode) {
1343 UDataInfo info;
374ca955 1344 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
b75a7d8f 1345 _MBCSHeader *header=(_MBCSHeader *)raw;
374ca955 1346 uint32_t offset;
46f4442e
A
1347 uint32_t headerLength;
1348 UBool noFromU=FALSE;
1349
1350 if(header->version[0]==4) {
1351 headerLength=MBCS_HEADER_V4_LENGTH;
1352 } else if(header->version[0]==5 && header->version[1]>=3 &&
1353 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
1354 headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1355 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1356 } else {
b75a7d8f
A
1357 *pErrorCode=U_INVALID_TABLE_FORMAT;
1358 return;
1359 }
1360
b75a7d8f 1361 mbcsTable->outputType=(uint8_t)header->flags;
46f4442e
A
1362 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1363 *pErrorCode=U_INVALID_TABLE_FORMAT;
1364 return;
1365 }
b75a7d8f 1366
374ca955
A
1367 /* extension data, header version 4.2 and higher */
1368 offset=header->flags>>8;
1369 if(offset!=0) {
1370 mbcsTable->extIndexes=(const int32_t *)(raw+offset);
b75a7d8f
A
1371 }
1372
374ca955
A
1373 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1374 UConverterLoadArgs args={ 0 };
1375 UConverterSharedData *baseSharedData;
1376 const int32_t *extIndexes;
1377 const char *baseName;
b75a7d8f 1378
374ca955
A
1379 /* extension-only file, load the base table and set values appropriately */
1380 if((extIndexes=mbcsTable->extIndexes)==NULL) {
1381 /* extension-only file without extension */
1382 *pErrorCode=U_INVALID_TABLE_FORMAT;
1383 return;
1384 }
b75a7d8f 1385
374ca955
A
1386 if(pArgs->nestedLoads!=1) {
1387 /* an extension table must not be loaded as a base table */
1388 *pErrorCode=U_INVALID_TABLE_FILE;
1389 return;
1390 }
b75a7d8f 1391
374ca955 1392 /* load the base table */
46f4442e 1393 baseName=(const char *)header+headerLength*4;
374ca955
A
1394 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1395 /* forbid loading this same extension-only file */
1396 *pErrorCode=U_INVALID_TABLE_FORMAT;
1397 return;
1398 }
b75a7d8f 1399
374ca955
A
1400 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1401 args.size=sizeof(UConverterLoadArgs);
1402 args.nestedLoads=2;
1403 args.reserved=pArgs->reserved;
1404 args.options=pArgs->options;
1405 args.pkg=pArgs->pkg;
1406 args.name=baseName;
1407 baseSharedData=ucnv_load(&args, pErrorCode);
1408 if(U_FAILURE(*pErrorCode)) {
1409 return;
1410 }
1411 if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1412 baseSharedData->mbcs.baseSharedData!=NULL
1413 ) {
1414 ucnv_unload(baseSharedData);
1415 *pErrorCode=U_INVALID_TABLE_FORMAT;
1416 return;
1417 }
1418
1419 /* copy the base table data */
1420 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1421
1422 /* overwrite values with relevant ones for the extension converter */
1423 mbcsTable->baseSharedData=baseSharedData;
1424 mbcsTable->extIndexes=extIndexes;
1425
1426 /*
1427 * It would be possible to share the swapLFNL data with a base converter,
1428 * but the generated name would have to be different, and the memory
1429 * would have to be free'd only once.
1430 * It is easier to just create the data for the extension converter
1431 * separately when it is requested.
1432 */
1433 mbcsTable->swapLFNLStateTable=NULL;
1434 mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1435 mbcsTable->swapLFNLName=NULL;
1436
46f4442e
A
1437 /*
1438 * The reconstitutedData must be deleted only when the base converter
1439 * is unloaded.
1440 */
1441 mbcsTable->reconstitutedData=NULL;
1442
374ca955
A
1443 /*
1444 * Set a special, runtime-only outputType if the extension converter
1445 * is a DBCS version of a base converter that also maps single bytes.
1446 */
1447 if( sharedData->staticData->conversionType==UCNV_DBCS ||
1448 (sharedData->staticData->conversionType==UCNV_MBCS &&
1449 sharedData->staticData->minBytesPerChar>=2)
1450 ) {
1451 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1452 /* the base converter is SI/SO-stateful */
1453 int32_t entry;
1454
1455 /* get the dbcs state from the state table entry for SO=0x0e */
1456 entry=mbcsTable->stateTable[0][0xe];
1457 if( MBCS_ENTRY_IS_FINAL(entry) &&
1458 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1459 MBCS_ENTRY_FINAL_STATE(entry)!=0
1460 ) {
1461 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1462
1463 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1464 }
1465 } else if(
1466 baseSharedData->staticData->conversionType==UCNV_MBCS &&
1467 baseSharedData->staticData->minBytesPerChar==1 &&
1468 baseSharedData->staticData->maxBytesPerChar==2 &&
1469 mbcsTable->countStates<=127
1470 ) {
1471 /* non-stateful base converter, need to modify the state table */
1472 int32_t (*newStateTable)[256];
1473 int32_t *state;
1474 int32_t i, count;
1475
1476 /* allocate a new state table and copy the base state table contents */
1477 count=mbcsTable->countStates;
1478 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1479 if(newStateTable==NULL) {
1480 ucnv_unload(baseSharedData);
1481 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1482 return;
1483 }
1484
1485 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1486
1487 /* change all final single-byte entries to go to a new all-illegal state */
1488 state=newStateTable[0];
1489 for(i=0; i<256; ++i) {
1490 if(MBCS_ENTRY_IS_FINAL(state[i])) {
1491 state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1492 }
1493 }
1494
1495 /* build the new all-illegal state */
1496 state=newStateTable[count];
1497 for(i=0; i<256; ++i) {
1498 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1499 }
1500 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1501 mbcsTable->countStates=(uint8_t)(count+1);
1502 mbcsTable->stateTableOwned=TRUE;
1503
1504 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1505 }
1506 }
1507
1508 /*
1509 * unlike below for files with base tables, do not get the unicodeMask
1510 * from the sharedData; instead, use the base table's unicodeMask,
1511 * which we copied in the memcpy above;
1512 * this is necessary because the static data unicodeMask, especially
1513 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1514 */
1515 } else {
1516 /* conversion file with a base table; an additional extension table is optional */
1517 /* make sure that the output type is known */
1518 switch(mbcsTable->outputType) {
1519 case MBCS_OUTPUT_1:
1520 case MBCS_OUTPUT_2:
1521 case MBCS_OUTPUT_3:
1522 case MBCS_OUTPUT_4:
1523 case MBCS_OUTPUT_3_EUC:
1524 case MBCS_OUTPUT_4_EUC:
1525 case MBCS_OUTPUT_2_SISO:
1526 /* OK */
1527 break;
1528 default:
1529 *pErrorCode=U_INVALID_TABLE_FORMAT;
1530 return;
1531 }
1532
1533 mbcsTable->countStates=(uint8_t)header->countStates;
1534 mbcsTable->countToUFallbacks=header->countToUFallbacks;
46f4442e 1535 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
374ca955
A
1536 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1537 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1538
1539 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1540 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1541 mbcsTable->fromUBytesLength=header->fromUBytesLength;
1542
1543 /*
1544 * converter versions 6.1 and up contain a unicodeMask that is
1545 * used here to select the most efficient function implementations
1546 */
1547 info.size=sizeof(UDataInfo);
1548 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1549 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1550 /* mask off possible future extensions to be safe */
1551 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1552 } else {
1553 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1554 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1555 }
46f4442e
A
1556
1557 /*
1558 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1559 * Check for the header version, SBCS vs. MBCS, and for whether the
1560 * data structures are optimized for code points as high as what the
1561 * runtime code is designed for.
1562 * The implementation does not handle mapping tables with entries for
1563 * unpaired surrogates.
1564 */
1565 if( header->version[1]>=3 &&
1566 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1567 (mbcsTable->countStates==1 ?
1568 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1569 (header->version[2]>=(MBCS_FAST_MAX>>8))
1570 )
1571 ) {
1572 mbcsTable->utf8Friendly=TRUE;
1573
1574 if(mbcsTable->countStates==1) {
1575 /*
1576 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1577 * Build a table with indexes to each block, to be used instead of
1578 * the regular stage 1/2 table.
1579 */
1580 int32_t i;
1581 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1582 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1583 }
1584 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1585 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1586 } else {
1587 /*
1588 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1589 * The .cnv file is prebuilt with an additional stage table with indexes
1590 * to each block.
1591 */
1592 mbcsTable->mbcsIndex=(const uint16_t *)
1593 (mbcsTable->fromUnicodeBytes+
1594 (noFromU ? 0 : mbcsTable->fromUBytesLength));
1595 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1596 }
1597 }
1598
1599 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1600 {
1601 uint32_t asciiRoundtrips=0xffffffff;
1602 int32_t i;
1603
1604 for(i=0; i<0x80; ++i) {
1605 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1606 asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1607 }
1608 }
1609 mbcsTable->asciiRoundtrips=asciiRoundtrips;
1610 }
1611
1612 if(noFromU) {
1613 uint32_t stage1Length=
1614 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1615 0x440 : 0x40;
1616 uint32_t stage2Length=
1617 (header->offsetFromUBytes-header->offsetFromUTable)/4-
1618 stage1Length/2;
1619 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1620 }
1621 }
1622
1623 /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1624 if(mbcsTable->utf8Friendly) {
1625 if(mbcsTable->countStates==1) {
1626 sharedData->impl=&_SBCSUTF8Impl;
1627 } else {
1628 if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1629 sharedData->impl=&_DBCSUTF8Impl;
1630 }
1631 }
1632 }
1633
1634 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1635 /*
1636 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1637 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1638 */
1639 mbcsTable->asciiRoundtrips=0;
374ca955
A
1640 }
1641}
1642
1643static void
1644ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1645 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1646
1647 if(mbcsTable->swapLFNLStateTable!=NULL) {
1648 uprv_free(mbcsTable->swapLFNLStateTable);
1649 }
1650 if(mbcsTable->stateTableOwned) {
1651 uprv_free((void *)mbcsTable->stateTable);
1652 }
1653 if(mbcsTable->baseSharedData!=NULL) {
1654 ucnv_unload(mbcsTable->baseSharedData);
1655 }
46f4442e
A
1656 if(mbcsTable->reconstitutedData!=NULL) {
1657 uprv_free(mbcsTable->reconstitutedData);
1658 }
374ca955
A
1659}
1660
1661static void
1662ucnv_MBCSOpen(UConverter *cnv,
1663 const char *name,
b75a7d8f
A
1664 const char *locale,
1665 uint32_t options,
1666 UErrorCode *pErrorCode) {
374ca955
A
1667 UConverterMBCSTable *mbcsTable;
1668 const int32_t *extIndexes;
1669 uint8_t outputType;
1670 int8_t maxBytesPerUChar;
1671
1672 mbcsTable=&cnv->sharedData->mbcs;
1673 outputType=mbcsTable->outputType;
1674
1675 if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1676 /* the swaplfnl option does not apply, remove it */
1677 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1678 }
1679
b75a7d8f
A
1680 if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
1681 /* do this because double-checked locking is broken */
1682 UBool isCached;
1683
1684 umtx_lock(NULL);
374ca955 1685 isCached=mbcsTable->swapLFNLStateTable!=NULL;
b75a7d8f
A
1686 umtx_unlock(NULL);
1687
1688 if(!isCached) {
1689 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
374ca955
A
1690 if(U_FAILURE(*pErrorCode)) {
1691 return; /* something went wrong */
1692 }
1693
b75a7d8f 1694 /* the option does not apply, remove it */
374ca955 1695 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
b75a7d8f
A
1696 }
1697 }
1698 }
1699
b75a7d8f
A
1700 if(uprv_strstr(name, "18030")!=NULL) {
1701 if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
1702 /* set a flag for GB 18030 mode, which changes the callback behavior */
1703 cnv->options|=_MBCS_OPTION_GB18030;
1704 }
1705 }
1706
374ca955
A
1707 /* fix maxBytesPerUChar depending on outputType and options etc. */
1708 if(outputType==MBCS_OUTPUT_2_SISO) {
1709 cnv->maxBytesPerUChar=3; /* SO+DBCS */
1710 }
1711
1712 extIndexes=mbcsTable->extIndexes;
1713 if(extIndexes!=NULL) {
1714 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1715 if(outputType==MBCS_OUTPUT_2_SISO) {
1716 ++maxBytesPerUChar; /* SO + multiple DBCS */
1717 }
1718
1719 if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1720 cnv->maxBytesPerUChar=maxBytesPerUChar;
1721 }
1722 }
1723
1724#if 0
1725 /*
1726 * documentation of UConverter fields used for status
1727 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1728 */
1729
1730 /* toUnicode */
1731 cnv->toUnicodeStatus=0; /* offset */
1732 cnv->mode=0; /* state */
1733 cnv->toULength=0; /* byteIndex */
1734
1735 /* fromUnicode */
1736 cnv->fromUChar32=0;
1737 cnv->fromUnicodeStatus=1; /* prevLength */
1738#endif
b75a7d8f
A
1739}
1740
1741static const char *
374ca955
A
1742ucnv_MBCSGetName(const UConverter *cnv) {
1743 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1744 return cnv->sharedData->mbcs.swapLFNLName;
b75a7d8f
A
1745 } else {
1746 return cnv->sharedData->staticData->name;
1747 }
1748}
1749
1750/* MBCS-to-Unicode conversion functions ------------------------------------- */
1751
1752static UChar32
374ca955 1753ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
b75a7d8f
A
1754 const _MBCSToUFallback *toUFallbacks;
1755 uint32_t i, start, limit;
1756
1757 limit=mbcsTable->countToUFallbacks;
1758 if(limit>0) {
1759 /* do a binary search for the fallback mapping */
1760 toUFallbacks=mbcsTable->toUFallbacks;
1761 start=0;
1762 while(start<limit-1) {
1763 i=(start+limit)/2;
1764 if(offset<toUFallbacks[i].offset) {
1765 limit=i;
1766 } else {
1767 start=i;
1768 }
1769 }
1770
1771 /* did we really find it? */
1772 if(offset==toUFallbacks[start].offset) {
1773 return toUFallbacks[start].codePoint;
1774 }
1775 }
1776
1777 return 0xfffe;
1778}
1779
374ca955
A
1780/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1781static void
1782ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1783 UErrorCode *pErrorCode) {
b75a7d8f
A
1784 UConverter *cnv;
1785 const uint8_t *source, *sourceLimit;
1786 UChar *target;
1787 const UChar *targetLimit;
1788 int32_t *offsets;
1789
1790 const int32_t (*stateTable)[256];
b75a7d8f 1791
374ca955 1792 int32_t sourceIndex;
b75a7d8f
A
1793
1794 int32_t entry;
1795 UChar c;
1796 uint8_t action;
b75a7d8f
A
1797
1798 /* set up the local pointers */
374ca955 1799 cnv=pArgs->converter;
b75a7d8f
A
1800 source=(const uint8_t *)pArgs->source;
1801 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1802 target=pArgs->target;
1803 targetLimit=pArgs->targetLimit;
1804 offsets=pArgs->offsets;
1805
1806 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 1807 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 1808 } else {
374ca955 1809 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f 1810 }
b75a7d8f
A
1811
1812 /* sourceIndex=-1 if the current character began in the previous buffer */
374ca955 1813 sourceIndex=0;
b75a7d8f
A
1814
1815 /* conversion loop */
1816 while(source<sourceLimit) {
1817 /*
1818 * This following test is to see if available input would overflow the output.
1819 * It does not catch output of more than one code unit that
1820 * overflows as a result of a surrogate pair or callback output
1821 * from the last source byte.
1822 * Therefore, those situations also test for overflows and will
1823 * then break the loop, too.
1824 */
374ca955
A
1825 if(target>=targetLimit) {
1826 /* target is full */
1827 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1828 break;
1829 }
b75a7d8f 1830
374ca955
A
1831 entry=stateTable[0][*source++];
1832 /* MBCS_ENTRY_IS_FINAL(entry) */
b75a7d8f 1833
374ca955
A
1834 /* test the most common case first */
1835 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1836 /* output BMP code point */
1837 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1838 if(offsets!=NULL) {
1839 *offsets++=sourceIndex;
1840 }
b75a7d8f 1841
374ca955
A
1842 /* normal end of action codes: prepare for a new character */
1843 ++sourceIndex;
1844 continue;
1845 }
b75a7d8f 1846
374ca955
A
1847 /*
1848 * An if-else-if chain provides more reliable performance for
1849 * the most common cases compared to a switch.
1850 */
1851 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1852 if(action==MBCS_STATE_VALID_DIRECT_20 ||
1853 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1854 ) {
1855 entry=MBCS_ENTRY_FINAL_VALUE(entry);
1856 /* output surrogate pair */
1857 *target++=(UChar)(0xd800|(UChar)(entry>>10));
1858 if(offsets!=NULL) {
1859 *offsets++=sourceIndex;
1860 }
1861 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1862 if(target<targetLimit) {
1863 *target++=c;
1864 if(offsets!=NULL) {
1865 *offsets++=sourceIndex;
1866 }
1867 } else {
1868 /* target overflow */
1869 cnv->UCharErrorBuffer[0]=c;
1870 cnv->UCharErrorBufferLength=1;
1871 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1872 break;
1873 }
b75a7d8f 1874
374ca955
A
1875 ++sourceIndex;
1876 continue;
1877 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1878 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1879 /* output BMP code point */
1880 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1881 if(offsets!=NULL) {
1882 *offsets++=sourceIndex;
b75a7d8f
A
1883 }
1884
374ca955
A
1885 ++sourceIndex;
1886 continue;
b75a7d8f 1887 }
374ca955
A
1888 } else if(action==MBCS_STATE_UNASSIGNED) {
1889 /* just fall through */
1890 } else if(action==MBCS_STATE_ILLEGAL) {
1891 /* callback(illegal) */
1892 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 1893 } else {
374ca955
A
1894 /* reserved, must never occur */
1895 ++sourceIndex;
1896 continue;
b75a7d8f 1897 }
b75a7d8f 1898
374ca955
A
1899 if(U_FAILURE(*pErrorCode)) {
1900 /* callback(illegal) */
1901 break;
1902 } else /* unassigned sequences indicated with byteIndex>0 */ {
1903 /* try an extension mapping */
1904 pArgs->source=(const char *)source;
1905 cnv->toUBytes[0]=*(source-1);
1906 cnv->toULength=_extToU(cnv, cnv->sharedData,
46f4442e 1907 1, &source, sourceLimit,
374ca955
A
1908 &target, targetLimit,
1909 &offsets, sourceIndex,
1910 pArgs->flush,
1911 pErrorCode);
1912 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
1913
1914 if(U_FAILURE(*pErrorCode)) {
1915 /* not mappable or buffer overflow */
1916 break;
1917 }
b75a7d8f 1918 }
b75a7d8f
A
1919 }
1920
1921 /* write back the updated pointers */
1922 pArgs->source=(const char *)source;
1923 pArgs->target=target;
1924 pArgs->offsets=offsets;
1925}
1926
374ca955
A
1927/*
1928 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1929 * that only map to and from the BMP.
1930 * In addition to single-byte optimizations, the offset calculations
1931 * become much easier.
1932 */
b75a7d8f 1933static void
374ca955
A
1934ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1935 UErrorCode *pErrorCode) {
b75a7d8f 1936 UConverter *cnv;
374ca955 1937 const uint8_t *source, *sourceLimit, *lastSource;
b75a7d8f 1938 UChar *target;
374ca955 1939 int32_t targetCapacity, length;
b75a7d8f
A
1940 int32_t *offsets;
1941
1942 const int32_t (*stateTable)[256];
1943
374ca955 1944 int32_t sourceIndex;
b75a7d8f
A
1945
1946 int32_t entry;
b75a7d8f 1947 uint8_t action;
b75a7d8f
A
1948
1949 /* set up the local pointers */
1950 cnv=pArgs->converter;
1951 source=(const uint8_t *)pArgs->source;
1952 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1953 target=pArgs->target;
73c04bcf 1954 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
1955 offsets=pArgs->offsets;
1956
1957 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 1958 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 1959 } else {
374ca955 1960 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f
A
1961 }
1962
1963 /* sourceIndex=-1 if the current character began in the previous buffer */
1964 sourceIndex=0;
374ca955 1965 lastSource=source;
b75a7d8f
A
1966
1967 /*
1968 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1969 * for the minimum of the sourceLength and targetCapacity
1970 */
73c04bcf 1971 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
1972 if(length<targetCapacity) {
1973 targetCapacity=length;
1974 }
1975
1976#if MBCS_UNROLL_SINGLE_TO_BMP
1977 /* unrolling makes it faster on Pentium III/Windows 2000 */
1978 /* unroll the loop with the most common case */
1979unrolled:
1980 if(targetCapacity>=16) {
1981 int32_t count, loops, oredEntries;
1982
1983 loops=count=targetCapacity>>4;
1984 do {
1985 oredEntries=entry=stateTable[0][*source++];
1986 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1987 oredEntries|=entry=stateTable[0][*source++];
1988 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1989 oredEntries|=entry=stateTable[0][*source++];
1990 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1991 oredEntries|=entry=stateTable[0][*source++];
1992 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1993 oredEntries|=entry=stateTable[0][*source++];
1994 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1995 oredEntries|=entry=stateTable[0][*source++];
1996 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1997 oredEntries|=entry=stateTable[0][*source++];
1998 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1999 oredEntries|=entry=stateTable[0][*source++];
2000 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2001 oredEntries|=entry=stateTable[0][*source++];
2002 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2003 oredEntries|=entry=stateTable[0][*source++];
2004 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2005 oredEntries|=entry=stateTable[0][*source++];
2006 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2007 oredEntries|=entry=stateTable[0][*source++];
2008 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2009 oredEntries|=entry=stateTable[0][*source++];
2010 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2011 oredEntries|=entry=stateTable[0][*source++];
2012 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2013 oredEntries|=entry=stateTable[0][*source++];
2014 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2015 oredEntries|=entry=stateTable[0][*source++];
2016 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2017
2018 /* were all 16 entries really valid? */
2019 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2020 /* no, return to the first of these 16 */
2021 source-=16;
2022 target-=16;
2023 break;
2024 }
2025 } while(--count>0);
2026 count=loops-count;
2027 targetCapacity-=16*count;
2028
2029 if(offsets!=NULL) {
2030 lastSource+=16*count;
2031 while(count>0) {
2032 *offsets++=sourceIndex++;
2033 *offsets++=sourceIndex++;
2034 *offsets++=sourceIndex++;
2035 *offsets++=sourceIndex++;
2036 *offsets++=sourceIndex++;
2037 *offsets++=sourceIndex++;
2038 *offsets++=sourceIndex++;
2039 *offsets++=sourceIndex++;
2040 *offsets++=sourceIndex++;
2041 *offsets++=sourceIndex++;
2042 *offsets++=sourceIndex++;
2043 *offsets++=sourceIndex++;
2044 *offsets++=sourceIndex++;
2045 *offsets++=sourceIndex++;
2046 *offsets++=sourceIndex++;
2047 *offsets++=sourceIndex++;
2048 --count;
2049 }
2050 }
2051 }
2052#endif
2053
2054 /* conversion loop */
2055 while(targetCapacity>0) {
2056 entry=stateTable[0][*source++];
2057 /* MBCS_ENTRY_IS_FINAL(entry) */
2058
2059 /* test the most common case first */
2060 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2061 /* output BMP code point */
2062 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2063 --targetCapacity;
2064 continue;
2065 }
2066
2067 /*
2068 * An if-else-if chain provides more reliable performance for
2069 * the most common cases compared to a switch.
2070 */
2071 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2072 if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
374ca955
A
2073 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2074 /* output BMP code point */
2075 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2076 --targetCapacity;
2077 continue;
b75a7d8f 2078 }
b75a7d8f 2079 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 2080 /* just fall through */
b75a7d8f
A
2081 } else if(action==MBCS_STATE_ILLEGAL) {
2082 /* callback(illegal) */
b75a7d8f
A
2083 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2084 } else {
2085 /* reserved, must never occur */
2086 continue;
2087 }
2088
374ca955 2089 /* set offsets since the start or the last extension */
b75a7d8f
A
2090 if(offsets!=NULL) {
2091 int32_t count=(int32_t)(source-lastSource);
2092
2093 /* predecrement: do not set the offset for the callback-causing character */
2094 while(--count>0) {
2095 *offsets++=sourceIndex++;
2096 }
2097 /* offset and sourceIndex are now set for the current character */
2098 }
2099
374ca955
A
2100 if(U_FAILURE(*pErrorCode)) {
2101 /* callback(illegal) */
b75a7d8f 2102 break;
374ca955
A
2103 } else /* unassigned sequences indicated with byteIndex>0 */ {
2104 /* try an extension mapping */
2105 lastSource=source;
2106 cnv->toUBytes[0]=*(source-1);
2107 cnv->toULength=_extToU(cnv, cnv->sharedData,
46f4442e
A
2108 1, &source, sourceLimit,
2109 &target, pArgs->targetLimit,
374ca955
A
2110 &offsets, sourceIndex,
2111 pArgs->flush,
2112 pErrorCode);
2113 sourceIndex+=1+(int32_t)(source-lastSource);
2114
2115 if(U_FAILURE(*pErrorCode)) {
2116 /* not mappable or buffer overflow */
2117 break;
2118 }
2119
2120 /* recalculate the targetCapacity after an extension mapping */
73c04bcf
A
2121 targetCapacity=(int32_t)(pArgs->targetLimit-target);
2122 length=(int32_t)(sourceLimit-source);
374ca955
A
2123 if(length<targetCapacity) {
2124 targetCapacity=length;
2125 }
b75a7d8f
A
2126 }
2127
2128#if MBCS_UNROLL_SINGLE_TO_BMP
2129 /* unrolling makes it faster on Pentium III/Windows 2000 */
2130 goto unrolled;
2131#endif
2132 }
2133
2134 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2135 /* target is full */
2136 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2137 }
2138
2139 /* set offsets since the start or the last callback */
2140 if(offsets!=NULL) {
2141 size_t count=source-lastSource;
2142 while(count>0) {
2143 *offsets++=sourceIndex++;
2144 --count;
2145 }
2146 }
2147
2148 /* write back the updated pointers */
2149 pArgs->source=(const char *)source;
2150 pArgs->target=target;
2151 pArgs->offsets=offsets;
2152}
2153
fd0068a8
A
2154static UBool
2155hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2156 const int32_t *row=stateTable[state];
2157 int32_t b, entry;
2158 /* First test for final entries in this state for some commonly valid byte values. */
2159 entry=row[0xa1];
2160 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2161 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2162 ) {
2163 return TRUE;
2164 }
2165 entry=row[0x41];
2166 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2167 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2168 ) {
2169 return TRUE;
2170 }
2171 /* Then test for final entries in this state. */
2172 for(b=0; b<=0xff; ++b) {
2173 entry=row[b];
2174 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2175 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2176 ) {
2177 return TRUE;
2178 }
2179 }
2180 /* Then recurse for transition entries. */
2181 for(b=0; b<=0xff; ++b) {
2182 entry=row[b];
2183 if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2184 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2185 ) {
2186 return TRUE;
2187 }
2188 }
2189 return FALSE;
2190}
2191
2192/*
2193 * Is byte b a single/lead byte in this state?
2194 * Recurse for transition states, because here we don't want to say that
2195 * b is a lead byte if all byte sequences that start with b are illegal.
2196 */
2197static UBool
2198isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2199 const int32_t *row=stateTable[state];
2200 int32_t entry=row[b];
2201 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
2202 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2203 } else {
2204 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2205 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2206 return FALSE; /* SI/SO are illegal for DBCS-only conversion */
2207 } else {
2208 return action!=MBCS_STATE_ILLEGAL;
2209 }
2210 }
2211}
2212
374ca955
A
2213U_CFUNC void
2214ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2215 UErrorCode *pErrorCode) {
b75a7d8f
A
2216 UConverter *cnv;
2217 const uint8_t *source, *sourceLimit;
374ca955
A
2218 UChar *target;
2219 const UChar *targetLimit;
2220 int32_t *offsets;
b75a7d8f
A
2221
2222 const int32_t (*stateTable)[256];
2223 const uint16_t *unicodeCodeUnits;
2224
2225 uint32_t offset;
2226 uint8_t state;
2227 int8_t byteIndex;
2228 uint8_t *bytes;
2229
374ca955
A
2230 int32_t sourceIndex, nextSourceIndex;
2231
b75a7d8f 2232 int32_t entry;
374ca955 2233 UChar c;
b75a7d8f 2234 uint8_t action;
b75a7d8f
A
2235
2236 /* use optimized function if possible */
2237 cnv=pArgs->converter;
374ca955
A
2238
2239 if(cnv->preToULength>0) {
b75a7d8f 2240 /*
374ca955
A
2241 * pass sourceIndex=-1 because we continue from an earlier buffer
2242 * in the future, this may change with continuous offsets
b75a7d8f 2243 */
374ca955
A
2244 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2245
2246 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2247 return;
2248 }
2249 }
2250
2251 if(cnv->sharedData->mbcs.countStates==1) {
2252 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2253 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2254 } else {
2255 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2256 }
2257 return;
b75a7d8f
A
2258 }
2259
2260 /* set up the local pointers */
2261 source=(const uint8_t *)pArgs->source;
2262 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
374ca955
A
2263 target=pArgs->target;
2264 targetLimit=pArgs->targetLimit;
2265 offsets=pArgs->offsets;
b75a7d8f
A
2266
2267 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2268 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2269 } else {
374ca955 2270 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f 2271 }
374ca955 2272 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
b75a7d8f
A
2273
2274 /* get the converter state from UConverter */
2275 offset=cnv->toUnicodeStatus;
b75a7d8f
A
2276 byteIndex=cnv->toULength;
2277 bytes=cnv->toUBytes;
2278
374ca955
A
2279 /*
2280 * if we are in the SBCS state for a DBCS-only converter,
2281 * then load the DBCS state from the MBCS data
2282 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2283 */
2284 if((state=(uint8_t)(cnv->mode))==0) {
2285 state=cnv->sharedData->mbcs.dbcsOnlyState;
2286 }
2287
2288 /* sourceIndex=-1 if the current character began in the previous buffer */
2289 sourceIndex=byteIndex==0 ? 0 : -1;
2290 nextSourceIndex=0;
2291
b75a7d8f
A
2292 /* conversion loop */
2293 while(source<sourceLimit) {
374ca955
A
2294 /*
2295 * This following test is to see if available input would overflow the output.
2296 * It does not catch output of more than one code unit that
2297 * overflows as a result of a surrogate pair or callback output
2298 * from the last source byte.
2299 * Therefore, those situations also test for overflows and will
2300 * then break the loop, too.
2301 */
2302 if(target>=targetLimit) {
2303 /* target is full */
2304 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2305 break;
2306 }
2307
2308 if(byteIndex==0) {
2309 /* optimized loop for 1/2-byte input and BMP output */
2310 if(offsets==NULL) {
2311 do {
2312 entry=stateTable[state][*source];
2313 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2314 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2315 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2316
2317 ++source;
2318 if( source<sourceLimit &&
2319 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2320 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2321 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2322 ) {
2323 ++source;
2324 *target++=c;
2325 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2326 offset=0;
2327 } else {
2328 /* set the state and leave the optimized loop */
2329 bytes[0]=*(source-1);
2330 byteIndex=1;
2331 break;
2332 }
2333 } else {
2334 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2335 /* output BMP code point */
2336 ++source;
2337 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2338 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2339 } else {
2340 /* leave the optimized loop */
2341 break;
2342 }
2343 }
2344 } while(source<sourceLimit && target<targetLimit);
2345 } else /* offsets!=NULL */ {
2346 do {
2347 entry=stateTable[state][*source];
2348 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2349 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2350 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2351
2352 ++source;
2353 if( source<sourceLimit &&
2354 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2355 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2356 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2357 ) {
2358 ++source;
2359 *target++=c;
2360 if(offsets!=NULL) {
2361 *offsets++=sourceIndex;
2362 sourceIndex=(nextSourceIndex+=2);
2363 }
2364 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2365 offset=0;
2366 } else {
2367 /* set the state and leave the optimized loop */
2368 ++nextSourceIndex;
2369 bytes[0]=*(source-1);
2370 byteIndex=1;
2371 break;
2372 }
2373 } else {
2374 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2375 /* output BMP code point */
2376 ++source;
2377 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2378 if(offsets!=NULL) {
2379 *offsets++=sourceIndex;
2380 sourceIndex=++nextSourceIndex;
2381 }
2382 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2383 } else {
2384 /* leave the optimized loop */
2385 break;
2386 }
2387 }
2388 } while(source<sourceLimit && target<targetLimit);
2389 }
2390
2391 /*
2392 * these tests and break statements could be put inside the loop
2393 * if C had "break outerLoop" like Java
2394 */
2395 if(source>=sourceLimit) {
2396 break;
2397 }
2398 if(target>=targetLimit) {
2399 /* target is full */
2400 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2401 break;
2402 }
2403
2404 ++nextSourceIndex;
2405 bytes[byteIndex++]=*source++;
2406 } else /* byteIndex>0 */ {
2407 ++nextSourceIndex;
2408 entry=stateTable[state][bytes[byteIndex++]=*source++];
2409 }
2410
b75a7d8f
A
2411 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2412 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2413 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
374ca955
A
2414 continue;
2415 }
b75a7d8f 2416
374ca955
A
2417 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2418 cnv->mode=state;
2419
2420 /* set the next state early so that we can reuse the entry variable */
2421 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2422
2423 /*
2424 * An if-else-if chain provides more reliable performance for
2425 * the most common cases compared to a switch.
2426 */
2427 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2428 if(action==MBCS_STATE_VALID_16) {
2429 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2430 c=unicodeCodeUnits[offset];
2431 if(c<0xfffe) {
2432 /* output BMP code point */
2433 *target++=c;
2434 if(offsets!=NULL) {
2435 *offsets++=sourceIndex;
2436 }
2437 byteIndex=0;
2438 } else if(c==0xfffe) {
2439 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2440 /* output fallback BMP code point */
2441 *target++=(UChar)entry;
2442 if(offsets!=NULL) {
2443 *offsets++=sourceIndex;
b75a7d8f 2444 }
374ca955 2445 byteIndex=0;
b75a7d8f 2446 }
374ca955
A
2447 } else {
2448 /* callback(illegal) */
2449 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2450 }
2451 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2452 /* output BMP code point */
2453 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2454 if(offsets!=NULL) {
2455 *offsets++=sourceIndex;
2456 }
2457 byteIndex=0;
2458 } else if(action==MBCS_STATE_VALID_16_PAIR) {
2459 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2460 c=unicodeCodeUnits[offset++];
2461 if(c<0xd800) {
2462 /* output BMP code point below 0xd800 */
2463 *target++=c;
2464 if(offsets!=NULL) {
2465 *offsets++=sourceIndex;
b75a7d8f 2466 }
374ca955
A
2467 byteIndex=0;
2468 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2469 /* output roundtrip or fallback surrogate pair */
2470 *target++=(UChar)(c&0xdbff);
2471 if(offsets!=NULL) {
2472 *offsets++=sourceIndex;
b75a7d8f 2473 }
374ca955
A
2474 byteIndex=0;
2475 if(target<targetLimit) {
2476 *target++=unicodeCodeUnits[offset];
2477 if(offsets!=NULL) {
2478 *offsets++=sourceIndex;
2479 }
2480 } else {
2481 /* target overflow */
2482 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2483 cnv->UCharErrorBufferLength=1;
2484 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2485
2486 offset=0;
2487 break;
b75a7d8f 2488 }
374ca955
A
2489 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2490 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2491 *target++=unicodeCodeUnits[offset];
2492 if(offsets!=NULL) {
2493 *offsets++=sourceIndex;
2494 }
2495 byteIndex=0;
2496 } else if(c==0xffff) {
b75a7d8f 2497 /* callback(illegal) */
374ca955
A
2498 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2499 }
2500 } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2501 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2502 ) {
2503 entry=MBCS_ENTRY_FINAL_VALUE(entry);
2504 /* output surrogate pair */
2505 *target++=(UChar)(0xd800|(UChar)(entry>>10));
2506 if(offsets!=NULL) {
2507 *offsets++=sourceIndex;
b75a7d8f 2508 }
b75a7d8f 2509 byteIndex=0;
374ca955
A
2510 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2511 if(target<targetLimit) {
2512 *target++=c;
2513 if(offsets!=NULL) {
2514 *offsets++=sourceIndex;
b75a7d8f 2515 }
b75a7d8f 2516 } else {
374ca955
A
2517 /* target overflow */
2518 cnv->UCharErrorBuffer[0]=c;
2519 cnv->UCharErrorBufferLength=1;
2520 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f 2521
374ca955
A
2522 offset=0;
2523 break;
2524 }
2525 } else if(action==MBCS_STATE_CHANGE_ONLY) {
b75a7d8f 2526 /*
374ca955
A
2527 * This serves as a state change without any output.
2528 * It is useful for reading simple stateful encodings,
2529 * for example using just Shift-In/Shift-Out codes.
2530 * The 21 unused bits may later be used for more sophisticated
2531 * state transitions.
b75a7d8f 2532 */
374ca955
A
2533 if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2534 byteIndex=0;
2535 } else {
2536 /* SI/SO are illegal for DBCS-only conversion */
2537 state=(uint8_t)(cnv->mode); /* restore the previous state */
2538
2539 /* callback(illegal) */
2540 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2541 }
2542 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2543 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2544 /* output BMP code point */
2545 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2546 if(offsets!=NULL) {
2547 *offsets++=sourceIndex;
2548 }
2549 byteIndex=0;
2550 }
2551 } else if(action==MBCS_STATE_UNASSIGNED) {
2552 /* just fall through */
2553 } else if(action==MBCS_STATE_ILLEGAL) {
2554 /* callback(illegal) */
2555 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2556 } else {
2557 /* reserved, must never occur */
2558 byteIndex=0;
b75a7d8f 2559 }
b75a7d8f 2560
374ca955
A
2561 /* end of action codes: prepare for a new character */
2562 offset=0;
2563
2564 if(byteIndex==0) {
2565 sourceIndex=nextSourceIndex;
2566 } else if(U_FAILURE(*pErrorCode)) {
2567 /* callback(illegal) */
fd0068a8
A
2568 if(byteIndex>1) {
2569 /*
2570 * Ticket 5691: consistent illegal sequences:
2571 * - We include at least the first byte in the illegal sequence.
2572 * - If any of the non-initial bytes could be the start of a character,
2573 * we stop the illegal sequence before the first one of those.
2574 */
2575 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2576 int8_t i;
2577 for(i=1;
2578 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2579 ++i) {}
2580 if(i<byteIndex) {
2581 /* Back out some bytes. */
2582 int8_t backOutDistance=byteIndex-i;
2583 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2584 byteIndex=i; /* length of reported illegal byte sequence */
2585 if(backOutDistance<=bytesFromThisBuffer) {
2586 source-=backOutDistance;
2587 } else {
2588 /* Back out bytes from the previous buffer: Need to replay them. */
2589 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2590 /* preToULength is negative! */
2591 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2592 source=(const uint8_t *)pArgs->source;
2593 }
2594 }
2595 }
374ca955
A
2596 break;
2597 } else /* unassigned sequences indicated with byteIndex>0 */ {
2598 /* try an extension mapping */
2599 pArgs->source=(const char *)source;
2600 byteIndex=_extToU(cnv, cnv->sharedData,
46f4442e 2601 byteIndex, &source, sourceLimit,
374ca955
A
2602 &target, targetLimit,
2603 &offsets, sourceIndex,
2604 pArgs->flush,
2605 pErrorCode);
fd0068a8 2606 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
374ca955
A
2607
2608 if(U_FAILURE(*pErrorCode)) {
2609 /* not mappable or buffer overflow */
2610 break;
2611 }
2612 }
b75a7d8f 2613 }
b75a7d8f 2614
374ca955
A
2615 /* set the converter state back into UConverter */
2616 cnv->toUnicodeStatus=offset;
b75a7d8f 2617 cnv->mode=state;
374ca955 2618 cnv->toULength=byteIndex;
b75a7d8f 2619
374ca955 2620 /* write back the updated pointers */
b75a7d8f 2621 pArgs->source=(const char *)source;
374ca955
A
2622 pArgs->target=target;
2623 pArgs->offsets=offsets;
b75a7d8f
A
2624}
2625
2626/*
374ca955
A
2627 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2628 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
b75a7d8f
A
2629 */
2630static UChar32
374ca955 2631ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
b75a7d8f 2632 UErrorCode *pErrorCode) {
b75a7d8f
A
2633 UConverter *cnv;
2634 const int32_t (*stateTable)[256];
2635 const uint8_t *source, *sourceLimit;
2636
2637 int32_t entry;
2638 uint8_t action;
b75a7d8f
A
2639
2640 /* set up the local pointers */
2641 cnv=pArgs->converter;
2642 source=(const uint8_t *)pArgs->source;
2643 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2644 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2645 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2646 } else {
374ca955 2647 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f
A
2648 }
2649
2650 /* conversion loop */
2651 while(source<sourceLimit) {
2652 entry=stateTable[0][*source++];
2653 /* MBCS_ENTRY_IS_FINAL(entry) */
2654
2655 /* write back the updated pointer early so that we can return directly */
2656 pArgs->source=(const char *)source;
2657
2658 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2659 /* output BMP code point */
2660 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2661 }
2662
2663 /*
2664 * An if-else-if chain provides more reliable performance for
2665 * the most common cases compared to a switch.
2666 */
2667 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
374ca955
A
2668 if( action==MBCS_STATE_VALID_DIRECT_20 ||
2669 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2670 ) {
b75a7d8f
A
2671 /* output supplementary code point */
2672 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2673 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
374ca955 2674 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
b75a7d8f
A
2675 /* output BMP code point */
2676 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2677 }
b75a7d8f 2678 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 2679 /* just fall through */
b75a7d8f
A
2680 } else if(action==MBCS_STATE_ILLEGAL) {
2681 /* callback(illegal) */
b75a7d8f
A
2682 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2683 } else {
2684 /* reserved, must never occur */
374ca955 2685 continue;
b75a7d8f
A
2686 }
2687
374ca955
A
2688 if(U_FAILURE(*pErrorCode)) {
2689 /* callback(illegal) */
2690 break;
2691 } else /* unassigned sequence */ {
2692 /* defer to the generic implementation */
2693 pArgs->source=(const char *)source-1;
2694 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
2695 }
2696 }
2697
374ca955 2698 /* no output because of empty input or only state changes */
b75a7d8f
A
2699 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2700 return 0xffff;
2701}
2702
2703/*
374ca955
A
2704 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2705 * conversion without offset handling.
b75a7d8f 2706 *
374ca955
A
2707 * When a character does not have a mapping to Unicode, then we return to the
2708 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2709 * handling.
2710 * We also defer to the generic code in other complicated cases and have them
2711 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2712 *
2713 * All normal mappings and errors are handled here.
b75a7d8f 2714 */
374ca955
A
2715static UChar32
2716ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2717 UErrorCode *pErrorCode) {
2718 UConverter *cnv;
2719 const uint8_t *source, *sourceLimit, *lastSource;
b75a7d8f
A
2720
2721 const int32_t (*stateTable)[256];
2722 const uint16_t *unicodeCodeUnits;
2723
2724 uint32_t offset;
374ca955 2725 uint8_t state;
b75a7d8f
A
2726
2727 int32_t entry;
374ca955
A
2728 UChar32 c;
2729 uint8_t action;
b75a7d8f 2730
374ca955
A
2731 /* use optimized function if possible */
2732 cnv=pArgs->converter;
2733
2734 if(cnv->preToULength>0) {
2735 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2736 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
2737 }
2738
374ca955
A
2739 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2740 /*
2741 * Using the generic ucnv_getNextUChar() code lets us deal correctly
2742 * with the rare case of a codepage that maps single surrogates
2743 * without adding the complexity to this already complicated function here.
2744 */
2745 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2746 } else if(cnv->sharedData->mbcs.countStates==1) {
2747 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
b75a7d8f 2748 }
b75a7d8f 2749
374ca955
A
2750 /* set up the local pointers */
2751 source=lastSource=(const uint8_t *)pArgs->source;
2752 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
b75a7d8f 2753
374ca955
A
2754 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2755 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2756 } else {
2757 stateTable=cnv->sharedData->mbcs.stateTable;
2758 }
2759 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2760
2761 /* get the converter state from UConverter */
2762 offset=cnv->toUnicodeStatus;
2763
2764 /*
2765 * if we are in the SBCS state for a DBCS-only converter,
2766 * then load the DBCS state from the MBCS data
2767 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2768 */
2769 if((state=(uint8_t)(cnv->mode))==0) {
2770 state=cnv->sharedData->mbcs.dbcsOnlyState;
2771 }
b75a7d8f
A
2772
2773 /* conversion loop */
374ca955
A
2774 c=U_SENTINEL;
2775 while(source<sourceLimit) {
b75a7d8f
A
2776 entry=stateTable[state][*source++];
2777 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2778 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2779 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
374ca955
A
2780
2781 /* optimization for 1/2-byte input and BMP output */
2782 if( source<sourceLimit &&
2783 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2784 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2785 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2786 ) {
2787 ++source;
2788 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2789 /* output BMP code point */
2790 break;
2791 }
b75a7d8f 2792 } else {
374ca955
A
2793 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2794 cnv->mode=state;
2795
2796 /* set the next state early so that we can reuse the entry variable */
2797 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
b75a7d8f
A
2798
2799 /*
2800 * An if-else-if chain provides more reliable performance for
2801 * the most common cases compared to a switch.
2802 */
2803 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
374ca955
A
2804 if(action==MBCS_STATE_VALID_DIRECT_16) {
2805 /* output BMP code point */
2806 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2807 break;
2808 } else if(action==MBCS_STATE_VALID_16) {
b75a7d8f 2809 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
374ca955
A
2810 c=unicodeCodeUnits[offset];
2811 if(c<0xfffe) {
2812 /* output BMP code point */
2813 break;
2814 } else if(c==0xfffe) {
2815 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2816 break;
2817 }
b75a7d8f 2818 } else {
374ca955
A
2819 /* callback(illegal) */
2820 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 2821 }
b75a7d8f
A
2822 } else if(action==MBCS_STATE_VALID_16_PAIR) {
2823 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
374ca955
A
2824 c=unicodeCodeUnits[offset++];
2825 if(c<0xd800) {
b75a7d8f 2826 /* output BMP code point below 0xd800 */
374ca955
A
2827 break;
2828 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
b75a7d8f 2829 /* output roundtrip or fallback supplementary code point */
374ca955
A
2830 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2831 break;
2832 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
b75a7d8f 2833 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
374ca955
A
2834 c=unicodeCodeUnits[offset];
2835 break;
2836 } else if(c==0xffff) {
2837 /* callback(illegal) */
2838 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 2839 }
374ca955
A
2840 } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2841 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2842 ) {
b75a7d8f 2843 /* output supplementary code point */
374ca955
A
2844 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2845 break;
b75a7d8f
A
2846 } else if(action==MBCS_STATE_CHANGE_ONLY) {
2847 /*
2848 * This serves as a state change without any output.
2849 * It is useful for reading simple stateful encodings,
2850 * for example using just Shift-In/Shift-Out codes.
2851 * The 21 unused bits may later be used for more sophisticated
2852 * state transitions.
2853 */
374ca955
A
2854 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2855 /* SI/SO are illegal for DBCS-only conversion */
2856 state=(uint8_t)(cnv->mode); /* restore the previous state */
2857
2858 /* callback(illegal) */
2859 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2860 }
2861 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2862 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2863 /* output BMP code point */
2864 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2865 break;
b75a7d8f
A
2866 }
2867 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 2868 /* just fall through */
b75a7d8f 2869 } else if(action==MBCS_STATE_ILLEGAL) {
374ca955
A
2870 /* callback(illegal) */
2871 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 2872 } else {
374ca955
A
2873 /* reserved (must never occur), or only state change */
2874 offset=0;
2875 lastSource=source;
2876 continue;
b75a7d8f
A
2877 }
2878
374ca955 2879 /* end of action codes: prepare for a new character */
b75a7d8f 2880 offset=0;
374ca955
A
2881
2882 if(U_FAILURE(*pErrorCode)) {
2883 /* callback(illegal) */
2884 break;
2885 } else /* unassigned sequence */ {
2886 /* defer to the generic implementation */
2887 cnv->toUnicodeStatus=0;
2888 cnv->mode=state;
2889 pArgs->source=(const char *)lastSource;
2890 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2891 }
b75a7d8f 2892 }
374ca955 2893 }
b75a7d8f 2894
374ca955
A
2895 if(c<0) {
2896 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
374ca955
A
2897 /* incomplete character byte sequence */
2898 uint8_t *bytes=cnv->toUBytes;
2899 cnv->toULength=(int8_t)(source-lastSource);
2900 do {
2901 *bytes++=*lastSource++;
2902 } while(lastSource<source);
fd0068a8
A
2903 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2904 } else if(U_FAILURE(*pErrorCode)) {
2905 /* callback(illegal) */
2906 /*
2907 * Ticket 5691: consistent illegal sequences:
2908 * - We include at least the first byte in the illegal sequence.
2909 * - If any of the non-initial bytes could be the start of a character,
2910 * we stop the illegal sequence before the first one of those.
2911 */
2912 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2913 uint8_t *bytes=cnv->toUBytes;
2914 *bytes++=*lastSource++; /* first byte */
2915 if(lastSource==source) {
2916 cnv->toULength=1;
2917 } else /* lastSource<source: multi-byte character */ {
2918 int8_t i;
2919 for(i=1;
2920 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
2921 ++i
2922 ) {
2923 *bytes++=*lastSource++;
2924 }
2925 cnv->toULength=i;
2926 source=lastSource;
2927 }
374ca955
A
2928 } else {
2929 /* no output because of empty input or only state changes */
2930 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2931 }
2932 c=0xffff;
2933 }
2934
2935 /* set the converter state back into UConverter, ready for a new character */
2936 cnv->toUnicodeStatus=0;
2937 cnv->mode=state;
2938
2939 /* write back the updated pointer */
2940 pArgs->source=(const char *)source;
2941 return c;
b75a7d8f
A
2942}
2943
2944#if 0
2945/*
2946 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2947 * Removal improves code coverage.
2948 */
2949/**
374ca955 2950 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
b75a7d8f 2951 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 2952 * It does not handle conversion extensions (_extToU()).
b75a7d8f
A
2953 */
2954U_CFUNC UChar32
374ca955 2955ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
b75a7d8f
A
2956 uint8_t b, UBool useFallback) {
2957 int32_t entry;
2958 uint8_t action;
2959
374ca955 2960 entry=sharedData->mbcs.stateTable[0][b];
b75a7d8f
A
2961 /* MBCS_ENTRY_IS_FINAL(entry) */
2962
2963 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2964 /* output BMP code point */
2965 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2966 }
2967
2968 /*
2969 * An if-else-if chain provides more reliable performance for
2970 * the most common cases compared to a switch.
2971 */
2972 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2973 if(action==MBCS_STATE_VALID_DIRECT_20) {
2974 /* output supplementary code point */
2975 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2976 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2977 if(!TO_U_USE_FALLBACK(useFallback)) {
2978 return 0xfffe;
2979 }
2980 /* output BMP code point */
2981 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2982 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2983 if(!TO_U_USE_FALLBACK(useFallback)) {
2984 return 0xfffe;
2985 }
2986 /* output supplementary code point */
2987 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2988 } else if(action==MBCS_STATE_UNASSIGNED) {
2989 return 0xfffe;
2990 } else if(action==MBCS_STATE_ILLEGAL) {
2991 return 0xffff;
2992 } else {
2993 /* reserved, must never occur */
2994 return 0xffff;
2995 }
2996}
2997#endif
2998
374ca955
A
2999/*
3000 * This is a simple version of _MBCSGetNextUChar() that is used
3001 * by other converter implementations.
3002 * It only returns an "assigned" result if it consumes the entire input.
3003 * It does not use state from the converter, nor error codes.
3004 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3005 * It handles conversion extensions but not GB 18030.
3006 *
3007 * Return value:
3008 * U+fffe unassigned
3009 * U+ffff illegal
3010 * otherwise the Unicode code point
3011 */
3012U_CFUNC UChar32
3013ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3014 const char *source, int32_t length,
3015 UBool useFallback) {
3016 const int32_t (*stateTable)[256];
3017 const uint16_t *unicodeCodeUnits;
3018
3019 uint32_t offset;
3020 uint8_t state, action;
3021
3022 UChar32 c;
3023 int32_t i, entry;
3024
3025 if(length<=0) {
3026 /* no input at all: "illegal" */
3027 return 0xffff;
3028 }
3029
3030#if 0
3031/*
3032 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3033 * TODO In future releases, verify that this function is never called for SBCS
3034 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3035 * Removal improves code coverage.
3036 */
3037 /* use optimized function if possible */
3038 if(sharedData->mbcs.countStates==1) {
3039 if(length==1) {
3040 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3041 } else {
3042 return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3043 }
3044 }
3045#endif
3046
3047 /* set up the local pointers */
3048 stateTable=sharedData->mbcs.stateTable;
3049 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3050
3051 /* converter state */
3052 offset=0;
3053 state=sharedData->mbcs.dbcsOnlyState;
3054
3055 /* conversion loop */
3056 for(i=0;;) {
3057 entry=stateTable[state][(uint8_t)source[i++]];
3058 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3059 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3060 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3061
3062 if(i==length) {
3063 return 0xffff; /* truncated character */
3064 }
3065 } else {
3066 /*
3067 * An if-else-if chain provides more reliable performance for
3068 * the most common cases compared to a switch.
3069 */
3070 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3071 if(action==MBCS_STATE_VALID_16) {
3072 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3073 c=unicodeCodeUnits[offset];
3074 if(c!=0xfffe) {
3075 /* done */
3076 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3077 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3078 /* else done with 0xfffe */
3079 }
3080 break;
3081 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3082 /* output BMP code point */
3083 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3084 break;
3085 } else if(action==MBCS_STATE_VALID_16_PAIR) {
3086 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3087 c=unicodeCodeUnits[offset++];
3088 if(c<0xd800) {
3089 /* output BMP code point below 0xd800 */
3090 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3091 /* output roundtrip or fallback supplementary code point */
3092 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3093 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3094 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3095 c=unicodeCodeUnits[offset];
3096 } else if(c==0xffff) {
3097 return 0xffff;
3098 } else {
3099 c=0xfffe;
3100 }
3101 break;
3102 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3103 /* output supplementary code point */
3104 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3105 break;
3106 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3107 if(!TO_U_USE_FALLBACK(useFallback)) {
3108 c=0xfffe;
3109 break;
3110 }
3111 /* output BMP code point */
3112 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3113 break;
3114 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3115 if(!TO_U_USE_FALLBACK(useFallback)) {
3116 c=0xfffe;
3117 break;
3118 }
3119 /* output supplementary code point */
3120 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3121 break;
3122 } else if(action==MBCS_STATE_UNASSIGNED) {
3123 c=0xfffe;
3124 break;
3125 }
3126
3127 /*
3128 * forbid MBCS_STATE_CHANGE_ONLY for this function,
3129 * and MBCS_STATE_ILLEGAL and reserved action codes
3130 */
3131 return 0xffff;
3132 }
3133 }
3134
3135 if(i!=length) {
3136 /* illegal for this function: not all input consumed */
3137 return 0xffff;
3138 }
3139
3140 if(c==0xfffe) {
3141 /* try an extension mapping */
3142 const int32_t *cx=sharedData->mbcs.extIndexes;
3143 if(cx!=NULL) {
3144 return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3145 }
3146 }
3147
3148 return c;
3149}
3150
b75a7d8f
A
3151/* MBCS-from-Unicode conversion functions ----------------------------------- */
3152
374ca955
A
3153/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3154static void
3155ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3156 UErrorCode *pErrorCode) {
b75a7d8f
A
3157 UConverter *cnv;
3158 const UChar *source, *sourceLimit;
3159 uint8_t *target;
3160 int32_t targetCapacity;
3161 int32_t *offsets;
3162
3163 const uint16_t *table;
46f4442e 3164 const uint16_t *mbcsIndex;
374ca955 3165 const uint8_t *bytes;
b75a7d8f
A
3166
3167 UChar32 c;
3168
374ca955 3169 int32_t sourceIndex, nextSourceIndex;
b75a7d8f 3170
b75a7d8f 3171 uint32_t stage2Entry;
46f4442e 3172 uint32_t asciiRoundtrips;
b75a7d8f 3173 uint32_t value;
b75a7d8f
A
3174 uint8_t unicodeMask;
3175
3176 /* use optimized function if possible */
3177 cnv=pArgs->converter;
374ca955 3178 unicodeMask=cnv->sharedData->mbcs.unicodeMask;
b75a7d8f
A
3179
3180 /* set up the local pointers */
3181 source=pArgs->source;
3182 sourceLimit=pArgs->sourceLimit;
3183 target=(uint8_t *)pArgs->target;
73c04bcf 3184 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3185 offsets=pArgs->offsets;
3186
374ca955 3187 table=cnv->sharedData->mbcs.fromUnicodeTable;
46f4442e 3188 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
b75a7d8f 3189 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3190 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3191 } else {
374ca955 3192 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f 3193 }
46f4442e 3194 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
b75a7d8f
A
3195
3196 /* get the converter state from UConverter */
374ca955 3197 c=cnv->fromUChar32;
b75a7d8f
A
3198
3199 /* sourceIndex=-1 if the current character began in the previous buffer */
b75a7d8f
A
3200 sourceIndex= c==0 ? 0 : -1;
3201 nextSourceIndex=0;
3202
3203 /* conversion loop */
b75a7d8f
A
3204 if(c!=0 && targetCapacity>0) {
3205 goto getTrail;
3206 }
3207
3208 while(source<sourceLimit) {
3209 /*
3210 * This following test is to see if available input would overflow the output.
3211 * It does not catch output of more than one byte that
3212 * overflows as a result of a multi-byte character or callback output
3213 * from the last source character.
3214 * Therefore, those situations also test for overflows and will
3215 * then break the loop, too.
3216 */
3217 if(targetCapacity>0) {
3218 /*
3219 * Get a correct Unicode code point:
3220 * a single UChar for a BMP code point or
3221 * a matched surrogate pair for a "supplementary code point".
3222 */
3223 c=*source++;
3224 ++nextSourceIndex;
46f4442e
A
3225 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3226 *target++=(uint8_t)c;
3227 if(offsets!=NULL) {
3228 *offsets++=sourceIndex;
3229 sourceIndex=nextSourceIndex;
3230 }
3231 --targetCapacity;
3232 c=0;
3233 continue;
3234 }
b75a7d8f 3235 /*
46f4442e
A
3236 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3237 * to avoid dealing with surrogates.
3238 * MBCS_FAST_MAX must be >=0xd7ff.
b75a7d8f 3239 */
46f4442e
A
3240 if(c<=0xd7ff) {
3241 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3242 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3243 if(value==0) {
3244 goto unassigned;
3245 }
3246 /* output the value */
3247 } else {
3248 /*
3249 * This also tests if the codepage maps single surrogates.
3250 * If it does, then surrogates are not paired but mapped separately.
3251 * Note that in this case unmatched surrogates are not detected.
3252 */
3253 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3254 if(UTF_IS_SURROGATE_FIRST(c)) {
b75a7d8f 3255getTrail:
46f4442e
A
3256 if(source<sourceLimit) {
3257 /* test the following code unit */
3258 UChar trail=*source;
3259 if(UTF_IS_SECOND_SURROGATE(trail)) {
3260 ++source;
3261 ++nextSourceIndex;
3262 c=UTF16_GET_PAIR_VALUE(c, trail);
3263 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3264 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3265 /* callback(unassigned) */
3266 goto unassigned;
3267 }
3268 /* convert this supplementary code point */
3269 /* exit this condition tree */
3270 } else {
3271 /* this is an unmatched lead code unit (1st surrogate) */
3272 /* callback(illegal) */
3273 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3274 break;
b75a7d8f 3275 }
b75a7d8f 3276 } else {
46f4442e 3277 /* no more input */
374ca955 3278 break;
b75a7d8f
A
3279 }
3280 } else {
46f4442e
A
3281 /* this is an unmatched trail code unit (2nd surrogate) */
3282 /* callback(illegal) */
3283 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
3284 break;
3285 }
b75a7d8f 3286 }
b75a7d8f 3287
46f4442e
A
3288 /* convert the Unicode code point in c into codepage bytes */
3289 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
b75a7d8f 3290
46f4442e
A
3291 /* get the bytes and the length for the output */
3292 /* MBCS_OUTPUT_2 */
3293 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
b75a7d8f 3294
46f4442e
A
3295 /* is this code point assigned, or do we use fallbacks? */
3296 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3297 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3298 ) {
3299 /*
3300 * We allow a 0 byte output if the "assigned" bit is set for this entry.
3301 * There is no way with this data structure for fallback output
3302 * to be a zero byte.
3303 */
b75a7d8f 3304
374ca955 3305unassigned:
46f4442e
A
3306 /* try an extension mapping */
3307 pArgs->source=source;
3308 c=_extFromU(cnv, cnv->sharedData,
3309 c, &source, sourceLimit,
3310 &target, target+targetCapacity,
3311 &offsets, sourceIndex,
3312 pArgs->flush,
3313 pErrorCode);
3314 nextSourceIndex+=(int32_t)(source-pArgs->source);
3315
3316 if(U_FAILURE(*pErrorCode)) {
3317 /* not mappable or buffer overflow */
3318 break;
3319 } else {
3320 /* a mapping was written to the target, continue */
b75a7d8f 3321
46f4442e
A
3322 /* recalculate the targetCapacity after an extension mapping */
3323 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
374ca955 3324
46f4442e
A
3325 /* normal end of conversion: prepare for a new character */
3326 sourceIndex=nextSourceIndex;
3327 continue;
3328 }
b75a7d8f 3329 }
374ca955 3330 }
b75a7d8f 3331
374ca955
A
3332 /* write the output character bytes from value and length */
3333 /* from the first if in the loop we know that targetCapacity>0 */
46f4442e 3334 if(value<=0xff) {
374ca955
A
3335 /* this is easy because we know that there is enough space */
3336 *target++=(uint8_t)value;
3337 if(offsets!=NULL) {
3338 *offsets++=sourceIndex;
3339 }
3340 --targetCapacity;
3341 } else /* length==2 */ {
3342 *target++=(uint8_t)(value>>8);
3343 if(2<=targetCapacity) {
3344 *target++=(uint8_t)value;
b75a7d8f
A
3345 if(offsets!=NULL) {
3346 *offsets++=sourceIndex;
b75a7d8f
A
3347 *offsets++=sourceIndex;
3348 }
374ca955
A
3349 targetCapacity-=2;
3350 } else {
b75a7d8f
A
3351 if(offsets!=NULL) {
3352 *offsets++=sourceIndex;
3353 }
374ca955
A
3354 cnv->charErrorBuffer[0]=(char)value;
3355 cnv->charErrorBufferLength=1;
3356
3357 /* target overflow */
3358 targetCapacity=0;
3359 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3360 c=0;
b75a7d8f
A
3361 break;
3362 }
b75a7d8f
A
3363 }
3364
3365 /* normal end of conversion: prepare for a new character */
3366 c=0;
374ca955 3367 sourceIndex=nextSourceIndex;
b75a7d8f 3368 continue;
b75a7d8f
A
3369 } else {
3370 /* target is full */
3371 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3372 break;
3373 }
3374 }
3375
b75a7d8f 3376 /* set the converter state back into UConverter */
374ca955 3377 cnv->fromUChar32=c;
b75a7d8f
A
3378
3379 /* write back the updated pointers */
3380 pArgs->source=source;
3381 pArgs->target=(char *)target;
3382 pArgs->offsets=offsets;
3383}
3384
374ca955 3385/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
b75a7d8f 3386static void
374ca955 3387ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
3388 UErrorCode *pErrorCode) {
3389 UConverter *cnv;
3390 const UChar *source, *sourceLimit;
3391 uint8_t *target;
3392 int32_t targetCapacity;
3393 int32_t *offsets;
3394
3395 const uint16_t *table;
374ca955 3396 const uint16_t *results;
b75a7d8f
A
3397
3398 UChar32 c;
3399
3400 int32_t sourceIndex, nextSourceIndex;
3401
b75a7d8f
A
3402 uint16_t value, minValue;
3403 UBool hasSupplementary;
3404
3405 /* set up the local pointers */
3406 cnv=pArgs->converter;
3407 source=pArgs->source;
3408 sourceLimit=pArgs->sourceLimit;
3409 target=(uint8_t *)pArgs->target;
73c04bcf 3410 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3411 offsets=pArgs->offsets;
3412
374ca955 3413 table=cnv->sharedData->mbcs.fromUnicodeTable;
b75a7d8f 3414 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3415 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3416 } else {
374ca955 3417 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f
A
3418 }
3419
3420 if(cnv->useFallback) {
3421 /* use all roundtrip and fallback results */
3422 minValue=0x800;
3423 } else {
3424 /* use only roundtrips and fallbacks from private-use characters */
3425 minValue=0xc00;
3426 }
374ca955 3427 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
b75a7d8f
A
3428
3429 /* get the converter state from UConverter */
374ca955 3430 c=cnv->fromUChar32;
b75a7d8f
A
3431
3432 /* sourceIndex=-1 if the current character began in the previous buffer */
3433 sourceIndex= c==0 ? 0 : -1;
3434 nextSourceIndex=0;
3435
3436 /* conversion loop */
3437 if(c!=0 && targetCapacity>0) {
3438 goto getTrail;
3439 }
3440
3441 while(source<sourceLimit) {
3442 /*
3443 * This following test is to see if available input would overflow the output.
3444 * It does not catch output of more than one byte that
3445 * overflows as a result of a multi-byte character or callback output
3446 * from the last source character.
3447 * Therefore, those situations also test for overflows and will
3448 * then break the loop, too.
3449 */
3450 if(targetCapacity>0) {
3451 /*
3452 * Get a correct Unicode code point:
3453 * a single UChar for a BMP code point or
3454 * a matched surrogate pair for a "supplementary code point".
3455 */
3456 c=*source++;
3457 ++nextSourceIndex;
3458 if(UTF_IS_SURROGATE(c)) {
3459 if(UTF_IS_SURROGATE_FIRST(c)) {
3460getTrail:
3461 if(source<sourceLimit) {
3462 /* test the following code unit */
3463 UChar trail=*source;
3464 if(UTF_IS_SECOND_SURROGATE(trail)) {
3465 ++source;
3466 ++nextSourceIndex;
3467 c=UTF16_GET_PAIR_VALUE(c, trail);
3468 if(!hasSupplementary) {
3469 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3470 /* callback(unassigned) */
3471 goto unassigned;
3472 }
3473 /* convert this supplementary code point */
3474 /* exit this condition tree */
3475 } else {
3476 /* this is an unmatched lead code unit (1st surrogate) */
3477 /* callback(illegal) */
b75a7d8f 3478 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3479 break;
b75a7d8f
A
3480 }
3481 } else {
3482 /* no more input */
3483 break;
3484 }
3485 } else {
3486 /* this is an unmatched trail code unit (2nd surrogate) */
3487 /* callback(illegal) */
b75a7d8f 3488 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3489 break;
b75a7d8f
A
3490 }
3491 }
3492
3493 /* convert the Unicode code point in c into codepage bytes */
3494 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3495
3496 /* is this code point assigned, or do we use fallbacks? */
3497 if(value>=minValue) {
3498 /* assigned, write the output character bytes from value and length */
3499 /* length==1 */
3500 /* this is easy because we know that there is enough space */
3501 *target++=(uint8_t)value;
3502 if(offsets!=NULL) {
3503 *offsets++=sourceIndex;
3504 }
3505 --targetCapacity;
3506
3507 /* normal end of conversion: prepare for a new character */
3508 c=0;
3509 sourceIndex=nextSourceIndex;
b75a7d8f 3510 } else { /* unassigned */
b75a7d8f 3511unassigned:
374ca955
A
3512 /* try an extension mapping */
3513 pArgs->source=source;
3514 c=_extFromU(cnv, cnv->sharedData,
3515 c, &source, sourceLimit,
46f4442e 3516 &target, target+targetCapacity,
374ca955
A
3517 &offsets, sourceIndex,
3518 pArgs->flush,
3519 pErrorCode);
3520 nextSourceIndex+=(int32_t)(source-pArgs->source);
3521
3522 if(U_FAILURE(*pErrorCode)) {
3523 /* not mappable or buffer overflow */
3524 break;
3525 } else {
3526 /* a mapping was written to the target, continue */
b75a7d8f 3527
374ca955 3528 /* recalculate the targetCapacity after an extension mapping */
73c04bcf 3529 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
b75a7d8f 3530
374ca955
A
3531 /* normal end of conversion: prepare for a new character */
3532 sourceIndex=nextSourceIndex;
3533 }
b75a7d8f 3534 }
b75a7d8f
A
3535 } else {
3536 /* target is full */
3537 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3538 break;
3539 }
3540 }
3541
374ca955
A
3542 /* set the converter state back into UConverter */
3543 cnv->fromUChar32=c;
b75a7d8f
A
3544
3545 /* write back the updated pointers */
3546 pArgs->source=source;
3547 pArgs->target=(char *)target;
3548 pArgs->offsets=offsets;
3549}
3550
3551/*
374ca955 3552 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
b75a7d8f
A
3553 * that map only to and from the BMP.
3554 * In addition to single-byte/state optimizations, the offset calculations
3555 * become much easier.
46f4442e
A
3556 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3557 * but measurements have shown that this diminishes performance
3558 * in more cases than it improves it.
3559 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3560 * for various MBCS and SBCS optimizations.
b75a7d8f
A
3561 */
3562static void
374ca955 3563ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
3564 UErrorCode *pErrorCode) {
3565 UConverter *cnv;
3566 const UChar *source, *sourceLimit, *lastSource;
3567 uint8_t *target;
3568 int32_t targetCapacity, length;
3569 int32_t *offsets;
3570
3571 const uint16_t *table;
3572 const uint16_t *results;
3573
3574 UChar32 c;
3575
3576 int32_t sourceIndex;
3577
46f4442e 3578 uint32_t asciiRoundtrips;
b75a7d8f
A
3579 uint16_t value, minValue;
3580
3581 /* set up the local pointers */
3582 cnv=pArgs->converter;
3583 source=pArgs->source;
3584 sourceLimit=pArgs->sourceLimit;
3585 target=(uint8_t *)pArgs->target;
73c04bcf 3586 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3587 offsets=pArgs->offsets;
3588
374ca955 3589 table=cnv->sharedData->mbcs.fromUnicodeTable;
b75a7d8f 3590 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3591 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3592 } else {
374ca955 3593 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f 3594 }
46f4442e 3595 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
b75a7d8f
A
3596
3597 if(cnv->useFallback) {
3598 /* use all roundtrip and fallback results */
3599 minValue=0x800;
3600 } else {
3601 /* use only roundtrips and fallbacks from private-use characters */
3602 minValue=0xc00;
3603 }
3604
3605 /* get the converter state from UConverter */
374ca955 3606 c=cnv->fromUChar32;
b75a7d8f
A
3607
3608 /* sourceIndex=-1 if the current character began in the previous buffer */
3609 sourceIndex= c==0 ? 0 : -1;
3610 lastSource=source;
3611
3612 /*
3613 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3614 * for the minimum of the sourceLength and targetCapacity
3615 */
73c04bcf 3616 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
3617 if(length<targetCapacity) {
3618 targetCapacity=length;
3619 }
3620
3621 /* conversion loop */
3622 if(c!=0 && targetCapacity>0) {
3623 goto getTrail;
3624 }
3625
3626#if MBCS_UNROLL_SINGLE_FROM_BMP
3627 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3628 /* unroll the loop with the most common case */
3629unrolled:
3630 if(targetCapacity>=4) {
3631 int32_t count, loops;
3632 uint16_t andedValues;
3633
3634 loops=count=targetCapacity>>2;
3635 do {
3636 c=*source++;
3637 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3638 *target++=(uint8_t)value;
3639 c=*source++;
3640 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3641 *target++=(uint8_t)value;
3642 c=*source++;
3643 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3644 *target++=(uint8_t)value;
3645 c=*source++;
3646 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3647 *target++=(uint8_t)value;
3648
3649 /* were all 4 entries really valid? */
3650 if(andedValues<minValue) {
3651 /* no, return to the first of these 4 */
3652 source-=4;
3653 target-=4;
3654 break;
3655 }
3656 } while(--count>0);
3657 count=loops-count;
3658 targetCapacity-=4*count;
3659
3660 if(offsets!=NULL) {
3661 lastSource+=4*count;
3662 while(count>0) {
3663 *offsets++=sourceIndex++;
3664 *offsets++=sourceIndex++;
3665 *offsets++=sourceIndex++;
3666 *offsets++=sourceIndex++;
3667 --count;
3668 }
3669 }
3670
3671 c=0;
3672 }
3673#endif
3674
3675 while(targetCapacity>0) {
3676 /*
3677 * Get a correct Unicode code point:
3678 * a single UChar for a BMP code point or
3679 * a matched surrogate pair for a "supplementary code point".
3680 */
3681 c=*source++;
3682 /*
3683 * Do not immediately check for single surrogates:
3684 * Assume that they are unassigned and check for them in that case.
3685 * This speeds up the conversion of assigned characters.
3686 */
3687 /* convert the Unicode code point in c into codepage bytes */
46f4442e
A
3688 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3689 *target++=(uint8_t)c;
3690 --targetCapacity;
3691 c=0;
3692 continue;
3693 }
b75a7d8f 3694 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
b75a7d8f
A
3695 /* is this code point assigned, or do we use fallbacks? */
3696 if(value>=minValue) {
3697 /* assigned, write the output character bytes from value and length */
3698 /* length==1 */
3699 /* this is easy because we know that there is enough space */
3700 *target++=(uint8_t)value;
3701 --targetCapacity;
3702
3703 /* normal end of conversion: prepare for a new character */
3704 c=0;
3705 continue;
3706 } else if(!UTF_IS_SURROGATE(c)) {
3707 /* normal, unassigned BMP character */
b75a7d8f
A
3708 } else if(UTF_IS_SURROGATE_FIRST(c)) {
3709getTrail:
3710 if(source<sourceLimit) {
3711 /* test the following code unit */
3712 UChar trail=*source;
3713 if(UTF_IS_SECOND_SURROGATE(trail)) {
3714 ++source;
3715 c=UTF16_GET_PAIR_VALUE(c, trail);
3716 /* this codepage does not map supplementary code points */
3717 /* callback(unassigned) */
b75a7d8f
A
3718 } else {
3719 /* this is an unmatched lead code unit (1st surrogate) */
3720 /* callback(illegal) */
b75a7d8f 3721 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3722 break;
b75a7d8f
A
3723 }
3724 } else {
3725 /* no more input */
46f4442e
A
3726 if (pArgs->flush) {
3727 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3728 }
b75a7d8f
A
3729 break;
3730 }
3731 } else {
3732 /* this is an unmatched trail code unit (2nd surrogate) */
3733 /* callback(illegal) */
b75a7d8f 3734 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3735 break;
b75a7d8f
A
3736 }
3737
374ca955 3738 /* c does not have a mapping */
b75a7d8f 3739
374ca955
A
3740 /* get the number of code units for c to correctly advance sourceIndex */
3741 length=U16_LENGTH(c);
3742
3743 /* set offsets since the start or the last extension */
b75a7d8f
A
3744 if(offsets!=NULL) {
3745 int32_t count=(int32_t)(source-lastSource);
3746
374ca955
A
3747 /* do not set the offset for this character */
3748 count-=length;
3749
3750 while(count>0) {
3751 *offsets++=sourceIndex++;
3752 --count;
3753 }
3754 /* offsets and sourceIndex are now set for the current character */
3755 }
3756
3757 /* try an extension mapping */
3758 lastSource=source;
3759 c=_extFromU(cnv, cnv->sharedData,
3760 c, &source, sourceLimit,
46f4442e 3761 &target, (const uint8_t *)(pArgs->targetLimit),
374ca955
A
3762 &offsets, sourceIndex,
3763 pArgs->flush,
3764 pErrorCode);
3765 sourceIndex+=length+(int32_t)(source-lastSource);
3766 lastSource=source;
3767
3768 if(U_FAILURE(*pErrorCode)) {
3769 /* not mappable or buffer overflow */
3770 break;
3771 } else {
3772 /* a mapping was written to the target, continue */
3773
3774 /* recalculate the targetCapacity after an extension mapping */
73c04bcf
A
3775 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3776 length=(int32_t)(sourceLimit-source);
374ca955
A
3777 if(length<targetCapacity) {
3778 targetCapacity=length;
3779 }
3780 }
3781
3782#if MBCS_UNROLL_SINGLE_FROM_BMP
3783 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3784 goto unrolled;
3785#endif
3786 }
3787
3788 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3789 /* target is full */
3790 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3791 }
3792
3793 /* set offsets since the start or the last callback */
3794 if(offsets!=NULL) {
3795 size_t count=source-lastSource;
46f4442e
A
3796 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
3797 /*
3798 Caller gave us a partial supplementary character,
3799 which this function couldn't convert in any case.
3800 The callback will handle the offset.
3801 */
3802 count--;
3803 }
374ca955
A
3804 while(count>0) {
3805 *offsets++=sourceIndex++;
3806 --count;
3807 }
3808 }
3809
3810 /* set the converter state back into UConverter */
3811 cnv->fromUChar32=c;
3812
3813 /* write back the updated pointers */
3814 pArgs->source=source;
3815 pArgs->target=(char *)target;
3816 pArgs->offsets=offsets;
3817}
3818
3819U_CFUNC void
3820ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3821 UErrorCode *pErrorCode) {
3822 UConverter *cnv;
3823 const UChar *source, *sourceLimit;
3824 uint8_t *target;
3825 int32_t targetCapacity;
3826 int32_t *offsets;
3827
3828 const uint16_t *table;
46f4442e 3829 const uint16_t *mbcsIndex;
374ca955
A
3830 const uint8_t *p, *bytes;
3831 uint8_t outputType;
3832
3833 UChar32 c;
3834
3835 int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3836
3837 uint32_t stage2Entry;
46f4442e 3838 uint32_t asciiRoundtrips;
374ca955
A
3839 uint32_t value;
3840 int32_t length, prevLength;
3841 uint8_t unicodeMask;
3842
3843 cnv=pArgs->converter;
3844
3845 if(cnv->preFromUFirstCP>=0) {
3846 /*
3847 * pass sourceIndex=-1 because we continue from an earlier buffer
3848 * in the future, this may change with continuous offsets
3849 */
3850 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3851
3852 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3853 return;
3854 }
3855 }
3856
3857 /* use optimized function if possible */
3858 outputType=cnv->sharedData->mbcs.outputType;
3859 unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3860 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3861 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3862 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3863 } else {
3864 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3865 }
3866 return;
46f4442e 3867 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
374ca955
A
3868 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3869 return;
3870 }
3871
3872 /* set up the local pointers */
3873 source=pArgs->source;
3874 sourceLimit=pArgs->sourceLimit;
3875 target=(uint8_t *)pArgs->target;
73c04bcf 3876 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
374ca955
A
3877 offsets=pArgs->offsets;
3878
3879 table=cnv->sharedData->mbcs.fromUnicodeTable;
46f4442e
A
3880 if(cnv->sharedData->mbcs.utf8Friendly) {
3881 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3882 } else {
3883 mbcsIndex=NULL;
3884 }
374ca955
A
3885 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3886 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3887 } else {
3888 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3889 }
46f4442e 3890 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
374ca955
A
3891
3892 /* get the converter state from UConverter */
3893 c=cnv->fromUChar32;
3894
3895 if(outputType==MBCS_OUTPUT_2_SISO) {
3896 prevLength=cnv->fromUnicodeStatus;
3897 if(prevLength==0) {
3898 /* set the real value */
3899 prevLength=1;
3900 }
3901 } else {
3902 /* prevent fromUnicodeStatus from being set to something non-0 */
3903 prevLength=0;
3904 }
3905
3906 /* sourceIndex=-1 if the current character began in the previous buffer */
3907 prevSourceIndex=-1;
3908 sourceIndex= c==0 ? 0 : -1;
3909 nextSourceIndex=0;
3910
3911 /* conversion loop */
3912 /*
3913 * This is another piece of ugly code:
3914 * A goto into the loop if the converter state contains a first surrogate
3915 * from the previous function call.
3916 * It saves me to check in each loop iteration a check of if(c==0)
3917 * and duplicating the trail-surrogate-handling code in the else
3918 * branch of that check.
3919 * I could not find any other way to get around this other than
3920 * using a function call for the conversion and callback, which would
3921 * be even more inefficient.
3922 *
3923 * Markus Scherer 2000-jul-19
3924 */
3925 if(c!=0 && targetCapacity>0) {
3926 goto getTrail;
3927 }
3928
3929 while(source<sourceLimit) {
3930 /*
3931 * This following test is to see if available input would overflow the output.
3932 * It does not catch output of more than one byte that
3933 * overflows as a result of a multi-byte character or callback output
3934 * from the last source character.
3935 * Therefore, those situations also test for overflows and will
3936 * then break the loop, too.
3937 */
3938 if(targetCapacity>0) {
3939 /*
3940 * Get a correct Unicode code point:
3941 * a single UChar for a BMP code point or
3942 * a matched surrogate pair for a "supplementary code point".
3943 */
3944 c=*source++;
3945 ++nextSourceIndex;
46f4442e
A
3946 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3947 *target++=(uint8_t)c;
3948 if(offsets!=NULL) {
3949 *offsets++=sourceIndex;
3950 prevSourceIndex=sourceIndex;
3951 sourceIndex=nextSourceIndex;
3952 }
3953 --targetCapacity;
3954 c=0;
3955 continue;
3956 }
374ca955 3957 /*
46f4442e
A
3958 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3959 * to avoid dealing with surrogates.
3960 * MBCS_FAST_MAX must be >=0xd7ff.
374ca955 3961 */
46f4442e
A
3962 if(c<=0xd7ff && mbcsIndex!=NULL) {
3963 value=mbcsIndex[c>>6];
3964
3965 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
3966 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3967 switch(outputType) {
3968 case MBCS_OUTPUT_2:
3969 value=((const uint16_t *)bytes)[value +(c&0x3f)];
3970 if(value<=0xff) {
3971 if(value==0) {
3972 goto unassigned;
3973 } else {
3974 length=1;
3975 }
3976 } else {
3977 length=2;
3978 }
3979 break;
3980 case MBCS_OUTPUT_2_SISO:
3981 /* 1/2-byte stateful with Shift-In/Shift-Out */
3982 /*
3983 * Save the old state in the converter object
3984 * right here, then change the local prevLength state variable if necessary.
3985 * Then, if this character turns out to be unassigned or a fallback that
3986 * is not taken, the callback code must not save the new state in the converter
3987 * because the new state is for a character that is not output.
3988 * However, the callback must still restore the state from the converter
3989 * in case the callback function changed it for its output.
3990 */
3991 cnv->fromUnicodeStatus=prevLength; /* save the old state */
3992 value=((const uint16_t *)bytes)[value +(c&0x3f)];
3993 if(value<=0xff) {
3994 if(value==0) {
3995 goto unassigned;
3996 } else if(prevLength<=1) {
3997 length=1;
3998 } else {
3999 /* change from double-byte mode to single-byte */
4000 value|=(uint32_t)UCNV_SI<<8;
4001 length=2;
4002 prevLength=1;
4003 }
4004 } else {
4005 if(prevLength==2) {
4006 length=2;
4007 } else {
4008 /* change from single-byte mode to double-byte */
4009 value|=(uint32_t)UCNV_SO<<16;
4010 length=3;
4011 prevLength=2;
4012 }
4013 }
4014 break;
4015 case MBCS_OUTPUT_DBCS_ONLY:
4016 /* table with single-byte results, but only DBCS mappings used */
4017 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4018 if(value<=0xff) {
4019 /* no mapping or SBCS result, not taken for DBCS-only */
4020 goto unassigned;
4021 } else {
4022 length=2;
4023 }
4024 break;
4025 case MBCS_OUTPUT_3:
4026 p=bytes+(value+(c&0x3f))*3;
4027 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4028 if(value<=0xff) {
4029 if(value==0) {
4030 goto unassigned;
4031 } else {
4032 length=1;
4033 }
4034 } else if(value<=0xffff) {
4035 length=2;
4036 } else {
4037 length=3;
4038 }
4039 break;
4040 case MBCS_OUTPUT_4:
4041 value=((const uint32_t *)bytes)[value +(c&0x3f)];
4042 if(value<=0xff) {
4043 if(value==0) {
4044 goto unassigned;
4045 } else {
4046 length=1;
4047 }
4048 } else if(value<=0xffff) {
4049 length=2;
4050 } else if(value<=0xffffff) {
4051 length=3;
4052 } else {
4053 length=4;
4054 }
4055 break;
4056 case MBCS_OUTPUT_3_EUC:
4057 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4058 /* EUC 16-bit fixed-length representation */
4059 if(value<=0xff) {
4060 if(value==0) {
4061 goto unassigned;
4062 } else {
4063 length=1;
4064 }
4065 } else if((value&0x8000)==0) {
4066 value|=0x8e8000;
4067 length=3;
4068 } else if((value&0x80)==0) {
4069 value|=0x8f0080;
4070 length=3;
4071 } else {
4072 length=2;
4073 }
4074 break;
4075 case MBCS_OUTPUT_4_EUC:
4076 p=bytes+(value+(c&0x3f))*3;
4077 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4078 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4079 if(value<=0xff) {
4080 if(value==0) {
4081 goto unassigned;
4082 } else {
4083 length=1;
4084 }
4085 } else if(value<=0xffff) {
4086 length=2;
4087 } else if((value&0x800000)==0) {
4088 value|=0x8e800000;
4089 length=4;
4090 } else if((value&0x8000)==0) {
4091 value|=0x8f008000;
4092 length=4;
4093 } else {
4094 length=3;
4095 }
4096 break;
4097 default:
4098 /* must not occur */
4099 /*
4100 * To avoid compiler warnings that value & length may be
4101 * used without having been initialized, we set them here.
4102 * In reality, this is unreachable code.
4103 * Not having a default branch also causes warnings with
4104 * some compilers.
4105 */
4106 value=0;
4107 length=0;
4108 break;
4109 }
4110 /* output the value */
4111 } else {
4112 /*
4113 * This also tests if the codepage maps single surrogates.
4114 * If it does, then surrogates are not paired but mapped separately.
4115 * Note that in this case unmatched surrogates are not detected.
4116 */
4117 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4118 if(UTF_IS_SURROGATE_FIRST(c)) {
374ca955 4119getTrail:
46f4442e
A
4120 if(source<sourceLimit) {
4121 /* test the following code unit */
4122 UChar trail=*source;
4123 if(UTF_IS_SECOND_SURROGATE(trail)) {
4124 ++source;
4125 ++nextSourceIndex;
4126 c=UTF16_GET_PAIR_VALUE(c, trail);
4127 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4128 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4129 cnv->fromUnicodeStatus=prevLength; /* save the old state */
4130 /* callback(unassigned) */
4131 goto unassigned;
4132 }
4133 /* convert this supplementary code point */
4134 /* exit this condition tree */
4135 } else {
4136 /* this is an unmatched lead code unit (1st surrogate) */
4137 /* callback(illegal) */
4138 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4139 break;
374ca955 4140 }
374ca955 4141 } else {
46f4442e 4142 /* no more input */
374ca955
A
4143 break;
4144 }
4145 } else {
46f4442e
A
4146 /* this is an unmatched trail code unit (2nd surrogate) */
4147 /* callback(illegal) */
4148 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955
A
4149 break;
4150 }
374ca955 4151 }
374ca955 4152
46f4442e 4153 /* convert the Unicode code point in c into codepage bytes */
374ca955 4154
374ca955 4155 /*
46f4442e
A
4156 * The basic lookup is a triple-stage compact array (trie) lookup.
4157 * For details see the beginning of this file.
4158 *
4159 * Single-byte codepages are handled with a different data structure
4160 * by _MBCSSingle... functions.
4161 *
4162 * The result consists of a 32-bit value from stage 2 and
4163 * a pointer to as many bytes as are stored per character.
4164 * The pointer points to the character's bytes in stage 3.
4165 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4166 * for that pointer, while bits 31..16 are flags for which of
4167 * the 16 characters in the block are roundtrip-assigned.
4168 *
4169 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4170 * respectively as uint32_t, in the platform encoding.
4171 * For 3-byte codepages, the bytes are always stored in big-endian order.
4172 *
4173 * For EUC encodings that use only either 0x8e or 0x8f as the first
4174 * byte of their longest byte sequences, the first two bytes in
4175 * this third stage indicate with their 7th bits whether these bytes
4176 * are to be written directly or actually need to be preceeded by
4177 * one of the two Single-Shift codes. With this, the third stage
4178 * stores one byte fewer per character than the actual maximum length of
4179 * EUC byte sequences.
4180 *
4181 * Other than that, leading zero bytes are removed and the other
4182 * bytes output. A single zero byte may be output if the "assigned"
4183 * bit in stage 2 was on.
4184 * The data structure does not support zero byte output as a fallback,
4185 * and also does not allow output of leading zeros.
374ca955 4186 */
46f4442e
A
4187 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4188
4189 /* get the bytes and the length for the output */
4190 switch(outputType) {
4191 case MBCS_OUTPUT_2:
4192 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4193 if(value<=0xff) {
4194 length=1;
4195 } else {
4196 length=2;
4197 }
4198 break;
4199 case MBCS_OUTPUT_2_SISO:
4200 /* 1/2-byte stateful with Shift-In/Shift-Out */
4201 /*
4202 * Save the old state in the converter object
4203 * right here, then change the local prevLength state variable if necessary.
4204 * Then, if this character turns out to be unassigned or a fallback that
4205 * is not taken, the callback code must not save the new state in the converter
4206 * because the new state is for a character that is not output.
4207 * However, the callback must still restore the state from the converter
4208 * in case the callback function changed it for its output.
4209 */
4210 cnv->fromUnicodeStatus=prevLength; /* save the old state */
4211 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4212 if(value<=0xff) {
4213 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4214 /* no mapping, leave value==0 */
4215 length=0;
4216 } else if(prevLength<=1) {
4217 length=1;
4218 } else {
4219 /* change from double-byte mode to single-byte */
4220 value|=(uint32_t)UCNV_SI<<8;
4221 length=2;
4222 prevLength=1;
4223 }
4224 } else {
4225 if(prevLength==2) {
4226 length=2;
4227 } else {
4228 /* change from single-byte mode to double-byte */
4229 value|=(uint32_t)UCNV_SO<<16;
4230 length=3;
4231 prevLength=2;
4232 }
4233 }
4234 break;
4235 case MBCS_OUTPUT_DBCS_ONLY:
4236 /* table with single-byte results, but only DBCS mappings used */
4237 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4238 if(value<=0xff) {
4239 /* no mapping or SBCS result, not taken for DBCS-only */
4240 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
374ca955 4241 length=0;
46f4442e
A
4242 } else {
4243 length=2;
4244 }
4245 break;
4246 case MBCS_OUTPUT_3:
4247 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4248 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4249 if(value<=0xff) {
4250 length=1;
4251 } else if(value<=0xffff) {
4252 length=2;
4253 } else {
4254 length=3;
4255 }
4256 break;
4257 case MBCS_OUTPUT_4:
4258 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4259 if(value<=0xff) {
4260 length=1;
4261 } else if(value<=0xffff) {
4262 length=2;
4263 } else if(value<=0xffffff) {
4264 length=3;
4265 } else {
4266 length=4;
4267 }
4268 break;
4269 case MBCS_OUTPUT_3_EUC:
4270 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4271 /* EUC 16-bit fixed-length representation */
4272 if(value<=0xff) {
374ca955 4273 length=1;
46f4442e
A
4274 } else if((value&0x8000)==0) {
4275 value|=0x8e8000;
4276 length=3;
4277 } else if((value&0x80)==0) {
4278 value|=0x8f0080;
4279 length=3;
374ca955 4280 } else {
374ca955 4281 length=2;
374ca955 4282 }
46f4442e
A
4283 break;
4284 case MBCS_OUTPUT_4_EUC:
4285 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4286 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4287 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4288 if(value<=0xff) {
4289 length=1;
4290 } else if(value<=0xffff) {
374ca955 4291 length=2;
46f4442e
A
4292 } else if((value&0x800000)==0) {
4293 value|=0x8e800000;
4294 length=4;
4295 } else if((value&0x8000)==0) {
4296 value|=0x8f008000;
4297 length=4;
374ca955 4298 } else {
374ca955 4299 length=3;
374ca955 4300 }
46f4442e
A
4301 break;
4302 default:
4303 /* must not occur */
4304 /*
4305 * To avoid compiler warnings that value & length may be
4306 * used without having been initialized, we set them here.
4307 * In reality, this is unreachable code.
4308 * Not having a default branch also causes warnings with
4309 * some compilers.
4310 */
374ca955
A
4311 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4312 length=0;
46f4442e 4313 break;
374ca955 4314 }
46f4442e
A
4315
4316 /* is this code point assigned, or do we use fallbacks? */
4317 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4318 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4319 ) {
4320 /*
4321 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4322 * There is no way with this data structure for fallback output
4323 * to be a zero byte.
4324 */
4325
4326unassigned:
4327 /* try an extension mapping */
4328 pArgs->source=source;
4329 c=_extFromU(cnv, cnv->sharedData,
4330 c, &source, sourceLimit,
4331 &target, target+targetCapacity,
4332 &offsets, sourceIndex,
4333 pArgs->flush,
4334 pErrorCode);
4335 nextSourceIndex+=(int32_t)(source-pArgs->source);
4336 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4337
4338 if(U_FAILURE(*pErrorCode)) {
4339 /* not mappable or buffer overflow */
4340 break;
4341 } else {
4342 /* a mapping was written to the target, continue */
4343
4344 /* recalculate the targetCapacity after an extension mapping */
4345 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4346
4347 /* normal end of conversion: prepare for a new character */
4348 if(offsets!=NULL) {
4349 prevSourceIndex=sourceIndex;
4350 sourceIndex=nextSourceIndex;
4351 }
4352 continue;
4353 }
374ca955 4354 }
46f4442e
A
4355 }
4356
4357 /* write the output character bytes from value and length */
4358 /* from the first if in the loop we know that targetCapacity>0 */
4359 if(length<=targetCapacity) {
4360 if(offsets==NULL) {
4361 switch(length) {
4362 /* each branch falls through to the next one */
4363 case 4:
4364 *target++=(uint8_t)(value>>24);
4365 case 3:
4366 *target++=(uint8_t)(value>>16);
4367 case 2:
4368 *target++=(uint8_t)(value>>8);
4369 case 1:
4370 *target++=(uint8_t)value;
4371 default:
4372 /* will never occur */
4373 break;
4374 }
374ca955
A
4375 } else {
4376 switch(length) {
4377 /* each branch falls through to the next one */
4378 case 4:
4379 *target++=(uint8_t)(value>>24);
4380 *offsets++=sourceIndex;
4381 case 3:
4382 *target++=(uint8_t)(value>>16);
4383 *offsets++=sourceIndex;
4384 case 2:
4385 *target++=(uint8_t)(value>>8);
4386 *offsets++=sourceIndex;
4387 case 1:
4388 *target++=(uint8_t)value;
4389 *offsets++=sourceIndex;
4390 default:
4391 /* will never occur */
4392 break;
4393 }
4394 }
4395 targetCapacity-=length;
4396 } else {
4397 uint8_t *charErrorBuffer;
4398
4399 /*
4400 * We actually do this backwards here:
4401 * In order to save an intermediate variable, we output
4402 * first to the overflow buffer what does not fit into the
4403 * regular target.
4404 */
4405 /* we know that 1<=targetCapacity<length<=4 */
4406 length-=targetCapacity;
4407 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4408 switch(length) {
4409 /* each branch falls through to the next one */
4410 case 3:
4411 *charErrorBuffer++=(uint8_t)(value>>16);
4412 case 2:
4413 *charErrorBuffer++=(uint8_t)(value>>8);
4414 case 1:
4415 *charErrorBuffer=(uint8_t)value;
4416 default:
4417 /* will never occur */
4418 break;
4419 }
4420 cnv->charErrorBufferLength=(int8_t)length;
4421
4422 /* now output what fits into the regular target */
4423 value>>=8*length; /* length was reduced by targetCapacity */
4424 switch(targetCapacity) {
4425 /* each branch falls through to the next one */
4426 case 3:
4427 *target++=(uint8_t)(value>>16);
4428 if(offsets!=NULL) {
4429 *offsets++=sourceIndex;
4430 }
4431 case 2:
4432 *target++=(uint8_t)(value>>8);
4433 if(offsets!=NULL) {
4434 *offsets++=sourceIndex;
4435 }
4436 case 1:
4437 *target++=(uint8_t)value;
4438 if(offsets!=NULL) {
4439 *offsets++=sourceIndex;
4440 }
4441 default:
4442 /* will never occur */
4443 break;
4444 }
b75a7d8f 4445
374ca955
A
4446 /* target overflow */
4447 targetCapacity=0;
4448 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4449 c=0;
4450 break;
b75a7d8f 4451 }
b75a7d8f 4452
374ca955 4453 /* normal end of conversion: prepare for a new character */
b75a7d8f 4454 c=0;
374ca955
A
4455 if(offsets!=NULL) {
4456 prevSourceIndex=sourceIndex;
4457 sourceIndex=nextSourceIndex;
4458 }
4459 continue;
4460 } else {
b75a7d8f
A
4461 /* target is full */
4462 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4463 break;
4464 }
b75a7d8f
A
4465 }
4466
374ca955
A
4467 /*
4468 * the end of the input stream and detection of truncated input
4469 * are handled by the framework, but for EBCDIC_STATEFUL conversion
4470 * we need to emit an SI at the very end
4471 *
4472 * conditions:
4473 * successful
4474 * EBCDIC_STATEFUL in DBCS mode
4475 * end of input and no truncated input
4476 */
4477 if( U_SUCCESS(*pErrorCode) &&
4478 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4479 pArgs->flush && source>=sourceLimit && c==0
4480 ) {
4481 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4482 if(targetCapacity>0) {
4483 *target++=(uint8_t)UCNV_SI;
4484 if(offsets!=NULL) {
4485 /* set the last source character's index (sourceIndex points at sourceLimit now) */
4486 *offsets++=prevSourceIndex;
4487 }
4488 } else {
4489 /* target is full */
4490 cnv->charErrorBuffer[0]=(char)UCNV_SI;
4491 cnv->charErrorBufferLength=1;
4492 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f 4493 }
374ca955 4494 prevLength=1; /* we switched into SBCS */
b75a7d8f
A
4495 }
4496
374ca955
A
4497 /* set the converter state back into UConverter */
4498 cnv->fromUChar32=c;
4499 cnv->fromUnicodeStatus=prevLength;
b75a7d8f
A
4500
4501 /* write back the updated pointers */
4502 pArgs->source=source;
4503 pArgs->target=(char *)target;
4504 pArgs->offsets=offsets;
4505}
4506
4507/*
4508 * This is another simple conversion function for internal use by other
4509 * conversion implementations.
4510 * It does not use the converter state nor call callbacks.
4511 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 4512 * It handles conversion extensions but not GB 18030.
b75a7d8f
A
4513 *
4514 * It converts one single Unicode code point into codepage bytes, encoded
4515 * as one 32-bit value. The function returns the number of bytes in *pValue:
4516 * 1..4 the number of bytes in *pValue
4517 * 0 unassigned (*pValue undefined)
4518 * -1 illegal (currently not used, *pValue undefined)
4519 *
4520 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4521 * the second to last byte in bits 15..8, etc.
4522 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4523 */
4524U_CFUNC int32_t
374ca955 4525ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
b75a7d8f
A
4526 UChar32 c, uint32_t *pValue,
4527 UBool useFallback) {
374ca955
A
4528 const int32_t *cx;
4529 const uint16_t *table;
4530#if 0
4531/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
b75a7d8f 4532 const uint8_t *p;
374ca955 4533#endif
b75a7d8f
A
4534 uint32_t stage2Entry;
4535 uint32_t value;
4536 int32_t length;
4537
4538 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955
A
4539 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4540 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 4541
374ca955
A
4542 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4543 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4544 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4545 /* is this code point assigned, or do we use fallbacks? */
4546 if(useFallback ? value>=0x800 : value>=0xc00) {
4547 *pValue=value&0xff;
4548 return 1;
4549 }
4550 } else /* outputType!=MBCS_OUTPUT_1 */ {
4551 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
b75a7d8f 4552
374ca955
A
4553 /* get the bytes and the length for the output */
4554 switch(sharedData->mbcs.outputType) {
4555 case MBCS_OUTPUT_2:
4556 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4557 if(value<=0xff) {
4558 length=1;
4559 } else {
4560 length=2;
4561 }
4562 break;
4563#if 0
4564/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4565 case MBCS_OUTPUT_DBCS_ONLY:
4566 /* table with single-byte results, but only DBCS mappings used */
4567 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4568 if(value<=0xff) {
4569 /* no mapping or SBCS result, not taken for DBCS-only */
4570 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4571 length=0;
4572 } else {
4573 length=2;
4574 }
4575 break;
4576 case MBCS_OUTPUT_3:
4577 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4578 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4579 if(value<=0xff) {
4580 length=1;
4581 } else if(value<=0xffff) {
4582 length=2;
4583 } else {
4584 length=3;
4585 }
4586 break;
4587 case MBCS_OUTPUT_4:
4588 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4589 if(value<=0xff) {
4590 length=1;
4591 } else if(value<=0xffff) {
4592 length=2;
4593 } else if(value<=0xffffff) {
4594 length=3;
4595 } else {
4596 length=4;
4597 }
4598 break;
4599 case MBCS_OUTPUT_3_EUC:
4600 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4601 /* EUC 16-bit fixed-length representation */
4602 if(value<=0xff) {
4603 length=1;
4604 } else if((value&0x8000)==0) {
4605 value|=0x8e8000;
4606 length=3;
4607 } else if((value&0x80)==0) {
4608 value|=0x8f0080;
4609 length=3;
4610 } else {
4611 length=2;
4612 }
4613 break;
4614 case MBCS_OUTPUT_4_EUC:
4615 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4616 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4617 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4618 if(value<=0xff) {
4619 length=1;
4620 } else if(value<=0xffff) {
4621 length=2;
4622 } else if((value&0x800000)==0) {
4623 value|=0x8e800000;
4624 length=4;
4625 } else if((value&0x8000)==0) {
4626 value|=0x8f008000;
4627 length=4;
4628 } else {
4629 length=3;
4630 }
4631 break;
4632#endif
4633 default:
4634 /* must not occur */
4635 return -1;
4636 }
b75a7d8f 4637
374ca955
A
4638 /* is this code point assigned, or do we use fallbacks? */
4639 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4640 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4641 ) {
4642 /*
4643 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4644 * There is no way with this data structure for fallback output
4645 * to be a zero byte.
4646 */
4647 /* assigned */
4648 *pValue=value;
4649 return length;
4650 }
b75a7d8f 4651 }
b75a7d8f
A
4652 }
4653
374ca955
A
4654 cx=sharedData->mbcs.extIndexes;
4655 if(cx!=NULL) {
46f4442e
A
4656 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4657 return length>=0 ? length : -length; /* return abs(length); */
b75a7d8f 4658 }
374ca955
A
4659
4660 /* unassigned */
4661 return 0;
b75a7d8f
A
4662}
4663
4664
4665#if 0
374ca955
A
4666/*
4667 * This function has been moved to ucnv2022.c for inlining.
4668 * This implementation is here only for documentation purposes
b75a7d8f
A
4669 */
4670
4671/**
374ca955 4672 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
b75a7d8f 4673 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 4674 * It does not handle conversion extensions (_extFromU()).
b75a7d8f
A
4675 *
4676 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4677 */
4678U_CFUNC int32_t
374ca955 4679ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
b75a7d8f
A
4680 UChar32 c,
4681 UBool useFallback) {
4682 const uint16_t *table;
4683 int32_t value;
4684
4685 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 4686 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
b75a7d8f
A
4687 return -1;
4688 }
4689
4690 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 4691 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f
A
4692
4693 /* get the byte for the output */
374ca955 4694 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f
A
4695 /* is this code point assigned, or do we use fallbacks? */
4696 if(useFallback ? value>=0x800 : value>=0xc00) {
4697 return value&0xff;
4698 } else {
4699 return -1;
4700 }
4701}
4702#endif
4703
46f4442e
A
4704/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4705
4706/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4707static const UChar32
4708utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4709
4710/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4711static const UChar32
4712utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4713
4714static void
4715ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4716 UConverterToUnicodeArgs *pToUArgs,
4717 UErrorCode *pErrorCode) {
4718 UConverter *utf8, *cnv;
4719 const uint8_t *source, *sourceLimit;
4720 uint8_t *target;
4721 int32_t targetCapacity;
4722
4723 const uint16_t *table, *sbcsIndex;
4724 const uint16_t *results;
4725
4726 int8_t oldToULength, toULength, toULimit;
4727
4728 UChar32 c;
4729 uint8_t b, t1, t2;
4730
4731 uint32_t asciiRoundtrips;
4732 uint16_t value, minValue;
4733 UBool hasSupplementary;
4734
4735 /* set up the local pointers */
4736 utf8=pToUArgs->converter;
4737 cnv=pFromUArgs->converter;
4738 source=(uint8_t *)pToUArgs->source;
4739 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4740 target=(uint8_t *)pFromUArgs->target;
4741 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4742
4743 table=cnv->sharedData->mbcs.fromUnicodeTable;
4744 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
4745 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4746 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4747 } else {
4748 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4749 }
4750 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4751
4752 if(cnv->useFallback) {
4753 /* use all roundtrip and fallback results */
4754 minValue=0x800;
4755 } else {
4756 /* use only roundtrips and fallbacks from private-use characters */
4757 minValue=0xc00;
4758 }
4759 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4760
4761 /* get the converter state from the UTF-8 UConverter */
4762 c=(UChar32)utf8->toUnicodeStatus;
4763 if(c!=0) {
4764 toULength=oldToULength=utf8->toULength;
4765 toULimit=(int8_t)utf8->mode;
4766 } else {
4767 toULength=oldToULength=toULimit=0;
4768 }
4769
4770 /*
4771 * Make sure that the last byte sequence before sourceLimit is complete
4772 * or runs into a lead byte.
4773 * Do not go back into the bytes that will be read for finishing a partial
4774 * sequence from the previous buffer.
4775 * In the conversion loop compare source with sourceLimit only once
4776 * per multi-byte character.
4777 */
4778 {
4779 int32_t i, length;
4780
4781 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4782 for(i=0; i<3 && i<length;) {
4783 b=*(sourceLimit-i-1);
4784 if(U8_IS_TRAIL(b)) {
4785 ++i;
4786 } else {
4787 if(i<utf8_countTrailBytes[b]) {
4788 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4789 sourceLimit-=i+1;
4790 }
4791 break;
4792 }
4793 }
4794 }
4795
4796 if(c!=0 && targetCapacity>0) {
4797 utf8->toUnicodeStatus=0;
4798 utf8->toULength=0;
4799 goto moreBytes;
4800 /*
4801 * Note: We could avoid the goto by duplicating some of the moreBytes
4802 * code, but only up to the point of collecting a complete UTF-8
4803 * sequence; then recurse for the toUBytes[toULength]
4804 * and then continue with normal conversion.
4805 *
4806 * If so, move this code to just after initializing the minimum
4807 * set of local variables for reading the UTF-8 input
4808 * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4809 *
4810 * Potential advantages:
4811 * - avoid the goto
4812 * - oldToULength could become a local variable in just those code blocks
4813 * that deal with buffer boundaries
4814 * - possibly faster if the goto prevents some compiler optimizations
4815 * (this would need measuring to confirm)
4816 * Disadvantage:
4817 * - code duplication
4818 */
4819 }
4820
4821 /* conversion loop */
4822 while(source<sourceLimit) {
4823 if(targetCapacity>0) {
4824 b=*source++;
4825 if((int8_t)b>=0) {
4826 /* convert ASCII */
4827 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4828 *target++=(uint8_t)b;
4829 --targetCapacity;
4830 continue;
4831 } else {
4832 c=b;
4833 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
4834 }
4835 } else {
4836 if(b<0xe0) {
4837 if( /* handle U+0080..U+07FF inline */
4838 b>=0xc2 &&
4839 (t1=(uint8_t)(*source-0x80)) <= 0x3f
4840 ) {
4841 c=b&0x1f;
4842 ++source;
4843 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
4844 if(value>=minValue) {
4845 *target++=(uint8_t)value;
4846 --targetCapacity;
4847 continue;
4848 } else {
4849 c=(c<<6)|t1;
4850 }
4851 } else {
4852 c=-1;
4853 }
4854 } else if(b==0xe0) {
4855 if( /* handle U+0800..U+0FFF inline */
4856 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
4857 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
4858 ) {
4859 c=t1;
4860 source+=2;
4861 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
4862 if(value>=minValue) {
4863 *target++=(uint8_t)value;
4864 --targetCapacity;
4865 continue;
4866 } else {
4867 c=(c<<6)|t2;
4868 }
4869 } else {
4870 c=-1;
4871 }
4872 } else {
4873 c=-1;
4874 }
4875
4876 if(c<0) {
4877 /* handle "complicated" and error cases, and continuing partial characters */
4878 oldToULength=0;
4879 toULength=1;
4880 toULimit=utf8_countTrailBytes[b]+1;
4881 c=b;
4882moreBytes:
4883 while(toULength<toULimit) {
4884 if(source<sourceLimit) {
4885 b=*source;
4886 if(U8_IS_TRAIL(b)) {
4887 ++source;
4888 ++toULength;
4889 c=(c<<6)+b;
4890 } else {
4891 break; /* sequence too short, stop with toULength<toULimit */
4892 }
4893 } else {
4894 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4895 source-=(toULength-oldToULength);
4896 while(oldToULength<toULength) {
4897 utf8->toUBytes[oldToULength++]=*source++;
4898 }
4899 utf8->toUnicodeStatus=c;
4900 utf8->toULength=toULength;
4901 utf8->mode=toULimit;
4902 pToUArgs->source=(char *)source;
4903 pFromUArgs->target=(char *)target;
4904 return;
4905 }
4906 }
4907
4908 if( toULength==toULimit && /* consumed all trail bytes */
4909 (toULength==3 || toULength==2) && /* BMP */
4910 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
4911 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
4912 ) {
4913 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4914 } else if(
4915 toULength==toULimit && toULength==4 &&
4916 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
4917 ) {
4918 /* supplementary code point */
4919 if(!hasSupplementary) {
4920 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4921 value=0;
4922 } else {
4923 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4924 }
4925 } else {
4926 /* error handling: illegal UTF-8 byte sequence */
4927 source-=(toULength-oldToULength);
4928 while(oldToULength<toULength) {
4929 utf8->toUBytes[oldToULength++]=*source++;
4930 }
4931 utf8->toULength=toULength;
4932 pToUArgs->source=(char *)source;
4933 pFromUArgs->target=(char *)target;
4934 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4935 return;
4936 }
4937 }
4938 }
4939
4940 if(value>=minValue) {
4941 /* output the mapping for c */
4942 *target++=(uint8_t)value;
4943 --targetCapacity;
4944 } else {
4945 /* value<minValue means c is unassigned (unmappable) */
4946 /*
4947 * Try an extension mapping.
4948 * Pass in no source because we don't have UTF-16 input.
4949 * If we have a partial match on c, we will return and revert
4950 * to UTF-8->UTF-16->charset conversion.
4951 */
4952 static const UChar nul=0;
4953 const UChar *noSource=&nul;
4954 c=_extFromU(cnv, cnv->sharedData,
4955 c, &noSource, noSource,
4956 &target, target+targetCapacity,
4957 NULL, -1,
4958 pFromUArgs->flush,
4959 pErrorCode);
4960
4961 if(U_FAILURE(*pErrorCode)) {
4962 /* not mappable or buffer overflow */
4963 cnv->fromUChar32=c;
4964 break;
4965 } else if(cnv->preFromUFirstCP>=0) {
4966 /*
4967 * Partial match, return and revert to pivoting.
4968 * In normal from-UTF-16 conversion, we would just continue
4969 * but then exit the loop because the extension match would
4970 * have consumed the source.
4971 */
4972 break;
4973 } else {
4974 /* a mapping was written to the target, continue */
4975
4976 /* recalculate the targetCapacity after an extension mapping */
4977 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
4978 }
4979 }
4980 } else {
4981 /* target is full */
4982 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4983 break;
4984 }
4985 }
4986
4987 /*
4988 * The sourceLimit may have been adjusted before the conversion loop
4989 * to stop before a truncated sequence.
4990 * If so, then collect the truncated sequence now.
4991 */
4992 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
4993 c=utf8->toUBytes[0]=b=*source++;
4994 toULength=1;
4995 toULimit=utf8_countTrailBytes[b]+1;
4996 while(source<sourceLimit) {
4997 utf8->toUBytes[toULength++]=b=*source++;
4998 c=(c<<6)+b;
4999 }
5000 utf8->toUnicodeStatus=c;
5001 utf8->toULength=toULength;
5002 utf8->mode=toULimit;
5003 }
5004
5005 /* write back the updated pointers */
5006 pToUArgs->source=(char *)source;
5007 pFromUArgs->target=(char *)target;
5008}
5009
5010static void
5011ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5012 UConverterToUnicodeArgs *pToUArgs,
5013 UErrorCode *pErrorCode) {
5014 UConverter *utf8, *cnv;
5015 const uint8_t *source, *sourceLimit;
5016 uint8_t *target;
5017 int32_t targetCapacity;
5018
5019 const uint16_t *table, *mbcsIndex;
5020 const uint16_t *results;
5021
5022 int8_t oldToULength, toULength, toULimit;
5023
5024 UChar32 c;
5025 uint8_t b, t1, t2;
5026
5027 uint32_t stage2Entry;
5028 uint32_t asciiRoundtrips;
5029 uint16_t value, minValue;
5030 UBool hasSupplementary;
5031
5032 /* set up the local pointers */
5033 utf8=pToUArgs->converter;
5034 cnv=pFromUArgs->converter;
5035 source=(uint8_t *)pToUArgs->source;
5036 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5037 target=(uint8_t *)pFromUArgs->target;
5038 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5039
5040 table=cnv->sharedData->mbcs.fromUnicodeTable;
5041 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5042 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5043 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5044 } else {
5045 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5046 }
5047 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5048
5049 if(cnv->useFallback) {
5050 /* use all roundtrip and fallback results */
5051 minValue=0x800;
5052 } else {
5053 /* use only roundtrips and fallbacks from private-use characters */
5054 minValue=0xc00;
5055 }
5056 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5057
5058 /* get the converter state from the UTF-8 UConverter */
5059 c=(UChar32)utf8->toUnicodeStatus;
5060 if(c!=0) {
5061 toULength=oldToULength=utf8->toULength;
5062 toULimit=(int8_t)utf8->mode;
5063 } else {
5064 toULength=oldToULength=toULimit=0;
5065 }
5066
5067 /*
5068 * Make sure that the last byte sequence before sourceLimit is complete
5069 * or runs into a lead byte.
5070 * Do not go back into the bytes that will be read for finishing a partial
5071 * sequence from the previous buffer.
5072 * In the conversion loop compare source with sourceLimit only once
5073 * per multi-byte character.
5074 */
5075 {
5076 int32_t i, length;
5077
5078 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5079 for(i=0; i<3 && i<length;) {
5080 b=*(sourceLimit-i-1);
5081 if(U8_IS_TRAIL(b)) {
5082 ++i;
5083 } else {
5084 if(i<utf8_countTrailBytes[b]) {
5085 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5086 sourceLimit-=i+1;
5087 }
5088 break;
5089 }
5090 }
5091 }
5092
5093 if(c!=0 && targetCapacity>0) {
5094 utf8->toUnicodeStatus=0;
5095 utf8->toULength=0;
5096 goto moreBytes;
5097 /* See note in ucnv_SBCSFromUTF8() about this goto. */
5098 }
5099
5100 /* conversion loop */
5101 while(source<sourceLimit) {
5102 if(targetCapacity>0) {
5103 b=*source++;
5104 if((int8_t)b>=0) {
5105 /* convert ASCII */
5106 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5107 *target++=b;
5108 --targetCapacity;
5109 continue;
5110 } else {
5111 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
5112 if(value==0) {
5113 c=b;
5114 goto unassigned;
5115 }
5116 }
5117 } else {
5118 if(b>0xe0) {
5119 if( /* handle U+1000..U+D7FF inline */
5120 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
5121 (b==0xed && (t1 <= 0x1f))) &&
5122 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5123 ) {
5124 c=((b&0xf)<<6)|t1;
5125 source+=2;
5126 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5127 if(value==0) {
5128 c=(c<<6)|t2;
5129 goto unassigned;
5130 }
5131 } else {
5132 c=-1;
5133 }
5134 } else if(b<0xe0) {
5135 if( /* handle U+0080..U+07FF inline */
5136 b>=0xc2 &&
5137 (t1=(uint8_t)(*source-0x80)) <= 0x3f
5138 ) {
5139 c=b&0x1f;
5140 ++source;
5141 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5142 if(value==0) {
5143 c=(c<<6)|t1;
5144 goto unassigned;
5145 }
5146 } else {
5147 c=-1;
5148 }
5149 } else {
5150 c=-1;
5151 }
5152
5153 if(c<0) {
5154 /* handle "complicated" and error cases, and continuing partial characters */
5155 oldToULength=0;
5156 toULength=1;
5157 toULimit=utf8_countTrailBytes[b]+1;
5158 c=b;
5159moreBytes:
5160 while(toULength<toULimit) {
5161 if(source<sourceLimit) {
5162 b=*source;
5163 if(U8_IS_TRAIL(b)) {
5164 ++source;
5165 ++toULength;
5166 c=(c<<6)+b;
5167 } else {
5168 break; /* sequence too short, stop with toULength<toULimit */
5169 }
5170 } else {
5171 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5172 source-=(toULength-oldToULength);
5173 while(oldToULength<toULength) {
5174 utf8->toUBytes[oldToULength++]=*source++;
5175 }
5176 utf8->toUnicodeStatus=c;
5177 utf8->toULength=toULength;
5178 utf8->mode=toULimit;
5179 pToUArgs->source=(char *)source;
5180 pFromUArgs->target=(char *)target;
5181 return;
5182 }
5183 }
5184
5185 if( toULength==toULimit && /* consumed all trail bytes */
5186 (toULength==3 || toULength==2) && /* BMP */
5187 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5188 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
5189 ) {
5190 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5191 } else if(
5192 toULength==toULimit && toULength==4 &&
5193 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5194 ) {
5195 /* supplementary code point */
5196 if(!hasSupplementary) {
5197 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5198 stage2Entry=0;
5199 } else {
5200 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5201 }
5202 } else {
5203 /* error handling: illegal UTF-8 byte sequence */
5204 source-=(toULength-oldToULength);
5205 while(oldToULength<toULength) {
5206 utf8->toUBytes[oldToULength++]=*source++;
5207 }
5208 utf8->toULength=toULength;
5209 pToUArgs->source=(char *)source;
5210 pFromUArgs->target=(char *)target;
5211 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5212 return;
5213 }
5214
5215 /* get the bytes and the length for the output */
5216 /* MBCS_OUTPUT_2 */
5217 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5218
5219 /* is this code point assigned, or do we use fallbacks? */
5220 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
5221 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
5222 ) {
5223 goto unassigned;
5224 }
5225 }
5226 }
5227
5228 /* write the output character bytes from value and length */
5229 /* from the first if in the loop we know that targetCapacity>0 */
5230 if(value<=0xff) {
5231 /* this is easy because we know that there is enough space */
5232 *target++=(uint8_t)value;
5233 --targetCapacity;
5234 } else /* length==2 */ {
5235 *target++=(uint8_t)(value>>8);
5236 if(2<=targetCapacity) {
5237 *target++=(uint8_t)value;
5238 targetCapacity-=2;
5239 } else {
5240 cnv->charErrorBuffer[0]=(char)value;
5241 cnv->charErrorBufferLength=1;
5242
5243 /* target overflow */
5244 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5245 break;
5246 }
5247 }
5248 continue;
5249
5250unassigned:
5251 {
5252 /*
5253 * Try an extension mapping.
5254 * Pass in no source because we don't have UTF-16 input.
5255 * If we have a partial match on c, we will return and revert
5256 * to UTF-8->UTF-16->charset conversion.
5257 */
5258 static const UChar nul=0;
5259 const UChar *noSource=&nul;
5260 c=_extFromU(cnv, cnv->sharedData,
5261 c, &noSource, noSource,
5262 &target, target+targetCapacity,
5263 NULL, -1,
5264 pFromUArgs->flush,
5265 pErrorCode);
5266
5267 if(U_FAILURE(*pErrorCode)) {
5268 /* not mappable or buffer overflow */
5269 cnv->fromUChar32=c;
5270 break;
5271 } else if(cnv->preFromUFirstCP>=0) {
5272 /*
5273 * Partial match, return and revert to pivoting.
5274 * In normal from-UTF-16 conversion, we would just continue
5275 * but then exit the loop because the extension match would
5276 * have consumed the source.
5277 */
5278 break;
5279 } else {
5280 /* a mapping was written to the target, continue */
5281
5282 /* recalculate the targetCapacity after an extension mapping */
5283 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5284 continue;
5285 }
5286 }
5287 } else {
5288 /* target is full */
5289 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5290 break;
5291 }
5292 }
5293
5294 /*
5295 * The sourceLimit may have been adjusted before the conversion loop
5296 * to stop before a truncated sequence.
5297 * If so, then collect the truncated sequence now.
5298 */
5299 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5300 c=utf8->toUBytes[0]=b=*source++;
5301 toULength=1;
5302 toULimit=utf8_countTrailBytes[b]+1;
5303 while(source<sourceLimit) {
5304 utf8->toUBytes[toULength++]=b=*source++;
5305 c=(c<<6)+b;
5306 }
5307 utf8->toUnicodeStatus=c;
5308 utf8->toULength=toULength;
5309 utf8->mode=toULimit;
5310 }
5311
5312 /* write back the updated pointers */
5313 pToUArgs->source=(char *)source;
5314 pFromUArgs->target=(char *)target;
5315}
5316
b75a7d8f
A
5317/* miscellaneous ------------------------------------------------------------ */
5318
5319static void
374ca955 5320ucnv_MBCSGetStarters(const UConverter* cnv,
b75a7d8f
A
5321 UBool starters[256],
5322 UErrorCode *pErrorCode) {
374ca955 5323 const int32_t *state0;
b75a7d8f
A
5324 int i;
5325
374ca955 5326 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
b75a7d8f
A
5327 for(i=0; i<256; ++i) {
5328 /* all bytes that cause a state transition from state 0 are lead bytes */
5329 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5330 }
5331}
5332
5333/*
5334 * This is an internal function that allows other converter implementations
5335 * to check whether a byte is a lead byte.
5336 */
5337U_CFUNC UBool
374ca955
A
5338ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
5339 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
b75a7d8f
A
5340}
5341
5342static void
374ca955 5343ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
5344 int32_t offsetIndex,
5345 UErrorCode *pErrorCode) {
5346 UConverter *cnv=pArgs->converter;
5347 char *p, *subchar;
5348 char buffer[4];
5349 int32_t length;
5350
5351 /* first, select between subChar and subChar1 */
374ca955
A
5352 if( cnv->subChar1!=0 &&
5353 (cnv->sharedData->mbcs.extIndexes!=NULL ?
5354 cnv->useSubChar1 :
5355 (cnv->invalidUCharBuffer[0]<=0xff))
5356 ) {
b75a7d8f
A
5357 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5358 subchar=(char *)&cnv->subChar1;
5359 length=1;
5360 } else {
5361 /* select subChar in all other cases */
73c04bcf 5362 subchar=(char *)cnv->subChars;
b75a7d8f
A
5363 length=cnv->subCharLen;
5364 }
5365
374ca955
A
5366 /* reset the selector for the next code point */
5367 cnv->useSubChar1=FALSE;
5368
46f4442e 5369 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
b75a7d8f
A
5370 p=buffer;
5371
5372 /* fromUnicodeStatus contains prevLength */
5373 switch(length) {
5374 case 1:
5375 if(cnv->fromUnicodeStatus==2) {
5376 /* DBCS mode and SBCS sub char: change to SBCS */
5377 cnv->fromUnicodeStatus=1;
5378 *p++=UCNV_SI;
5379 }
5380 *p++=subchar[0];
5381 break;
5382 case 2:
374ca955 5383 if(cnv->fromUnicodeStatus<=1) {
b75a7d8f
A
5384 /* SBCS mode and DBCS sub char: change to DBCS */
5385 cnv->fromUnicodeStatus=2;
5386 *p++=UCNV_SO;
5387 }
5388 *p++=subchar[0];
5389 *p++=subchar[1];
5390 break;
5391 default:
5392 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5393 return;
5394 }
46f4442e
A
5395 subchar=buffer;
5396 length=(int32_t)(p-buffer);
b75a7d8f 5397 }
46f4442e
A
5398
5399 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
b75a7d8f
A
5400}
5401
5402U_CFUNC UConverterType
374ca955 5403ucnv_MBCSGetType(const UConverter* converter) {
b75a7d8f 5404 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
374ca955 5405 if(converter->sharedData->mbcs.countStates==1) {
b75a7d8f 5406 return (UConverterType)UCNV_SBCS;
374ca955 5407 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
b75a7d8f
A
5408 return (UConverterType)UCNV_EBCDIC_STATEFUL;
5409 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
5410 return (UConverterType)UCNV_DBCS;
5411 }
5412 return (UConverterType)UCNV_MBCS;
5413}
5414
46f4442e
A
5415static const UConverterImpl _SBCSUTF8Impl={
5416 UCNV_MBCS,
5417
5418 ucnv_MBCSLoad,
5419 ucnv_MBCSUnload,
5420
5421 ucnv_MBCSOpen,
5422 NULL,
5423 NULL,
5424
5425 ucnv_MBCSToUnicodeWithOffsets,
5426 ucnv_MBCSToUnicodeWithOffsets,
5427 ucnv_MBCSFromUnicodeWithOffsets,
5428 ucnv_MBCSFromUnicodeWithOffsets,
5429 ucnv_MBCSGetNextUChar,
5430
5431 ucnv_MBCSGetStarters,
5432 ucnv_MBCSGetName,
5433 ucnv_MBCSWriteSub,
5434 NULL,
5435 ucnv_MBCSGetUnicodeSet,
5436
5437 NULL,
5438 ucnv_SBCSFromUTF8
5439};
5440
5441static const UConverterImpl _DBCSUTF8Impl={
5442 UCNV_MBCS,
5443
5444 ucnv_MBCSLoad,
5445 ucnv_MBCSUnload,
5446
5447 ucnv_MBCSOpen,
5448 NULL,
5449 NULL,
5450
5451 ucnv_MBCSToUnicodeWithOffsets,
5452 ucnv_MBCSToUnicodeWithOffsets,
5453 ucnv_MBCSFromUnicodeWithOffsets,
5454 ucnv_MBCSFromUnicodeWithOffsets,
5455 ucnv_MBCSGetNextUChar,
5456
5457 ucnv_MBCSGetStarters,
5458 ucnv_MBCSGetName,
5459 ucnv_MBCSWriteSub,
5460 NULL,
5461 ucnv_MBCSGetUnicodeSet,
5462
5463 NULL,
5464 ucnv_DBCSFromUTF8
5465};
5466
b75a7d8f
A
5467static const UConverterImpl _MBCSImpl={
5468 UCNV_MBCS,
5469
374ca955
A
5470 ucnv_MBCSLoad,
5471 ucnv_MBCSUnload,
b75a7d8f 5472
374ca955
A
5473 ucnv_MBCSOpen,
5474 NULL,
b75a7d8f 5475 NULL,
b75a7d8f 5476
374ca955
A
5477 ucnv_MBCSToUnicodeWithOffsets,
5478 ucnv_MBCSToUnicodeWithOffsets,
5479 ucnv_MBCSFromUnicodeWithOffsets,
5480 ucnv_MBCSFromUnicodeWithOffsets,
5481 ucnv_MBCSGetNextUChar,
b75a7d8f 5482
374ca955
A
5483 ucnv_MBCSGetStarters,
5484 ucnv_MBCSGetName,
5485 ucnv_MBCSWriteSub,
b75a7d8f 5486 NULL,
374ca955 5487 ucnv_MBCSGetUnicodeSet
b75a7d8f
A
5488};
5489
5490
5491/* Static data is in tools/makeconv/ucnvstat.c for data-based
5492 * converters. Be sure to update it as well.
5493 */
5494
5495const UConverterSharedData _MBCSData={
5496 sizeof(UConverterSharedData), 1,
5497 NULL, NULL, NULL, FALSE, &_MBCSImpl,
5498 0
5499};
5500
b75a7d8f 5501#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */