]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnvmbcs.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / ucnvmbcs.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvmbcs.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000jul03
14 * created by: Markus W. Scherer
15 *
16 * The current code in this file replaces the previous implementation
17 * of conversion code from multi-byte codepages to Unicode and back.
18 * This implementation supports the following:
19 * - legacy variable-length codepages with up to 4 bytes per character
20 * - all Unicode code points (up to 0x10ffff)
21 * - efficient distinction of unassigned vs. illegal byte sequences
22 * - it is possible in fromUnicode() to directly deal with simple
23 * stateful encodings (used for EBCDIC_STATEFUL)
24 * - it is possible to convert Unicode code points other than U+0000
25 * to a single zero byte (but not as a fallback except for SBCS)
26 *
27 * Remaining limitations in fromUnicode:
28 * - byte sequences must not have leading zero bytes
29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30 * - limitation to up to 4 bytes per character
31 *
32 * Change history:
33 *
34 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
35 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
36 * macros to ucnvmbcs.h file
37 */
38
39 #include "unicode/utypes.h"
40
41 #if !UCONFIG_NO_LEGACY_CONVERSION
42
43 #include "unicode/ucnv.h"
44 #include "unicode/ucnv_cb.h"
45 #include "unicode/udata.h"
46 #include "unicode/uset.h"
47 #include "ucnv_bld.h"
48 #include "ucnvmbcs.h"
49 #include "ucnv_cnv.h"
50 #include "umutex.h"
51 #include "cmemory.h"
52 #include "cstring.h"
53
54 /* control optimizations according to the platform */
55 #define MBCS_UNROLL_SINGLE_TO_BMP 1
56 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
57
58 /*
59 * _MBCSHeader versions 4.1
60 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
61 *
62 * Change from version 4.0:
63 * - Replace header.reserved with header.fromUBytesLength so that all
64 * fields in the data have length.
65 *
66 * Changes from version 3 (for performance improvements):
67 * - new bit distribution for state table entries
68 * - reordered action codes
69 * - new data structure for single-byte fromUnicode
70 * + stage 2 only contains indexes
71 * + stage 3 stores 16 bits per character with classification bits 15..8
72 * - no multiplier for stage 1 entries
73 * - stage 2 for non-single-byte codepages contains the index and the flags in
74 * one 32-bit value
75 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
76 *
77 * For more details about old versions of the MBCS data structure, see
78 * the corresponding versions of this file.
79 *
80 * Converting stateless codepage data ---------------------------------------***
81 * (or codepage data with simple states) to Unicode.
82 *
83 * Data structure and algorithm for converting from complex legacy codepages
84 * to Unicode. (Designed before 2000-may-22.)
85 *
86 * The basic idea is that the structure of legacy codepages can be described
87 * with state tables.
88 * When reading a byte stream, each input byte causes a state transition.
89 * Some transitions result in the output of a code point, some result in
90 * "unassigned" or "illegal" output.
91 * This is used here for character conversion.
92 *
93 * The data structure begins with a state table consisting of a row
94 * per state, with 256 entries (columns) per row for each possible input
95 * byte value.
96 * Each entry is 32 bits wide, with two formats distinguished by
97 * the sign bit (bit 31):
98 *
99 * One format for transitional entries (bit 31 not set) for non-final bytes, and
100 * one format for final entries (bit 31 set).
101 * Both formats contain the number of the next state in the same bit
102 * positions.
103 * State 0 is the initial state.
104 *
105 * Most of the time, the offset values of subsequent states are added
106 * up to a scalar value. This value will eventually be the index of
107 * the Unicode code point in a table that follows the state table.
108 * The effect is that the code points for final state table rows
109 * are contiguous. The code points of final state rows follow each other
110 * in the order of the references to those final states by previous
111 * states, etc.
112 *
113 * For some terminal states, the offset is itself the output Unicode
114 * code point (16 bits for a BMP code point or 20 bits for a supplementary
115 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
116 * For others, the code point in the Unicode table is stored with either
117 * one or two code units: one for BMP code points, two for a pair of
118 * surrogates.
119 * All code points for a final state entry take up the same number of code
120 * units, regardless of whether they all actually _use_ the same number
121 * of code units. This is necessary for simple array access.
122 *
123 * An additional feature comes in with what in ICU is called "fallback"
124 * mappings:
125 *
126 * In addition to round-trippable, precise, 1:1 mappings, there are often
127 * mappings defined between similar, though not the same, characters.
128 * Typically, such mappings occur only in fromUnicode mapping tables because
129 * Unicode has a superset repertoire of most other codepages. However, it
130 * is possible to provide such mappings in the toUnicode tables, too.
131 * In this case, the fallback mappings are partly integrated into the
132 * general state tables because the structure of the encoding includes their
133 * byte sequences.
134 * For final entries in an initial state, fallback mappings are stored in
135 * the entry itself like with roundtrip mappings.
136 * For other final entries, they are stored in the code units table if
137 * the entry is for a pair of code units.
138 * For single-unit results in the code units table, there is no space to
139 * alternatively hold a fallback mapping; in this case, the code unit
140 * is stored as U+fffe (unassigned), and the fallback mapping needs to
141 * be looked up by the scalar offset value in a separate table.
142 *
143 * "Unassigned" state entries really mean "structurally unassigned",
144 * i.e., such a byte sequence will never have a mapping result.
145 *
146 * The interpretation of the bits in each entry is as follows:
147 *
148 * Bit 31 not set, not a terminal entry ("transitional"):
149 * 30..24 next state
150 * 23..0 offset delta, to be added up
151 *
152 * Bit 31 set, terminal ("final") entry:
153 * 30..24 next state (regardless of action code)
154 * 23..20 action code:
155 * action codes 0 and 1 result in precise-mapping Unicode code points
156 * 0 valid byte sequence
157 * 19..16 not used, 0
158 * 15..0 16-bit Unicode BMP code point
159 * never U+fffe or U+ffff
160 * 1 valid byte sequence
161 * 19..0 20-bit Unicode supplementary code point
162 * never U+fffe or U+ffff
163 *
164 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
165 * 2 valid byte sequence (fallback)
166 * 19..16 not used, 0
167 * 15..0 16-bit Unicode BMP code point as fallback result
168 * 3 valid byte sequence (fallback)
169 * 19..0 20-bit Unicode supplementary code point as fallback result
170 *
171 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
172 * depending on the code units they result in
173 * 4 valid byte sequence
174 * 19..9 not used, 0
175 * 8..0 final offset delta
176 * pointing to one 16-bit code unit which may be
177 * fffe unassigned -- look for a fallback for this offset
178 * ffff illegal
179 * 5 valid byte sequence
180 * 19..9 not used, 0
181 * 8..0 final offset delta
182 * pointing to two 16-bit code units
183 * (typically UTF-16 surrogates)
184 * the result depends on the first code unit as follows:
185 * 0000..d7ff roundtrip BMP code point (1st alone)
186 * d800..dbff roundtrip surrogate pair (1st, 2nd)
187 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
188 * e000 roundtrip BMP code point (2nd alone)
189 * e001 fallback BMP code point (2nd alone)
190 * fffe unassigned
191 * ffff illegal
192 * (the final offset deltas are at most 255 * 2,
193 * times 2 because of storing code unit pairs)
194 *
195 * 6 unassigned byte sequence
196 * 19..16 not used, 0
197 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
198 * this does not contain a final offset delta because the main
199 * purpose of this action code is to save scalar offset values;
200 * therefore, fallback values cannot be assigned to byte
201 * sequences that result in this action code
202 * 7 illegal byte sequence
203 * 19..16 not used, 0
204 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
205 * 8 state change only
206 * 19..0 not used, 0
207 * useful for state changes in simple stateful encodings,
208 * at Shift-In/Shift-Out codes
209 *
210 *
211 * 9..15 reserved for future use
212 * current implementations will only perform a state change
213 * and ignore bits 19..0
214 *
215 * An encoding with contiguous ranges of unassigned byte sequences, like
216 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
217 * at least two states for the trail bytes:
218 * One trail byte state that results in code points, and one that only
219 * has "unassigned" and "illegal" terminal states.
220 *
221 * Note: partly by accident, this data structure supports simple stateless
222 * encodings without any additional logic.
223 * Currently, only simple Shift-In/Shift-Out schemes are handled with
224 * appropriate state tables (especially EBCDIC_STATEFUL!).
225 *
226 * MBCS version 2 added:
227 * unassigned and illegal action codes have U+fffe and U+ffff
228 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
229 *
230 * Converting from Unicode to codepage bytes --------------------------------***
231 *
232 * The conversion data structure for fromUnicode is designed for the known
233 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
234 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
235 * a roundtrip mapping.
236 *
237 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
238 * like in the character properties table.
239 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
240 * with the resulting bytes is at offsetFromUBytes.
241 *
242 * Beginning with version 4, single-byte codepages have a significantly different
243 * trie compared to other codepages.
244 * In all cases, the entry in stage 1 is directly the index of the block of
245 * 64 entries in stage 2.
246 *
247 * Single-byte lookup:
248 *
249 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
250 * Stage 3 contains one 16-bit word per result:
251 * Bits 15..8 indicate the kind of result:
252 * f roundtrip result
253 * c fallback result from private-use code point
254 * 8 fallback result from other code points
255 * 0 unassigned
256 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
257 *
258 * Multi-byte lookup:
259 *
260 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
261 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
262 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
263 * If this test is false, then a non-zero result will be interpreted as
264 * a fallback mapping.
265 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
266 *
267 * Stage 3 contains 2, 3, or 4 bytes per result.
268 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
269 * while 3 bytes are stored as bytes in big-endian order.
270 * Leading zero bytes are ignored, and the number of bytes is counted.
271 * A zero byte mapping result is possible as a roundtrip result.
272 * For some output types, the actual result is processed from this;
273 * see _MBCSFromUnicodeWithOffsets().
274 *
275 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
276 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
277 *
278 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
279 * for compaction.
280 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
281 * may overlap by any number of entries.
282 *
283 * MBCS version 2 added:
284 * the converter checks for known output types, which allows
285 * adding new ones without crashing an unaware converter
286 */
287
288 /* prototypes --------------------------------------------------------------- */
289
290 static void
291 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
292 UErrorCode *pErrorCode);
293
294 static void
295 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
296 UErrorCode *pErrorCode);
297
298 static UChar32
299 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
300 UErrorCode *pErrorCode);
301
302 static UChar32
303 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
304 UErrorCode *pErrorCode);
305
306 static void
307 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
308 UErrorCode *pErrorCode);
309
310 static void
311 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
312 UErrorCode *pErrorCode);
313
314 static void
315 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
316 UErrorCode *pErrorCode);
317
318 static void
319 fromUCallback(UConverter *cnv,
320 const void *context, UConverterFromUnicodeArgs *pArgs,
321 UChar32 codePoint,
322 UConverterCallbackReason reason, UErrorCode *pErrorCode);
323
324 static void
325 toUCallback(UConverter *cnv,
326 const void *context, UConverterToUnicodeArgs *pArgs,
327 const char *codeUnits, int32_t length,
328 UConverterCallbackReason reason, UErrorCode *pErrorCode);
329
330 /* GB 18030 data ------------------------------------------------------------ */
331
332 /* helper macros for linear values for GB 18030 four-byte sequences */
333 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
334
335 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
336
337 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
338
339 /*
340 * Some ranges of GB 18030 where both the Unicode code points and the
341 * GB four-byte sequences are contiguous and are handled algorithmically by
342 * the special callback functions below.
343 * The values are start & end of Unicode & GB codes.
344 *
345 * Note that single surrogates are not mapped by GB 18030
346 * as of the re-released mapping tables from 2000-nov-30.
347 */
348 static const uint32_t
349 gb18030Ranges[13][4]={
350 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
351 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
352 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
353 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
354 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
355 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
356 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
357 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
358 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
359 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
360 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
361 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
362 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
363 };
364
365 /* bit flag for UConverter.options indicating GB 18030 special handling */
366 #define _MBCS_OPTION_GB18030 0x8000
367
368 /* Miscellaneous ------------------------------------------------------------ */
369
370 static uint32_t
371 _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
372 const uint16_t *table;
373
374 uint32_t st3, maxStage3;
375 uint16_t st1, maxStage1, st2;
376
377 if(mbcsTable->fromUBytesLength>0) {
378 /*
379 * We _know_ the number of bytes in the fromUnicodeBytes array
380 * starting with header.version 4.1.
381 * Otherwise, below, we need to enumerate the fromUnicode
382 * trie and find the highest entry.
383 */
384 return mbcsTable->fromUBytesLength;
385 }
386
387 /* Enumerate the from-Unicode trie table to find the highest stage 3 index. */
388 table=mbcsTable->fromUnicodeTable;
389 maxStage3=0;
390 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
391 maxStage1=0x440;
392 } else {
393 maxStage1=0x40;
394 }
395
396
397 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
398 const uint16_t *stage2;
399
400 for(st1=0; st1<maxStage1; ++st1) {
401 st2=table[st1];
402 if(st2>maxStage1) {
403 stage2=table+st2;
404 for(st2=0; st2<64; ++st2) {
405 st3=stage2[st2];
406 if(st3>maxStage3) {
407 maxStage3=st3;
408 }
409 }
410 }
411 }
412
413 /*
414 * add 16 to get the limit not start index of the last stage 3 block,
415 * times 2 for number of bytes
416 */
417 return (maxStage3+16)*2;
418 } else {
419 const uint32_t *stage2;
420
421 for(st1=0; st1<maxStage1; ++st1) {
422 st2=table[st1];
423 if(st2>(maxStage1>>1)) {
424 stage2=(const uint32_t *)table+st2;
425 for(st2=0; st2<64; ++st2) {
426 st3=stage2[st2]&0xffff;
427 if(st3>maxStage3) {
428 maxStage3=st3;
429 }
430 }
431 }
432 }
433
434 /*
435 * add 16 to get the limit not start index of the last stage 3 block,
436 * times 2..4 for number of bytes
437 */
438 maxStage3=16*maxStage3+16;
439 switch(mbcsTable->outputType) {
440 case MBCS_OUTPUT_3:
441 case MBCS_OUTPUT_4_EUC:
442 maxStage3*=3;
443 break;
444 case MBCS_OUTPUT_4:
445 maxStage3*=4;
446 break;
447 default:
448 /* MBCS_OUTPUT_2... and MBCS_OUTPUT_3_EUC */
449 maxStage3*=2;
450 break;
451 }
452 return maxStage3;
453 }
454 }
455
456 static void
457 _MBCSGetUnicodeSet(const UConverter *cnv,
458 USet *set,
459 UConverterUnicodeSet which,
460 UErrorCode *pErrorCode) {
461 UConverterMBCSTable *mbcsTable;
462 const uint16_t *table;
463
464 uint32_t st3;
465 uint16_t st1, maxStage1, st2;
466
467 UChar32 c;
468
469 if(cnv->options&_MBCS_OPTION_GB18030) {
470 uset_addRange(set, 0, 0xd7ff);
471 uset_addRange(set, 0xe000, 0x10ffff);
472 return;
473 }
474
475 /* enumerate the from-Unicode trie table */
476 mbcsTable=&cnv->sharedData->table->mbcs;
477 table=mbcsTable->fromUnicodeTable;
478 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
479 maxStage1=0x440;
480 } else {
481 maxStage1=0x40;
482 }
483
484 c=0; /* keep track of the current code point while enumerating */
485
486 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
487 const uint16_t *stage2, *stage3, *results;
488
489 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
490
491 for(st1=0; st1<maxStage1; ++st1) {
492 st2=table[st1];
493 if(st2>maxStage1) {
494 stage2=table+st2;
495 for(st2=0; st2<64; ++st2) {
496 if((st3=stage2[st2])!=0) {
497 /* read the stage 3 block */
498 stage3=results+st3;
499
500 /*
501 * Add code points for which the roundtrip flag is set.
502 * Once we get a set for fallback mappings, we have to use
503 * a threshold variable with a value of 0x800.
504 * See _MBCSSingleFromBMPWithOffsets() and
505 * MBCS_SINGLE_RESULT_FROM_U() for details.
506 */
507 do {
508 if(*stage3++>=0xf00) {
509 uset_add(set, c);
510 }
511 } while((++c&0xf)!=0);
512 } else {
513 c+=16; /* empty stage 3 block */
514 }
515 }
516 } else {
517 c+=1024; /* empty stage 2 block */
518 }
519 }
520 } else {
521 const uint32_t *stage2;
522
523 for(st1=0; st1<maxStage1; ++st1) {
524 st2=table[st1];
525 if(st2>(maxStage1>>1)) {
526 stage2=(const uint32_t *)table+st2;
527 for(st2=0; st2<64; ++st2) {
528 if((st3=stage2[st2])!=0) {
529 /* get the roundtrip flags for the stage 3 block */
530 st3>>=16;
531
532 /*
533 * Add code points for which the roundtrip flag is set.
534 * Once we get a set for fallback mappings, we have to check
535 * non-roundtrip stage 3 results for whether they are 0.
536 * See _MBCSFromUnicodeWithOffsets() for details.
537 */
538 do {
539 if(st3&1) {
540 uset_add(set, c);
541 }
542 st3>>=1;
543 } while((++c&0xf)!=0);
544 } else {
545 c+=16; /* empty stage 3 block */
546 }
547 }
548 } else {
549 c+=1024; /* empty stage 2 block */
550 }
551 }
552 }
553 }
554
555 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
556
557 /*
558 * This code modifies a standard EBCDIC<->Unicode mapping table for
559 * OS/390 (z/OS) Unix System Services (Open Edition).
560 * The difference is in the mapping of Line Feed and New Line control codes:
561 * Standard EBCDIC maps
562 *
563 * <U000A> \x25 |0
564 * <U0085> \x15 |0
565 *
566 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
567 * mapping
568 *
569 * <U000A> \x15 |0
570 * <U0085> \x25 |0
571 *
572 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
573 * by copying it into allocated memory and swapping the LF and NL values.
574 * It allows to support the same EBCDIC charset in both versions without
575 * duplicating the entire installed table.
576 */
577
578 /* standard EBCDIC codes */
579 #define EBCDIC_LF 0x25
580 #define EBCDIC_NL 0x15
581
582 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
583 #define EBCDIC_RT_LF 0xf25
584 #define EBCDIC_RT_NL 0xf15
585
586 /* Unicode code points */
587 #define U_LF 0x0a
588 #define U_NL 0x85
589
590 static UBool
591 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
592 UConverterMBCSTable *mbcsTable;
593
594 const uint16_t *table, *results;
595 const uint8_t *bytes;
596
597 int32_t (*newStateTable)[256];
598 uint16_t *newResults;
599 uint8_t *p;
600 char *name;
601
602 uint32_t stage2Entry;
603 uint32_t size, sizeofFromUBytes;
604
605 mbcsTable=&sharedData->table->mbcs;
606
607 table=mbcsTable->fromUnicodeTable;
608 bytes=mbcsTable->fromUnicodeBytes;
609 results=(const uint16_t *)bytes;
610
611 /*
612 * Check that this is an EBCDIC table with SBCS portion -
613 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
614 *
615 * If not, ignore the option. Options are always ignored if they do not apply.
616 */
617 if(!(
618 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
619 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
620 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
621 )) {
622 return FALSE;
623 }
624
625 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
626 if(!(
627 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
628 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
629 )) {
630 return FALSE;
631 }
632 } else /* MBCS_OUTPUT_2_SISO */ {
633 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
634 if(!(
635 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
636 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
637 )) {
638 return FALSE;
639 }
640
641 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
642 if(!(
643 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
644 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
645 )) {
646 return FALSE;
647 }
648 }
649
650 /*
651 * The table has an appropriate format.
652 * Allocate and build
653 * - a modified to-Unicode state table
654 * - a modified from-Unicode output array
655 * - a converter name string with the swap option appended
656 */
657 sizeofFromUBytes=_MBCSSizeofFromUBytes(mbcsTable);
658 size=
659 mbcsTable->countStates*1024+
660 sizeofFromUBytes+
661 UCNV_MAX_CONVERTER_NAME_LENGTH+20;
662 p=(uint8_t *)uprv_malloc(size);
663 if(p==NULL) {
664 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
665 return FALSE;
666 }
667
668 /* copy and modify the to-Unicode state table */
669 newStateTable=(int32_t (*)[256])p;
670 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
671
672 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
673 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
674
675 /* copy and modify the from-Unicode result table */
676 newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
677 uprv_memcpy(newResults, bytes, sizeofFromUBytes);
678
679 /* conveniently, the table access macros work on the left side of expressions */
680 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
681 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
682 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
683 } else /* MBCS_OUTPUT_2_SISO */ {
684 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
685 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
686
687 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
688 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
689 }
690
691 /* set the canonical converter name */
692 name=(char *)newResults+sizeofFromUBytes;
693 uprv_strcpy(name, sharedData->staticData->name);
694 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
695
696 /* set the pointers */
697 umtx_lock(NULL);
698 if(mbcsTable->swapLFNLStateTable==NULL) {
699 mbcsTable->swapLFNLStateTable=newStateTable;
700 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
701 mbcsTable->swapLFNLName=name;
702
703 newStateTable=NULL;
704 }
705 umtx_unlock(NULL);
706
707 /* release the allocated memory if another thread beat us to it */
708 if(newStateTable!=NULL) {
709 uprv_free(newStateTable);
710 }
711 return TRUE;
712 }
713
714 /* MBCS setup functions ----------------------------------------------------- */
715
716 static void
717 _MBCSLoad(UConverterSharedData *sharedData,
718 const uint8_t *raw,
719 UErrorCode *pErrorCode) {
720 UDataInfo info;
721 UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
722 _MBCSHeader *header=(_MBCSHeader *)raw;
723
724 if(header->version[0]!=4) {
725 *pErrorCode=U_INVALID_TABLE_FORMAT;
726 return;
727 }
728
729 mbcsTable->countStates=(uint8_t)header->countStates;
730 mbcsTable->countToUFallbacks=header->countToUFallbacks;
731 mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
732 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
733 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
734
735 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
736 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
737 mbcsTable->fromUBytesLength=header->fromUBytesLength;
738 mbcsTable->outputType=(uint8_t)header->flags;
739
740 /* make sure that the output type is known */
741 switch(mbcsTable->outputType) {
742 case MBCS_OUTPUT_1:
743 case MBCS_OUTPUT_2:
744 case MBCS_OUTPUT_3:
745 case MBCS_OUTPUT_4:
746 case MBCS_OUTPUT_3_EUC:
747 case MBCS_OUTPUT_4_EUC:
748 case MBCS_OUTPUT_2_SISO:
749 /* OK */
750 break;
751 default:
752 *pErrorCode=U_INVALID_TABLE_FORMAT;
753 return;
754 }
755
756 /*
757 * converter versions 6.1 and up contain a unicodeMask that is
758 * used here to select the most efficient function implementations
759 */
760 info.size=sizeof(UDataInfo);
761 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
762 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
763 /* mask off possible future extensions to be safe */
764 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
765 } else {
766 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
767 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
768 }
769 }
770
771 static void
772 _MBCSUnload(UConverterSharedData *sharedData) {
773 UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
774
775 if(mbcsTable->swapLFNLStateTable!=NULL) {
776 uprv_free(mbcsTable->swapLFNLStateTable);
777 }
778 }
779
780 static void
781 _MBCSReset(UConverter *cnv, UConverterResetChoice choice) {
782 if(choice<=UCNV_RESET_TO_UNICODE) {
783 /* toUnicode */
784 cnv->toUnicodeStatus=0; /* offset */
785 cnv->mode=0; /* state */
786 cnv->toULength=0; /* byteIndex */
787 }
788 if(choice!=UCNV_RESET_TO_UNICODE) {
789 /* fromUnicode */
790 cnv->fromUSurrogateLead=0;
791 cnv->fromUnicodeStatus=1; /* prevLength */
792 }
793 }
794
795 static void
796 _MBCSOpen(UConverter *cnv,
797 const char *name,
798 const char *locale,
799 uint32_t options,
800 UErrorCode *pErrorCode) {
801 if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
802 /* do this because double-checked locking is broken */
803 UBool isCached;
804
805 umtx_lock(NULL);
806 isCached=cnv->sharedData->table->mbcs.swapLFNLStateTable!=NULL;
807 umtx_unlock(NULL);
808
809 if(!isCached) {
810 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
811 /* the option does not apply, remove it */
812 cnv->options&=~UCNV_OPTION_SWAP_LFNL;
813 }
814 }
815 }
816
817
818 if(uprv_strstr(name, "18030")!=NULL) {
819 if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
820 /* set a flag for GB 18030 mode, which changes the callback behavior */
821 cnv->options|=_MBCS_OPTION_GB18030;
822 }
823 }
824
825 _MBCSReset(cnv, UCNV_RESET_BOTH);
826 }
827
828 static const char *
829 _MBCSGetName(const UConverter *cnv) {
830 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->table->mbcs.swapLFNLName!=NULL) {
831 return cnv->sharedData->table->mbcs.swapLFNLName;
832 } else {
833 return cnv->sharedData->staticData->name;
834 }
835 }
836
837 /* MBCS-to-Unicode conversion functions ------------------------------------- */
838
839 static UChar32
840 _MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
841 const _MBCSToUFallback *toUFallbacks;
842 uint32_t i, start, limit;
843
844 limit=mbcsTable->countToUFallbacks;
845 if(limit>0) {
846 /* do a binary search for the fallback mapping */
847 toUFallbacks=mbcsTable->toUFallbacks;
848 start=0;
849 while(start<limit-1) {
850 i=(start+limit)/2;
851 if(offset<toUFallbacks[i].offset) {
852 limit=i;
853 } else {
854 start=i;
855 }
856 }
857
858 /* did we really find it? */
859 if(offset==toUFallbacks[start].offset) {
860 return toUFallbacks[start].codePoint;
861 }
862 }
863
864 return 0xfffe;
865 }
866
867 U_CFUNC void
868 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
869 UErrorCode *pErrorCode) {
870 UConverter *cnv;
871 const uint8_t *source, *sourceLimit;
872 UChar *target;
873 const UChar *targetLimit;
874 int32_t *offsets;
875
876 const int32_t (*stateTable)[256];
877 const uint16_t *unicodeCodeUnits;
878
879 uint32_t offset;
880 uint8_t state;
881 int8_t byteIndex;
882 uint8_t *bytes;
883
884 int32_t sourceIndex, nextSourceIndex;
885
886 int32_t entry;
887 UChar c;
888 uint8_t action;
889 UConverterCallbackReason reason;
890
891 /* use optimized function if possible */
892 cnv=pArgs->converter;
893 if(cnv->sharedData->table->mbcs.countStates==1) {
894 if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
895 _MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
896 } else {
897 _MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
898 }
899 return;
900 }
901
902 /* set up the local pointers */
903 source=(const uint8_t *)pArgs->source;
904 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
905 target=pArgs->target;
906 targetLimit=pArgs->targetLimit;
907 offsets=pArgs->offsets;
908
909 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
910 stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
911 } else {
912 stateTable=cnv->sharedData->table->mbcs.stateTable;
913 }
914 unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
915
916 /* get the converter state from UConverter */
917 offset=cnv->toUnicodeStatus;
918 state=(uint8_t)(cnv->mode);
919 byteIndex=cnv->toULength;
920 bytes=cnv->toUBytes;
921
922 /* sourceIndex=-1 if the current character began in the previous buffer */
923 sourceIndex=byteIndex==0 ? 0 : -1;
924 nextSourceIndex=0;
925
926 /* conversion loop */
927 while(source<sourceLimit) {
928 /*
929 * This following test is to see if available input would overflow the output.
930 * It does not catch output of more than one code unit that
931 * overflows as a result of a surrogate pair or callback output
932 * from the last source byte.
933 * Therefore, those situations also test for overflows and will
934 * then break the loop, too.
935 */
936 if(target<targetLimit) {
937 ++nextSourceIndex;
938 entry=stateTable[state][bytes[byteIndex++]=*source++];
939 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
940 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
941 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
942 } else {
943 /* set the next state early so that we can reuse the entry variable */
944 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
945
946 /*
947 * An if-else-if chain provides more reliable performance for
948 * the most common cases compared to a switch.
949 */
950 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
951 if(action==MBCS_STATE_VALID_16) {
952 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
953 c=unicodeCodeUnits[offset];
954 if(c<0xfffe) {
955 /* output BMP code point */
956 *target++=c;
957 if(offsets!=NULL) {
958 *offsets++=sourceIndex;
959 }
960 } else if(c==0xfffe) {
961 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
962 /* output fallback BMP code point */
963 *target++=(UChar)entry;
964 if(offsets!=NULL) {
965 *offsets++=sourceIndex;
966 }
967 } else {
968 /* callback(unassigned) */
969 goto unassigned;
970 }
971 } else {
972 /* callback(illegal) */
973 goto illegal;
974 }
975 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
976 /* output BMP code point */
977 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
978 if(offsets!=NULL) {
979 *offsets++=sourceIndex;
980 }
981 } else if(action==MBCS_STATE_VALID_16_PAIR) {
982 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
983 c=unicodeCodeUnits[offset++];
984 if(c<0xd800) {
985 /* output BMP code point below 0xd800 */
986 *target++=c;
987 if(offsets!=NULL) {
988 *offsets++=sourceIndex;
989 }
990 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
991 /* output roundtrip or fallback surrogate pair */
992 *target++=(UChar)(c&0xdbff);
993 if(offsets!=NULL) {
994 *offsets++=sourceIndex;
995 }
996 if(target<targetLimit) {
997 *target++=unicodeCodeUnits[offset];
998 if(offsets!=NULL) {
999 *offsets++=sourceIndex;
1000 }
1001 } else {
1002 /* target overflow */
1003 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
1004 cnv->UCharErrorBufferLength=1;
1005 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1006
1007 offset=0;
1008 byteIndex=0;
1009 break;
1010 }
1011 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1012 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1013 *target++=unicodeCodeUnits[offset];
1014 if(offsets!=NULL) {
1015 *offsets++=sourceIndex;
1016 }
1017 } else if(c==0xffff) {
1018 /* callback(illegal) */
1019 goto illegal;
1020 } else {
1021 /* callback(unassigned) */
1022 goto unassigned;
1023 }
1024 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
1025 valid20:
1026 entry=MBCS_ENTRY_FINAL_VALUE(entry);
1027 /* output surrogate pair */
1028 *target++=(UChar)(0xd800|(UChar)(entry>>10));
1029 if(offsets!=NULL) {
1030 *offsets++=sourceIndex;
1031 }
1032 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1033 if(target<targetLimit) {
1034 *target++=c;
1035 if(offsets!=NULL) {
1036 *offsets++=sourceIndex;
1037 }
1038 } else {
1039 /* target overflow */
1040 cnv->UCharErrorBuffer[0]=c;
1041 cnv->UCharErrorBufferLength=1;
1042 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1043
1044 offset=0;
1045 byteIndex=0;
1046 break;
1047 }
1048 } else if(action==MBCS_STATE_CHANGE_ONLY) {
1049 /*
1050 * This serves as a state change without any output.
1051 * It is useful for reading simple stateful encodings,
1052 * for example using just Shift-In/Shift-Out codes.
1053 * The 21 unused bits may later be used for more sophisticated
1054 * state transitions.
1055 */
1056 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1057 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1058 /* callback(unassigned) */
1059 goto unassigned;
1060 }
1061 /* output BMP code point */
1062 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1063 if(offsets!=NULL) {
1064 *offsets++=sourceIndex;
1065 }
1066 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1067 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1068 /* callback(unassigned) */
1069 goto unassigned;
1070 }
1071 goto valid20;
1072 } else if(action==MBCS_STATE_UNASSIGNED) {
1073 /* callback(unassigned) */
1074 goto unassigned;
1075 } else if(action==MBCS_STATE_ILLEGAL) {
1076 /* callback(illegal) */
1077 goto illegal;
1078 } else {
1079 /* reserved, must never occur */
1080 }
1081
1082 /* normal end of action codes: prepare for a new character */
1083 offset=0;
1084 byteIndex=0;
1085 sourceIndex=nextSourceIndex;
1086 continue;
1087
1088 illegal:
1089 reason=UCNV_ILLEGAL;
1090 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1091 goto callback;
1092 unassigned:
1093 reason=UCNV_UNASSIGNED;
1094 *pErrorCode=U_INVALID_CHAR_FOUND;
1095 callback:
1096 /* call the callback function with all the preparations and post-processing */
1097 /* update the arguments structure */
1098 pArgs->source=(const char *)source;
1099 pArgs->target=target;
1100 pArgs->offsets=offsets;
1101
1102 /* set the converter state in UConverter to deal with the next character */
1103 cnv->toUnicodeStatus=0;
1104 cnv->mode=state;
1105 cnv->toULength=0;
1106
1107 /* call the callback function */
1108 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)bytes, byteIndex, reason, pErrorCode);
1109
1110 /* get the converter state from UConverter */
1111 offset=cnv->toUnicodeStatus;
1112 state=(uint8_t)cnv->mode;
1113 byteIndex=cnv->toULength;
1114
1115 /* update target and deal with offsets if necessary */
1116 offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1117 target=pArgs->target;
1118
1119 /* update the source pointer and index */
1120 sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
1121 source=(const uint8_t *)pArgs->source;
1122
1123 /*
1124 * If the callback overflowed the target, then we need to
1125 * stop here with an overflow indication.
1126 */
1127 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1128 break;
1129 } else if(U_FAILURE(*pErrorCode)) {
1130 /* break on error */
1131 offset=0;
1132 state=0;
1133 byteIndex=0;
1134 break;
1135 } else if(cnv->UCharErrorBufferLength>0) {
1136 /* target is full */
1137 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1138 break;
1139 }
1140
1141 /*
1142 * We do not need to repeat the statements from the normal
1143 * end of the action codes because we already updated all the
1144 * necessary variables.
1145 */
1146 }
1147 } else {
1148 /* target is full */
1149 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1150 break;
1151 }
1152 }
1153
1154 if(pArgs->flush && source>=sourceLimit) {
1155 /* reset the state for the next conversion */
1156 if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
1157 /* a character byte sequence remains incomplete */
1158 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1159 }
1160 cnv->toUnicodeStatus=0;
1161 cnv->mode=0;
1162 cnv->toULength=0;
1163 } else {
1164 /* set the converter state back into UConverter */
1165 cnv->toUnicodeStatus=offset;
1166 cnv->mode=state;
1167 cnv->toULength=byteIndex;
1168 }
1169
1170 /* write back the updated pointers */
1171 pArgs->source=(const char *)source;
1172 pArgs->target=target;
1173 pArgs->offsets=offsets;
1174 }
1175
1176 /* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1177 static void
1178 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1179 UErrorCode *pErrorCode) {
1180 UConverter *cnv;
1181 const uint8_t *source, *sourceLimit;
1182 UChar *target;
1183 const UChar *targetLimit;
1184 int32_t *offsets;
1185
1186 const int32_t (*stateTable)[256];
1187
1188 int32_t sourceIndex, nextSourceIndex;
1189
1190 int32_t entry;
1191 UChar c;
1192 uint8_t action;
1193 UConverterCallbackReason reason;
1194
1195 /* set up the local pointers */
1196 cnv=pArgs->converter;
1197 source=(const uint8_t *)pArgs->source;
1198 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1199 target=pArgs->target;
1200 targetLimit=pArgs->targetLimit;
1201 offsets=pArgs->offsets;
1202
1203 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1204 stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1205 } else {
1206 stateTable=cnv->sharedData->table->mbcs.stateTable;
1207 }
1208
1209 /* sourceIndex=-1 if the current character began in the previous buffer */
1210 sourceIndex=0;
1211 nextSourceIndex=0;
1212
1213 /* conversion loop */
1214 while(source<sourceLimit) {
1215 /*
1216 * This following test is to see if available input would overflow the output.
1217 * It does not catch output of more than one code unit that
1218 * overflows as a result of a surrogate pair or callback output
1219 * from the last source byte.
1220 * Therefore, those situations also test for overflows and will
1221 * then break the loop, too.
1222 */
1223 if(target<targetLimit) {
1224 ++nextSourceIndex;
1225 entry=stateTable[0][*source++];
1226 /* MBCS_ENTRY_IS_FINAL(entry) */
1227
1228 /* test the most common case first */
1229 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1230 /* output BMP code point */
1231 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1232 if(offsets!=NULL) {
1233 *offsets++=sourceIndex;
1234 }
1235
1236 /* normal end of action codes: prepare for a new character */
1237 sourceIndex=nextSourceIndex;
1238 continue;
1239 }
1240
1241 /*
1242 * An if-else-if chain provides more reliable performance for
1243 * the most common cases compared to a switch.
1244 */
1245 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1246 if(action==MBCS_STATE_VALID_DIRECT_20) {
1247 valid20:
1248 entry=MBCS_ENTRY_FINAL_VALUE(entry);
1249 /* output surrogate pair */
1250 *target++=(UChar)(0xd800|(UChar)(entry>>10));
1251 if(offsets!=NULL) {
1252 *offsets++=sourceIndex;
1253 }
1254 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1255 if(target<targetLimit) {
1256 *target++=c;
1257 if(offsets!=NULL) {
1258 *offsets++=sourceIndex;
1259 }
1260 } else {
1261 /* target overflow */
1262 cnv->UCharErrorBuffer[0]=c;
1263 cnv->UCharErrorBufferLength=1;
1264 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1265 break;
1266 }
1267 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1268 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1269 /* callback(unassigned) */
1270 goto unassigned;
1271 }
1272 /* output BMP code point */
1273 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1274 if(offsets!=NULL) {
1275 *offsets++=sourceIndex;
1276 }
1277 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1278 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1279 /* callback(unassigned) */
1280 goto unassigned;
1281 }
1282 goto valid20;
1283 } else if(action==MBCS_STATE_UNASSIGNED) {
1284 /* callback(unassigned) */
1285 goto unassigned;
1286 } else if(action==MBCS_STATE_ILLEGAL) {
1287 /* callback(illegal) */
1288 reason=UCNV_ILLEGAL;
1289 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1290 goto callback;
1291 } else {
1292 /* reserved, must never occur */
1293 }
1294
1295 /* normal end of action codes: prepare for a new character */
1296 sourceIndex=nextSourceIndex;
1297 continue;
1298
1299 unassigned:
1300 reason=UCNV_UNASSIGNED;
1301 *pErrorCode=U_INVALID_CHAR_FOUND;
1302 callback:
1303 /* call the callback function with all the preparations and post-processing */
1304 /* update the arguments structure */
1305 pArgs->source=(const char *)source;
1306 pArgs->target=target;
1307 pArgs->offsets=offsets;
1308
1309 /* call the callback function */
1310 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1311
1312 /* update target and deal with offsets if necessary */
1313 offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1314 target=pArgs->target;
1315
1316 /* update the source pointer and index */
1317 sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
1318 source=(const uint8_t *)pArgs->source;
1319
1320 /*
1321 * If the callback overflowed the target, then we need to
1322 * stop here with an overflow indication.
1323 */
1324 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1325 break;
1326 } else if(U_FAILURE(*pErrorCode)) {
1327 /* break on error */
1328 break;
1329 } else if(cnv->UCharErrorBufferLength>0) {
1330 /* target is full */
1331 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332 break;
1333 }
1334
1335 /*
1336 * We do not need to repeat the statements from the normal
1337 * end of the action codes because we already updated all the
1338 * necessary variables.
1339 */
1340 } else {
1341 /* target is full */
1342 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1343 break;
1344 }
1345 }
1346
1347 /* write back the updated pointers */
1348 pArgs->source=(const char *)source;
1349 pArgs->target=target;
1350 pArgs->offsets=offsets;
1351 }
1352
1353 /*
1354 * This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1355 * that only map to and from the BMP.
1356 * In addition to single-byte optimizations, the offset calculations
1357 * become much easier.
1358 */
1359 static void
1360 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1361 UErrorCode *pErrorCode) {
1362 UConverter *cnv;
1363 const uint8_t *source, *sourceLimit, *lastSource;
1364 UChar *target;
1365 int32_t targetCapacity, length;
1366 int32_t *offsets;
1367
1368 const int32_t (*stateTable)[256];
1369
1370 int32_t sourceIndex;
1371
1372 int32_t entry;
1373 uint8_t action;
1374 UConverterCallbackReason reason;
1375
1376 /* set up the local pointers */
1377 cnv=pArgs->converter;
1378 source=(const uint8_t *)pArgs->source;
1379 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1380 target=pArgs->target;
1381 targetCapacity=pArgs->targetLimit-pArgs->target;
1382 offsets=pArgs->offsets;
1383
1384 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1385 stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1386 } else {
1387 stateTable=cnv->sharedData->table->mbcs.stateTable;
1388 }
1389
1390 /* sourceIndex=-1 if the current character began in the previous buffer */
1391 sourceIndex=0;
1392 lastSource=source;
1393
1394 /*
1395 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1396 * for the minimum of the sourceLength and targetCapacity
1397 */
1398 length=sourceLimit-source;
1399 if(length<targetCapacity) {
1400 targetCapacity=length;
1401 }
1402
1403 #if MBCS_UNROLL_SINGLE_TO_BMP
1404 /* unrolling makes it faster on Pentium III/Windows 2000 */
1405 /* unroll the loop with the most common case */
1406 unrolled:
1407 if(targetCapacity>=16) {
1408 int32_t count, loops, oredEntries;
1409
1410 loops=count=targetCapacity>>4;
1411 do {
1412 oredEntries=entry=stateTable[0][*source++];
1413 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1414 oredEntries|=entry=stateTable[0][*source++];
1415 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1416 oredEntries|=entry=stateTable[0][*source++];
1417 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1418 oredEntries|=entry=stateTable[0][*source++];
1419 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1420 oredEntries|=entry=stateTable[0][*source++];
1421 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1422 oredEntries|=entry=stateTable[0][*source++];
1423 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1424 oredEntries|=entry=stateTable[0][*source++];
1425 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1426 oredEntries|=entry=stateTable[0][*source++];
1427 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1428 oredEntries|=entry=stateTable[0][*source++];
1429 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1430 oredEntries|=entry=stateTable[0][*source++];
1431 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1432 oredEntries|=entry=stateTable[0][*source++];
1433 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1434 oredEntries|=entry=stateTable[0][*source++];
1435 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1436 oredEntries|=entry=stateTable[0][*source++];
1437 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1438 oredEntries|=entry=stateTable[0][*source++];
1439 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1440 oredEntries|=entry=stateTable[0][*source++];
1441 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1442 oredEntries|=entry=stateTable[0][*source++];
1443 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1444
1445 /* were all 16 entries really valid? */
1446 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
1447 /* no, return to the first of these 16 */
1448 source-=16;
1449 target-=16;
1450 break;
1451 }
1452 } while(--count>0);
1453 count=loops-count;
1454 targetCapacity-=16*count;
1455
1456 if(offsets!=NULL) {
1457 lastSource+=16*count;
1458 while(count>0) {
1459 *offsets++=sourceIndex++;
1460 *offsets++=sourceIndex++;
1461 *offsets++=sourceIndex++;
1462 *offsets++=sourceIndex++;
1463 *offsets++=sourceIndex++;
1464 *offsets++=sourceIndex++;
1465 *offsets++=sourceIndex++;
1466 *offsets++=sourceIndex++;
1467 *offsets++=sourceIndex++;
1468 *offsets++=sourceIndex++;
1469 *offsets++=sourceIndex++;
1470 *offsets++=sourceIndex++;
1471 *offsets++=sourceIndex++;
1472 *offsets++=sourceIndex++;
1473 *offsets++=sourceIndex++;
1474 *offsets++=sourceIndex++;
1475 --count;
1476 }
1477 }
1478 }
1479 #endif
1480
1481 /* conversion loop */
1482 while(targetCapacity>0) {
1483 entry=stateTable[0][*source++];
1484 /* MBCS_ENTRY_IS_FINAL(entry) */
1485
1486 /* test the most common case first */
1487 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1488 /* output BMP code point */
1489 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1490 --targetCapacity;
1491 continue;
1492 }
1493
1494 /*
1495 * An if-else-if chain provides more reliable performance for
1496 * the most common cases compared to a switch.
1497 */
1498 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1499 if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1500 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1501 /* callback(unassigned) */
1502 reason=UCNV_UNASSIGNED;
1503 *pErrorCode=U_INVALID_CHAR_FOUND;
1504 }
1505 /* output BMP code point */
1506 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1507 --targetCapacity;
1508 continue;
1509 } else if(action==MBCS_STATE_UNASSIGNED) {
1510 /* callback(unassigned) */
1511 reason=UCNV_UNASSIGNED;
1512 *pErrorCode=U_INVALID_CHAR_FOUND;
1513 } else if(action==MBCS_STATE_ILLEGAL) {
1514 /* callback(illegal) */
1515 reason=UCNV_ILLEGAL;
1516 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1517 } else {
1518 /* reserved, must never occur */
1519 continue;
1520 }
1521
1522 /* call the callback function with all the preparations and post-processing */
1523 /* set offsets since the start or the last callback */
1524 if(offsets!=NULL) {
1525 int32_t count=(int32_t)(source-lastSource);
1526
1527 /* predecrement: do not set the offset for the callback-causing character */
1528 while(--count>0) {
1529 *offsets++=sourceIndex++;
1530 }
1531 /* offset and sourceIndex are now set for the current character */
1532 }
1533
1534 /* update the arguments structure */
1535 pArgs->source=(const char *)source;
1536 pArgs->target=target;
1537 pArgs->offsets=offsets;
1538
1539 /* call the callback function */
1540 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1541
1542 /* update target and deal with offsets if necessary */
1543 offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1544 target=pArgs->target;
1545
1546 /* update the source pointer and index */
1547 sourceIndex+=1+((const uint8_t *)pArgs->source-source);
1548 source=lastSource=(const uint8_t *)pArgs->source;
1549 targetCapacity=pArgs->targetLimit-target;
1550 length=sourceLimit-source;
1551 if(length<targetCapacity) {
1552 targetCapacity=length;
1553 }
1554
1555 /*
1556 * If the callback overflowed the target, then we need to
1557 * stop here with an overflow indication.
1558 */
1559 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1560 break;
1561 } else if(U_FAILURE(*pErrorCode)) {
1562 /* break on error */
1563 break;
1564 } else if(cnv->UCharErrorBufferLength>0) {
1565 /* target is full */
1566 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1567 break;
1568 }
1569
1570 #if MBCS_UNROLL_SINGLE_TO_BMP
1571 /* unrolling makes it faster on Pentium III/Windows 2000 */
1572 goto unrolled;
1573 #endif
1574 }
1575
1576 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
1577 /* target is full */
1578 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1579 }
1580
1581 /* set offsets since the start or the last callback */
1582 if(offsets!=NULL) {
1583 size_t count=source-lastSource;
1584 while(count>0) {
1585 *offsets++=sourceIndex++;
1586 --count;
1587 }
1588 }
1589
1590 /* write back the updated pointers */
1591 pArgs->source=(const char *)source;
1592 pArgs->target=target;
1593 pArgs->offsets=offsets;
1594 }
1595
1596 static UChar32
1597 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
1598 UErrorCode *pErrorCode) {
1599 UChar buffer[UTF_MAX_CHAR_LENGTH];
1600
1601 UConverter *cnv;
1602 const uint8_t *source, *sourceLimit;
1603
1604 const int32_t (*stateTable)[256];
1605 const uint16_t *unicodeCodeUnits;
1606
1607 uint32_t offset;
1608 uint8_t state;
1609 int8_t byteIndex;
1610 uint8_t *bytes;
1611
1612 int32_t entry;
1613 UChar32 c;
1614 uint8_t action;
1615 UConverterCallbackReason reason;
1616
1617 /* use optimized function if possible */
1618 cnv=pArgs->converter;
1619 if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
1620 /*
1621 * Calling the inefficient, generic getNextUChar() lets us deal correctly
1622 * with the rare case of a codepage that maps single surrogates
1623 * without adding the complexity to this already complicated function here.
1624 */
1625 return ucnv_getNextUCharFromToUImpl(pArgs, _MBCSToUnicodeWithOffsets, TRUE, pErrorCode);
1626 } else if(cnv->sharedData->table->mbcs.countStates==1) {
1627 return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
1628 }
1629
1630 /* set up the local pointers */
1631 source=(const uint8_t *)pArgs->source;
1632 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1633
1634 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1635 stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1636 } else {
1637 stateTable=cnv->sharedData->table->mbcs.stateTable;
1638 }
1639 unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
1640
1641 /* get the converter state from UConverter */
1642 offset=cnv->toUnicodeStatus;
1643 state=(uint8_t)(cnv->mode);
1644 byteIndex=cnv->toULength;
1645 bytes=cnv->toUBytes;
1646
1647 /* conversion loop */
1648 while(source<sourceLimit) {
1649 entry=stateTable[state][bytes[byteIndex++]=*source++];
1650 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1651 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1652 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1653 } else {
1654 /* set the next state early so that we can reuse the entry variable */
1655 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1656
1657 /*
1658 * An if-else-if chain provides more reliable performance for
1659 * the most common cases compared to a switch.
1660 */
1661 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1662 if(action==MBCS_STATE_VALID_16) {
1663 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1664 c=unicodeCodeUnits[offset];
1665 if(c<0xfffe) {
1666 /* output BMP code point */
1667 goto finish;
1668 } else if(c==0xfffe) {
1669 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
1670 goto finish;
1671 }
1672 /* callback(unassigned) */
1673 goto unassigned;
1674 } else {
1675 /* callback(illegal) */
1676 goto illegal;
1677 }
1678 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
1679 /* output BMP code point */
1680 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1681 goto finish;
1682 } else if(action==MBCS_STATE_VALID_16_PAIR) {
1683 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1684 c=unicodeCodeUnits[offset++];
1685 if(c<0xd800) {
1686 /* output BMP code point below 0xd800 */
1687 goto finish;
1688 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
1689 /* output roundtrip or fallback supplementary code point */
1690 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
1691 goto finish;
1692 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1693 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1694 c=unicodeCodeUnits[offset];
1695 goto finish;
1696 } else if(c==0xffff) {
1697 /* callback(illegal) */
1698 goto illegal;
1699 } else {
1700 /* callback(unassigned) */
1701 goto unassigned;
1702 }
1703 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
1704 /* output supplementary code point */
1705 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1706 goto finish;
1707 } else if(action==MBCS_STATE_CHANGE_ONLY) {
1708 /*
1709 * This serves as a state change without any output.
1710 * It is useful for reading simple stateful encodings,
1711 * for example using just Shift-In/Shift-Out codes.
1712 * The 21 unused bits may later be used for more sophisticated
1713 * state transitions.
1714 */
1715 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1716 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1717 /* callback(unassigned) */
1718 goto unassigned;
1719 }
1720 /* output BMP code point */
1721 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1722 goto finish;
1723 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1724 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1725 /* callback(unassigned) */
1726 goto unassigned;
1727 }
1728 /* output supplementary code point */
1729 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1730 goto finish;
1731 } else if(action==MBCS_STATE_UNASSIGNED) {
1732 /* callback(unassigned) */
1733 goto unassigned;
1734 } else if(action==MBCS_STATE_ILLEGAL) {
1735 /* callback(illegal) */
1736 goto illegal;
1737 } else {
1738 /* reserved, must never occur */
1739 }
1740
1741 /* normal end of action codes: prepare for a new character */
1742 offset=0;
1743 byteIndex=0;
1744 continue;
1745
1746 illegal:
1747 reason=UCNV_ILLEGAL;
1748 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1749 goto callback;
1750 unassigned:
1751 reason=UCNV_UNASSIGNED;
1752 *pErrorCode=U_INVALID_CHAR_FOUND;
1753 callback:
1754 /* call the callback function with all the preparations and post-processing */
1755 /* update the arguments structure */
1756 pArgs->source=(const char *)source;
1757 pArgs->target=buffer;
1758 pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
1759
1760 /* set the converter state in UConverter to deal with the next character */
1761 cnv->toUnicodeStatus=0;
1762 cnv->mode=state;
1763 cnv->toULength=0;
1764
1765 /* call the callback function */
1766 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)bytes, byteIndex, reason, pErrorCode);
1767
1768 /* get the converter state from UConverter */
1769 offset=cnv->toUnicodeStatus;
1770 state=(uint8_t)cnv->mode;
1771 byteIndex=cnv->toULength;
1772
1773 /* update the source pointer */
1774 source=(const uint8_t *)pArgs->source;
1775
1776 /*
1777 * return the first character if the callback wrote some
1778 * we do not need to goto finish because the converter state is already set
1779 */
1780 if(U_SUCCESS(*pErrorCode)) {
1781 entry=pArgs->target-buffer;
1782 if(entry>0) {
1783 return ucnv_getUChar32KeepOverflow(cnv, buffer, entry);
1784 }
1785 /* else (callback did not write anything) continue */
1786 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1787 *pErrorCode=U_ZERO_ERROR;
1788 return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
1789 } else {
1790 /* break on error */
1791 /* ### what if a callback set an error but _also_ generated output?! */
1792 state=0;
1793 c=0xffff;
1794 goto finish;
1795 }
1796
1797 /*
1798 * We do not need to repeat the statements from the normal
1799 * end of the action codes because we already updated all the
1800 * necessary variables.
1801 */
1802 }
1803 }
1804
1805 if(byteIndex>0) {
1806 /* incomplete character byte sequence */
1807 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1808 state=0;
1809 } else {
1810 /* no output because of empty input or only state changes and skipping callbacks */
1811 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1812 }
1813 c=0xffff;
1814
1815 finish:
1816 /* set the converter state back into UConverter, ready for a new character */
1817 cnv->toUnicodeStatus=0;
1818 cnv->mode=state;
1819 cnv->toULength=0;
1820
1821 /* write back the updated pointer */
1822 pArgs->source=(const char *)source;
1823 return c;
1824 }
1825
1826 /*
1827 * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
1828 * We still need a conversion loop in case a skipping callback is called.
1829 */
1830 static UChar32
1831 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
1832 UErrorCode *pErrorCode) {
1833 UChar buffer[UTF_MAX_CHAR_LENGTH];
1834
1835 UConverter *cnv;
1836 const int32_t (*stateTable)[256];
1837 const uint8_t *source, *sourceLimit;
1838
1839 int32_t entry;
1840 uint8_t action;
1841 UConverterCallbackReason reason;
1842
1843 /* set up the local pointers */
1844 cnv=pArgs->converter;
1845 source=(const uint8_t *)pArgs->source;
1846 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1847 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1848 stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1849 } else {
1850 stateTable=cnv->sharedData->table->mbcs.stateTable;
1851 }
1852
1853 /* conversion loop */
1854 while(source<sourceLimit) {
1855 entry=stateTable[0][*source++];
1856 /* MBCS_ENTRY_IS_FINAL(entry) */
1857
1858 /* write back the updated pointer early so that we can return directly */
1859 pArgs->source=(const char *)source;
1860
1861 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1862 /* output BMP code point */
1863 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1864 }
1865
1866 /*
1867 * An if-else-if chain provides more reliable performance for
1868 * the most common cases compared to a switch.
1869 */
1870 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1871 if(action==MBCS_STATE_VALID_DIRECT_20) {
1872 /* output supplementary code point */
1873 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1874 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1875 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1876 /* callback(unassigned) */
1877 reason=UCNV_UNASSIGNED;
1878 *pErrorCode=U_INVALID_CHAR_FOUND;
1879 } else {
1880 /* output BMP code point */
1881 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1882 }
1883 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1884 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1885 /* callback(unassigned) */
1886 reason=UCNV_UNASSIGNED;
1887 *pErrorCode=U_INVALID_CHAR_FOUND;
1888 } else {
1889 /* output supplementary code point */
1890 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1891 }
1892 } else if(action==MBCS_STATE_UNASSIGNED) {
1893 /* callback(unassigned) */
1894 reason=UCNV_UNASSIGNED;
1895 *pErrorCode=U_INVALID_CHAR_FOUND;
1896 } else if(action==MBCS_STATE_ILLEGAL) {
1897 /* callback(illegal) */
1898 reason=UCNV_ILLEGAL;
1899 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1900 } else {
1901 /* reserved, must never occur */
1902 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1903 return 0xffff;
1904 }
1905
1906 /* call the callback function with all the preparations and post-processing */
1907 /* update the arguments structure */
1908 pArgs->target=buffer;
1909 pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
1910
1911 /* call the callback function */
1912 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1913
1914 /* update the source pointer */
1915 source=(const uint8_t *)pArgs->source;
1916
1917 /*
1918 * return the first character if the callback wrote some
1919 * we do not need to goto finish because the converter state is already set
1920 */
1921 if(U_SUCCESS(*pErrorCode)) {
1922 entry=pArgs->target-buffer;
1923 if(entry>0) {
1924 return ucnv_getUChar32KeepOverflow(cnv, buffer, entry);
1925 }
1926 /* else (callback did not write anything) continue */
1927 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1928 *pErrorCode=U_ZERO_ERROR;
1929 return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
1930 } else {
1931 /* break on error */
1932 /* ### what if a callback set an error but _also_ generated output?! */
1933 return 0xffff;
1934 }
1935 }
1936
1937 /* no output because of empty input or only state changes and skipping callbacks */
1938 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1939 return 0xffff;
1940 }
1941
1942 /*
1943 * This is a simple version of getNextUChar() that is used
1944 * by other converter implementations.
1945 * It does not use state from the converter, nor error codes.
1946 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
1947 *
1948 * Return value:
1949 * U+fffe unassigned
1950 * U+ffff illegal
1951 * otherwise the Unicode code point
1952 */
1953 U_CFUNC UChar32
1954 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
1955 const char **pSource, const char *sourceLimit,
1956 UBool useFallback) {
1957 const uint8_t *source;
1958
1959 const int32_t (*stateTable)[256];
1960 const uint16_t *unicodeCodeUnits;
1961
1962 uint32_t offset;
1963 uint8_t state, action;
1964
1965 int32_t entry;
1966
1967 /* set up the local pointers */
1968 source=(const uint8_t *)*pSource;
1969 if(source>=(const uint8_t *)sourceLimit) {
1970 /* no input at all: "illegal" */
1971 return 0xffff;
1972 }
1973
1974 #if 0
1975 /*
1976 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
1977 * TODO In future releases, verify that this function is never called for SBCS
1978 * conversions, i.e., that sharedData->table->mbcs.countStates==1 is still true.
1979 * Removal improves code coverage.
1980 */
1981 /* use optimized function if possible */
1982 if(sharedData->table->mbcs.countStates==1) {
1983 return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)(*(*pSource)++), useFallback);
1984 }
1985 #endif
1986
1987 stateTable=sharedData->table->mbcs.stateTable;
1988 unicodeCodeUnits=sharedData->table->mbcs.unicodeCodeUnits;
1989
1990 /* converter state */
1991 offset=0;
1992 state=0;
1993
1994 /* conversion loop */
1995 do {
1996 entry=stateTable[state][*source++];
1997 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1998 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1999 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2000 } else {
2001 *pSource=(const char *)source;
2002
2003 /*
2004 * An if-else-if chain provides more reliable performance for
2005 * the most common cases compared to a switch.
2006 */
2007 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2008 if(action==MBCS_STATE_VALID_16) {
2009 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2010 entry=unicodeCodeUnits[offset];
2011 if(entry!=0xfffe) {
2012 return (UChar32)entry;
2013 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2014 return _MBCSGetFallback(&sharedData->table->mbcs, offset);
2015 } else {
2016 return 0xfffe;
2017 }
2018 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2019 /* output BMP code point */
2020 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2021 } else if(action==MBCS_STATE_VALID_16_PAIR) {
2022 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2023 entry=unicodeCodeUnits[offset++];
2024 if(entry<0xd800) {
2025 /* output BMP code point below 0xd800 */
2026 return (UChar32)entry;
2027 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? entry<=0xdfff : entry<=0xdbff) {
2028 /* output roundtrip or fallback supplementary code point */
2029 return (UChar32)(((entry&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
2030 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (entry&0xfffe)==0xe000 : entry==0xe000) {
2031 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2032 return unicodeCodeUnits[offset];
2033 } else if(entry==0xffff) {
2034 return 0xffff;
2035 } else {
2036 return 0xfffe;
2037 }
2038 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
2039 /* output supplementary code point */
2040 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2041 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2042 if(!TO_U_USE_FALLBACK(useFallback)) {
2043 return 0xfffe;
2044 }
2045 /* output BMP code point */
2046 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2047 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2048 if(!TO_U_USE_FALLBACK(useFallback)) {
2049 return 0xfffe;
2050 }
2051 /* output supplementary code point */
2052 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2053 } else if(action==MBCS_STATE_CHANGE_ONLY) {
2054 /*
2055 * This serves as a state change without any output.
2056 * It is useful for reading simple stateful encodings,
2057 * for example using just Shift-In/Shift-Out codes.
2058 * The 21 unused bits may later be used for more sophisticated
2059 * state transitions.
2060 */
2061 if(source==(const uint8_t *)sourceLimit) {
2062 /* if there are only state changes, then return "unassigned" */
2063 return 0xfffe;
2064 }
2065 } else if(action==MBCS_STATE_UNASSIGNED) {
2066 return 0xfffe;
2067 } else if(action==MBCS_STATE_ILLEGAL) {
2068 return 0xffff;
2069 } else {
2070 /* reserved, must never occur */
2071 }
2072
2073 /* state change only - prepare for a new character */
2074 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2075 offset=0;
2076 }
2077 } while(source<(const uint8_t *)sourceLimit);
2078
2079 *pSource=(const char *)source;
2080 return 0xffff;
2081 }
2082
2083 #if 0
2084 /*
2085 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2086 * Removal improves code coverage.
2087 */
2088 /**
2089 * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2090 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2091 */
2092 U_CFUNC UChar32
2093 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2094 uint8_t b, UBool useFallback) {
2095 int32_t entry;
2096 uint8_t action;
2097
2098 entry=sharedData->table->mbcs.stateTable[0][b];
2099 /* MBCS_ENTRY_IS_FINAL(entry) */
2100
2101 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2102 /* output BMP code point */
2103 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2104 }
2105
2106 /*
2107 * An if-else-if chain provides more reliable performance for
2108 * the most common cases compared to a switch.
2109 */
2110 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2111 if(action==MBCS_STATE_VALID_DIRECT_20) {
2112 /* output supplementary code point */
2113 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2114 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2115 if(!TO_U_USE_FALLBACK(useFallback)) {
2116 return 0xfffe;
2117 }
2118 /* output BMP code point */
2119 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2120 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2121 if(!TO_U_USE_FALLBACK(useFallback)) {
2122 return 0xfffe;
2123 }
2124 /* output supplementary code point */
2125 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2126 } else if(action==MBCS_STATE_UNASSIGNED) {
2127 return 0xfffe;
2128 } else if(action==MBCS_STATE_ILLEGAL) {
2129 return 0xffff;
2130 } else {
2131 /* reserved, must never occur */
2132 return 0xffff;
2133 }
2134 }
2135 #endif
2136
2137 /* MBCS-from-Unicode conversion functions ----------------------------------- */
2138
2139 U_CFUNC void
2140 _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2141 UErrorCode *pErrorCode) {
2142 UConverter *cnv;
2143 const UChar *source, *sourceLimit;
2144 uint8_t *target;
2145 int32_t targetCapacity;
2146 int32_t *offsets;
2147
2148 const uint16_t *table;
2149 const uint8_t *p, *bytes;
2150 uint8_t outputType;
2151
2152 UChar32 c;
2153
2154 int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
2155
2156 UConverterCallbackReason reason;
2157 uint32_t stage2Entry;
2158 uint32_t value;
2159 int32_t length, prevLength;
2160 uint8_t unicodeMask;
2161
2162 /* use optimized function if possible */
2163 cnv=pArgs->converter;
2164 outputType=cnv->sharedData->table->mbcs.outputType;
2165 unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
2166 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2167 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2168 _MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
2169 } else {
2170 _MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
2171 }
2172 return;
2173 } else if(outputType==MBCS_OUTPUT_2) {
2174 _MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
2175 return;
2176 }
2177
2178 /* set up the local pointers */
2179 source=pArgs->source;
2180 sourceLimit=pArgs->sourceLimit;
2181 target=(uint8_t *)pArgs->target;
2182 targetCapacity=pArgs->targetLimit-pArgs->target;
2183 offsets=pArgs->offsets;
2184
2185 table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2186 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2187 bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2188 } else {
2189 bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
2190 }
2191
2192 /* get the converter state from UConverter */
2193 c=cnv->fromUSurrogateLead;
2194 prevLength=cnv->fromUnicodeStatus;
2195
2196 /* sourceIndex=-1 if the current character began in the previous buffer */
2197 prevSourceIndex=-1;
2198 sourceIndex= c==0 ? 0 : -1;
2199 nextSourceIndex=0;
2200
2201 /* conversion loop */
2202 /*
2203 * This is another piece of ugly code:
2204 * A goto into the loop if the converter state contains a first surrogate
2205 * from the previous function call.
2206 * It saves me to check in each loop iteration a check of if(c==0)
2207 * and duplicating the trail-surrogate-handling code in the else
2208 * branch of that check.
2209 * I could not find any other way to get around this other than
2210 * using a function call for the conversion and callback, which would
2211 * be even more inefficient.
2212 *
2213 * Markus Scherer 2000-jul-19
2214 */
2215 if(c!=0 && targetCapacity>0) {
2216 goto getTrail;
2217 }
2218
2219 while(source<sourceLimit) {
2220 /*
2221 * This following test is to see if available input would overflow the output.
2222 * It does not catch output of more than one byte that
2223 * overflows as a result of a multi-byte character or callback output
2224 * from the last source character.
2225 * Therefore, those situations also test for overflows and will
2226 * then break the loop, too.
2227 */
2228 if(targetCapacity>0) {
2229 /*
2230 * Get a correct Unicode code point:
2231 * a single UChar for a BMP code point or
2232 * a matched surrogate pair for a "supplementary code point".
2233 */
2234 c=*source++;
2235 ++nextSourceIndex;
2236 /*
2237 * This also tests if the codepage maps single surrogates.
2238 * If it does, then surrogates are not paired but mapped separately.
2239 * Note that in this case unmatched surrogates are not detected.
2240 */
2241 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2242 if(UTF_IS_SURROGATE_FIRST(c)) {
2243 getTrail:
2244 if(source<sourceLimit) {
2245 /* test the following code unit */
2246 UChar trail=*source;
2247 if(UTF_IS_SECOND_SURROGATE(trail)) {
2248 ++source;
2249 ++nextSourceIndex;
2250 c=UTF16_GET_PAIR_VALUE(c, trail);
2251 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2252 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2253 /* callback(unassigned) */
2254 goto unassigned;
2255 }
2256 /* convert this supplementary code point */
2257 /* exit this condition tree */
2258 } else {
2259 /* this is an unmatched lead code unit (1st surrogate) */
2260 /* callback(illegal) */
2261 reason=UCNV_ILLEGAL;
2262 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2263 goto callback;
2264 }
2265 } else {
2266 /* no more input */
2267 break;
2268 }
2269 } else {
2270 /* this is an unmatched trail code unit (2nd surrogate) */
2271 /* callback(illegal) */
2272 reason=UCNV_ILLEGAL;
2273 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2274 goto callback;
2275 }
2276 }
2277
2278 /* convert the Unicode code point in c into codepage bytes */
2279
2280 /*
2281 * The basic lookup is a triple-stage compact array (trie) lookup.
2282 * For details see the beginning of this file.
2283 *
2284 * Single-byte codepages are handled with a different data structure
2285 * by _MBCSSingle... functions.
2286 *
2287 * The result consists of a 32-bit value from stage 2 and
2288 * a pointer to as many bytes as are stored per character.
2289 * The pointer points to the character's bytes in stage 3.
2290 * Bits 15..0 of the stage 2 entry contain the stage 3 index
2291 * for that pointer, while bits 31..16 are flags for which of
2292 * the 16 characters in the block are roundtrip-assigned.
2293 *
2294 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
2295 * respectively as uint32_t, in the platform encoding.
2296 * For 3-byte codepages, the bytes are always stored in big-endian order.
2297 *
2298 * For EUC encodings that use only either 0x8e or 0x8f as the first
2299 * byte of their longest byte sequences, the first two bytes in
2300 * this third stage indicate with their 7th bits whether these bytes
2301 * are to be written directly or actually need to be preceeded by
2302 * one of the two Single-Shift codes. With this, the third stage
2303 * stores one byte fewer per character than the actual maximum length of
2304 * EUC byte sequences.
2305 *
2306 * Other than that, leading zero bytes are removed and the other
2307 * bytes output. A single zero byte may be output if the "assigned"
2308 * bit in stage 2 was on or also if the Unicode code point is U+0000.
2309 * The data structure does not support zero byte output as a fallback
2310 * for other code points, and also does not allow output of leading zeros.
2311 */
2312 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2313
2314 /* get the bytes and the length for the output */
2315 switch(outputType) {
2316 case MBCS_OUTPUT_2:
2317 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2318 if(value<=0xff) {
2319 length=1;
2320 } else {
2321 length=2;
2322 }
2323 break;
2324 case MBCS_OUTPUT_2_SISO:
2325 /* 1/2-byte stateful with Shift-In/Shift-Out */
2326 /*
2327 * Save the old state in the converter object
2328 * right here, then change the local prevLength state variable if necessary.
2329 * Then, if this character turns out to be unassigned or a fallback that
2330 * is not taken, the callback code must not save the new state in the converter
2331 * because the new state is for a character that is not output.
2332 * However, the callback must still restore the state from the converter
2333 * in case the callback function changed it for its output.
2334 */
2335 cnv->fromUnicodeStatus=prevLength; /* save the old state */
2336 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2337 if(value<=0xff) {
2338 if(prevLength==1) {
2339 length=1;
2340 } else {
2341 /* change from double-byte mode to single-byte */
2342 value|=(uint32_t)UCNV_SI<<8;
2343 length=2;
2344 prevLength=1;
2345 }
2346 } else {
2347 if(prevLength==2) {
2348 length=2;
2349 } else {
2350 /* change from single-byte mode to double-byte */
2351 value|=(uint32_t)UCNV_SO<<16;
2352 length=3;
2353 prevLength=2;
2354 }
2355 }
2356 break;
2357 case MBCS_OUTPUT_3:
2358 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
2359 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
2360 if(value<=0xff) {
2361 length=1;
2362 } else if(value<=0xffff) {
2363 length=2;
2364 } else {
2365 length=3;
2366 }
2367 break;
2368 case MBCS_OUTPUT_4:
2369 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
2370 if(value<=0xff) {
2371 length=1;
2372 } else if(value<=0xffff) {
2373 length=2;
2374 } else if(value<=0xffffff) {
2375 length=3;
2376 } else {
2377 length=4;
2378 }
2379 break;
2380 case MBCS_OUTPUT_3_EUC:
2381 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2382 /* EUC 16-bit fixed-length representation */
2383 if(value<=0xff) {
2384 length=1;
2385 } else if((value&0x8000)==0) {
2386 value|=0x8e8000;
2387 length=3;
2388 } else if((value&0x80)==0) {
2389 value|=0x8f0080;
2390 length=3;
2391 } else {
2392 length=2;
2393 }
2394 break;
2395 case MBCS_OUTPUT_4_EUC:
2396 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
2397 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
2398 /* EUC 16-bit fixed-length representation applied to the first two bytes */
2399 if(value<=0xff) {
2400 length=1;
2401 } else if(value<=0xffff) {
2402 length=2;
2403 } else if((value&0x800000)==0) {
2404 value|=0x8e800000;
2405 length=4;
2406 } else if((value&0x8000)==0) {
2407 value|=0x8f008000;
2408 length=4;
2409 } else {
2410 length=3;
2411 }
2412 break;
2413 default:
2414 /* must not occur */
2415 /*
2416 * To avoid compiler warnings that value & length may be
2417 * used without having been initialized, we set them here.
2418 * In reality, this is unreachable code.
2419 * Not having a default branch also causes warnings with
2420 * some compilers.
2421 */
2422 value=0;
2423 length=0;
2424 break;
2425 }
2426
2427 /* is this code point assigned, or do we use fallbacks? */
2428 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
2429 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0)))
2430 ) {
2431 /*
2432 * We allow a 0 byte output if the Unicode code point is
2433 * U+0000 and also if the "assigned" bit is set for this entry.
2434 * There is no way with this data structure for fallback output
2435 * for other than U+0000 to be a zero byte.
2436 */
2437 /* callback(unassigned) */
2438 goto unassigned;
2439 }
2440
2441 /* write the output character bytes from value and length */
2442 /* from the first if in the loop we know that targetCapacity>0 */
2443 if(length<=targetCapacity) {
2444 if(offsets==NULL) {
2445 switch(length) {
2446 /* each branch falls through to the next one */
2447 case 4:
2448 *target++=(uint8_t)(value>>24);
2449 case 3:
2450 *target++=(uint8_t)(value>>16);
2451 case 2:
2452 *target++=(uint8_t)(value>>8);
2453 case 1:
2454 *target++=(uint8_t)value;
2455 default:
2456 /* will never occur */
2457 break;
2458 }
2459 } else {
2460 switch(length) {
2461 /* each branch falls through to the next one */
2462 case 4:
2463 *target++=(uint8_t)(value>>24);
2464 *offsets++=sourceIndex;
2465 case 3:
2466 *target++=(uint8_t)(value>>16);
2467 *offsets++=sourceIndex;
2468 case 2:
2469 *target++=(uint8_t)(value>>8);
2470 *offsets++=sourceIndex;
2471 case 1:
2472 *target++=(uint8_t)value;
2473 *offsets++=sourceIndex;
2474 default:
2475 /* will never occur */
2476 break;
2477 }
2478 }
2479 targetCapacity-=length;
2480 } else {
2481 uint8_t *charErrorBuffer;
2482
2483 /*
2484 * We actually do this backwards here:
2485 * In order to save an intermediate variable, we output
2486 * first to the overflow buffer what does not fit into the
2487 * regular target.
2488 */
2489 /* we know that 1<=targetCapacity<length<=4 */
2490 length-=targetCapacity;
2491 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
2492 switch(length) {
2493 /* each branch falls through to the next one */
2494 case 3:
2495 *charErrorBuffer++=(uint8_t)(value>>16);
2496 case 2:
2497 *charErrorBuffer++=(uint8_t)(value>>8);
2498 case 1:
2499 *charErrorBuffer=(uint8_t)value;
2500 default:
2501 /* will never occur */
2502 break;
2503 }
2504 cnv->charErrorBufferLength=(int8_t)length;
2505
2506 /* now output what fits into the regular target */
2507 value>>=8*length; /* length was reduced by targetCapacity */
2508 switch(targetCapacity) {
2509 /* each branch falls through to the next one */
2510 case 3:
2511 *target++=(uint8_t)(value>>16);
2512 if(offsets!=NULL) {
2513 *offsets++=sourceIndex;
2514 }
2515 case 2:
2516 *target++=(uint8_t)(value>>8);
2517 if(offsets!=NULL) {
2518 *offsets++=sourceIndex;
2519 }
2520 case 1:
2521 *target++=(uint8_t)value;
2522 if(offsets!=NULL) {
2523 *offsets++=sourceIndex;
2524 }
2525 default:
2526 /* will never occur */
2527 break;
2528 }
2529
2530 /* target overflow */
2531 targetCapacity=0;
2532 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2533 c=0;
2534 break;
2535 }
2536
2537 /* normal end of conversion: prepare for a new character */
2538 c=0;
2539 if(offsets!=NULL) {
2540 prevSourceIndex=sourceIndex;
2541 sourceIndex=nextSourceIndex;
2542 }
2543 continue;
2544
2545 /*
2546 * This is the same ugly trick as in ToUnicode(), for the
2547 * same reasons...
2548 */
2549 unassigned:
2550 reason=UCNV_UNASSIGNED;
2551 *pErrorCode=U_INVALID_CHAR_FOUND;
2552 callback:
2553 /* call the callback function with all the preparations and post-processing */
2554 /* update the arguments structure */
2555 pArgs->source=source;
2556 pArgs->target=(char *)target;
2557 pArgs->offsets=offsets;
2558
2559 /* set the converter state in UConverter to deal with the next character */
2560 cnv->fromUSurrogateLead=0;
2561 /*
2562 * Do not save the prevLength SISO state because prevLength is set for
2563 * the character that is now not output because it is unassigned or it is
2564 * a fallback that is not taken.
2565 * The above branch for MBCS_OUTPUT_2_SISO has saved the previous state already.
2566 * See comments there.
2567 */
2568 prevSourceIndex=sourceIndex;
2569
2570 /* call the callback function */
2571 fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
2572
2573 /* get the converter state from UConverter */
2574 c=cnv->fromUSurrogateLead;
2575 prevLength=cnv->fromUnicodeStatus;
2576
2577 /* update target and deal with offsets if necessary */
2578 offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
2579 target=(uint8_t *)pArgs->target;
2580
2581 /* update the source pointer and index */
2582 sourceIndex=nextSourceIndex+(pArgs->source-source);
2583 source=pArgs->source;
2584 targetCapacity=(uint8_t *)pArgs->targetLimit-target;
2585
2586 /*
2587 * If the callback overflowed the target, then we need to
2588 * stop here with an overflow indication.
2589 */
2590 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
2591 break;
2592 } else if(U_FAILURE(*pErrorCode)) {
2593 /* break on error */
2594 c=0;
2595 break;
2596 } else if(cnv->charErrorBufferLength>0) {
2597 /* target is full */
2598 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2599 break;
2600 }
2601
2602 /*
2603 * We do not need to repeat the statements from the normal
2604 * end of the conversion because we already updated all the
2605 * necessary variables.
2606 */
2607 } else {
2608 /* target is full */
2609 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2610 break;
2611 }
2612 }
2613
2614 if(pArgs->flush && source>=sourceLimit && U_SUCCESS(*pErrorCode)) {
2615 /* end of input stream */
2616 if(c!=0) {
2617 /* a Unicode code point remains incomplete (only a first surrogate) */
2618 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2619 /* the following may change with Jitterbug 2449: would prepare for callback instead of resetting */
2620 c=0;
2621 prevLength=1;
2622 } else if(outputType==MBCS_OUTPUT_2_SISO && prevLength==2) {
2623 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
2624 if(targetCapacity>0) {
2625 *target++=(uint8_t)UCNV_SI;
2626 if(offsets!=NULL) {
2627 /* set the last source character's index (sourceIndex points at sourceLimit now) */
2628 *offsets++=prevSourceIndex;
2629 }
2630 } else {
2631 /* target is full */
2632 cnv->charErrorBuffer[0]=(char)UCNV_SI;
2633 cnv->charErrorBufferLength=1;
2634 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2635 }
2636 prevLength=1; /* we switched into SBCS */
2637 }
2638
2639 /* reset the state for the next conversion */
2640 if(U_SUCCESS(*pErrorCode)) {
2641 c=0;
2642 prevLength=1;
2643 }
2644 }
2645
2646 /* set the converter state back into UConverter */
2647 cnv->fromUSurrogateLead=(UChar)c;
2648 cnv->fromUnicodeStatus=prevLength;
2649
2650 /* write back the updated pointers */
2651 pArgs->source=source;
2652 pArgs->target=(char *)target;
2653 pArgs->offsets=offsets;
2654 }
2655
2656 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
2657 static void
2658 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2659 UErrorCode *pErrorCode) {
2660 UConverter *cnv;
2661 const UChar *source, *sourceLimit;
2662 uint8_t *target;
2663 int32_t targetCapacity;
2664 int32_t *offsets;
2665
2666 const uint16_t *table;
2667 const uint8_t *bytes;
2668
2669 UChar32 c;
2670
2671 int32_t sourceIndex, nextSourceIndex;
2672
2673 UConverterCallbackReason reason;
2674 uint32_t stage2Entry;
2675 uint32_t value;
2676 int32_t length, prevLength;
2677 uint8_t unicodeMask;
2678
2679 /* use optimized function if possible */
2680 cnv=pArgs->converter;
2681 unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
2682
2683 /* set up the local pointers */
2684 source=pArgs->source;
2685 sourceLimit=pArgs->sourceLimit;
2686 target=(uint8_t *)pArgs->target;
2687 targetCapacity=pArgs->targetLimit-pArgs->target;
2688 offsets=pArgs->offsets;
2689
2690 table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2691 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2692 bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2693 } else {
2694 bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
2695 }
2696
2697 /* get the converter state from UConverter */
2698 c=cnv->fromUSurrogateLead;
2699 prevLength=cnv->fromUnicodeStatus;
2700
2701 /* sourceIndex=-1 if the current character began in the previous buffer */
2702 sourceIndex= c==0 ? 0 : -1;
2703 nextSourceIndex=0;
2704
2705 /* conversion loop */
2706 if(c!=0 && targetCapacity>0) {
2707 goto getTrail;
2708 }
2709
2710 while(source<sourceLimit) {
2711 /*
2712 * This following test is to see if available input would overflow the output.
2713 * It does not catch output of more than one byte that
2714 * overflows as a result of a multi-byte character or callback output
2715 * from the last source character.
2716 * Therefore, those situations also test for overflows and will
2717 * then break the loop, too.
2718 */
2719 if(targetCapacity>0) {
2720 /*
2721 * Get a correct Unicode code point:
2722 * a single UChar for a BMP code point or
2723 * a matched surrogate pair for a "supplementary code point".
2724 */
2725 c=*source++;
2726 ++nextSourceIndex;
2727 /*
2728 * This also tests if the codepage maps single surrogates.
2729 * If it does, then surrogates are not paired but mapped separately.
2730 * Note that in this case unmatched surrogates are not detected.
2731 */
2732 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2733 if(UTF_IS_SURROGATE_FIRST(c)) {
2734 getTrail:
2735 if(source<sourceLimit) {
2736 /* test the following code unit */
2737 UChar trail=*source;
2738 if(UTF_IS_SECOND_SURROGATE(trail)) {
2739 ++source;
2740 ++nextSourceIndex;
2741 c=UTF16_GET_PAIR_VALUE(c, trail);
2742 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2743 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2744 /* callback(unassigned) */
2745 goto unassigned;
2746 }
2747 /* convert this supplementary code point */
2748 /* exit this condition tree */
2749 } else {
2750 /* this is an unmatched lead code unit (1st surrogate) */
2751 /* callback(illegal) */
2752 reason=UCNV_ILLEGAL;
2753 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2754 goto callback;
2755 }
2756 } else {
2757 /* no more input */
2758 break;
2759 }
2760 } else {
2761 /* this is an unmatched trail code unit (2nd surrogate) */
2762 /* callback(illegal) */
2763 reason=UCNV_ILLEGAL;
2764 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2765 goto callback;
2766 }
2767 }
2768
2769 /* convert the Unicode code point in c into codepage bytes */
2770 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2771
2772 /* get the bytes and the length for the output */
2773 /* MBCS_OUTPUT_2 */
2774 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2775 if(value<=0xff) {
2776 length=1;
2777 } else {
2778 length=2;
2779 }
2780
2781 /* is this code point assigned, or do we use fallbacks? */
2782 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
2783 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0)))
2784 ) {
2785 /*
2786 * We allow a 0 byte output if the Unicode code point is
2787 * U+0000 and also if the "assigned" bit is set for this entry.
2788 * There is no way with this data structure for fallback output
2789 * for other than U+0000 to be a zero byte.
2790 */
2791 /* callback(unassigned) */
2792 goto unassigned;
2793 }
2794
2795 /* write the output character bytes from value and length */
2796 /* from the first if in the loop we know that targetCapacity>0 */
2797 if(length==1) {
2798 /* this is easy because we know that there is enough space */
2799 *target++=(uint8_t)value;
2800 if(offsets!=NULL) {
2801 *offsets++=sourceIndex;
2802 }
2803 --targetCapacity;
2804 } else /* length==2 */ {
2805 *target++=(uint8_t)(value>>8);
2806 if(2<=targetCapacity) {
2807 *target++=(uint8_t)value;
2808 if(offsets!=NULL) {
2809 *offsets++=sourceIndex;
2810 *offsets++=sourceIndex;
2811 }
2812 targetCapacity-=2;
2813 } else {
2814 if(offsets!=NULL) {
2815 *offsets++=sourceIndex;
2816 }
2817 cnv->charErrorBuffer[0]=(char)value;
2818 cnv->charErrorBufferLength=1;
2819
2820 /* target overflow */
2821 targetCapacity=0;
2822 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2823 c=0;
2824 break;
2825 }
2826 }
2827
2828 /* normal end of conversion: prepare for a new character */
2829 c=0;
2830 sourceIndex=nextSourceIndex;
2831 continue;
2832
2833 /*
2834 * This is the same ugly trick as in ToUnicode(), for the
2835 * same reasons...
2836 */
2837 unassigned:
2838 reason=UCNV_UNASSIGNED;
2839 *pErrorCode=U_INVALID_CHAR_FOUND;
2840 callback:
2841 /* call the callback function with all the preparations and post-processing */
2842 /* update the arguments structure */
2843 pArgs->source=source;
2844 pArgs->target=(char *)target;
2845 pArgs->offsets=offsets;
2846
2847 /* set the converter state in UConverter to deal with the next character */
2848 cnv->fromUSurrogateLead=0;
2849 cnv->fromUnicodeStatus=prevLength;
2850
2851 /* call the callback function */
2852 fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
2853
2854 /* get the converter state from UConverter */
2855 c=cnv->fromUSurrogateLead;
2856 prevLength=cnv->fromUnicodeStatus;
2857
2858 /* update target and deal with offsets if necessary */
2859 offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
2860 target=(uint8_t *)pArgs->target;
2861
2862 /* update the source pointer and index */
2863 sourceIndex=nextSourceIndex+(pArgs->source-source);
2864 source=pArgs->source;
2865 targetCapacity=(uint8_t *)pArgs->targetLimit-target;
2866
2867 /*
2868 * If the callback overflowed the target, then we need to
2869 * stop here with an overflow indication.
2870 */
2871 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
2872 break;
2873 } else if(U_FAILURE(*pErrorCode)) {
2874 /* break on error */
2875 c=0;
2876 break;
2877 } else if(cnv->charErrorBufferLength>0) {
2878 /* target is full */
2879 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2880 break;
2881 }
2882
2883 /*
2884 * We do not need to repeat the statements from the normal
2885 * end of the conversion because we already updated all the
2886 * necessary variables.
2887 */
2888 } else {
2889 /* target is full */
2890 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2891 break;
2892 }
2893 }
2894
2895 if(pArgs->flush && source>=sourceLimit) {
2896 /* reset the state for the next conversion */
2897 if(c!=0 && U_SUCCESS(*pErrorCode)) {
2898 /* a Unicode code point remains incomplete (only a first surrogate) */
2899 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2900 }
2901 cnv->fromUSurrogateLead=0;
2902 cnv->fromUnicodeStatus=1;
2903 } else {
2904 /* set the converter state back into UConverter */
2905 cnv->fromUSurrogateLead=(UChar)c;
2906 cnv->fromUnicodeStatus=prevLength;
2907 }
2908
2909 /* write back the updated pointers */
2910 pArgs->source=source;
2911 pArgs->target=(char *)target;
2912 pArgs->offsets=offsets;
2913 }
2914
2915 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
2916 static void
2917 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2918 UErrorCode *pErrorCode) {
2919 UConverter *cnv;
2920 const UChar *source, *sourceLimit;
2921 uint8_t *target;
2922 int32_t targetCapacity;
2923 int32_t *offsets;
2924
2925 const uint16_t *table;
2926 const uint16_t *results;
2927
2928 UChar32 c;
2929
2930 int32_t sourceIndex, nextSourceIndex;
2931
2932 UConverterCallbackReason reason;
2933 uint16_t value, minValue;
2934 UBool hasSupplementary;
2935
2936 /* set up the local pointers */
2937 cnv=pArgs->converter;
2938 source=pArgs->source;
2939 sourceLimit=pArgs->sourceLimit;
2940 target=(uint8_t *)pArgs->target;
2941 targetCapacity=pArgs->targetLimit-pArgs->target;
2942 offsets=pArgs->offsets;
2943
2944 table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2945 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2946 results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2947 } else {
2948 results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes;
2949 }
2950
2951 if(cnv->useFallback) {
2952 /* use all roundtrip and fallback results */
2953 minValue=0x800;
2954 } else {
2955 /* use only roundtrips and fallbacks from private-use characters */
2956 minValue=0xc00;
2957 }
2958 hasSupplementary=(UBool)(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
2959
2960 /* get the converter state from UConverter */
2961 c=cnv->fromUSurrogateLead;
2962
2963 /* sourceIndex=-1 if the current character began in the previous buffer */
2964 sourceIndex= c==0 ? 0 : -1;
2965 nextSourceIndex=0;
2966
2967 /* conversion loop */
2968 if(c!=0 && targetCapacity>0) {
2969 goto getTrail;
2970 }
2971
2972 while(source<sourceLimit) {
2973 /*
2974 * This following test is to see if available input would overflow the output.
2975 * It does not catch output of more than one byte that
2976 * overflows as a result of a multi-byte character or callback output
2977 * from the last source character.
2978 * Therefore, those situations also test for overflows and will
2979 * then break the loop, too.
2980 */
2981 if(targetCapacity>0) {
2982 /*
2983 * Get a correct Unicode code point:
2984 * a single UChar for a BMP code point or
2985 * a matched surrogate pair for a "supplementary code point".
2986 */
2987 c=*source++;
2988 ++nextSourceIndex;
2989 if(UTF_IS_SURROGATE(c)) {
2990 if(UTF_IS_SURROGATE_FIRST(c)) {
2991 getTrail:
2992 if(source<sourceLimit) {
2993 /* test the following code unit */
2994 UChar trail=*source;
2995 if(UTF_IS_SECOND_SURROGATE(trail)) {
2996 ++source;
2997 ++nextSourceIndex;
2998 c=UTF16_GET_PAIR_VALUE(c, trail);
2999 if(!hasSupplementary) {
3000 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3001 /* callback(unassigned) */
3002 goto unassigned;
3003 }
3004 /* convert this supplementary code point */
3005 /* exit this condition tree */
3006 } else {
3007 /* this is an unmatched lead code unit (1st surrogate) */
3008 /* callback(illegal) */
3009 reason=UCNV_ILLEGAL;
3010 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3011 goto callback;
3012 }
3013 } else {
3014 /* no more input */
3015 break;
3016 }
3017 } else {
3018 /* this is an unmatched trail code unit (2nd surrogate) */
3019 /* callback(illegal) */
3020 reason=UCNV_ILLEGAL;
3021 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3022 goto callback;
3023 }
3024 }
3025
3026 /* convert the Unicode code point in c into codepage bytes */
3027 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3028
3029 /* is this code point assigned, or do we use fallbacks? */
3030 if(value>=minValue) {
3031 /* assigned, write the output character bytes from value and length */
3032 /* length==1 */
3033 /* this is easy because we know that there is enough space */
3034 *target++=(uint8_t)value;
3035 if(offsets!=NULL) {
3036 *offsets++=sourceIndex;
3037 }
3038 --targetCapacity;
3039
3040 /* normal end of conversion: prepare for a new character */
3041 c=0;
3042 sourceIndex=nextSourceIndex;
3043 continue;
3044 } else { /* unassigned */
3045 /*
3046 * We allow a 0 byte output if the Unicode code point is
3047 * U+0000 and also if the "assigned" bit is set for this entry.
3048 * There is no way with this data structure for fallback output
3049 * for other than U+0000 to be a zero byte.
3050 */
3051 /* callback(unassigned) */
3052 }
3053 unassigned:
3054 reason=UCNV_UNASSIGNED;
3055 *pErrorCode=U_INVALID_CHAR_FOUND;
3056 callback:
3057 /* call the callback function with all the preparations and post-processing */
3058 /* update the arguments structure */
3059 pArgs->source=source;
3060 pArgs->target=(char *)target;
3061 pArgs->offsets=offsets;
3062
3063 /* set the converter state in UConverter to deal with the next character */
3064 cnv->fromUSurrogateLead=0;
3065
3066 /* call the callback function */
3067 fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
3068
3069 /* get the converter state from UConverter */
3070 c=cnv->fromUSurrogateLead;
3071
3072 /* update target and deal with offsets if necessary */
3073 offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
3074 target=(uint8_t *)pArgs->target;
3075
3076 /* update the source pointer and index */
3077 sourceIndex=nextSourceIndex+(pArgs->source-source);
3078 source=pArgs->source;
3079 targetCapacity=(uint8_t *)pArgs->targetLimit-target;
3080
3081 /*
3082 * If the callback overflowed the target, then we need to
3083 * stop here with an overflow indication.
3084 */
3085 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3086 break;
3087 } else if(U_FAILURE(*pErrorCode)) {
3088 /* break on error */
3089 c=0;
3090 break;
3091 } else if(cnv->charErrorBufferLength>0) {
3092 /* target is full */
3093 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3094 break;
3095 }
3096
3097 /*
3098 * We do not need to repeat the statements from the normal
3099 * end of the conversion because we already updated all the
3100 * necessary variables.
3101 */
3102 } else {
3103 /* target is full */
3104 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3105 break;
3106 }
3107 }
3108
3109 if(pArgs->flush && source>=sourceLimit) {
3110 /* reset the state for the next conversion */
3111 if(c!=0 && U_SUCCESS(*pErrorCode)) {
3112 /* a Unicode code point remains incomplete (only a first surrogate) */
3113 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3114 }
3115 cnv->fromUSurrogateLead=0;
3116 } else {
3117 /* set the converter state back into UConverter */
3118 cnv->fromUSurrogateLead=(UChar)c;
3119 }
3120
3121 /* write back the updated pointers */
3122 pArgs->source=source;
3123 pArgs->target=(char *)target;
3124 pArgs->offsets=offsets;
3125 }
3126
3127 /*
3128 * This version of _MBCSFromUnicode() is optimized for single-byte codepages
3129 * that map only to and from the BMP.
3130 * In addition to single-byte/state optimizations, the offset calculations
3131 * become much easier.
3132 */
3133 static void
3134 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3135 UErrorCode *pErrorCode) {
3136 UConverter *cnv;
3137 const UChar *source, *sourceLimit, *lastSource;
3138 uint8_t *target;
3139 int32_t targetCapacity, length;
3140 int32_t *offsets;
3141
3142 const uint16_t *table;
3143 const uint16_t *results;
3144
3145 UChar32 c;
3146
3147 int32_t sourceIndex;
3148
3149 UConverterCallbackReason reason;
3150 uint16_t value, minValue;
3151
3152 /* set up the local pointers */
3153 cnv=pArgs->converter;
3154 source=pArgs->source;
3155 sourceLimit=pArgs->sourceLimit;
3156 target=(uint8_t *)pArgs->target;
3157 targetCapacity=pArgs->targetLimit-pArgs->target;
3158 offsets=pArgs->offsets;
3159
3160 table=cnv->sharedData->table->mbcs.fromUnicodeTable;
3161 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3162 results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
3163 } else {
3164 results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes;
3165 }
3166
3167 if(cnv->useFallback) {
3168 /* use all roundtrip and fallback results */
3169 minValue=0x800;
3170 } else {
3171 /* use only roundtrips and fallbacks from private-use characters */
3172 minValue=0xc00;
3173 }
3174
3175 /* get the converter state from UConverter */
3176 c=cnv->fromUSurrogateLead;
3177
3178 /* sourceIndex=-1 if the current character began in the previous buffer */
3179 sourceIndex= c==0 ? 0 : -1;
3180 lastSource=source;
3181
3182 /*
3183 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3184 * for the minimum of the sourceLength and targetCapacity
3185 */
3186 length=sourceLimit-source;
3187 if(length<targetCapacity) {
3188 targetCapacity=length;
3189 }
3190
3191 /* conversion loop */
3192 if(c!=0 && targetCapacity>0) {
3193 goto getTrail;
3194 }
3195
3196 #if MBCS_UNROLL_SINGLE_FROM_BMP
3197 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3198 /* unroll the loop with the most common case */
3199 unrolled:
3200 if(targetCapacity>=4) {
3201 int32_t count, loops;
3202 uint16_t andedValues;
3203
3204 loops=count=targetCapacity>>2;
3205 do {
3206 c=*source++;
3207 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3208 *target++=(uint8_t)value;
3209 c=*source++;
3210 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3211 *target++=(uint8_t)value;
3212 c=*source++;
3213 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3214 *target++=(uint8_t)value;
3215 c=*source++;
3216 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3217 *target++=(uint8_t)value;
3218
3219 /* were all 4 entries really valid? */
3220 if(andedValues<minValue) {
3221 /* no, return to the first of these 4 */
3222 source-=4;
3223 target-=4;
3224 break;
3225 }
3226 } while(--count>0);
3227 count=loops-count;
3228 targetCapacity-=4*count;
3229
3230 if(offsets!=NULL) {
3231 lastSource+=4*count;
3232 while(count>0) {
3233 *offsets++=sourceIndex++;
3234 *offsets++=sourceIndex++;
3235 *offsets++=sourceIndex++;
3236 *offsets++=sourceIndex++;
3237 --count;
3238 }
3239 }
3240
3241 c=0;
3242 }
3243 #endif
3244
3245 while(targetCapacity>0) {
3246 /*
3247 * Get a correct Unicode code point:
3248 * a single UChar for a BMP code point or
3249 * a matched surrogate pair for a "supplementary code point".
3250 */
3251 c=*source++;
3252 /*
3253 * Do not immediately check for single surrogates:
3254 * Assume that they are unassigned and check for them in that case.
3255 * This speeds up the conversion of assigned characters.
3256 */
3257 /* convert the Unicode code point in c into codepage bytes */
3258 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3259
3260 /* is this code point assigned, or do we use fallbacks? */
3261 if(value>=minValue) {
3262 /* assigned, write the output character bytes from value and length */
3263 /* length==1 */
3264 /* this is easy because we know that there is enough space */
3265 *target++=(uint8_t)value;
3266 --targetCapacity;
3267
3268 /* normal end of conversion: prepare for a new character */
3269 c=0;
3270 continue;
3271 } else if(!UTF_IS_SURROGATE(c)) {
3272 /* normal, unassigned BMP character */
3273 /*
3274 * We allow a 0 byte output if the Unicode code point is
3275 * U+0000 and also if the "assigned" bit is set for this entry.
3276 * There is no way with this data structure for fallback output
3277 * for other than U+0000 to be a zero byte.
3278 */
3279 /* callback(unassigned) */
3280 reason=UCNV_UNASSIGNED;
3281 *pErrorCode=U_INVALID_CHAR_FOUND;
3282 } else if(UTF_IS_SURROGATE_FIRST(c)) {
3283 getTrail:
3284 if(source<sourceLimit) {
3285 /* test the following code unit */
3286 UChar trail=*source;
3287 if(UTF_IS_SECOND_SURROGATE(trail)) {
3288 ++source;
3289 c=UTF16_GET_PAIR_VALUE(c, trail);
3290 /* this codepage does not map supplementary code points */
3291 /* callback(unassigned) */
3292 reason=UCNV_UNASSIGNED;
3293 *pErrorCode=U_INVALID_CHAR_FOUND;
3294 } else {
3295 /* this is an unmatched lead code unit (1st surrogate) */
3296 /* callback(illegal) */
3297 reason=UCNV_ILLEGAL;
3298 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3299 }
3300 } else {
3301 /* no more input */
3302 break;
3303 }
3304 } else {
3305 /* this is an unmatched trail code unit (2nd surrogate) */
3306 /* callback(illegal) */
3307 reason=UCNV_ILLEGAL;
3308 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3309 }
3310
3311 /* call the callback function with all the preparations and post-processing */
3312 /* get the number of code units for c to correctly advance sourceIndex after the callback call */
3313 length=UTF_CHAR_LENGTH(c);
3314
3315 /* set offsets since the start or the last callback */
3316 if(offsets!=NULL) {
3317 int32_t count=(int32_t)(source-lastSource);
3318
3319 /* do not set the offset for the callback-causing character */
3320 count-=length;
3321
3322 while(count>0) {
3323 *offsets++=sourceIndex++;
3324 --count;
3325 }
3326 /* offset and sourceIndex are now set for the current character */
3327 }
3328
3329 /* update the arguments structure */
3330 pArgs->source=source;
3331 pArgs->target=(char *)target;
3332 pArgs->offsets=offsets;
3333
3334 /* set the converter state in UConverter to deal with the next character */
3335 cnv->fromUSurrogateLead=0;
3336
3337 /* call the callback function */
3338 fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
3339
3340 /* get the converter state from UConverter */
3341 c=cnv->fromUSurrogateLead;
3342
3343 /* update target and deal with offsets if necessary */
3344 offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
3345 target=(uint8_t *)pArgs->target;
3346
3347 /* update the source pointer and index */
3348 sourceIndex+=length+(pArgs->source-source);
3349 source=lastSource=pArgs->source;
3350 targetCapacity=(uint8_t *)pArgs->targetLimit-target;
3351 length=sourceLimit-source;
3352 if(length<targetCapacity) {
3353 targetCapacity=length;
3354 }
3355
3356 /*
3357 * If the callback overflowed the target, then we need to
3358 * stop here with an overflow indication.
3359 */
3360 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3361 break;
3362 } else if(U_FAILURE(*pErrorCode)) {
3363 /* break on error */
3364 c=0;
3365 break;
3366 } else if(cnv->charErrorBufferLength>0) {
3367 /* target is full */
3368 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3369 break;
3370 }
3371
3372 #if MBCS_UNROLL_SINGLE_FROM_BMP
3373 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3374 goto unrolled;
3375 #endif
3376 }
3377
3378 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3379 /* target is full */
3380 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3381 }
3382
3383 /* set offsets since the start or the last callback */
3384 if(offsets!=NULL) {
3385 size_t count=source-lastSource;
3386 while(count>0) {
3387 *offsets++=sourceIndex++;
3388 --count;
3389 }
3390 }
3391
3392 if(pArgs->flush && source>=sourceLimit) {
3393 /* reset the state for the next conversion */
3394 if(c!=0 && U_SUCCESS(*pErrorCode)) {
3395 /* a Unicode code point remains incomplete (only a first surrogate) */
3396 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3397 }
3398 cnv->fromUSurrogateLead=0;
3399 } else {
3400 /* set the converter state back into UConverter */
3401 cnv->fromUSurrogateLead=(UChar)c;
3402 }
3403
3404 /* write back the updated pointers */
3405 pArgs->source=source;
3406 pArgs->target=(char *)target;
3407 pArgs->offsets=offsets;
3408 }
3409
3410 /*
3411 * This is another simple conversion function for internal use by other
3412 * conversion implementations.
3413 * It does not use the converter state nor call callbacks.
3414 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3415 *
3416 * It converts one single Unicode code point into codepage bytes, encoded
3417 * as one 32-bit value. The function returns the number of bytes in *pValue:
3418 * 1..4 the number of bytes in *pValue
3419 * 0 unassigned (*pValue undefined)
3420 * -1 illegal (currently not used, *pValue undefined)
3421 *
3422 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
3423 * the second to last byte in bits 15..8, etc.
3424 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3425 */
3426 U_CFUNC int32_t
3427 _MBCSFromUChar32(UConverterSharedData *sharedData,
3428 UChar32 c, uint32_t *pValue,
3429 UBool useFallback) {
3430 const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
3431 const uint8_t *p;
3432 uint32_t stage2Entry;
3433 uint32_t value;
3434 int32_t length;
3435
3436 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3437 if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3438 return 0;
3439 }
3440
3441 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3442 if(sharedData->table->mbcs.outputType==MBCS_OUTPUT_1) {
3443 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);
3444 /* is this code point assigned, or do we use fallbacks? */
3445 if(useFallback ? value>=0x800 : value>=0xc00) {
3446 *pValue=value&0xff;
3447 return 1;
3448 } else {
3449 return 0;
3450 }
3451 }
3452
3453 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3454
3455 /* get the bytes and the length for the output */
3456 switch(sharedData->table->mbcs.outputType) {
3457 case MBCS_OUTPUT_2:
3458 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3459 if(value<=0xff) {
3460 length=1;
3461 } else {
3462 length=2;
3463 }
3464 break;
3465 case MBCS_OUTPUT_3:
3466 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3467 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3468 if(value<=0xff) {
3469 length=1;
3470 } else if(value<=0xffff) {
3471 length=2;
3472 } else {
3473 length=3;
3474 }
3475 break;
3476 case MBCS_OUTPUT_4:
3477 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3478 if(value<=0xff) {
3479 length=1;
3480 } else if(value<=0xffff) {
3481 length=2;
3482 } else if(value<=0xffffff) {
3483 length=3;
3484 } else {
3485 length=4;
3486 }
3487 break;
3488 case MBCS_OUTPUT_3_EUC:
3489 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3490 /* EUC 16-bit fixed-length representation */
3491 if(value<=0xff) {
3492 length=1;
3493 } else if((value&0x8000)==0) {
3494 value|=0x8e8000;
3495 length=3;
3496 } else if((value&0x80)==0) {
3497 value|=0x8f0080;
3498 length=3;
3499 } else {
3500 length=2;
3501 }
3502 break;
3503 case MBCS_OUTPUT_4_EUC:
3504 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3505 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3506 /* EUC 16-bit fixed-length representation applied to the first two bytes */
3507 if(value<=0xff) {
3508 length=1;
3509 } else if(value<=0xffff) {
3510 length=2;
3511 } else if((value&0x800000)==0) {
3512 value|=0x8e800000;
3513 length=4;
3514 } else if((value&0x8000)==0) {
3515 value|=0x8f008000;
3516 length=4;
3517 } else {
3518 length=3;
3519 }
3520 break;
3521 default:
3522 /* must not occur */
3523 return -1;
3524 }
3525
3526 /* is this code point assigned, or do we use fallbacks? */
3527 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3528 (FROM_U_USE_FALLBACK(useFallback, c) && (value!=0 || c==0))
3529 ) {
3530 /*
3531 * We allow a 0 byte output if the Unicode code point is
3532 * U+0000 and also if the "assigned" bit is set for this entry.
3533 * There is no way with this data structure for fallback output
3534 * for other than U+0000 to be a zero byte.
3535 */
3536 /* assigned */
3537 *pValue=value;
3538 return length;
3539 } else {
3540 return 0;
3541 }
3542 }
3543
3544
3545 #if 0
3546 /**
3547 * ################################################################
3548 * #
3549 * # This function has been moved to ucnv2022.c for inlining.
3550 * # This implementation is here only for documentation purposes
3551 * #
3552 * ################################################################
3553 */
3554
3555 /**
3556 * This version of _MBCSFromUChar32() is optimized for single-byte codepages.
3557 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3558 *
3559 * It returns the codepage byte for the code point, or -1 if it is unassigned.
3560 */
3561 U_CFUNC int32_t
3562 _MBCSSingleFromUChar32(UConverterSharedData *sharedData,
3563 UChar32 c,
3564 UBool useFallback) {
3565 const uint16_t *table;
3566 int32_t value;
3567
3568 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3569 if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3570 return -1;
3571 }
3572
3573 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3574 table=sharedData->table->mbcs.fromUnicodeTable;
3575
3576 /* get the byte for the output */
3577 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);
3578 /* is this code point assigned, or do we use fallbacks? */
3579 if(useFallback ? value>=0x800 : value>=0xc00) {
3580 return value&0xff;
3581 } else {
3582 return -1;
3583 }
3584 }
3585 #endif
3586
3587 /* miscellaneous ------------------------------------------------------------ */
3588
3589 static void
3590 _MBCSGetStarters(const UConverter* cnv,
3591 UBool starters[256],
3592 UErrorCode *pErrorCode) {
3593 const int32_t *state0=cnv->sharedData->table->mbcs.stateTable[0];
3594 int i;
3595
3596 for(i=0; i<256; ++i) {
3597 /* all bytes that cause a state transition from state 0 are lead bytes */
3598 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
3599 }
3600 }
3601
3602 /*
3603 * This is an internal function that allows other converter implementations
3604 * to check whether a byte is a lead byte.
3605 */
3606 U_CFUNC UBool
3607 _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
3608 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->table->mbcs.stateTable[0][(uint8_t)byte]);
3609 }
3610
3611 static void
3612 _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
3613 int32_t offsetIndex,
3614 UErrorCode *pErrorCode) {
3615 UConverter *cnv=pArgs->converter;
3616 char *p, *subchar;
3617 char buffer[4];
3618 int32_t length;
3619
3620 /* first, select between subChar and subChar1 */
3621 if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) {
3622 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
3623 subchar=(char *)&cnv->subChar1;
3624 length=1;
3625 } else {
3626 /* select subChar in all other cases */
3627 subchar=(char *)cnv->subChar;
3628 length=cnv->subCharLen;
3629 }
3630
3631 switch(cnv->sharedData->table->mbcs.outputType) {
3632 case MBCS_OUTPUT_2_SISO:
3633 p=buffer;
3634
3635 /* fromUnicodeStatus contains prevLength */
3636 switch(length) {
3637 case 1:
3638 if(cnv->fromUnicodeStatus==2) {
3639 /* DBCS mode and SBCS sub char: change to SBCS */
3640 cnv->fromUnicodeStatus=1;
3641 *p++=UCNV_SI;
3642 }
3643 *p++=subchar[0];
3644 break;
3645 case 2:
3646 if(cnv->fromUnicodeStatus==1) {
3647 /* SBCS mode and DBCS sub char: change to DBCS */
3648 cnv->fromUnicodeStatus=2;
3649 *p++=UCNV_SO;
3650 }
3651 *p++=subchar[0];
3652 *p++=subchar[1];
3653 break;
3654 default:
3655 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3656 return;
3657 }
3658 ucnv_cbFromUWriteBytes(pArgs,
3659 buffer, (int32_t)(p-buffer),
3660 offsetIndex, pErrorCode);
3661 break;
3662 default:
3663 ucnv_cbFromUWriteBytes(pArgs,
3664 subchar, length,
3665 offsetIndex, pErrorCode);
3666 break;
3667 }
3668 }
3669
3670 U_CFUNC UConverterType
3671 _MBCSGetType(const UConverter* converter) {
3672 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
3673 if(converter->sharedData->table->mbcs.countStates==1) {
3674 return (UConverterType)UCNV_SBCS;
3675 } else if((converter->sharedData->table->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
3676 return (UConverterType)UCNV_EBCDIC_STATEFUL;
3677 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
3678 return (UConverterType)UCNV_DBCS;
3679 }
3680 return (UConverterType)UCNV_MBCS;
3681 }
3682
3683 static const UConverterImpl _MBCSImpl={
3684 UCNV_MBCS,
3685
3686 _MBCSLoad,
3687 _MBCSUnload,
3688
3689 _MBCSOpen,
3690 NULL,
3691 _MBCSReset,
3692
3693 _MBCSToUnicodeWithOffsets,
3694 _MBCSToUnicodeWithOffsets,
3695 _MBCSFromUnicodeWithOffsets,
3696 _MBCSFromUnicodeWithOffsets,
3697 _MBCSGetNextUChar,
3698
3699 _MBCSGetStarters,
3700 _MBCSGetName,
3701 _MBCSWriteSub,
3702 NULL,
3703 _MBCSGetUnicodeSet
3704 };
3705
3706
3707 /* Static data is in tools/makeconv/ucnvstat.c for data-based
3708 * converters. Be sure to update it as well.
3709 */
3710
3711 const UConverterSharedData _MBCSData={
3712 sizeof(UConverterSharedData), 1,
3713 NULL, NULL, NULL, FALSE, &_MBCSImpl,
3714 0
3715 };
3716
3717 /* GB 18030 special handling ------------------------------------------------ */
3718
3719 /* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
3720
3721 /* the callback functions handle GB 18030 specially */
3722 static void
3723 fromUCallback(UConverter *cnv,
3724 const void *context, UConverterFromUnicodeArgs *pArgs,
3725 UChar32 codePoint,
3726 UConverterCallbackReason reason, UErrorCode *pErrorCode) {
3727 int32_t i;
3728
3729 if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED) {
3730 const uint32_t *range;
3731
3732 range=gb18030Ranges[0];
3733 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
3734 if(range[0]<=(uint32_t)codePoint && (uint32_t)codePoint<=range[1]) {
3735 uint32_t linear;
3736 char bytes[4];
3737
3738 /* found the Unicode code point, output the four-byte sequence for it */
3739 *pErrorCode=U_ZERO_ERROR;
3740
3741 /* get the linear value of the first GB 18030 code in this range */
3742 linear=range[2]-LINEAR_18030_BASE;
3743
3744 /* add the offset from the beginning of the range */
3745 linear+=((uint32_t)codePoint-range[0]);
3746
3747 /* turn this into a four-byte sequence */
3748 bytes[3]=(char)(0x30+linear%10); linear/=10;
3749 bytes[2]=(char)(0x81+linear%126); linear/=126;
3750 bytes[1]=(char)(0x30+linear%10); linear/=10;
3751 bytes[0]=(char)(0x81+linear);
3752
3753 /* output this sequence */
3754 ucnv_cbFromUWriteBytes(pArgs, bytes, 4, 0, pErrorCode);
3755 return;
3756 }
3757 }
3758 }
3759
3760 /* write the code point as code units */
3761 i=0;
3762 UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, codePoint);
3763 cnv->invalidUCharLength=(int8_t)i;
3764
3765 /* call the normal callback function */
3766 cnv->fromUCharErrorBehaviour(context, pArgs, cnv->invalidUCharBuffer, i, codePoint, reason, pErrorCode);
3767 }
3768
3769 static void
3770 toUCallback(UConverter *cnv,
3771 const void *context, UConverterToUnicodeArgs *pArgs,
3772 const char *codeUnits, int32_t length,
3773 UConverterCallbackReason reason, UErrorCode *pErrorCode) {
3774 int32_t i;
3775
3776 if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED && length==4) {
3777 const uint32_t *range;
3778 uint32_t linear;
3779
3780 linear=LINEAR_18030((uint8_t)codeUnits[0], (uint8_t)codeUnits[1], (uint8_t)codeUnits[2], (uint8_t)codeUnits[3]);
3781 range=gb18030Ranges[0];
3782 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
3783 if(range[2]<=linear && linear<=range[3]) {
3784 UChar u[UTF_MAX_CHAR_LENGTH];
3785
3786 /* found the sequence, output the Unicode code point for it */
3787 *pErrorCode=U_ZERO_ERROR;
3788
3789 /* add the linear difference between the input and start sequences to the start code point */
3790 linear=range[0]+(linear-range[2]);
3791
3792 /* write the result as UChars and output */
3793 i=0;
3794 UTF_APPEND_CHAR_UNSAFE(u, i, linear);
3795 ucnv_cbToUWriteUChars(pArgs, u, i, 0, pErrorCode);
3796 return;
3797 }
3798 }
3799 }
3800
3801 /* copy the current bytes to invalidCharBuffer */
3802 for(i=0; i<length; ++i) {
3803 cnv->invalidCharBuffer[i]=codeUnits[i];
3804 }
3805 cnv->invalidCharLength=(int8_t)length;
3806
3807 /* call the normal callback function */
3808 cnv->fromCharErrorBehaviour(context, pArgs, codeUnits, length, reason, pErrorCode);
3809 }
3810
3811 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */