2 *******************************************************************************
4 * Copyright (C) 2000-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000jul06
14 * created by: Markus W. Scherer
18 #include "unicode/utypes.h"
29 typedef struct MBCSData
{
30 NewConverter newConverter
;
34 /* toUnicode (state table in ucm->states) */
35 _MBCSToUFallback toUFallbacks
[MBCS_MAX_FALLBACK_COUNT
];
36 int32_t countToUFallbacks
;
37 uint16_t *unicodeCodeUnits
;
40 uint16_t stage1
[MBCS_STAGE_1_SIZE
];
41 uint16_t stage2Single
[MBCS_STAGE_2_SIZE
]; /* stage 2 for single-byte codepages */
42 uint32_t stage2
[MBCS_STAGE_2_SIZE
]; /* stage 2 for MBCS */
44 uint32_t stage2Top
, stage3Top
;
49 MBCSClose(NewConverter
*cnvData
);
52 MBCSStartMappings(MBCSData
*mbcsData
);
55 MBCSAddToUnicode(MBCSData
*mbcsData
,
56 const uint8_t *bytes
, int32_t length
,
61 MBCSIsValid(NewConverter
*cnvData
,
62 const uint8_t *bytes
, int32_t length
);
65 MBCSSingleAddFromUnicode(MBCSData
*mbcsData
,
66 const uint8_t *bytes
, int32_t length
,
71 MBCSAddFromUnicode(MBCSData
*mbcsData
,
72 const uint8_t *bytes
, int32_t length
,
77 MBCSPostprocess(MBCSData
*mbcsData
, const UConverterStaticData
*staticData
);
80 MBCSAddTable(NewConverter
*cnvData
, UCMTable
*table
, UConverterStaticData
*staticData
);
83 MBCSWrite(NewConverter
*cnvData
, const UConverterStaticData
*staticData
,
84 UNewDataMemory
*pData
, int32_t tableType
);
86 /* helper ------------------------------------------------------------------- */
89 hexDigit(uint8_t digit
) {
90 return digit
<=9 ? (char)('0'+digit
) : (char)('a'-10+digit
);
93 static U_INLINE
char *
94 printBytes(char *buffer
, const uint8_t *bytes
, int32_t length
) {
97 *s
++=hexDigit((uint8_t)(*bytes
>>4));
98 *s
++=hexDigit((uint8_t)(*bytes
&0xf));
107 /* implementation ----------------------------------------------------------- */
110 MBCSInit(MBCSData
*mbcsData
, UCMFile
*ucm
) {
111 int32_t i
, maxCharLength
;
113 uprv_memset(mbcsData
, 0, sizeof(MBCSData
));
115 maxCharLength
=ucm
->states
.maxCharLength
;
117 mbcsData
->ucm
=ucm
; /* aliased, not owned */
119 mbcsData
->newConverter
.close
=MBCSClose
;
120 mbcsData
->newConverter
.isValid
=MBCSIsValid
;
121 mbcsData
->newConverter
.addTable
=MBCSAddTable
;
122 mbcsData
->newConverter
.write
=MBCSWrite
;
124 mbcsData
->stage2Top
=MBCS_STAGE_2_FIRST_ASSIGNED
; /* after stage 1 and one all-unassigned stage 2 block */
125 mbcsData
->stage3Top
=16*maxCharLength
; /* after one all-unassigned stage 3 block */
127 /* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
128 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
129 mbcsData
->stage1
[i
]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
134 MBCSOpen(UCMFile
*ucm
) {
135 MBCSData
*mbcsData
=(MBCSData
*)uprv_malloc(sizeof(MBCSData
));
137 MBCSInit(mbcsData
, ucm
);
139 return &mbcsData
->newConverter
;
143 MBCSClose(NewConverter
*cnvData
) {
144 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
146 uprv_free(mbcsData
->unicodeCodeUnits
);
147 uprv_free(mbcsData
->fromUBytes
);
153 MBCSStartMappings(MBCSData
*mbcsData
) {
156 /* allocate the code unit array and prefill it with "unassigned" values */
157 sum
=mbcsData
->ucm
->states
.countToUCodeUnits
;
159 printf("the total number of offsets is 0x%lx=%ld\n", (long)sum
, (long)sum
);
163 mbcsData
->unicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
164 if(mbcsData
->unicodeCodeUnits
==NULL
) {
165 fprintf(stderr
, "error: out of memory allocating %ld 16-bit code units\n",
169 for(i
=0; i
<sum
; ++i
) {
170 mbcsData
->unicodeCodeUnits
[i
]=0xfffe;
174 /* allocate the codepage mappings and preset the first 16 characters to 0 */
175 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
176 /* allocate 64k 16-bit results for single-byte codepages */
179 /* allocate 1M * maxCharLength bytes for at most 1M mappings */
180 sum
=0x100000*mbcsData
->ucm
->states
.maxCharLength
;
182 mbcsData
->fromUBytes
=(uint8_t *)uprv_malloc(sum
);
183 if(mbcsData
->fromUBytes
==NULL
) {
184 fprintf(stderr
, "error: out of memory allocating %ld B for target mappings\n", (long)sum
);
187 /* initialize the all-unassigned first stage 3 block */
188 uprv_memset(mbcsData
->fromUBytes
, 0, 64);
193 /* return TRUE for success */
195 setFallback(MBCSData
*mbcsData
, uint32_t offset
, UChar32 c
) {
196 int32_t i
=ucm_findFallback(mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
, offset
);
198 /* if there is already a fallback for this offset, then overwrite it */
199 mbcsData
->toUFallbacks
[i
].codePoint
=c
;
202 /* if there is no fallback for this offset, then add one */
203 i
=mbcsData
->countToUFallbacks
;
204 if(i
>=MBCS_MAX_FALLBACK_COUNT
) {
205 fprintf(stderr
, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c
);
208 mbcsData
->toUFallbacks
[i
].offset
=offset
;
209 mbcsData
->toUFallbacks
[i
].codePoint
=c
;
210 mbcsData
->countToUFallbacks
=i
+1;
216 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
218 removeFallback(MBCSData
*mbcsData
, uint32_t offset
) {
219 int32_t i
=ucm_findFallback(mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
, offset
);
221 _MBCSToUFallback
*toUFallbacks
;
224 toUFallbacks
=mbcsData
->toUFallbacks
;
225 limit
=mbcsData
->countToUFallbacks
;
226 old
=(int32_t)toUFallbacks
[i
].codePoint
;
228 /* copy the last fallback entry here to keep the list contiguous */
229 toUFallbacks
[i
].offset
=toUFallbacks
[limit
-1].offset
;
230 toUFallbacks
[i
].codePoint
=toUFallbacks
[limit
-1].codePoint
;
231 mbcsData
->countToUFallbacks
=limit
-1;
239 * isFallback is almost a boolean:
240 * 1 (TRUE) this is a fallback mapping
241 * 0 (FALSE) this is a precise mapping
242 * -1 the precision of this mapping is not specified
245 MBCSAddToUnicode(MBCSData
*mbcsData
,
246 const uint8_t *bytes
, int32_t length
,
251 int32_t i
=0, entry
, old
;
254 if(mbcsData
->ucm
->states
.countStates
==0) {
255 fprintf(stderr
, "error: there is no state information!\n");
259 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
260 if(length
==2 && mbcsData
->ucm
->states
.outputType
==MBCS_OUTPUT_2_SISO
) {
265 * Walk down the state table like in conversion,
266 * much like getNextUChar().
267 * We assume that c<=0x10ffff.
270 entry
=mbcsData
->ucm
->states
.stateTable
[state
][bytes
[i
++]];
271 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
273 fprintf(stderr
, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
274 (short)state
, printBytes(buffer
, bytes
, length
), (int)c
);
277 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
278 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
281 fprintf(stderr
, "error: byte sequence too long by %d bytes, final state %hu: 0x%s (U+%x)\n",
282 (int)(length
-i
), state
, printBytes(buffer
, bytes
, length
), (int)c
);
285 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
286 case MBCS_STATE_ILLEGAL
:
287 fprintf(stderr
, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
288 (int)c
, printBytes(buffer
, bytes
, length
));
290 case MBCS_STATE_CHANGE_ONLY
:
291 fprintf(stderr
, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
292 (int)c
, printBytes(buffer
, bytes
, length
));
294 case MBCS_STATE_UNASSIGNED
:
295 fprintf(stderr
, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
296 (int)c
, printBytes(buffer
, bytes
, length
));
298 case MBCS_STATE_FALLBACK_DIRECT_16
:
299 case MBCS_STATE_VALID_DIRECT_16
:
300 case MBCS_STATE_FALLBACK_DIRECT_20
:
301 case MBCS_STATE_VALID_DIRECT_20
:
302 if(MBCS_ENTRY_SET_STATE(entry
, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
303 /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
304 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_DIRECT_16
|| MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_FALLBACK_DIRECT_16
) {
305 old
=MBCS_ENTRY_FINAL_VALUE(entry
);
307 old
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
310 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
311 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
314 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
315 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
318 * Continue after the above warning
319 * if the precision of the mapping is unspecified.
322 /* reassign the correct action code */
323 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, (MBCS_STATE_VALID_DIRECT_16
+(flag
==3 ? 2 : 0)+(c
>=0x10000 ? 1 : 0)));
325 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
327 entry
=MBCS_ENTRY_FINAL_SET_VALUE(entry
, c
);
329 entry
=MBCS_ENTRY_FINAL_SET_VALUE(entry
, c
-0x10000);
331 mbcsData
->ucm
->states
.stateTable
[state
][bytes
[i
-1]]=entry
;
333 case MBCS_STATE_VALID_16
:
334 /* bits 26..16 are not used, 0 */
335 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
336 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
337 /* check that this byte sequence is still unassigned */
338 if((old
=mbcsData
->unicodeCodeUnits
[offset
])!=0xfffe || (old
=removeFallback(mbcsData
, offset
))!=-1) {
340 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
341 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
344 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
345 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
349 fprintf(stderr
, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
350 (int)c
, printBytes(buffer
, bytes
, length
));
354 /* assign only if there is no precise mapping */
355 if(mbcsData
->unicodeCodeUnits
[offset
]==0xfffe) {
356 return setFallback(mbcsData
, offset
, c
);
359 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
362 case MBCS_STATE_VALID_16_PAIR
:
363 /* bits 26..16 are not used, 0 */
364 /* bits 15..7 contain the final offset delta to two 16-bit code units */
365 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
366 /* check that this byte sequence is still unassigned */
367 old
=mbcsData
->unicodeCodeUnits
[offset
];
372 } else if(old
<=0xdfff) {
373 real
=0x10000+((old
&0x3ff)<<10)+((mbcsData
->unicodeCodeUnits
[offset
+1])&0x3ff);
374 } else /* old<=0xe001 */ {
375 real
=mbcsData
->unicodeCodeUnits
[offset
+1];
378 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
379 (int)c
, printBytes(buffer
, bytes
, length
), (int)real
);
382 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
383 (int)c
, printBytes(buffer
, bytes
, length
), (int)real
);
387 /* assign only if there is no precise mapping */
388 if(old
<=0xdbff || old
==0xe000) {
390 } else if(c
<=0xffff) {
391 /* set a BMP fallback code point as a pair with 0xe001 */
392 mbcsData
->unicodeCodeUnits
[offset
++]=0xe001;
393 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
395 /* set a fallback surrogate pair with two second surrogates */
396 mbcsData
->unicodeCodeUnits
[offset
++]=(uint16_t)(0xdbc0+(c
>>10));
397 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)(0xdc00+(c
&0x3ff));
401 /* set a BMP code point */
402 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
403 } else if(c
<=0xffff) {
404 /* set a BMP code point above 0xd800 as a pair with 0xe000 */
405 mbcsData
->unicodeCodeUnits
[offset
++]=0xe000;
406 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
408 /* set a surrogate pair */
409 mbcsData
->unicodeCodeUnits
[offset
++]=(uint16_t)(0xd7c0+(c
>>10));
410 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)(0xdc00+(c
&0x3ff));
415 /* reserved, must never occur */
416 fprintf(stderr
, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
417 (int)entry
, printBytes(buffer
, bytes
, length
), (int)c
);
426 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
428 MBCSIsValid(NewConverter
*cnvData
,
429 const uint8_t *bytes
, int32_t length
) {
430 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
432 return (UBool
)(1==ucm_countChars(&mbcsData
->ucm
->states
, bytes
, length
));
436 MBCSSingleAddFromUnicode(MBCSData
*mbcsData
,
437 const uint8_t *bytes
, int32_t length
,
445 /* ignore |2 SUB mappings */
451 * Walk down the triple-stage compact array ("trie") and
452 * allocate parts as necessary.
453 * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
454 * We assume that length<=maxCharLength and that c<=0x10ffff.
458 /* inspect stage 1 */
460 if(mbcsData
->stage1
[index
]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
) {
461 /* allocate another block in stage 2 */
462 if(mbcsData
->stage2Top
>=MBCS_MAX_STAGE_2_TOP
) {
463 fprintf(stderr
, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c
, b
);
468 * each stage 2 block contains 64 16-bit words:
469 * 6 code point bits 9..4 with 1 stage 3 index
471 mbcsData
->stage1
[index
]=(uint16_t)mbcsData
->stage2Top
;
472 mbcsData
->stage2Top
+=MBCS_STAGE_2_BLOCK_SIZE
;
475 /* inspect stage 2 */
476 index
=(uint32_t)mbcsData
->stage1
[index
]+((c
>>4)&0x3f);
477 if(mbcsData
->stage2Single
[index
]==0) {
478 /* allocate another block in stage 3 */
479 if(mbcsData
->stage3Top
>=0x10000) {
480 fprintf(stderr
, "error: too many code points at U+%04x<->0x%02x\n", (int)c
, b
);
483 /* each block has 16 uint16_t entries */
484 mbcsData
->stage2Single
[index
]=(uint16_t)mbcsData
->stage3Top
;
485 uprv_memset(mbcsData
->fromUBytes
+2*mbcsData
->stage3Top
, 0, 32);
486 mbcsData
->stage3Top
+=16;
489 /* write the codepage entry into stage 3 and get the previous entry */
490 p
=(uint16_t *)mbcsData
->fromUBytes
+mbcsData
->stage2Single
[index
]+(c
&0xf);
493 *p
=(uint16_t)(0xf00|b
);
494 } else if(IS_PRIVATE_USE(c
)) {
495 *p
=(uint16_t)(0xc00|b
);
497 *p
=(uint16_t)(0x800|b
);
500 /* check that this Unicode code point was still unassigned */
503 fprintf(stderr
, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
504 (int)c
, b
, old
&0xff);
507 fprintf(stderr
, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
508 (int)c
, b
, old
&0xff);
510 /* continue after the above warning if the precision of the mapping is unspecified */
517 MBCSAddFromUnicode(MBCSData
*mbcsData
,
518 const uint8_t *bytes
, int32_t length
,
524 uint32_t index
, b
, old
;
525 int32_t maxCharLength
;
527 /* ignore |2 SUB mappings */
532 maxCharLength
=mbcsData
->ucm
->states
.maxCharLength
;
534 if(maxCharLength
==1) {
535 return MBCSSingleAddFromUnicode(mbcsData
, bytes
, length
, c
, flag
);
538 if( mbcsData
->ucm
->states
.outputType
==MBCS_OUTPUT_2_SISO
&&
539 (*bytes
==0xe || *bytes
==0xf)
541 fprintf(stderr
, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
542 (int)c
, printBytes(buffer
, bytes
, length
));
546 if(flag
==1 && length
==1 && *bytes
==0) {
547 fprintf(stderr
, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
553 * Walk down the triple-stage compact array ("trie") and
554 * allocate parts as necessary.
555 * Note that the first stage 2 and 3 blocks are reserved for
556 * all-unassigned mappings.
557 * We assume that length<=maxCharLength and that c<=0x10ffff.
560 /* inspect stage 1 */
562 if(mbcsData
->stage1
[index
]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
) {
563 /* allocate another block in stage 2 */
564 if(mbcsData
->stage2Top
>=MBCS_MAX_STAGE_2_TOP
) {
565 fprintf(stderr
, "error: too many stage 2 entries at U+%04x<->0x%s\n",
566 (int)c
, printBytes(buffer
, bytes
, length
));
571 * each stage 2 block contains 64 32-bit words:
572 * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
574 mbcsData
->stage1
[index
]=(uint16_t)mbcsData
->stage2Top
;
575 mbcsData
->stage2Top
+=MBCS_STAGE_2_BLOCK_SIZE
;
578 /* inspect stage 2 */
579 index
=mbcsData
->stage1
[index
]+((c
>>4)&0x3f);
580 if(mbcsData
->stage2
[index
]==0) {
581 /* allocate another block in stage 3 */
582 if(mbcsData
->stage3Top
>=0x100000*(uint32_t)maxCharLength
) {
583 fprintf(stderr
, "error: too many code points at U+%04x<->0x%s\n",
584 (int)c
, printBytes(buffer
, bytes
, length
));
587 /* each block has 16*maxCharLength bytes */
588 mbcsData
->stage2
[index
]=(mbcsData
->stage3Top
/16)/maxCharLength
;
589 uprv_memset(mbcsData
->fromUBytes
+mbcsData
->stage3Top
, 0, 16*maxCharLength
);
590 mbcsData
->stage3Top
+=16*maxCharLength
;
593 /* write the codepage bytes into stage 3 and get the previous bytes */
595 /* assemble the bytes into a single integer */
612 p
=mbcsData
->fromUBytes
+(16*(uint32_t)(uint16_t)mbcsData
->stage2
[index
]+(c
&0xf))*maxCharLength
;
613 switch(maxCharLength
) {
616 *(uint16_t *)p
=(uint16_t)b
;
619 old
=(uint32_t)*p
<<16;
620 *p
++=(uint8_t)(b
>>16);
621 old
|=(uint32_t)*p
<<8;
622 *p
++=(uint8_t)(b
>>8);
631 /* will never occur */
635 /* check that this Unicode code point was still unassigned */
636 if((mbcsData
->stage2
[index
]&(1UL<<(16+(c
&0xf))))!=0 || old
!=0) {
638 fprintf(stderr
, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
639 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
642 fprintf(stderr
, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
643 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
645 /* continue after the above warning if the precision of the mapping is
649 /* set the roundtrip flag */
650 mbcsData
->stage2
[index
]|=(1UL<<(16+(c
&0xf)));
656 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
658 MBCSAddTable(NewConverter
*cnvData
, UCMTable
*table
, UConverterStaticData
*staticData
) {
665 staticData
->unicodeMask
=table
->unicodeMask
;
666 if(staticData
->unicodeMask
==3) {
667 fprintf(stderr
, "error: contains mappings for both supplementary and surrogate code points\n");
671 staticData
->conversionType
=UCNV_MBCS
;
673 mbcsData
=(MBCSData
*)cnvData
;
675 if(!MBCSStartMappings(mbcsData
)) {
682 for(i
=0; i
<table
->mappingsLength
; ++m
, ++i
) {
687 /* there was no precision/fallback indicator */
688 /* fall through to set the mappings */
690 /* set roundtrip mappings */
691 isOK
&=MBCSAddToUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
) &&
692 MBCSAddFromUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
695 /* set only a fallback mapping from Unicode to codepage */
696 staticData
->hasFromUnicodeFallback
=TRUE
;
697 isOK
&=MBCSAddFromUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
700 /* ignore |2 SUB mappings */
703 /* set only a fallback mapping from codepage to Unicode */
704 staticData
->hasToUnicodeFallback
=TRUE
;
705 isOK
&=MBCSAddToUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
708 /* will not occur because the parser checked it already */
709 fprintf(stderr
, "error: illegal fallback indicator %d\n", m
->f
);
714 MBCSPostprocess(mbcsData
, staticData
);
720 transformEUC(MBCSData
*mbcsData
) {
722 uint32_t i
, value
, oldLength
, old3Top
, new3Top
;
725 oldLength
=mbcsData
->ucm
->states
.maxCharLength
;
730 old3Top
=mbcsData
->stage3Top
;
732 /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
734 /* test if all first bytes are in {0, 0x8e, 0x8f} */
735 p8
=mbcsData
->fromUBytes
;
743 for(i
=0; i
<old3Top
; i
+=oldLength
) {
745 if(b
!=0 && b
!=0x8e && b
!=0x8f) {
746 /* some first byte does not fit the EUC pattern, nothing to be done */
750 /* restore p if it was modified above */
751 p8
=mbcsData
->fromUBytes
;
753 /* modify outputType and adjust stage3Top */
754 mbcsData
->ucm
->states
.outputType
=(int8_t)(MBCS_OUTPUT_3_EUC
+oldLength
-3);
755 mbcsData
->stage3Top
=new3Top
=(old3Top
*(oldLength
-1))/oldLength
;
758 * EUC-encode all byte sequences;
759 * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
760 * p. 161 in chapter 4 "Encoding Methods"
762 * This also must reverse the byte order if the platform is little-endian!
765 uint16_t *q
=(uint16_t *)p8
;
766 for(i
=0; i
<old3Top
; i
+=oldLength
) {
769 /* short sequences are stored directly */
770 /* code set 0 or 1 */
771 (*q
++)=(uint16_t)((p8
[1]<<8)|p8
[2]);
774 (*q
++)=(uint16_t)(((p8
[1]&0x7f)<<8)|p8
[2]);
775 } else /* b==0x8f */ {
777 (*q
++)=(uint16_t)((p8
[1]<<8)|(p8
[2]&0x7f));
781 } else /* oldLength==4 */ {
783 uint32_t *p32
=(uint32_t *)p8
;
784 for(i
=0; i
<old3Top
; i
+=4) {
786 if(value
<=0xffffff) {
787 /* short sequences are stored directly */
788 /* code set 0 or 1 */
789 (*q
++)=(uint8_t)(value
>>16);
790 (*q
++)=(uint8_t)(value
>>8);
791 (*q
++)=(uint8_t)value
;
792 } else if(value
<=0x8effffff) {
794 (*q
++)=(uint8_t)((value
>>16)&0x7f);
795 (*q
++)=(uint8_t)(value
>>8);
796 (*q
++)=(uint8_t)value
;
797 } else /* first byte is 0x8f */ {
799 (*q
++)=(uint8_t)(value
>>16);
800 (*q
++)=(uint8_t)((value
>>8)&0x7f);
801 (*q
++)=(uint8_t)value
;
810 * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
811 * as possible. Overlapping is done on unassigned head and tail
812 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
813 * Stage 1 indexes need to be adjusted accordingly.
814 * This function is very similar to genprops/store.c/compactStage().
817 singleCompactStage2(MBCSData
*mbcsData
) {
818 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
819 uint16_t map
[MBCS_STAGE_2_MAX_BLOCKS
];
820 uint16_t i
, start
, prevEnd
, newStart
;
822 /* enter the all-unassigned first stage 2 block into the map */
823 map
[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
825 /* begin with the first block after the all-unassigned one */
826 start
=newStart
=MBCS_STAGE_2_FIRST_ASSIGNED
;
827 while(start
<mbcsData
->stage2Top
) {
828 prevEnd
=(uint16_t)(newStart
-1);
830 /* find the size of the overlap */
831 for(i
=0; i
<MBCS_STAGE_2_BLOCK_SIZE
&& mbcsData
->stage2Single
[start
+i
]==0 && mbcsData
->stage2Single
[prevEnd
-i
]==0; ++i
) {}
834 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=(uint16_t)(newStart
-i
);
836 /* move the non-overlapping indexes to their new positions */
838 for(i
=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE
-i
); i
>0; --i
) {
839 mbcsData
->stage2Single
[newStart
++]=mbcsData
->stage2Single
[start
++];
841 } else if(newStart
<start
) {
842 /* move the indexes to their new positions */
843 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=newStart
;
844 for(i
=MBCS_STAGE_2_BLOCK_SIZE
; i
>0; --i
) {
845 mbcsData
->stage2Single
[newStart
++]=mbcsData
->stage2Single
[start
++];
847 } else /* no overlap && newStart==start */ {
848 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=start
;
849 start
=newStart
+=MBCS_STAGE_2_BLOCK_SIZE
;
853 /* adjust stage2Top */
854 if(VERBOSE
&& newStart
<mbcsData
->stage2Top
) {
855 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
856 (unsigned long)mbcsData
->stage2Top
, (unsigned long)newStart
,
857 (long)(mbcsData
->stage2Top
-newStart
)*2);
859 mbcsData
->stage2Top
=newStart
;
861 /* now adjust stage 1 */
862 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
863 mbcsData
->stage1
[i
]=map
[mbcsData
->stage1
[i
]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
];
867 /* Compact stage 3 for SBCS - same algorithm as above. */
869 singleCompactStage3(MBCSData
*mbcsData
) {
870 uint16_t *stage3
=(uint16_t *)mbcsData
->fromUBytes
;
872 /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
873 uint16_t map
[0x1000];
874 uint16_t i
, start
, prevEnd
, newStart
;
876 /* enter the all-unassigned first stage 3 block into the map */
879 /* begin with the first block after the all-unassigned one */
881 while(start
<mbcsData
->stage3Top
) {
882 prevEnd
=(uint16_t)(newStart
-1);
884 /* find the size of the overlap */
885 for(i
=0; i
<16 && stage3
[start
+i
]==0 && stage3
[prevEnd
-i
]==0; ++i
) {}
888 map
[start
>>4]=(uint16_t)(newStart
-i
);
890 /* move the non-overlapping indexes to their new positions */
892 for(i
=(uint16_t)(16-i
); i
>0; --i
) {
893 stage3
[newStart
++]=stage3
[start
++];
895 } else if(newStart
<start
) {
896 /* move the indexes to their new positions */
897 map
[start
>>4]=newStart
;
898 for(i
=16; i
>0; --i
) {
899 stage3
[newStart
++]=stage3
[start
++];
901 } else /* no overlap && newStart==start */ {
907 /* adjust stage3Top */
908 if(VERBOSE
&& newStart
<mbcsData
->stage3Top
) {
909 printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
910 (unsigned long)mbcsData
->stage3Top
, (unsigned long)newStart
,
911 (long)(mbcsData
->stage3Top
-newStart
)*2);
913 mbcsData
->stage3Top
=newStart
;
915 /* now adjust stage 2 */
916 for(i
=0; i
<mbcsData
->stage2Top
; ++i
) {
917 mbcsData
->stage2Single
[i
]=map
[mbcsData
->stage2Single
[i
]>>4];
922 * Compact stage 2 by overlapping adjacent stage 2 blocks as far
923 * as possible. Overlapping is done on unassigned head and tail
924 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
925 * Stage 1 indexes need to be adjusted accordingly.
926 * This function is very similar to genprops/store.c/compactStage().
929 compactStage2(MBCSData
*mbcsData
) {
930 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
931 uint16_t map
[MBCS_STAGE_2_MAX_BLOCKS
];
932 uint16_t i
, start
, prevEnd
, newStart
;
934 /* enter the all-unassigned first stage 2 block into the map */
935 map
[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
937 /* begin with the first block after the all-unassigned one */
938 start
=newStart
=MBCS_STAGE_2_FIRST_ASSIGNED
;
939 while(start
<mbcsData
->stage2Top
) {
940 prevEnd
=(uint16_t)(newStart
-1);
942 /* find the size of the overlap */
943 for(i
=0; i
<MBCS_STAGE_2_BLOCK_SIZE
&& mbcsData
->stage2
[start
+i
]==0 && mbcsData
->stage2
[prevEnd
-i
]==0; ++i
) {}
946 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=(uint16_t)(newStart
-i
);
948 /* move the non-overlapping indexes to their new positions */
950 for(i
=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE
-i
); i
>0; --i
) {
951 mbcsData
->stage2
[newStart
++]=mbcsData
->stage2
[start
++];
953 } else if(newStart
<start
) {
954 /* move the indexes to their new positions */
955 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=newStart
;
956 for(i
=MBCS_STAGE_2_BLOCK_SIZE
; i
>0; --i
) {
957 mbcsData
->stage2
[newStart
++]=mbcsData
->stage2
[start
++];
959 } else /* no overlap && newStart==start */ {
960 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=start
;
961 start
=newStart
+=MBCS_STAGE_2_BLOCK_SIZE
;
965 /* adjust stage2Top */
966 if(VERBOSE
&& newStart
<mbcsData
->stage2Top
) {
967 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
968 (unsigned long)mbcsData
->stage2Top
, (unsigned long)newStart
,
969 (long)(mbcsData
->stage2Top
-newStart
)*4);
971 mbcsData
->stage2Top
=newStart
;
973 /* now adjust stage 1 */
974 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
975 mbcsData
->stage1
[i
]=map
[mbcsData
->stage1
[i
]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
];
980 MBCSPostprocess(MBCSData
*mbcsData
, const UConverterStaticData
*staticData
) {
982 int32_t maxCharLength
;
984 states
=&mbcsData
->ucm
->states
;
985 maxCharLength
=states
->maxCharLength
;
987 /* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
989 printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
990 (unsigned long)mbcsData
->stage3Top
/maxCharLength
,
991 (unsigned long)mbcsData
->stage3Top
/maxCharLength
);
994 ucm_optimizeStates(states
,
995 &mbcsData
->unicodeCodeUnits
,
996 mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
,
999 /* try to compact the fromUnicode tables */
1000 transformEUC(mbcsData
);
1001 if(maxCharLength
==1) {
1002 singleCompactStage3(mbcsData
);
1003 singleCompactStage2(mbcsData
);
1005 compactStage2(mbcsData
);
1010 MBCSWrite(NewConverter
*cnvData
, const UConverterStaticData
*staticData
,
1011 UNewDataMemory
*pData
, int32_t tableType
) {
1012 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
1014 int32_t i
, stage1Top
;
1016 _MBCSHeader header
={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
1018 /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1019 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
1020 if(staticData
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
1021 stage1Top
=MBCS_STAGE_1_SIZE
; /* 0x440==1088 */
1023 stage1Top
=0x40; /* 0x40==64 */
1025 for(i
=0; i
<stage1Top
; ++i
) {
1026 mbcsData
->stage1
[i
]+=(uint16_t)stage1Top
;
1029 /* stage2Top has counted 16-bit results, now we need to count bytes */
1030 mbcsData
->stage2Top
*=2;
1032 /* stage3Top has counted 16-bit results, now we need to count bytes */
1033 mbcsData
->stage3Top
*=2;
1035 if(staticData
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
1036 stage1Top
=MBCS_STAGE_1_SIZE
; /* 0x440==1088 */
1038 stage1Top
=0x40; /* 0x40==64 */
1040 for(i
=0; i
<stage1Top
; ++i
) {
1041 mbcsData
->stage1
[i
]+=(uint16_t)stage1Top
/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1044 /* stage2Top has counted 32-bit results, now we need to count bytes */
1045 mbcsData
->stage2Top
*=4;
1047 /* stage3Top has already counted bytes */
1050 /* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
1051 mbcsData
->stage2Top
=(mbcsData
->stage2Top
+3)&~3;
1052 mbcsData
->stage3Top
=(mbcsData
->stage3Top
+3)&~3;
1054 /* fill the header */
1055 header
.version
[0]=4;
1056 header
.version
[1]=2;
1057 header
.countStates
=mbcsData
->ucm
->states
.countStates
;
1058 header
.countToUFallbacks
=mbcsData
->countToUFallbacks
;
1060 header
.offsetToUCodeUnits
=
1061 sizeof(_MBCSHeader
)+
1062 mbcsData
->ucm
->states
.countStates
*1024+
1063 mbcsData
->countToUFallbacks
*sizeof(_MBCSToUFallback
);
1064 header
.offsetFromUTable
=
1065 header
.offsetToUCodeUnits
+
1066 mbcsData
->ucm
->states
.countToUCodeUnits
*2;
1067 header
.offsetFromUBytes
=
1068 header
.offsetFromUTable
+
1070 mbcsData
->stage2Top
;
1071 header
.fromUBytesLength
=mbcsData
->stage3Top
;
1073 top
=header
.offsetFromUBytes
+header
.fromUBytesLength
;
1075 header
.flags
=(uint8_t)(mbcsData
->ucm
->states
.outputType
);
1077 if(tableType
&TABLE_EXT
) {
1079 fprintf(stderr
, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top
);
1083 header
.flags
|=top
<<8;
1086 /* write the MBCS data */
1087 udata_writeBlock(pData
, &header
, sizeof(_MBCSHeader
));
1088 udata_writeBlock(pData
, mbcsData
->ucm
->states
.stateTable
, header
.countStates
*1024);
1089 udata_writeBlock(pData
, mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
*sizeof(_MBCSToUFallback
));
1090 udata_writeBlock(pData
, mbcsData
->unicodeCodeUnits
, mbcsData
->ucm
->states
.countToUCodeUnits
*2);
1091 udata_writeBlock(pData
, mbcsData
->stage1
, stage1Top
*2);
1092 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
1093 udata_writeBlock(pData
, mbcsData
->stage2Single
, mbcsData
->stage2Top
);
1095 udata_writeBlock(pData
, mbcsData
->stage2
, mbcsData
->stage2Top
);
1097 udata_writeBlock(pData
, mbcsData
->fromUBytes
, mbcsData
->stage3Top
);
1099 /* return the number of bytes that should have been written */
1100 return header
.offsetFromUBytes
+header
.fromUBytesLength
;