2 *******************************************************************************
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000jul06
14 * created by: Markus W. Scherer
18 #include "unicode/utypes.h"
28 typedef struct MBCSData
{
29 NewConverter newConverter
;
33 /* toUnicode (state table in ucm->states) */
34 _MBCSToUFallback toUFallbacks
[MBCS_MAX_FALLBACK_COUNT
];
35 int32_t countToUFallbacks
;
36 uint16_t *unicodeCodeUnits
;
39 uint16_t stage1
[MBCS_STAGE_1_SIZE
];
40 uint16_t stage2Single
[MBCS_STAGE_2_SIZE
]; /* stage 2 for single-byte codepages */
41 uint32_t stage2
[MBCS_STAGE_2_SIZE
]; /* stage 2 for MBCS */
43 uint32_t stage2Top
, stage3Top
;
48 MBCSClose(NewConverter
*cnvData
);
51 MBCSStartMappings(MBCSData
*mbcsData
);
54 MBCSAddToUnicode(MBCSData
*mbcsData
,
55 const uint8_t *bytes
, int32_t length
,
60 MBCSIsValid(NewConverter
*cnvData
,
61 const uint8_t *bytes
, int32_t length
);
64 MBCSSingleAddFromUnicode(MBCSData
*mbcsData
,
65 const uint8_t *bytes
, int32_t length
,
70 MBCSAddFromUnicode(MBCSData
*mbcsData
,
71 const uint8_t *bytes
, int32_t length
,
76 MBCSPostprocess(MBCSData
*mbcsData
, const UConverterStaticData
*staticData
);
79 MBCSAddTable(NewConverter
*cnvData
, UCMTable
*table
, UConverterStaticData
*staticData
);
82 MBCSWrite(NewConverter
*cnvData
, const UConverterStaticData
*staticData
,
83 UNewDataMemory
*pData
, int32_t tableType
);
85 /* helper ------------------------------------------------------------------- */
88 hexDigit(uint8_t digit
) {
89 return digit
<=9 ? (char)('0'+digit
) : (char)('a'-10+digit
);
92 static U_INLINE
char *
93 printBytes(char *buffer
, const uint8_t *bytes
, int32_t length
) {
96 *s
++=hexDigit((uint8_t)(*bytes
>>4));
97 *s
++=hexDigit((uint8_t)(*bytes
&0xf));
106 /* implementation ----------------------------------------------------------- */
109 MBCSInit(MBCSData
*mbcsData
, UCMFile
*ucm
) {
110 int32_t i
, maxCharLength
;
112 uprv_memset(mbcsData
, 0, sizeof(MBCSData
));
114 maxCharLength
=ucm
->states
.maxCharLength
;
116 mbcsData
->ucm
=ucm
; /* aliased, not owned */
118 mbcsData
->newConverter
.close
=MBCSClose
;
119 mbcsData
->newConverter
.isValid
=MBCSIsValid
;
120 mbcsData
->newConverter
.addTable
=MBCSAddTable
;
121 mbcsData
->newConverter
.write
=MBCSWrite
;
123 mbcsData
->stage2Top
=MBCS_STAGE_2_FIRST_ASSIGNED
; /* after stage 1 and one all-unassigned stage 2 block */
124 mbcsData
->stage3Top
=16*maxCharLength
; /* after one all-unassigned stage 3 block */
126 /* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
127 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
128 mbcsData
->stage1
[i
]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
133 MBCSOpen(UCMFile
*ucm
) {
134 MBCSData
*mbcsData
=(MBCSData
*)uprv_malloc(sizeof(MBCSData
));
136 MBCSInit(mbcsData
, ucm
);
138 return &mbcsData
->newConverter
;
142 MBCSClose(NewConverter
*cnvData
) {
143 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
145 uprv_free(mbcsData
->unicodeCodeUnits
);
146 uprv_free(mbcsData
->fromUBytes
);
152 MBCSStartMappings(MBCSData
*mbcsData
) {
155 /* allocate the code unit array and prefill it with "unassigned" values */
156 sum
=mbcsData
->ucm
->states
.countToUCodeUnits
;
158 printf("the total number of offsets is 0x%lx=%ld\n", (long)sum
, (long)sum
);
162 mbcsData
->unicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
163 if(mbcsData
->unicodeCodeUnits
==NULL
) {
164 fprintf(stderr
, "error: out of memory allocating %ld 16-bit code units\n",
168 for(i
=0; i
<sum
; ++i
) {
169 mbcsData
->unicodeCodeUnits
[i
]=0xfffe;
173 /* allocate the codepage mappings and preset the first 16 characters to 0 */
174 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
175 /* allocate 64k 16-bit results for single-byte codepages */
178 /* allocate 1M * maxCharLength bytes for at most 1M mappings */
179 sum
=0x100000*mbcsData
->ucm
->states
.maxCharLength
;
181 mbcsData
->fromUBytes
=(uint8_t *)uprv_malloc(sum
);
182 if(mbcsData
->fromUBytes
==NULL
) {
183 fprintf(stderr
, "error: out of memory allocating %ld B for target mappings\n", (long)sum
);
186 /* initialize the all-unassigned first stage 3 block */
187 uprv_memset(mbcsData
->fromUBytes
, 0, 64);
192 /* return TRUE for success */
194 setFallback(MBCSData
*mbcsData
, uint32_t offset
, UChar32 c
) {
195 int32_t i
=ucm_findFallback(mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
, offset
);
197 /* if there is already a fallback for this offset, then overwrite it */
198 mbcsData
->toUFallbacks
[i
].codePoint
=c
;
201 /* if there is no fallback for this offset, then add one */
202 i
=mbcsData
->countToUFallbacks
;
203 if(i
>=MBCS_MAX_FALLBACK_COUNT
) {
204 fprintf(stderr
, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c
);
207 mbcsData
->toUFallbacks
[i
].offset
=offset
;
208 mbcsData
->toUFallbacks
[i
].codePoint
=c
;
209 mbcsData
->countToUFallbacks
=i
+1;
215 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
217 removeFallback(MBCSData
*mbcsData
, uint32_t offset
) {
218 int32_t i
=ucm_findFallback(mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
, offset
);
220 _MBCSToUFallback
*toUFallbacks
;
223 toUFallbacks
=mbcsData
->toUFallbacks
;
224 limit
=mbcsData
->countToUFallbacks
;
225 old
=(int32_t)toUFallbacks
[i
].codePoint
;
227 /* copy the last fallback entry here to keep the list contiguous */
228 toUFallbacks
[i
].offset
=toUFallbacks
[limit
-1].offset
;
229 toUFallbacks
[i
].codePoint
=toUFallbacks
[limit
-1].codePoint
;
230 mbcsData
->countToUFallbacks
=limit
-1;
238 * isFallback is almost a boolean:
239 * 1 (TRUE) this is a fallback mapping
240 * 0 (FALSE) this is a precise mapping
241 * -1 the precision of this mapping is not specified
244 MBCSAddToUnicode(MBCSData
*mbcsData
,
245 const uint8_t *bytes
, int32_t length
,
250 int32_t i
=0, entry
, old
;
253 if(mbcsData
->ucm
->states
.countStates
==0) {
254 fprintf(stderr
, "error: there is no state information!\n");
258 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
259 if(length
==2 && mbcsData
->ucm
->states
.outputType
==MBCS_OUTPUT_2_SISO
) {
264 * Walk down the state table like in conversion,
265 * much like getNextUChar().
266 * We assume that c<=0x10ffff.
269 entry
=mbcsData
->ucm
->states
.stateTable
[state
][bytes
[i
++]];
270 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
272 fprintf(stderr
, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
273 (short)state
, printBytes(buffer
, bytes
, length
), (int)c
);
276 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
277 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
280 fprintf(stderr
, "error: byte sequence too long by %d bytes, final state %hu: 0x%s (U+%x)\n",
281 (int)(length
-i
), state
, printBytes(buffer
, bytes
, length
), (int)c
);
284 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
285 case MBCS_STATE_ILLEGAL
:
286 fprintf(stderr
, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
287 (int)c
, printBytes(buffer
, bytes
, length
));
289 case MBCS_STATE_CHANGE_ONLY
:
290 fprintf(stderr
, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
291 (int)c
, printBytes(buffer
, bytes
, length
));
293 case MBCS_STATE_UNASSIGNED
:
294 fprintf(stderr
, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
295 (int)c
, printBytes(buffer
, bytes
, length
));
297 case MBCS_STATE_FALLBACK_DIRECT_16
:
298 case MBCS_STATE_VALID_DIRECT_16
:
299 case MBCS_STATE_FALLBACK_DIRECT_20
:
300 case MBCS_STATE_VALID_DIRECT_20
:
301 if(MBCS_ENTRY_SET_STATE(entry
, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
302 /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
303 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_DIRECT_16
|| MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_FALLBACK_DIRECT_16
) {
304 old
=MBCS_ENTRY_FINAL_VALUE(entry
);
306 old
=0x10000+MBCS_ENTRY_FINAL_VALUE(entry
);
309 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
310 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
313 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
314 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
317 * Continue after the above warning
318 * if the precision of the mapping is unspecified.
321 /* reassign the correct action code */
322 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, (MBCS_STATE_VALID_DIRECT_16
+(flag
==3 ? 2 : 0)+(c
>=0x10000 ? 1 : 0)));
324 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
326 entry
=MBCS_ENTRY_FINAL_SET_VALUE(entry
, c
);
328 entry
=MBCS_ENTRY_FINAL_SET_VALUE(entry
, c
-0x10000);
330 mbcsData
->ucm
->states
.stateTable
[state
][bytes
[i
-1]]=entry
;
332 case MBCS_STATE_VALID_16
:
333 /* bits 26..16 are not used, 0 */
334 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
335 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
336 /* check that this byte sequence is still unassigned */
337 if((old
=mbcsData
->unicodeCodeUnits
[offset
])!=0xfffe || (old
=removeFallback(mbcsData
, offset
))!=-1) {
339 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
340 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
343 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
344 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
348 fprintf(stderr
, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
349 (int)c
, printBytes(buffer
, bytes
, length
));
353 /* assign only if there is no precise mapping */
354 if(mbcsData
->unicodeCodeUnits
[offset
]==0xfffe) {
355 return setFallback(mbcsData
, offset
, c
);
358 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
361 case MBCS_STATE_VALID_16_PAIR
:
362 /* bits 26..16 are not used, 0 */
363 /* bits 15..7 contain the final offset delta to two 16-bit code units */
364 offset
+=MBCS_ENTRY_FINAL_VALUE_16(entry
);
365 /* check that this byte sequence is still unassigned */
366 old
=mbcsData
->unicodeCodeUnits
[offset
];
371 } else if(old
<=0xdfff) {
372 real
=0x10000+((old
&0x3ff)<<10)+((mbcsData
->unicodeCodeUnits
[offset
+1])&0x3ff);
373 } else /* old<=0xe001 */ {
374 real
=mbcsData
->unicodeCodeUnits
[offset
+1];
377 fprintf(stderr
, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
378 (int)c
, printBytes(buffer
, bytes
, length
), (int)real
);
381 fprintf(stderr
, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
382 (int)c
, printBytes(buffer
, bytes
, length
), (int)real
);
386 /* assign only if there is no precise mapping */
387 if(old
<=0xdbff || old
==0xe000) {
389 } else if(c
<=0xffff) {
390 /* set a BMP fallback code point as a pair with 0xe001 */
391 mbcsData
->unicodeCodeUnits
[offset
++]=0xe001;
392 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
394 /* set a fallback surrogate pair with two second surrogates */
395 mbcsData
->unicodeCodeUnits
[offset
++]=(uint16_t)(0xdbc0+(c
>>10));
396 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)(0xdc00+(c
&0x3ff));
400 /* set a BMP code point */
401 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
402 } else if(c
<=0xffff) {
403 /* set a BMP code point above 0xd800 as a pair with 0xe000 */
404 mbcsData
->unicodeCodeUnits
[offset
++]=0xe000;
405 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)c
;
407 /* set a surrogate pair */
408 mbcsData
->unicodeCodeUnits
[offset
++]=(uint16_t)(0xd7c0+(c
>>10));
409 mbcsData
->unicodeCodeUnits
[offset
]=(uint16_t)(0xdc00+(c
&0x3ff));
414 /* reserved, must never occur */
415 fprintf(stderr
, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
416 (int)entry
, printBytes(buffer
, bytes
, length
), (int)c
);
425 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
427 MBCSIsValid(NewConverter
*cnvData
,
428 const uint8_t *bytes
, int32_t length
) {
429 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
431 return (UBool
)(1==ucm_countChars(&mbcsData
->ucm
->states
, bytes
, length
));
435 MBCSSingleAddFromUnicode(MBCSData
*mbcsData
,
436 const uint8_t *bytes
, int32_t length
,
444 /* ignore |2 SUB mappings */
450 * Walk down the triple-stage compact array ("trie") and
451 * allocate parts as necessary.
452 * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
453 * We assume that length<=maxCharLength and that c<=0x10ffff.
457 /* inspect stage 1 */
459 if(mbcsData
->stage1
[index
]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
) {
460 /* allocate another block in stage 2 */
461 if(mbcsData
->stage2Top
>=MBCS_MAX_STAGE_2_TOP
) {
462 fprintf(stderr
, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c
, b
);
467 * each stage 2 block contains 64 16-bit words:
468 * 6 code point bits 9..4 with 1 stage 3 index
470 mbcsData
->stage1
[index
]=(uint16_t)mbcsData
->stage2Top
;
471 mbcsData
->stage2Top
+=MBCS_STAGE_2_BLOCK_SIZE
;
474 /* inspect stage 2 */
475 index
=(uint32_t)mbcsData
->stage1
[index
]+((c
>>4)&0x3f);
476 if(mbcsData
->stage2Single
[index
]==0) {
477 /* allocate another block in stage 3 */
478 if(mbcsData
->stage3Top
>=0x10000) {
479 fprintf(stderr
, "error: too many code points at U+%04x<->0x%02x\n", (int)c
, b
);
482 /* each block has 16 uint16_t entries */
483 mbcsData
->stage2Single
[index
]=(uint16_t)mbcsData
->stage3Top
;
484 uprv_memset(mbcsData
->fromUBytes
+2*mbcsData
->stage3Top
, 0, 32);
485 mbcsData
->stage3Top
+=16;
488 /* write the codepage entry into stage 3 and get the previous entry */
489 p
=(uint16_t *)mbcsData
->fromUBytes
+mbcsData
->stage2Single
[index
]+(c
&0xf);
492 *p
=(uint16_t)(0xf00|b
);
493 } else if(IS_PRIVATE_USE(c
)) {
494 *p
=(uint16_t)(0xc00|b
);
496 *p
=(uint16_t)(0x800|b
);
499 /* check that this Unicode code point was still unassigned */
502 fprintf(stderr
, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
503 (int)c
, b
, old
&0xff);
506 fprintf(stderr
, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
507 (int)c
, b
, old
&0xff);
509 /* continue after the above warning if the precision of the mapping is unspecified */
516 MBCSAddFromUnicode(MBCSData
*mbcsData
,
517 const uint8_t *bytes
, int32_t length
,
523 uint32_t index
, b
, old
;
524 int32_t maxCharLength
;
526 /* ignore |2 SUB mappings */
531 maxCharLength
=mbcsData
->ucm
->states
.maxCharLength
;
533 if(maxCharLength
==1) {
534 return MBCSSingleAddFromUnicode(mbcsData
, bytes
, length
, c
, flag
);
537 if( mbcsData
->ucm
->states
.outputType
==MBCS_OUTPUT_2_SISO
&&
538 (*bytes
==0xe || *bytes
==0xf)
540 fprintf(stderr
, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
541 (int)c
, printBytes(buffer
, bytes
, length
));
545 if(flag
==1 && length
==1 && *bytes
==0) {
546 fprintf(stderr
, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
552 * Walk down the triple-stage compact array ("trie") and
553 * allocate parts as necessary.
554 * Note that the first stage 2 and 3 blocks are reserved for
555 * all-unassigned mappings.
556 * We assume that length<=maxCharLength and that c<=0x10ffff.
559 /* inspect stage 1 */
561 if(mbcsData
->stage1
[index
]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
) {
562 /* allocate another block in stage 2 */
563 if(mbcsData
->stage2Top
>=MBCS_MAX_STAGE_2_TOP
) {
564 fprintf(stderr
, "error: too many stage 2 entries at U+%04x<->0x%s\n",
565 (int)c
, printBytes(buffer
, bytes
, length
));
570 * each stage 2 block contains 64 32-bit words:
571 * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
573 mbcsData
->stage1
[index
]=(uint16_t)mbcsData
->stage2Top
;
574 mbcsData
->stage2Top
+=MBCS_STAGE_2_BLOCK_SIZE
;
577 /* inspect stage 2 */
578 index
=mbcsData
->stage1
[index
]+((c
>>4)&0x3f);
579 if(mbcsData
->stage2
[index
]==0) {
580 /* allocate another block in stage 3 */
581 if(mbcsData
->stage3Top
>=0x100000*(uint32_t)maxCharLength
) {
582 fprintf(stderr
, "error: too many code points at U+%04x<->0x%s\n",
583 (int)c
, printBytes(buffer
, bytes
, length
));
586 /* each block has 16*maxCharLength bytes */
587 mbcsData
->stage2
[index
]=(mbcsData
->stage3Top
/16)/maxCharLength
;
588 uprv_memset(mbcsData
->fromUBytes
+mbcsData
->stage3Top
, 0, 16*maxCharLength
);
589 mbcsData
->stage3Top
+=16*maxCharLength
;
592 /* write the codepage bytes into stage 3 and get the previous bytes */
594 /* assemble the bytes into a single integer */
611 p
=mbcsData
->fromUBytes
+(16*(uint32_t)(uint16_t)mbcsData
->stage2
[index
]+(c
&0xf))*maxCharLength
;
612 switch(maxCharLength
) {
615 *(uint16_t *)p
=(uint16_t)b
;
618 old
=(uint32_t)*p
<<16;
619 *p
++=(uint8_t)(b
>>16);
620 old
|=(uint32_t)*p
<<8;
621 *p
++=(uint8_t)(b
>>8);
630 /* will never occur */
634 /* check that this Unicode code point was still unassigned */
635 if((mbcsData
->stage2
[index
]&(1UL<<(16+(c
&0xf))))!=0 || old
!=0) {
637 fprintf(stderr
, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
638 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
641 fprintf(stderr
, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
642 (int)c
, printBytes(buffer
, bytes
, length
), (int)old
);
644 /* continue after the above warning if the precision of the mapping is
648 /* set the roundtrip flag */
649 mbcsData
->stage2
[index
]|=(1UL<<(16+(c
&0xf)));
655 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
657 MBCSAddTable(NewConverter
*cnvData
, UCMTable
*table
, UConverterStaticData
*staticData
) {
664 staticData
->unicodeMask
=table
->unicodeMask
;
665 if(staticData
->unicodeMask
==3) {
666 fprintf(stderr
, "error: contains mappings for both supplementary and surrogate code points\n");
670 staticData
->conversionType
=UCNV_MBCS
;
672 mbcsData
=(MBCSData
*)cnvData
;
674 if(!MBCSStartMappings(mbcsData
)) {
681 for(i
=0; i
<table
->mappingsLength
; ++m
, ++i
) {
686 /* there was no precision/fallback indicator */
687 /* fall through to set the mappings */
689 /* set roundtrip mappings */
690 isOK
&=MBCSAddToUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
) &&
691 MBCSAddFromUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
694 /* set only a fallback mapping from Unicode to codepage */
695 staticData
->hasFromUnicodeFallback
=TRUE
;
696 isOK
&=MBCSAddFromUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
699 /* ignore |2 SUB mappings */
702 /* set only a fallback mapping from codepage to Unicode */
703 staticData
->hasToUnicodeFallback
=TRUE
;
704 isOK
&=MBCSAddToUnicode(mbcsData
, m
->b
.bytes
, m
->bLen
, c
, m
->f
);
707 /* will not occur because the parser checked it already */
708 fprintf(stderr
, "error: illegal fallback indicator %d\n", m
->f
);
713 MBCSPostprocess(mbcsData
, staticData
);
719 transformEUC(MBCSData
*mbcsData
) {
721 uint32_t i
, value
, oldLength
, old3Top
, new3Top
;
724 oldLength
=mbcsData
->ucm
->states
.maxCharLength
;
729 old3Top
=mbcsData
->stage3Top
;
731 /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
733 /* test if all first bytes are in {0, 0x8e, 0x8f} */
734 p8
=mbcsData
->fromUBytes
;
742 for(i
=0; i
<old3Top
; i
+=oldLength
) {
744 if(b
!=0 && b
!=0x8e && b
!=0x8f) {
745 /* some first byte does not fit the EUC pattern, nothing to be done */
749 /* restore p if it was modified above */
750 p8
=mbcsData
->fromUBytes
;
752 /* modify outputType and adjust stage3Top */
753 mbcsData
->ucm
->states
.outputType
=(int8_t)(MBCS_OUTPUT_3_EUC
+oldLength
-3);
754 mbcsData
->stage3Top
=new3Top
=(old3Top
*(oldLength
-1))/oldLength
;
757 * EUC-encode all byte sequences;
758 * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
759 * p. 161 in chapter 4 "Encoding Methods"
761 * This also must reverse the byte order if the platform is little-endian!
764 uint16_t *q
=(uint16_t *)p8
;
765 for(i
=0; i
<old3Top
; i
+=oldLength
) {
768 /* short sequences are stored directly */
769 /* code set 0 or 1 */
770 (*q
++)=(uint16_t)((p8
[1]<<8)|p8
[2]);
773 (*q
++)=(uint16_t)(((p8
[1]&0x7f)<<8)|p8
[2]);
774 } else /* b==0x8f */ {
776 (*q
++)=(uint16_t)((p8
[1]<<8)|(p8
[2]&0x7f));
780 } else /* oldLength==4 */ {
782 uint32_t *p32
=(uint32_t *)p8
;
783 for(i
=0; i
<old3Top
; i
+=4) {
785 if(value
<=0xffffff) {
786 /* short sequences are stored directly */
787 /* code set 0 or 1 */
788 (*q
++)=(uint8_t)(value
>>16);
789 (*q
++)=(uint8_t)(value
>>8);
790 (*q
++)=(uint8_t)value
;
791 } else if(value
<=0x8effffff) {
793 (*q
++)=(uint8_t)((value
>>16)&0x7f);
794 (*q
++)=(uint8_t)(value
>>8);
795 (*q
++)=(uint8_t)value
;
796 } else /* first byte is 0x8f */ {
798 (*q
++)=(uint8_t)(value
>>16);
799 (*q
++)=(uint8_t)((value
>>8)&0x7f);
800 (*q
++)=(uint8_t)value
;
809 * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
810 * as possible. Overlapping is done on unassigned head and tail
811 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
812 * Stage 1 indexes need to be adjusted accordingly.
813 * This function is very similar to genprops/store.c/compactStage().
816 singleCompactStage2(MBCSData
*mbcsData
) {
817 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
818 uint16_t map
[MBCS_STAGE_2_MAX_BLOCKS
];
819 uint16_t i
, start
, prevEnd
, newStart
;
821 /* enter the all-unassigned first stage 2 block into the map */
822 map
[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
824 /* begin with the first block after the all-unassigned one */
825 start
=newStart
=MBCS_STAGE_2_FIRST_ASSIGNED
;
826 while(start
<mbcsData
->stage2Top
) {
827 prevEnd
=(uint16_t)(newStart
-1);
829 /* find the size of the overlap */
830 for(i
=0; i
<MBCS_STAGE_2_BLOCK_SIZE
&& mbcsData
->stage2Single
[start
+i
]==0 && mbcsData
->stage2Single
[prevEnd
-i
]==0; ++i
) {}
833 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=(uint16_t)(newStart
-i
);
835 /* move the non-overlapping indexes to their new positions */
837 for(i
=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE
-i
); i
>0; --i
) {
838 mbcsData
->stage2Single
[newStart
++]=mbcsData
->stage2Single
[start
++];
840 } else if(newStart
<start
) {
841 /* move the indexes to their new positions */
842 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=newStart
;
843 for(i
=MBCS_STAGE_2_BLOCK_SIZE
; i
>0; --i
) {
844 mbcsData
->stage2Single
[newStart
++]=mbcsData
->stage2Single
[start
++];
846 } else /* no overlap && newStart==start */ {
847 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=start
;
848 start
=newStart
+=MBCS_STAGE_2_BLOCK_SIZE
;
852 /* adjust stage2Top */
853 if(VERBOSE
&& newStart
<mbcsData
->stage2Top
) {
854 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
855 (unsigned long)mbcsData
->stage2Top
, (unsigned long)newStart
,
856 (long)(mbcsData
->stage2Top
-newStart
)*2);
858 mbcsData
->stage2Top
=newStart
;
860 /* now adjust stage 1 */
861 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
862 mbcsData
->stage1
[i
]=map
[mbcsData
->stage1
[i
]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
];
866 /* Compact stage 3 for SBCS - same algorithm as above. */
868 singleCompactStage3(MBCSData
*mbcsData
) {
869 uint16_t *stage3
=(uint16_t *)mbcsData
->fromUBytes
;
871 /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
872 uint16_t map
[0x1000];
873 uint16_t i
, start
, prevEnd
, newStart
;
875 /* enter the all-unassigned first stage 3 block into the map */
878 /* begin with the first block after the all-unassigned one */
880 while(start
<mbcsData
->stage3Top
) {
881 prevEnd
=(uint16_t)(newStart
-1);
883 /* find the size of the overlap */
884 for(i
=0; i
<16 && stage3
[start
+i
]==0 && stage3
[prevEnd
-i
]==0; ++i
) {}
887 map
[start
>>4]=(uint16_t)(newStart
-i
);
889 /* move the non-overlapping indexes to their new positions */
891 for(i
=(uint16_t)(16-i
); i
>0; --i
) {
892 stage3
[newStart
++]=stage3
[start
++];
894 } else if(newStart
<start
) {
895 /* move the indexes to their new positions */
896 map
[start
>>4]=newStart
;
897 for(i
=16; i
>0; --i
) {
898 stage3
[newStart
++]=stage3
[start
++];
900 } else /* no overlap && newStart==start */ {
906 /* adjust stage3Top */
907 if(VERBOSE
&& newStart
<mbcsData
->stage3Top
) {
908 printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
909 (unsigned long)mbcsData
->stage3Top
, (unsigned long)newStart
,
910 (long)(mbcsData
->stage3Top
-newStart
)*2);
912 mbcsData
->stage3Top
=newStart
;
914 /* now adjust stage 2 */
915 for(i
=0; i
<mbcsData
->stage2Top
; ++i
) {
916 mbcsData
->stage2Single
[i
]=map
[mbcsData
->stage2Single
[i
]>>4];
921 * Compact stage 2 by overlapping adjacent stage 2 blocks as far
922 * as possible. Overlapping is done on unassigned head and tail
923 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
924 * Stage 1 indexes need to be adjusted accordingly.
925 * This function is very similar to genprops/store.c/compactStage().
928 compactStage2(MBCSData
*mbcsData
) {
929 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
930 uint16_t map
[MBCS_STAGE_2_MAX_BLOCKS
];
931 uint16_t i
, start
, prevEnd
, newStart
;
933 /* enter the all-unassigned first stage 2 block into the map */
934 map
[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
;
936 /* begin with the first block after the all-unassigned one */
937 start
=newStart
=MBCS_STAGE_2_FIRST_ASSIGNED
;
938 while(start
<mbcsData
->stage2Top
) {
939 prevEnd
=(uint16_t)(newStart
-1);
941 /* find the size of the overlap */
942 for(i
=0; i
<MBCS_STAGE_2_BLOCK_SIZE
&& mbcsData
->stage2
[start
+i
]==0 && mbcsData
->stage2
[prevEnd
-i
]==0; ++i
) {}
945 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=(uint16_t)(newStart
-i
);
947 /* move the non-overlapping indexes to their new positions */
949 for(i
=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE
-i
); i
>0; --i
) {
950 mbcsData
->stage2
[newStart
++]=mbcsData
->stage2
[start
++];
952 } else if(newStart
<start
) {
953 /* move the indexes to their new positions */
954 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=newStart
;
955 for(i
=MBCS_STAGE_2_BLOCK_SIZE
; i
>0; --i
) {
956 mbcsData
->stage2
[newStart
++]=mbcsData
->stage2
[start
++];
958 } else /* no overlap && newStart==start */ {
959 map
[start
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
]=start
;
960 start
=newStart
+=MBCS_STAGE_2_BLOCK_SIZE
;
964 /* adjust stage2Top */
965 if(VERBOSE
&& newStart
<mbcsData
->stage2Top
) {
966 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
967 (unsigned long)mbcsData
->stage2Top
, (unsigned long)newStart
,
968 (long)(mbcsData
->stage2Top
-newStart
)*4);
970 mbcsData
->stage2Top
=newStart
;
972 /* now adjust stage 1 */
973 for(i
=0; i
<MBCS_STAGE_1_SIZE
; ++i
) {
974 mbcsData
->stage1
[i
]=map
[mbcsData
->stage1
[i
]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
];
979 MBCSPostprocess(MBCSData
*mbcsData
, const UConverterStaticData
*staticData
) {
981 int32_t maxCharLength
;
983 states
=&mbcsData
->ucm
->states
;
984 maxCharLength
=states
->maxCharLength
;
986 /* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
988 printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
989 (unsigned long)mbcsData
->stage3Top
/maxCharLength
,
990 (unsigned long)mbcsData
->stage3Top
/maxCharLength
);
993 ucm_optimizeStates(states
,
994 &mbcsData
->unicodeCodeUnits
,
995 mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
,
998 /* try to compact the fromUnicode tables */
999 transformEUC(mbcsData
);
1000 if(maxCharLength
==1) {
1001 singleCompactStage3(mbcsData
);
1002 singleCompactStage2(mbcsData
);
1004 compactStage2(mbcsData
);
1009 MBCSWrite(NewConverter
*cnvData
, const UConverterStaticData
*staticData
,
1010 UNewDataMemory
*pData
, int32_t tableType
) {
1011 MBCSData
*mbcsData
=(MBCSData
*)cnvData
;
1013 int32_t i
, stage1Top
;
1015 _MBCSHeader header
={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
1017 /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1018 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
1019 if(staticData
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
1020 stage1Top
=MBCS_STAGE_1_SIZE
; /* 0x440==1088 */
1022 stage1Top
=0x40; /* 0x40==64 */
1024 for(i
=0; i
<stage1Top
; ++i
) {
1025 mbcsData
->stage1
[i
]+=(uint16_t)stage1Top
;
1028 /* stage2Top has counted 16-bit results, now we need to count bytes */
1029 mbcsData
->stage2Top
*=2;
1031 /* stage3Top has counted 16-bit results, now we need to count bytes */
1032 mbcsData
->stage3Top
*=2;
1034 if(staticData
->unicodeMask
&UCNV_HAS_SUPPLEMENTARY
) {
1035 stage1Top
=MBCS_STAGE_1_SIZE
; /* 0x440==1088 */
1037 stage1Top
=0x40; /* 0x40==64 */
1039 for(i
=0; i
<stage1Top
; ++i
) {
1040 mbcsData
->stage1
[i
]+=(uint16_t)stage1Top
/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1043 /* stage2Top has counted 32-bit results, now we need to count bytes */
1044 mbcsData
->stage2Top
*=4;
1046 /* stage3Top has already counted bytes */
1049 /* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
1050 mbcsData
->stage2Top
=(mbcsData
->stage2Top
+3)&~3;
1051 mbcsData
->stage3Top
=(mbcsData
->stage3Top
+3)&~3;
1053 /* fill the header */
1054 header
.version
[0]=4;
1055 header
.version
[1]=2;
1056 header
.countStates
=mbcsData
->ucm
->states
.countStates
;
1057 header
.countToUFallbacks
=mbcsData
->countToUFallbacks
;
1059 header
.offsetToUCodeUnits
=
1060 sizeof(_MBCSHeader
)+
1061 mbcsData
->ucm
->states
.countStates
*1024+
1062 mbcsData
->countToUFallbacks
*sizeof(_MBCSToUFallback
);
1063 header
.offsetFromUTable
=
1064 header
.offsetToUCodeUnits
+
1065 mbcsData
->ucm
->states
.countToUCodeUnits
*2;
1066 header
.offsetFromUBytes
=
1067 header
.offsetFromUTable
+
1069 mbcsData
->stage2Top
;
1070 header
.fromUBytesLength
=mbcsData
->stage3Top
;
1072 top
=header
.offsetFromUBytes
+header
.fromUBytesLength
;
1074 header
.flags
=(uint8_t)(mbcsData
->ucm
->states
.outputType
);
1076 if(tableType
&TABLE_EXT
) {
1078 fprintf(stderr
, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top
);
1082 header
.flags
|=top
<<8;
1085 /* write the MBCS data */
1086 udata_writeBlock(pData
, &header
, sizeof(_MBCSHeader
));
1087 udata_writeBlock(pData
, mbcsData
->ucm
->states
.stateTable
, header
.countStates
*1024);
1088 udata_writeBlock(pData
, mbcsData
->toUFallbacks
, mbcsData
->countToUFallbacks
*sizeof(_MBCSToUFallback
));
1089 udata_writeBlock(pData
, mbcsData
->unicodeCodeUnits
, mbcsData
->ucm
->states
.countToUCodeUnits
*2);
1090 udata_writeBlock(pData
, mbcsData
->stage1
, stage1Top
*2);
1091 if(mbcsData
->ucm
->states
.maxCharLength
==1) {
1092 udata_writeBlock(pData
, mbcsData
->stage2Single
, mbcsData
->stage2Top
);
1094 udata_writeBlock(pData
, mbcsData
->stage2
, mbcsData
->stage2Top
);
1096 udata_writeBlock(pData
, mbcsData
->fromUBytes
, mbcsData
->stage3Top
);
1098 /* return the number of bytes that should have been written */
1099 return header
.offsetFromUBytes
+header
.fromUBytesLength
;