]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/makeconv/genmbcs.c
ICU-6.2.10.tar.gz
[apple/icu.git] / icuSources / tools / makeconv / genmbcs.c
... / ...
CommitLineData
1/*
2*******************************************************************************
3*
4* Copyright (C) 2000-2004, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: genmbcs.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000jul06
14* created by: Markus W. Scherer
15*/
16
17#include <stdio.h>
18#include "unicode/utypes.h"
19#include "cstring.h"
20#include "cmemory.h"
21#include "unewdata.h"
22#include "ucnv_cnv.h"
23#include "ucnvmbcs.h"
24#include "ucm.h"
25#include "makeconv.h"
26#include "genmbcs.h"
27
28typedef struct MBCSData {
29 NewConverter newConverter;
30
31 UCMFile *ucm;
32
33 /* toUnicode (state table in ucm->states) */
34 _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
35 int32_t countToUFallbacks;
36 uint16_t *unicodeCodeUnits;
37
38 /* fromUnicode */
39 uint16_t stage1[MBCS_STAGE_1_SIZE];
40 uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
41 uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
42 uint8_t *fromUBytes;
43 uint32_t stage2Top, stage3Top;
44} MBCSData;
45
46/* prototypes */
47static void
48MBCSClose(NewConverter *cnvData);
49
50static UBool
51MBCSStartMappings(MBCSData *mbcsData);
52
53static UBool
54MBCSAddToUnicode(MBCSData *mbcsData,
55 const uint8_t *bytes, int32_t length,
56 UChar32 c,
57 int8_t flag);
58
59static UBool
60MBCSIsValid(NewConverter *cnvData,
61 const uint8_t *bytes, int32_t length);
62
63static UBool
64MBCSSingleAddFromUnicode(MBCSData *mbcsData,
65 const uint8_t *bytes, int32_t length,
66 UChar32 c,
67 int8_t flag);
68
69static UBool
70MBCSAddFromUnicode(MBCSData *mbcsData,
71 const uint8_t *bytes, int32_t length,
72 UChar32 c,
73 int8_t flag);
74
75static void
76MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
77
78static UBool
79MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
80
81static uint32_t
82MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
83 UNewDataMemory *pData, int32_t tableType);
84
85/* helper ------------------------------------------------------------------- */
86
87static U_INLINE char
88hexDigit(uint8_t digit) {
89 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
90}
91
92static U_INLINE char *
93printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
94 char *s=buffer;
95 while(length>0) {
96 *s++=hexDigit((uint8_t)(*bytes>>4));
97 *s++=hexDigit((uint8_t)(*bytes&0xf));
98 ++bytes;
99 --length;
100 }
101
102 *s=0;
103 return buffer;
104}
105
106/* implementation ----------------------------------------------------------- */
107
108static void
109MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
110 int32_t i, maxCharLength;
111
112 uprv_memset(mbcsData, 0, sizeof(MBCSData));
113
114 maxCharLength=ucm->states.maxCharLength;
115
116 mbcsData->ucm=ucm; /* aliased, not owned */
117
118 mbcsData->newConverter.close=MBCSClose;
119 mbcsData->newConverter.isValid=MBCSIsValid;
120 mbcsData->newConverter.addTable=MBCSAddTable;
121 mbcsData->newConverter.write=MBCSWrite;
122
123 mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
124 mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
125
126 /* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
127 for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
128 mbcsData->stage1[i]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
129 }
130}
131
132NewConverter *
133MBCSOpen(UCMFile *ucm) {
134 MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
135 if(mbcsData!=NULL) {
136 MBCSInit(mbcsData, ucm);
137 }
138 return &mbcsData->newConverter;
139}
140
141static void
142MBCSClose(NewConverter *cnvData) {
143 MBCSData *mbcsData=(MBCSData *)cnvData;
144 if(mbcsData!=NULL) {
145 uprv_free(mbcsData->unicodeCodeUnits);
146 uprv_free(mbcsData->fromUBytes);
147 uprv_free(mbcsData);
148 }
149}
150
151static UBool
152MBCSStartMappings(MBCSData *mbcsData) {
153 int32_t i, sum;
154
155 /* allocate the code unit array and prefill it with "unassigned" values */
156 sum=mbcsData->ucm->states.countToUCodeUnits;
157 if(VERBOSE) {
158 printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum);
159 }
160
161 if(sum>0) {
162 mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
163 if(mbcsData->unicodeCodeUnits==NULL) {
164 fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
165 (long)sum);
166 return FALSE;
167 }
168 for(i=0; i<sum; ++i) {
169 mbcsData->unicodeCodeUnits[i]=0xfffe;
170 }
171 }
172
173 /* allocate the codepage mappings and preset the first 16 characters to 0 */
174 if(mbcsData->ucm->states.maxCharLength==1) {
175 /* allocate 64k 16-bit results for single-byte codepages */
176 sum=0x20000;
177 } else {
178 /* allocate 1M * maxCharLength bytes for at most 1M mappings */
179 sum=0x100000*mbcsData->ucm->states.maxCharLength;
180 }
181 mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
182 if(mbcsData->fromUBytes==NULL) {
183 fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
184 return FALSE;
185 }
186 /* initialize the all-unassigned first stage 3 block */
187 uprv_memset(mbcsData->fromUBytes, 0, 64);
188
189 return TRUE;
190}
191
192/* return TRUE for success */
193static UBool
194setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
195 int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
196 if(i>=0) {
197 /* if there is already a fallback for this offset, then overwrite it */
198 mbcsData->toUFallbacks[i].codePoint=c;
199 return TRUE;
200 } else {
201 /* if there is no fallback for this offset, then add one */
202 i=mbcsData->countToUFallbacks;
203 if(i>=MBCS_MAX_FALLBACK_COUNT) {
204 fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c);
205 return FALSE;
206 } else {
207 mbcsData->toUFallbacks[i].offset=offset;
208 mbcsData->toUFallbacks[i].codePoint=c;
209 mbcsData->countToUFallbacks=i+1;
210 return TRUE;
211 }
212 }
213}
214
215/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
216static int32_t
217removeFallback(MBCSData *mbcsData, uint32_t offset) {
218 int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
219 if(i>=0) {
220 _MBCSToUFallback *toUFallbacks;
221 int32_t limit, old;
222
223 toUFallbacks=mbcsData->toUFallbacks;
224 limit=mbcsData->countToUFallbacks;
225 old=(int32_t)toUFallbacks[i].codePoint;
226
227 /* copy the last fallback entry here to keep the list contiguous */
228 toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
229 toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
230 mbcsData->countToUFallbacks=limit-1;
231 return old;
232 } else {
233 return -1;
234 }
235}
236
237/*
238 * isFallback is almost a boolean:
239 * 1 (TRUE) this is a fallback mapping
240 * 0 (FALSE) this is a precise mapping
241 * -1 the precision of this mapping is not specified
242 */
243static UBool
244MBCSAddToUnicode(MBCSData *mbcsData,
245 const uint8_t *bytes, int32_t length,
246 UChar32 c,
247 int8_t flag) {
248 char buffer[10];
249 uint32_t offset=0;
250 int32_t i=0, entry, old;
251 uint8_t state=0;
252
253 if(mbcsData->ucm->states.countStates==0) {
254 fprintf(stderr, "error: there is no state information!\n");
255 return FALSE;
256 }
257
258 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
259 if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
260 state=1;
261 }
262
263 /*
264 * Walk down the state table like in conversion,
265 * much like getNextUChar().
266 * We assume that c<=0x10ffff.
267 */
268 for(i=0;;) {
269 entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
270 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
271 if(i==length) {
272 fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
273 (short)state, printBytes(buffer, bytes, length), (int)c);
274 return FALSE;
275 }
276 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
277 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
278 } else {
279 if(i<length) {
280 fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%s (U+%x)\n",
281 (int)(length-i), state, printBytes(buffer, bytes, length), (int)c);
282 return FALSE;
283 }
284 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
285 case MBCS_STATE_ILLEGAL:
286 fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
287 (int)c, printBytes(buffer, bytes, length));
288 return FALSE;
289 case MBCS_STATE_CHANGE_ONLY:
290 fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
291 (int)c, printBytes(buffer, bytes, length));
292 return FALSE;
293 case MBCS_STATE_UNASSIGNED:
294 fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
295 (int)c, printBytes(buffer, bytes, length));
296 return FALSE;
297 case MBCS_STATE_FALLBACK_DIRECT_16:
298 case MBCS_STATE_VALID_DIRECT_16:
299 case MBCS_STATE_FALLBACK_DIRECT_20:
300 case MBCS_STATE_VALID_DIRECT_20:
301 if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
302 /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
303 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
304 old=MBCS_ENTRY_FINAL_VALUE(entry);
305 } else {
306 old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
307 }
308 if(flag>=0) {
309 fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
310 (int)c, printBytes(buffer, bytes, length), (int)old);
311 return FALSE;
312 } else if(VERBOSE) {
313 fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
314 (int)c, printBytes(buffer, bytes, length), (int)old);
315 }
316 /*
317 * Continue after the above warning
318 * if the precision of the mapping is unspecified.
319 */
320 }
321 /* reassign the correct action code */
322 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
323
324 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
325 if(c<=0xffff) {
326 entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
327 } else {
328 entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
329 }
330 mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
331 break;
332 case MBCS_STATE_VALID_16:
333 /* bits 26..16 are not used, 0 */
334 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
335 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
336 /* check that this byte sequence is still unassigned */
337 if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
338 if(flag>=0) {
339 fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
340 (int)c, printBytes(buffer, bytes, length), (int)old);
341 return FALSE;
342 } else if(VERBOSE) {
343 fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
344 (int)c, printBytes(buffer, bytes, length), (int)old);
345 }
346 }
347 if(c>=0x10000) {
348 fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
349 (int)c, printBytes(buffer, bytes, length));
350 return FALSE;
351 }
352 if(flag>0) {
353 /* assign only if there is no precise mapping */
354 if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
355 return setFallback(mbcsData, offset, c);
356 }
357 } else {
358 mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
359 }
360 break;
361 case MBCS_STATE_VALID_16_PAIR:
362 /* bits 26..16 are not used, 0 */
363 /* bits 15..7 contain the final offset delta to two 16-bit code units */
364 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
365 /* check that this byte sequence is still unassigned */
366 old=mbcsData->unicodeCodeUnits[offset];
367 if(old<0xfffe) {
368 int32_t real;
369 if(old<0xd800) {
370 real=old;
371 } else if(old<=0xdfff) {
372 real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
373 } else /* old<=0xe001 */ {
374 real=mbcsData->unicodeCodeUnits[offset+1];
375 }
376 if(flag>=0) {
377 fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
378 (int)c, printBytes(buffer, bytes, length), (int)real);
379 return FALSE;
380 } else if(VERBOSE) {
381 fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
382 (int)c, printBytes(buffer, bytes, length), (int)real);
383 }
384 }
385 if(flag>0) {
386 /* assign only if there is no precise mapping */
387 if(old<=0xdbff || old==0xe000) {
388 /* do nothing */
389 } else if(c<=0xffff) {
390 /* set a BMP fallback code point as a pair with 0xe001 */
391 mbcsData->unicodeCodeUnits[offset++]=0xe001;
392 mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
393 } else {
394 /* set a fallback surrogate pair with two second surrogates */
395 mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10));
396 mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
397 }
398 } else {
399 if(c<0xd800) {
400 /* set a BMP code point */
401 mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
402 } else if(c<=0xffff) {
403 /* set a BMP code point above 0xd800 as a pair with 0xe000 */
404 mbcsData->unicodeCodeUnits[offset++]=0xe000;
405 mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
406 } else {
407 /* set a surrogate pair */
408 mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10));
409 mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
410 }
411 }
412 break;
413 default:
414 /* reserved, must never occur */
415 fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
416 (int)entry, printBytes(buffer, bytes, length), (int)c);
417 return FALSE;
418 }
419
420 return TRUE;
421 }
422 }
423}
424
425/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
426static UBool
427MBCSIsValid(NewConverter *cnvData,
428 const uint8_t *bytes, int32_t length) {
429 MBCSData *mbcsData=(MBCSData *)cnvData;
430
431 return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length));
432}
433
434static UBool
435MBCSSingleAddFromUnicode(MBCSData *mbcsData,
436 const uint8_t *bytes, int32_t length,
437 UChar32 c,
438 int8_t flag) {
439 uint16_t *p;
440 uint32_t index;
441 uint16_t old;
442 uint8_t b;
443
444 /* ignore |2 SUB mappings */
445 if(flag==2) {
446 return TRUE;
447 }
448
449 /*
450 * Walk down the triple-stage compact array ("trie") and
451 * allocate parts as necessary.
452 * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
453 * We assume that length<=maxCharLength and that c<=0x10ffff.
454 */
455 b=*bytes;
456
457 /* inspect stage 1 */
458 index=c>>10;
459 if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
460 /* allocate another block in stage 2 */
461 if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
462 fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
463 return FALSE;
464 }
465
466 /*
467 * each stage 2 block contains 64 16-bit words:
468 * 6 code point bits 9..4 with 1 stage 3 index
469 */
470 mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
471 mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
472 }
473
474 /* inspect stage 2 */
475 index=(uint32_t)mbcsData->stage1[index]+((c>>4)&0x3f);
476 if(mbcsData->stage2Single[index]==0) {
477 /* allocate another block in stage 3 */
478 if(mbcsData->stage3Top>=0x10000) {
479 fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
480 return FALSE;
481 }
482 /* each block has 16 uint16_t entries */
483 mbcsData->stage2Single[index]=(uint16_t)mbcsData->stage3Top;
484 uprv_memset(mbcsData->fromUBytes+2*mbcsData->stage3Top, 0, 32);
485 mbcsData->stage3Top+=16;
486 }
487
488 /* write the codepage entry into stage 3 and get the previous entry */
489 p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
490 old=*p;
491 if(flag<=0) {
492 *p=(uint16_t)(0xf00|b);
493 } else if(IS_PRIVATE_USE(c)) {
494 *p=(uint16_t)(0xc00|b);
495 } else {
496 *p=(uint16_t)(0x800|b);
497 }
498
499 /* check that this Unicode code point was still unassigned */
500 if(old>=0x100) {
501 if(flag>=0) {
502 fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
503 (int)c, b, old&0xff);
504 return FALSE;
505 } else if(VERBOSE) {
506 fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
507 (int)c, b, old&0xff);
508 }
509 /* continue after the above warning if the precision of the mapping is unspecified */
510 }
511
512 return TRUE;
513}
514
515static UBool
516MBCSAddFromUnicode(MBCSData *mbcsData,
517 const uint8_t *bytes, int32_t length,
518 UChar32 c,
519 int8_t flag) {
520 char buffer[10];
521 const uint8_t *pb;
522 uint8_t *p;
523 uint32_t index, b, old;
524 int32_t maxCharLength;
525
526 /* ignore |2 SUB mappings */
527 if(flag==2) {
528 return TRUE;
529 }
530
531 maxCharLength=mbcsData->ucm->states.maxCharLength;
532
533 if(maxCharLength==1) {
534 return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag);
535 }
536
537 if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
538 (*bytes==0xe || *bytes==0xf)
539 ) {
540 fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
541 (int)c, printBytes(buffer, bytes, length));
542 return FALSE;
543 }
544
545 if(flag==1 && length==1 && *bytes==0) {
546 fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
547 (int)c, *bytes);
548 return FALSE;
549 }
550
551 /*
552 * Walk down the triple-stage compact array ("trie") and
553 * allocate parts as necessary.
554 * Note that the first stage 2 and 3 blocks are reserved for
555 * all-unassigned mappings.
556 * We assume that length<=maxCharLength and that c<=0x10ffff.
557 */
558
559 /* inspect stage 1 */
560 index=c>>10;
561 if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
562 /* allocate another block in stage 2 */
563 if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
564 fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
565 (int)c, printBytes(buffer, bytes, length));
566 return FALSE;
567 }
568
569 /*
570 * each stage 2 block contains 64 32-bit words:
571 * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
572 */
573 mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
574 mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
575 }
576
577 /* inspect stage 2 */
578 index=mbcsData->stage1[index]+((c>>4)&0x3f);
579 if(mbcsData->stage2[index]==0) {
580 /* allocate another block in stage 3 */
581 if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) {
582 fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
583 (int)c, printBytes(buffer, bytes, length));
584 return FALSE;
585 }
586 /* each block has 16*maxCharLength bytes */
587 mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength;
588 uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength);
589 mbcsData->stage3Top+=16*maxCharLength;
590 }
591
592 /* write the codepage bytes into stage 3 and get the previous bytes */
593
594 /* assemble the bytes into a single integer */
595 pb=bytes;
596 b=0;
597 switch(length) {
598 case 4:
599 b=*pb++;
600 case 3:
601 b=(b<<8)|*pb++;
602 case 2:
603 b=(b<<8)|*pb++;
604 case 1:
605 default:
606 b=(b<<8)|*pb++;
607 break;
608 }
609
610 old=0;
611 p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength;
612 switch(maxCharLength) {
613 case 2:
614 old=*(uint16_t *)p;
615 *(uint16_t *)p=(uint16_t)b;
616 break;
617 case 3:
618 old=(uint32_t)*p<<16;
619 *p++=(uint8_t)(b>>16);
620 old|=(uint32_t)*p<<8;
621 *p++=(uint8_t)(b>>8);
622 old|=*p;
623 *p=(uint8_t)b;
624 break;
625 case 4:
626 old=*(uint32_t *)p;
627 *(uint32_t *)p=b;
628 break;
629 default:
630 /* will never occur */
631 break;
632 }
633
634 /* check that this Unicode code point was still unassigned */
635 if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
636 if(flag>=0) {
637 fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
638 (int)c, printBytes(buffer, bytes, length), (int)old);
639 return FALSE;
640 } else if(VERBOSE) {
641 fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
642 (int)c, printBytes(buffer, bytes, length), (int)old);
643 }
644 /* continue after the above warning if the precision of the mapping is
645 unspecified */
646 }
647 if(flag<=0) {
648 /* set the roundtrip flag */
649 mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
650 }
651
652 return TRUE;
653}
654
655/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
656static UBool
657MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
658 MBCSData *mbcsData;
659 UCMapping *m;
660 UChar32 c;
661 int32_t i;
662 UBool isOK;
663
664 staticData->unicodeMask=table->unicodeMask;
665 if(staticData->unicodeMask==3) {
666 fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
667 return FALSE;
668 }
669
670 staticData->conversionType=UCNV_MBCS;
671
672 mbcsData=(MBCSData *)cnvData;
673
674 if(!MBCSStartMappings(mbcsData)) {
675 return FALSE;
676 }
677
678 isOK=TRUE;
679
680 m=table->mappings;
681 for(i=0; i<table->mappingsLength; ++m, ++i) {
682 c=m->u;
683
684 switch(m->f) {
685 case -1:
686 /* there was no precision/fallback indicator */
687 /* fall through to set the mappings */
688 case 0:
689 /* set roundtrip mappings */
690 isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) &&
691 MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
692 break;
693 case 1:
694 /* set only a fallback mapping from Unicode to codepage */
695 staticData->hasFromUnicodeFallback=TRUE;
696 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
697 break;
698 case 2:
699 /* ignore |2 SUB mappings */
700 break;
701 case 3:
702 /* set only a fallback mapping from codepage to Unicode */
703 staticData->hasToUnicodeFallback=TRUE;
704 isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
705 break;
706 default:
707 /* will not occur because the parser checked it already */
708 fprintf(stderr, "error: illegal fallback indicator %d\n", m->f);
709 return FALSE;
710 }
711 }
712
713 MBCSPostprocess(mbcsData, staticData);
714
715 return isOK;
716}
717
718static UBool
719transformEUC(MBCSData *mbcsData) {
720 uint8_t *p8;
721 uint32_t i, value, oldLength, old3Top, new3Top;
722 uint8_t b;
723
724 oldLength=mbcsData->ucm->states.maxCharLength;
725 if(oldLength<3) {
726 return FALSE;
727 }
728
729 old3Top=mbcsData->stage3Top;
730
731 /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
732
733 /* test if all first bytes are in {0, 0x8e, 0x8f} */
734 p8=mbcsData->fromUBytes;
735
736#if !U_IS_BIG_ENDIAN
737 if(oldLength==4) {
738 p8+=3;
739 }
740#endif
741
742 for(i=0; i<old3Top; i+=oldLength) {
743 b=p8[i];
744 if(b!=0 && b!=0x8e && b!=0x8f) {
745 /* some first byte does not fit the EUC pattern, nothing to be done */
746 return FALSE;
747 }
748 }
749 /* restore p if it was modified above */
750 p8=mbcsData->fromUBytes;
751
752 /* modify outputType and adjust stage3Top */
753 mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
754 mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength;
755
756 /*
757 * EUC-encode all byte sequences;
758 * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
759 * p. 161 in chapter 4 "Encoding Methods"
760 *
761 * This also must reverse the byte order if the platform is little-endian!
762 */
763 if(oldLength==3) {
764 uint16_t *q=(uint16_t *)p8;
765 for(i=0; i<old3Top; i+=oldLength) {
766 b=*p8;
767 if(b==0) {
768 /* short sequences are stored directly */
769 /* code set 0 or 1 */
770 (*q++)=(uint16_t)((p8[1]<<8)|p8[2]);
771 } else if(b==0x8e) {
772 /* code set 2 */
773 (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]);
774 } else /* b==0x8f */ {
775 /* code set 3 */
776 (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f));
777 }
778 p8+=3;
779 }
780 } else /* oldLength==4 */ {
781 uint8_t *q=p8;
782 uint32_t *p32=(uint32_t *)p8;
783 for(i=0; i<old3Top; i+=4) {
784 value=(*p32++);
785 if(value<=0xffffff) {
786 /* short sequences are stored directly */
787 /* code set 0 or 1 */
788 (*q++)=(uint8_t)(value>>16);
789 (*q++)=(uint8_t)(value>>8);
790 (*q++)=(uint8_t)value;
791 } else if(value<=0x8effffff) {
792 /* code set 2 */
793 (*q++)=(uint8_t)((value>>16)&0x7f);
794 (*q++)=(uint8_t)(value>>8);
795 (*q++)=(uint8_t)value;
796 } else /* first byte is 0x8f */ {
797 /* code set 3 */
798 (*q++)=(uint8_t)(value>>16);
799 (*q++)=(uint8_t)((value>>8)&0x7f);
800 (*q++)=(uint8_t)value;
801 }
802 }
803 }
804
805 return TRUE;
806}
807
808/*
809 * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
810 * as possible. Overlapping is done on unassigned head and tail
811 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
812 * Stage 1 indexes need to be adjusted accordingly.
813 * This function is very similar to genprops/store.c/compactStage().
814 */
815static void
816singleCompactStage2(MBCSData *mbcsData) {
817 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
818 uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
819 uint16_t i, start, prevEnd, newStart;
820
821 /* enter the all-unassigned first stage 2 block into the map */
822 map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
823
824 /* begin with the first block after the all-unassigned one */
825 start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
826 while(start<mbcsData->stage2Top) {
827 prevEnd=(uint16_t)(newStart-1);
828
829 /* find the size of the overlap */
830 for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
831
832 if(i>0) {
833 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
834
835 /* move the non-overlapping indexes to their new positions */
836 start+=i;
837 for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
838 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
839 }
840 } else if(newStart<start) {
841 /* move the indexes to their new positions */
842 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
843 for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
844 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
845 }
846 } else /* no overlap && newStart==start */ {
847 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
848 start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
849 }
850 }
851
852 /* adjust stage2Top */
853 if(VERBOSE && newStart<mbcsData->stage2Top) {
854 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
855 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
856 (long)(mbcsData->stage2Top-newStart)*2);
857 }
858 mbcsData->stage2Top=newStart;
859
860 /* now adjust stage 1 */
861 for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
862 mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
863 }
864}
865
866/* Compact stage 3 for SBCS - same algorithm as above. */
867static void
868singleCompactStage3(MBCSData *mbcsData) {
869 uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes;
870
871 /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
872 uint16_t map[0x1000];
873 uint16_t i, start, prevEnd, newStart;
874
875 /* enter the all-unassigned first stage 3 block into the map */
876 map[0]=0;
877
878 /* begin with the first block after the all-unassigned one */
879 start=newStart=16;
880 while(start<mbcsData->stage3Top) {
881 prevEnd=(uint16_t)(newStart-1);
882
883 /* find the size of the overlap */
884 for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
885
886 if(i>0) {
887 map[start>>4]=(uint16_t)(newStart-i);
888
889 /* move the non-overlapping indexes to their new positions */
890 start+=i;
891 for(i=(uint16_t)(16-i); i>0; --i) {
892 stage3[newStart++]=stage3[start++];
893 }
894 } else if(newStart<start) {
895 /* move the indexes to their new positions */
896 map[start>>4]=newStart;
897 for(i=16; i>0; --i) {
898 stage3[newStart++]=stage3[start++];
899 }
900 } else /* no overlap && newStart==start */ {
901 map[start>>4]=start;
902 start=newStart+=16;
903 }
904 }
905
906 /* adjust stage3Top */
907 if(VERBOSE && newStart<mbcsData->stage3Top) {
908 printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
909 (unsigned long)mbcsData->stage3Top, (unsigned long)newStart,
910 (long)(mbcsData->stage3Top-newStart)*2);
911 }
912 mbcsData->stage3Top=newStart;
913
914 /* now adjust stage 2 */
915 for(i=0; i<mbcsData->stage2Top; ++i) {
916 mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
917 }
918}
919
920/*
921 * Compact stage 2 by overlapping adjacent stage 2 blocks as far
922 * as possible. Overlapping is done on unassigned head and tail
923 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
924 * Stage 1 indexes need to be adjusted accordingly.
925 * This function is very similar to genprops/store.c/compactStage().
926 */
927static void
928compactStage2(MBCSData *mbcsData) {
929 /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
930 uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
931 uint16_t i, start, prevEnd, newStart;
932
933 /* enter the all-unassigned first stage 2 block into the map */
934 map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
935
936 /* begin with the first block after the all-unassigned one */
937 start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
938 while(start<mbcsData->stage2Top) {
939 prevEnd=(uint16_t)(newStart-1);
940
941 /* find the size of the overlap */
942 for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
943
944 if(i>0) {
945 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
946
947 /* move the non-overlapping indexes to their new positions */
948 start+=i;
949 for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
950 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
951 }
952 } else if(newStart<start) {
953 /* move the indexes to their new positions */
954 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
955 for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
956 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
957 }
958 } else /* no overlap && newStart==start */ {
959 map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
960 start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
961 }
962 }
963
964 /* adjust stage2Top */
965 if(VERBOSE && newStart<mbcsData->stage2Top) {
966 printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
967 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
968 (long)(mbcsData->stage2Top-newStart)*4);
969 }
970 mbcsData->stage2Top=newStart;
971
972 /* now adjust stage 1 */
973 for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
974 mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
975 }
976}
977
978static void
979MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
980 UCMStates *states;
981 int32_t maxCharLength;
982
983 states=&mbcsData->ucm->states;
984 maxCharLength=states->maxCharLength;
985
986 /* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
987 if(VERBOSE) {
988 printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
989 (unsigned long)mbcsData->stage3Top/maxCharLength,
990 (unsigned long)mbcsData->stage3Top/maxCharLength);
991 }
992
993 ucm_optimizeStates(states,
994 &mbcsData->unicodeCodeUnits,
995 mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
996 VERBOSE);
997
998 /* try to compact the fromUnicode tables */
999 transformEUC(mbcsData);
1000 if(maxCharLength==1) {
1001 singleCompactStage3(mbcsData);
1002 singleCompactStage2(mbcsData);
1003 } else {
1004 compactStage2(mbcsData);
1005 }
1006}
1007
1008static uint32_t
1009MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
1010 UNewDataMemory *pData, int32_t tableType) {
1011 MBCSData *mbcsData=(MBCSData *)cnvData;
1012 uint32_t top;
1013 int32_t i, stage1Top;
1014
1015 _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
1016
1017 /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1018 if(mbcsData->ucm->states.maxCharLength==1) {
1019 if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1020 stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1021 } else {
1022 stage1Top=0x40; /* 0x40==64 */
1023 }
1024 for(i=0; i<stage1Top; ++i) {
1025 mbcsData->stage1[i]+=(uint16_t)stage1Top;
1026 }
1027
1028 /* stage2Top has counted 16-bit results, now we need to count bytes */
1029 mbcsData->stage2Top*=2;
1030
1031 /* stage3Top has counted 16-bit results, now we need to count bytes */
1032 mbcsData->stage3Top*=2;
1033 } else {
1034 if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1035 stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1036 } else {
1037 stage1Top=0x40; /* 0x40==64 */
1038 }
1039 for(i=0; i<stage1Top; ++i) {
1040 mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1041 }
1042
1043 /* stage2Top has counted 32-bit results, now we need to count bytes */
1044 mbcsData->stage2Top*=4;
1045
1046 /* stage3Top has already counted bytes */
1047 }
1048
1049 /* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
1050 mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3;
1051 mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
1052
1053 /* fill the header */
1054 header.version[0]=4;
1055 header.version[1]=2;
1056 header.countStates=mbcsData->ucm->states.countStates;
1057 header.countToUFallbacks=mbcsData->countToUFallbacks;
1058
1059 header.offsetToUCodeUnits=
1060 sizeof(_MBCSHeader)+
1061 mbcsData->ucm->states.countStates*1024+
1062 mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
1063 header.offsetFromUTable=
1064 header.offsetToUCodeUnits+
1065 mbcsData->ucm->states.countToUCodeUnits*2;
1066 header.offsetFromUBytes=
1067 header.offsetFromUTable+
1068 stage1Top*2+
1069 mbcsData->stage2Top;
1070 header.fromUBytesLength=mbcsData->stage3Top;
1071
1072 top=header.offsetFromUBytes+header.fromUBytesLength;
1073
1074 header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
1075
1076 if(tableType&TABLE_EXT) {
1077 if(top>0xffffff) {
1078 fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
1079 return 0;
1080 }
1081
1082 header.flags|=top<<8;
1083 }
1084
1085 /* write the MBCS data */
1086 udata_writeBlock(pData, &header, sizeof(_MBCSHeader));
1087 udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
1088 udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
1089 udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
1090 udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
1091 if(mbcsData->ucm->states.maxCharLength==1) {
1092 udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
1093 } else {
1094 udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
1095 }
1096 udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
1097
1098 /* return the number of bytes that should have been written */
1099 return header.offsetFromUBytes+header.fromUBytesLength;
1100}