]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2000-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: genmbcs.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2000jul06 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #include <stdio.h> | |
18 | #include "unicode/utypes.h" | |
19 | #include "cstring.h" | |
20 | #include "cmemory.h" | |
21 | #include "unewdata.h" | |
22 | #include "ucnv_cnv.h" | |
23 | #include "ucnvmbcs.h" | |
374ca955 | 24 | #include "ucm.h" |
b75a7d8f A |
25 | #include "makeconv.h" |
26 | #include "genmbcs.h" | |
27 | ||
b75a7d8f A |
28 | typedef struct MBCSData { |
29 | NewConverter newConverter; | |
30 | ||
374ca955 A |
31 | UCMFile *ucm; |
32 | ||
33 | /* toUnicode (state table in ucm->states) */ | |
b75a7d8f | 34 | _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT]; |
374ca955 | 35 | int32_t countToUFallbacks; |
b75a7d8f | 36 | uint16_t *unicodeCodeUnits; |
b75a7d8f A |
37 | |
38 | /* fromUnicode */ | |
39 | uint16_t stage1[MBCS_STAGE_1_SIZE]; | |
40 | uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */ | |
41 | uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */ | |
42 | uint8_t *fromUBytes; | |
374ca955 | 43 | uint32_t stage2Top, stage3Top; |
b75a7d8f A |
44 | } MBCSData; |
45 | ||
46 | /* prototypes */ | |
47 | static void | |
48 | MBCSClose(NewConverter *cnvData); | |
49 | ||
50 | static UBool | |
374ca955 | 51 | MBCSStartMappings(MBCSData *mbcsData); |
b75a7d8f A |
52 | |
53 | static UBool | |
374ca955 | 54 | MBCSAddToUnicode(MBCSData *mbcsData, |
b75a7d8f | 55 | const uint8_t *bytes, int32_t length, |
374ca955 A |
56 | UChar32 c, |
57 | int8_t flag); | |
b75a7d8f A |
58 | |
59 | static UBool | |
60 | MBCSIsValid(NewConverter *cnvData, | |
374ca955 | 61 | const uint8_t *bytes, int32_t length); |
b75a7d8f A |
62 | |
63 | static UBool | |
374ca955 | 64 | MBCSSingleAddFromUnicode(MBCSData *mbcsData, |
b75a7d8f | 65 | const uint8_t *bytes, int32_t length, |
374ca955 A |
66 | UChar32 c, |
67 | int8_t flag); | |
b75a7d8f A |
68 | |
69 | static UBool | |
374ca955 | 70 | MBCSAddFromUnicode(MBCSData *mbcsData, |
b75a7d8f | 71 | const uint8_t *bytes, int32_t length, |
374ca955 A |
72 | UChar32 c, |
73 | int8_t flag); | |
b75a7d8f A |
74 | |
75 | static void | |
374ca955 A |
76 | MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData); |
77 | ||
78 | static UBool | |
79 | MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); | |
b75a7d8f A |
80 | |
81 | static uint32_t | |
374ca955 A |
82 | MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, |
83 | UNewDataMemory *pData, int32_t tableType); | |
84 | ||
85 | /* helper ------------------------------------------------------------------- */ | |
86 | ||
87 | static U_INLINE char | |
88 | hexDigit(uint8_t digit) { | |
89 | return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); | |
90 | } | |
91 | ||
92 | static U_INLINE char * | |
93 | printBytes(char *buffer, const uint8_t *bytes, int32_t length) { | |
94 | char *s=buffer; | |
95 | while(length>0) { | |
96 | *s++=hexDigit((uint8_t)(*bytes>>4)); | |
97 | *s++=hexDigit((uint8_t)(*bytes&0xf)); | |
98 | ++bytes; | |
99 | --length; | |
100 | } | |
101 | ||
102 | *s=0; | |
103 | return buffer; | |
104 | } | |
b75a7d8f A |
105 | |
106 | /* implementation ----------------------------------------------------------- */ | |
107 | ||
108 | static void | |
374ca955 A |
109 | MBCSInit(MBCSData *mbcsData, UCMFile *ucm) { |
110 | int32_t i, maxCharLength; | |
b75a7d8f A |
111 | |
112 | uprv_memset(mbcsData, 0, sizeof(MBCSData)); | |
113 | ||
374ca955 A |
114 | maxCharLength=ucm->states.maxCharLength; |
115 | ||
116 | mbcsData->ucm=ucm; /* aliased, not owned */ | |
117 | ||
b75a7d8f | 118 | mbcsData->newConverter.close=MBCSClose; |
b75a7d8f | 119 | mbcsData->newConverter.isValid=MBCSIsValid; |
374ca955 | 120 | mbcsData->newConverter.addTable=MBCSAddTable; |
b75a7d8f A |
121 | mbcsData->newConverter.write=MBCSWrite; |
122 | ||
b75a7d8f A |
123 | mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */ |
124 | mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */ | |
b75a7d8f A |
125 | |
126 | /* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */ | |
127 | for(i=0; i<MBCS_STAGE_1_SIZE; ++i) { | |
128 | mbcsData->stage1[i]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; | |
129 | } | |
130 | } | |
131 | ||
132 | NewConverter * | |
374ca955 | 133 | MBCSOpen(UCMFile *ucm) { |
b75a7d8f A |
134 | MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData)); |
135 | if(mbcsData!=NULL) { | |
374ca955 | 136 | MBCSInit(mbcsData, ucm); |
b75a7d8f A |
137 | } |
138 | return &mbcsData->newConverter; | |
139 | } | |
140 | ||
141 | static void | |
142 | MBCSClose(NewConverter *cnvData) { | |
143 | MBCSData *mbcsData=(MBCSData *)cnvData; | |
144 | if(mbcsData!=NULL) { | |
374ca955 A |
145 | uprv_free(mbcsData->unicodeCodeUnits); |
146 | uprv_free(mbcsData->fromUBytes); | |
b75a7d8f A |
147 | uprv_free(mbcsData); |
148 | } | |
149 | } | |
150 | ||
b75a7d8f | 151 | static UBool |
374ca955 A |
152 | MBCSStartMappings(MBCSData *mbcsData) { |
153 | int32_t i, sum; | |
b75a7d8f | 154 | |
374ca955 A |
155 | /* allocate the code unit array and prefill it with "unassigned" values */ |
156 | sum=mbcsData->ucm->states.countToUCodeUnits; | |
157 | if(VERBOSE) { | |
158 | printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum); | |
b75a7d8f A |
159 | } |
160 | ||
b75a7d8f A |
161 | if(sum>0) { |
162 | mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); | |
163 | if(mbcsData->unicodeCodeUnits==NULL) { | |
164 | fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n", | |
165 | (long)sum); | |
166 | return FALSE; | |
167 | } | |
168 | for(i=0; i<sum; ++i) { | |
169 | mbcsData->unicodeCodeUnits[i]=0xfffe; | |
170 | } | |
171 | } | |
172 | ||
173 | /* allocate the codepage mappings and preset the first 16 characters to 0 */ | |
374ca955 | 174 | if(mbcsData->ucm->states.maxCharLength==1) { |
b75a7d8f A |
175 | /* allocate 64k 16-bit results for single-byte codepages */ |
176 | sum=0x20000; | |
177 | } else { | |
178 | /* allocate 1M * maxCharLength bytes for at most 1M mappings */ | |
374ca955 | 179 | sum=0x100000*mbcsData->ucm->states.maxCharLength; |
b75a7d8f A |
180 | } |
181 | mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum); | |
182 | if(mbcsData->fromUBytes==NULL) { | |
374ca955 | 183 | fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum); |
b75a7d8f A |
184 | return FALSE; |
185 | } | |
186 | /* initialize the all-unassigned first stage 3 block */ | |
187 | uprv_memset(mbcsData->fromUBytes, 0, 64); | |
188 | ||
189 | return TRUE; | |
190 | } | |
191 | ||
b75a7d8f A |
192 | /* return TRUE for success */ |
193 | static UBool | |
194 | setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) { | |
374ca955 | 195 | int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); |
b75a7d8f A |
196 | if(i>=0) { |
197 | /* if there is already a fallback for this offset, then overwrite it */ | |
198 | mbcsData->toUFallbacks[i].codePoint=c; | |
199 | return TRUE; | |
200 | } else { | |
201 | /* if there is no fallback for this offset, then add one */ | |
374ca955 | 202 | i=mbcsData->countToUFallbacks; |
b75a7d8f | 203 | if(i>=MBCS_MAX_FALLBACK_COUNT) { |
374ca955 | 204 | fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c); |
b75a7d8f A |
205 | return FALSE; |
206 | } else { | |
207 | mbcsData->toUFallbacks[i].offset=offset; | |
208 | mbcsData->toUFallbacks[i].codePoint=c; | |
374ca955 | 209 | mbcsData->countToUFallbacks=i+1; |
b75a7d8f A |
210 | return TRUE; |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */ | |
216 | static int32_t | |
217 | removeFallback(MBCSData *mbcsData, uint32_t offset) { | |
374ca955 | 218 | int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); |
b75a7d8f A |
219 | if(i>=0) { |
220 | _MBCSToUFallback *toUFallbacks; | |
221 | int32_t limit, old; | |
222 | ||
223 | toUFallbacks=mbcsData->toUFallbacks; | |
374ca955 | 224 | limit=mbcsData->countToUFallbacks; |
b75a7d8f A |
225 | old=(int32_t)toUFallbacks[i].codePoint; |
226 | ||
227 | /* copy the last fallback entry here to keep the list contiguous */ | |
228 | toUFallbacks[i].offset=toUFallbacks[limit-1].offset; | |
229 | toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint; | |
374ca955 | 230 | mbcsData->countToUFallbacks=limit-1; |
b75a7d8f A |
231 | return old; |
232 | } else { | |
233 | return -1; | |
234 | } | |
235 | } | |
236 | ||
237 | /* | |
238 | * isFallback is almost a boolean: | |
239 | * 1 (TRUE) this is a fallback mapping | |
240 | * 0 (FALSE) this is a precise mapping | |
241 | * -1 the precision of this mapping is not specified | |
242 | */ | |
243 | static UBool | |
374ca955 | 244 | MBCSAddToUnicode(MBCSData *mbcsData, |
b75a7d8f | 245 | const uint8_t *bytes, int32_t length, |
374ca955 A |
246 | UChar32 c, |
247 | int8_t flag) { | |
248 | char buffer[10]; | |
b75a7d8f A |
249 | uint32_t offset=0; |
250 | int32_t i=0, entry, old; | |
251 | uint8_t state=0; | |
252 | ||
374ca955 | 253 | if(mbcsData->ucm->states.countStates==0) { |
b75a7d8f A |
254 | fprintf(stderr, "error: there is no state information!\n"); |
255 | return FALSE; | |
256 | } | |
257 | ||
258 | /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ | |
374ca955 | 259 | if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) { |
b75a7d8f A |
260 | state=1; |
261 | } | |
262 | ||
263 | /* | |
264 | * Walk down the state table like in conversion, | |
265 | * much like getNextUChar(). | |
266 | * We assume that c<=0x10ffff. | |
267 | */ | |
268 | for(i=0;;) { | |
374ca955 | 269 | entry=mbcsData->ucm->states.stateTable[state][bytes[i++]]; |
b75a7d8f A |
270 | if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
271 | if(i==length) { | |
374ca955 A |
272 | fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n", |
273 | (short)state, printBytes(buffer, bytes, length), (int)c); | |
b75a7d8f A |
274 | return FALSE; |
275 | } | |
276 | state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); | |
277 | offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); | |
278 | } else { | |
279 | if(i<length) { | |
374ca955 A |
280 | fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%s (U+%x)\n", |
281 | (int)(length-i), state, printBytes(buffer, bytes, length), (int)c); | |
b75a7d8f A |
282 | return FALSE; |
283 | } | |
284 | switch(MBCS_ENTRY_FINAL_ACTION(entry)) { | |
285 | case MBCS_STATE_ILLEGAL: | |
374ca955 A |
286 | fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n", |
287 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
288 | return FALSE; |
289 | case MBCS_STATE_CHANGE_ONLY: | |
374ca955 A |
290 | fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n", |
291 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
292 | return FALSE; |
293 | case MBCS_STATE_UNASSIGNED: | |
374ca955 A |
294 | fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n", |
295 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
296 | return FALSE; |
297 | case MBCS_STATE_FALLBACK_DIRECT_16: | |
298 | case MBCS_STATE_VALID_DIRECT_16: | |
299 | case MBCS_STATE_FALLBACK_DIRECT_20: | |
300 | case MBCS_STATE_VALID_DIRECT_20: | |
301 | if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { | |
302 | /* the "direct" action's value is not "valid-direct-16-unassigned" any more */ | |
303 | if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) { | |
304 | old=MBCS_ENTRY_FINAL_VALUE(entry); | |
305 | } else { | |
306 | old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); | |
307 | } | |
374ca955 A |
308 | if(flag>=0) { |
309 | fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", | |
310 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
311 | return FALSE; |
312 | } else if(VERBOSE) { | |
374ca955 A |
313 | fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", |
314 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
315 | } |
316 | /* | |
317 | * Continue after the above warning | |
318 | * if the precision of the mapping is unspecified. | |
319 | */ | |
320 | } | |
321 | /* reassign the correct action code */ | |
374ca955 | 322 | entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0))); |
b75a7d8f A |
323 | |
324 | /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */ | |
325 | if(c<=0xffff) { | |
326 | entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c); | |
327 | } else { | |
328 | entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000); | |
329 | } | |
374ca955 | 330 | mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry; |
b75a7d8f A |
331 | break; |
332 | case MBCS_STATE_VALID_16: | |
333 | /* bits 26..16 are not used, 0 */ | |
334 | /* bits 15..7 contain the final offset delta to one 16-bit code unit */ | |
335 | offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); | |
336 | /* check that this byte sequence is still unassigned */ | |
337 | if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) { | |
374ca955 A |
338 | if(flag>=0) { |
339 | fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", | |
340 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
341 | return FALSE; |
342 | } else if(VERBOSE) { | |
374ca955 A |
343 | fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", |
344 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
345 | } |
346 | } | |
347 | if(c>=0x10000) { | |
374ca955 A |
348 | fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n", |
349 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
350 | return FALSE; |
351 | } | |
374ca955 | 352 | if(flag>0) { |
b75a7d8f A |
353 | /* assign only if there is no precise mapping */ |
354 | if(mbcsData->unicodeCodeUnits[offset]==0xfffe) { | |
355 | return setFallback(mbcsData, offset, c); | |
356 | } | |
357 | } else { | |
358 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; | |
359 | } | |
360 | break; | |
361 | case MBCS_STATE_VALID_16_PAIR: | |
362 | /* bits 26..16 are not used, 0 */ | |
363 | /* bits 15..7 contain the final offset delta to two 16-bit code units */ | |
364 | offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); | |
365 | /* check that this byte sequence is still unassigned */ | |
366 | old=mbcsData->unicodeCodeUnits[offset]; | |
367 | if(old<0xfffe) { | |
368 | int32_t real; | |
369 | if(old<0xd800) { | |
370 | real=old; | |
371 | } else if(old<=0xdfff) { | |
372 | real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff); | |
373 | } else /* old<=0xe001 */ { | |
374 | real=mbcsData->unicodeCodeUnits[offset+1]; | |
375 | } | |
374ca955 A |
376 | if(flag>=0) { |
377 | fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", | |
378 | (int)c, printBytes(buffer, bytes, length), (int)real); | |
b75a7d8f A |
379 | return FALSE; |
380 | } else if(VERBOSE) { | |
374ca955 A |
381 | fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", |
382 | (int)c, printBytes(buffer, bytes, length), (int)real); | |
b75a7d8f A |
383 | } |
384 | } | |
374ca955 | 385 | if(flag>0) { |
b75a7d8f A |
386 | /* assign only if there is no precise mapping */ |
387 | if(old<=0xdbff || old==0xe000) { | |
388 | /* do nothing */ | |
389 | } else if(c<=0xffff) { | |
390 | /* set a BMP fallback code point as a pair with 0xe001 */ | |
391 | mbcsData->unicodeCodeUnits[offset++]=0xe001; | |
392 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; | |
393 | } else { | |
394 | /* set a fallback surrogate pair with two second surrogates */ | |
395 | mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10)); | |
396 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); | |
397 | } | |
398 | } else { | |
399 | if(c<0xd800) { | |
400 | /* set a BMP code point */ | |
401 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; | |
402 | } else if(c<=0xffff) { | |
403 | /* set a BMP code point above 0xd800 as a pair with 0xe000 */ | |
404 | mbcsData->unicodeCodeUnits[offset++]=0xe000; | |
405 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; | |
406 | } else { | |
407 | /* set a surrogate pair */ | |
408 | mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10)); | |
409 | mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); | |
410 | } | |
411 | } | |
412 | break; | |
413 | default: | |
414 | /* reserved, must never occur */ | |
374ca955 A |
415 | fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n", |
416 | (int)entry, printBytes(buffer, bytes, length), (int)c); | |
b75a7d8f A |
417 | return FALSE; |
418 | } | |
419 | ||
420 | return TRUE; | |
421 | } | |
422 | } | |
423 | } | |
424 | ||
425 | /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */ | |
426 | static UBool | |
427 | MBCSIsValid(NewConverter *cnvData, | |
374ca955 | 428 | const uint8_t *bytes, int32_t length) { |
b75a7d8f | 429 | MBCSData *mbcsData=(MBCSData *)cnvData; |
b75a7d8f | 430 | |
374ca955 | 431 | return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length)); |
b75a7d8f A |
432 | } |
433 | ||
434 | static UBool | |
374ca955 | 435 | MBCSSingleAddFromUnicode(MBCSData *mbcsData, |
b75a7d8f | 436 | const uint8_t *bytes, int32_t length, |
374ca955 A |
437 | UChar32 c, |
438 | int8_t flag) { | |
b75a7d8f A |
439 | uint16_t *p; |
440 | uint32_t index; | |
441 | uint16_t old; | |
374ca955 A |
442 | uint8_t b; |
443 | ||
444 | /* ignore |2 SUB mappings */ | |
445 | if(flag==2) { | |
446 | return TRUE; | |
447 | } | |
b75a7d8f A |
448 | |
449 | /* | |
450 | * Walk down the triple-stage compact array ("trie") and | |
451 | * allocate parts as necessary. | |
452 | * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings. | |
453 | * We assume that length<=maxCharLength and that c<=0x10ffff. | |
454 | */ | |
374ca955 | 455 | b=*bytes; |
b75a7d8f A |
456 | |
457 | /* inspect stage 1 */ | |
458 | index=c>>10; | |
459 | if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { | |
460 | /* allocate another block in stage 2 */ | |
461 | if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) { | |
374ca955 | 462 | fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b); |
b75a7d8f A |
463 | return FALSE; |
464 | } | |
465 | ||
466 | /* | |
467 | * each stage 2 block contains 64 16-bit words: | |
468 | * 6 code point bits 9..4 with 1 stage 3 index | |
469 | */ | |
470 | mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top; | |
471 | mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE; | |
472 | } | |
473 | ||
474 | /* inspect stage 2 */ | |
475 | index=(uint32_t)mbcsData->stage1[index]+((c>>4)&0x3f); | |
476 | if(mbcsData->stage2Single[index]==0) { | |
477 | /* allocate another block in stage 3 */ | |
478 | if(mbcsData->stage3Top>=0x10000) { | |
374ca955 | 479 | fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b); |
b75a7d8f A |
480 | return FALSE; |
481 | } | |
482 | /* each block has 16 uint16_t entries */ | |
483 | mbcsData->stage2Single[index]=(uint16_t)mbcsData->stage3Top; | |
484 | uprv_memset(mbcsData->fromUBytes+2*mbcsData->stage3Top, 0, 32); | |
485 | mbcsData->stage3Top+=16; | |
486 | } | |
487 | ||
488 | /* write the codepage entry into stage 3 and get the previous entry */ | |
489 | p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf); | |
490 | old=*p; | |
374ca955 | 491 | if(flag<=0) { |
b75a7d8f A |
492 | *p=(uint16_t)(0xf00|b); |
493 | } else if(IS_PRIVATE_USE(c)) { | |
494 | *p=(uint16_t)(0xc00|b); | |
495 | } else { | |
496 | *p=(uint16_t)(0x800|b); | |
497 | } | |
498 | ||
499 | /* check that this Unicode code point was still unassigned */ | |
500 | if(old>=0x100) { | |
374ca955 A |
501 | if(flag>=0) { |
502 | fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", | |
503 | (int)c, b, old&0xff); | |
b75a7d8f A |
504 | return FALSE; |
505 | } else if(VERBOSE) { | |
374ca955 A |
506 | fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", |
507 | (int)c, b, old&0xff); | |
b75a7d8f A |
508 | } |
509 | /* continue after the above warning if the precision of the mapping is unspecified */ | |
510 | } | |
511 | ||
512 | return TRUE; | |
513 | } | |
514 | ||
515 | static UBool | |
374ca955 | 516 | MBCSAddFromUnicode(MBCSData *mbcsData, |
b75a7d8f | 517 | const uint8_t *bytes, int32_t length, |
374ca955 A |
518 | UChar32 c, |
519 | int8_t flag) { | |
520 | char buffer[10]; | |
521 | const uint8_t *pb; | |
b75a7d8f | 522 | uint8_t *p; |
374ca955 A |
523 | uint32_t index, b, old; |
524 | int32_t maxCharLength; | |
525 | ||
526 | /* ignore |2 SUB mappings */ | |
527 | if(flag==2) { | |
528 | return TRUE; | |
529 | } | |
530 | ||
531 | maxCharLength=mbcsData->ucm->states.maxCharLength; | |
532 | ||
533 | if(maxCharLength==1) { | |
534 | return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag); | |
535 | } | |
b75a7d8f | 536 | |
374ca955 | 537 | if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO && |
b75a7d8f A |
538 | (*bytes==0xe || *bytes==0xf) |
539 | ) { | |
374ca955 A |
540 | fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n", |
541 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
542 | return FALSE; |
543 | } | |
374ca955 A |
544 | |
545 | if(flag==1 && length==1 && *bytes==0) { | |
546 | fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n", | |
547 | (int)c, *bytes); | |
548 | return FALSE; | |
549 | } | |
550 | ||
b75a7d8f A |
551 | /* |
552 | * Walk down the triple-stage compact array ("trie") and | |
553 | * allocate parts as necessary. | |
554 | * Note that the first stage 2 and 3 blocks are reserved for | |
555 | * all-unassigned mappings. | |
556 | * We assume that length<=maxCharLength and that c<=0x10ffff. | |
557 | */ | |
558 | ||
559 | /* inspect stage 1 */ | |
560 | index=c>>10; | |
561 | if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { | |
562 | /* allocate another block in stage 2 */ | |
563 | if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) { | |
374ca955 A |
564 | fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n", |
565 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
566 | return FALSE; |
567 | } | |
568 | ||
569 | /* | |
570 | * each stage 2 block contains 64 32-bit words: | |
571 | * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index | |
572 | */ | |
573 | mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top; | |
574 | mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE; | |
575 | } | |
576 | ||
577 | /* inspect stage 2 */ | |
578 | index=mbcsData->stage1[index]+((c>>4)&0x3f); | |
579 | if(mbcsData->stage2[index]==0) { | |
580 | /* allocate another block in stage 3 */ | |
374ca955 A |
581 | if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) { |
582 | fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n", | |
583 | (int)c, printBytes(buffer, bytes, length)); | |
b75a7d8f A |
584 | return FALSE; |
585 | } | |
586 | /* each block has 16*maxCharLength bytes */ | |
374ca955 A |
587 | mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength; |
588 | uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength); | |
589 | mbcsData->stage3Top+=16*maxCharLength; | |
b75a7d8f A |
590 | } |
591 | ||
592 | /* write the codepage bytes into stage 3 and get the previous bytes */ | |
374ca955 A |
593 | |
594 | /* assemble the bytes into a single integer */ | |
595 | pb=bytes; | |
596 | b=0; | |
597 | switch(length) { | |
598 | case 4: | |
599 | b=*pb++; | |
600 | case 3: | |
601 | b=(b<<8)|*pb++; | |
602 | case 2: | |
603 | b=(b<<8)|*pb++; | |
604 | case 1: | |
605 | default: | |
606 | b=(b<<8)|*pb++; | |
607 | break; | |
608 | } | |
609 | ||
b75a7d8f | 610 | old=0; |
374ca955 A |
611 | p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength; |
612 | switch(maxCharLength) { | |
b75a7d8f A |
613 | case 2: |
614 | old=*(uint16_t *)p; | |
615 | *(uint16_t *)p=(uint16_t)b; | |
616 | break; | |
617 | case 3: | |
618 | old=(uint32_t)*p<<16; | |
619 | *p++=(uint8_t)(b>>16); | |
620 | old|=(uint32_t)*p<<8; | |
621 | *p++=(uint8_t)(b>>8); | |
622 | old|=*p; | |
623 | *p=(uint8_t)b; | |
624 | break; | |
625 | case 4: | |
626 | old=*(uint32_t *)p; | |
627 | *(uint32_t *)p=b; | |
628 | break; | |
629 | default: | |
630 | /* will never occur */ | |
631 | break; | |
632 | } | |
633 | ||
634 | /* check that this Unicode code point was still unassigned */ | |
635 | if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) { | |
374ca955 A |
636 | if(flag>=0) { |
637 | fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", | |
638 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
639 | return FALSE; |
640 | } else if(VERBOSE) { | |
374ca955 A |
641 | fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", |
642 | (int)c, printBytes(buffer, bytes, length), (int)old); | |
b75a7d8f A |
643 | } |
644 | /* continue after the above warning if the precision of the mapping is | |
645 | unspecified */ | |
646 | } | |
374ca955 A |
647 | if(flag<=0) { |
648 | /* set the roundtrip flag */ | |
b75a7d8f A |
649 | mbcsData->stage2[index]|=(1UL<<(16+(c&0xf))); |
650 | } | |
651 | ||
652 | return TRUE; | |
653 | } | |
654 | ||
374ca955 A |
655 | /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */ |
656 | static UBool | |
657 | MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { | |
658 | MBCSData *mbcsData; | |
659 | UCMapping *m; | |
660 | UChar32 c; | |
661 | int32_t i; | |
662 | UBool isOK; | |
663 | ||
664 | staticData->unicodeMask=table->unicodeMask; | |
665 | if(staticData->unicodeMask==3) { | |
666 | fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n"); | |
667 | return FALSE; | |
b75a7d8f A |
668 | } |
669 | ||
374ca955 | 670 | staticData->conversionType=UCNV_MBCS; |
b75a7d8f | 671 | |
374ca955 | 672 | mbcsData=(MBCSData *)cnvData; |
b75a7d8f | 673 | |
374ca955 A |
674 | if(!MBCSStartMappings(mbcsData)) { |
675 | return FALSE; | |
b75a7d8f A |
676 | } |
677 | ||
374ca955 | 678 | isOK=TRUE; |
b75a7d8f | 679 | |
374ca955 A |
680 | m=table->mappings; |
681 | for(i=0; i<table->mappingsLength; ++m, ++i) { | |
682 | c=m->u; | |
b75a7d8f | 683 | |
374ca955 A |
684 | switch(m->f) { |
685 | case -1: | |
686 | /* there was no precision/fallback indicator */ | |
687 | /* fall through to set the mappings */ | |
688 | case 0: | |
689 | /* set roundtrip mappings */ | |
690 | isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) && | |
691 | MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); | |
692 | break; | |
693 | case 1: | |
694 | /* set only a fallback mapping from Unicode to codepage */ | |
695 | staticData->hasFromUnicodeFallback=TRUE; | |
696 | isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); | |
697 | break; | |
698 | case 2: | |
699 | /* ignore |2 SUB mappings */ | |
700 | break; | |
701 | case 3: | |
702 | /* set only a fallback mapping from codepage to Unicode */ | |
703 | staticData->hasToUnicodeFallback=TRUE; | |
704 | isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); | |
705 | break; | |
706 | default: | |
707 | /* will not occur because the parser checked it already */ | |
708 | fprintf(stderr, "error: illegal fallback indicator %d\n", m->f); | |
709 | return FALSE; | |
b75a7d8f A |
710 | } |
711 | } | |
b75a7d8f | 712 | |
374ca955 | 713 | MBCSPostprocess(mbcsData, staticData); |
b75a7d8f | 714 | |
374ca955 | 715 | return isOK; |
b75a7d8f A |
716 | } |
717 | ||
718 | static UBool | |
719 | transformEUC(MBCSData *mbcsData) { | |
720 | uint8_t *p8; | |
374ca955 | 721 | uint32_t i, value, oldLength, old3Top, new3Top; |
b75a7d8f A |
722 | uint8_t b; |
723 | ||
374ca955 | 724 | oldLength=mbcsData->ucm->states.maxCharLength; |
b75a7d8f A |
725 | if(oldLength<3) { |
726 | return FALSE; | |
727 | } | |
728 | ||
374ca955 A |
729 | old3Top=mbcsData->stage3Top; |
730 | ||
b75a7d8f A |
731 | /* careful: 2-byte and 4-byte codes are stored in platform endianness! */ |
732 | ||
733 | /* test if all first bytes are in {0, 0x8e, 0x8f} */ | |
734 | p8=mbcsData->fromUBytes; | |
735 | ||
736 | #if !U_IS_BIG_ENDIAN | |
737 | if(oldLength==4) { | |
738 | p8+=3; | |
739 | } | |
740 | #endif | |
741 | ||
742 | for(i=0; i<old3Top; i+=oldLength) { | |
743 | b=p8[i]; | |
744 | if(b!=0 && b!=0x8e && b!=0x8f) { | |
745 | /* some first byte does not fit the EUC pattern, nothing to be done */ | |
746 | return FALSE; | |
747 | } | |
748 | } | |
749 | /* restore p if it was modified above */ | |
750 | p8=mbcsData->fromUBytes; | |
751 | ||
752 | /* modify outputType and adjust stage3Top */ | |
374ca955 | 753 | mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3); |
b75a7d8f A |
754 | mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength; |
755 | ||
756 | /* | |
757 | * EUC-encode all byte sequences; | |
758 | * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly, | |
759 | * p. 161 in chapter 4 "Encoding Methods" | |
760 | * | |
761 | * This also must reverse the byte order if the platform is little-endian! | |
762 | */ | |
763 | if(oldLength==3) { | |
764 | uint16_t *q=(uint16_t *)p8; | |
765 | for(i=0; i<old3Top; i+=oldLength) { | |
766 | b=*p8; | |
767 | if(b==0) { | |
768 | /* short sequences are stored directly */ | |
769 | /* code set 0 or 1 */ | |
770 | (*q++)=(uint16_t)((p8[1]<<8)|p8[2]); | |
771 | } else if(b==0x8e) { | |
772 | /* code set 2 */ | |
773 | (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]); | |
774 | } else /* b==0x8f */ { | |
775 | /* code set 3 */ | |
776 | (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f)); | |
777 | } | |
778 | p8+=3; | |
779 | } | |
780 | } else /* oldLength==4 */ { | |
781 | uint8_t *q=p8; | |
782 | uint32_t *p32=(uint32_t *)p8; | |
783 | for(i=0; i<old3Top; i+=4) { | |
784 | value=(*p32++); | |
785 | if(value<=0xffffff) { | |
786 | /* short sequences are stored directly */ | |
787 | /* code set 0 or 1 */ | |
788 | (*q++)=(uint8_t)(value>>16); | |
789 | (*q++)=(uint8_t)(value>>8); | |
790 | (*q++)=(uint8_t)value; | |
791 | } else if(value<=0x8effffff) { | |
792 | /* code set 2 */ | |
793 | (*q++)=(uint8_t)((value>>16)&0x7f); | |
794 | (*q++)=(uint8_t)(value>>8); | |
795 | (*q++)=(uint8_t)value; | |
796 | } else /* first byte is 0x8f */ { | |
797 | /* code set 3 */ | |
798 | (*q++)=(uint8_t)(value>>16); | |
799 | (*q++)=(uint8_t)((value>>8)&0x7f); | |
800 | (*q++)=(uint8_t)value; | |
801 | } | |
802 | } | |
803 | } | |
804 | ||
805 | return TRUE; | |
806 | } | |
807 | ||
808 | /* | |
809 | * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far | |
810 | * as possible. Overlapping is done on unassigned head and tail | |
811 | * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. | |
812 | * Stage 1 indexes need to be adjusted accordingly. | |
813 | * This function is very similar to genprops/store.c/compactStage(). | |
814 | */ | |
815 | static void | |
816 | singleCompactStage2(MBCSData *mbcsData) { | |
817 | /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ | |
818 | uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; | |
819 | uint16_t i, start, prevEnd, newStart; | |
820 | ||
821 | /* enter the all-unassigned first stage 2 block into the map */ | |
822 | map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; | |
823 | ||
824 | /* begin with the first block after the all-unassigned one */ | |
825 | start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; | |
826 | while(start<mbcsData->stage2Top) { | |
827 | prevEnd=(uint16_t)(newStart-1); | |
828 | ||
829 | /* find the size of the overlap */ | |
830 | for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {} | |
831 | ||
832 | if(i>0) { | |
833 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); | |
834 | ||
835 | /* move the non-overlapping indexes to their new positions */ | |
836 | start+=i; | |
837 | for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { | |
838 | mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; | |
839 | } | |
840 | } else if(newStart<start) { | |
841 | /* move the indexes to their new positions */ | |
842 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; | |
843 | for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { | |
844 | mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; | |
845 | } | |
846 | } else /* no overlap && newStart==start */ { | |
847 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; | |
848 | start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; | |
849 | } | |
850 | } | |
851 | ||
852 | /* adjust stage2Top */ | |
853 | if(VERBOSE && newStart<mbcsData->stage2Top) { | |
854 | printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", | |
855 | (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, | |
856 | (long)(mbcsData->stage2Top-newStart)*2); | |
857 | } | |
858 | mbcsData->stage2Top=newStart; | |
859 | ||
860 | /* now adjust stage 1 */ | |
861 | for(i=0; i<MBCS_STAGE_1_SIZE; ++i) { | |
862 | mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; | |
863 | } | |
864 | } | |
865 | ||
866 | /* Compact stage 3 for SBCS - same algorithm as above. */ | |
867 | static void | |
868 | singleCompactStage3(MBCSData *mbcsData) { | |
869 | uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes; | |
870 | ||
871 | /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */ | |
872 | uint16_t map[0x1000]; | |
873 | uint16_t i, start, prevEnd, newStart; | |
874 | ||
875 | /* enter the all-unassigned first stage 3 block into the map */ | |
876 | map[0]=0; | |
877 | ||
878 | /* begin with the first block after the all-unassigned one */ | |
879 | start=newStart=16; | |
880 | while(start<mbcsData->stage3Top) { | |
881 | prevEnd=(uint16_t)(newStart-1); | |
882 | ||
883 | /* find the size of the overlap */ | |
884 | for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {} | |
885 | ||
886 | if(i>0) { | |
887 | map[start>>4]=(uint16_t)(newStart-i); | |
888 | ||
889 | /* move the non-overlapping indexes to their new positions */ | |
890 | start+=i; | |
891 | for(i=(uint16_t)(16-i); i>0; --i) { | |
892 | stage3[newStart++]=stage3[start++]; | |
893 | } | |
894 | } else if(newStart<start) { | |
895 | /* move the indexes to their new positions */ | |
896 | map[start>>4]=newStart; | |
897 | for(i=16; i>0; --i) { | |
898 | stage3[newStart++]=stage3[start++]; | |
899 | } | |
900 | } else /* no overlap && newStart==start */ { | |
901 | map[start>>4]=start; | |
902 | start=newStart+=16; | |
903 | } | |
904 | } | |
905 | ||
906 | /* adjust stage3Top */ | |
907 | if(VERBOSE && newStart<mbcsData->stage3Top) { | |
908 | printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n", | |
909 | (unsigned long)mbcsData->stage3Top, (unsigned long)newStart, | |
910 | (long)(mbcsData->stage3Top-newStart)*2); | |
911 | } | |
912 | mbcsData->stage3Top=newStart; | |
913 | ||
914 | /* now adjust stage 2 */ | |
915 | for(i=0; i<mbcsData->stage2Top; ++i) { | |
916 | mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4]; | |
917 | } | |
918 | } | |
919 | ||
920 | /* | |
921 | * Compact stage 2 by overlapping adjacent stage 2 blocks as far | |
922 | * as possible. Overlapping is done on unassigned head and tail | |
923 | * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. | |
924 | * Stage 1 indexes need to be adjusted accordingly. | |
925 | * This function is very similar to genprops/store.c/compactStage(). | |
926 | */ | |
927 | static void | |
928 | compactStage2(MBCSData *mbcsData) { | |
929 | /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ | |
930 | uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; | |
931 | uint16_t i, start, prevEnd, newStart; | |
932 | ||
933 | /* enter the all-unassigned first stage 2 block into the map */ | |
934 | map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; | |
935 | ||
936 | /* begin with the first block after the all-unassigned one */ | |
937 | start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; | |
938 | while(start<mbcsData->stage2Top) { | |
939 | prevEnd=(uint16_t)(newStart-1); | |
940 | ||
941 | /* find the size of the overlap */ | |
942 | for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {} | |
943 | ||
944 | if(i>0) { | |
945 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); | |
946 | ||
947 | /* move the non-overlapping indexes to their new positions */ | |
948 | start+=i; | |
949 | for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { | |
950 | mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; | |
951 | } | |
952 | } else if(newStart<start) { | |
953 | /* move the indexes to their new positions */ | |
954 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; | |
955 | for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { | |
956 | mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; | |
957 | } | |
958 | } else /* no overlap && newStart==start */ { | |
959 | map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; | |
960 | start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; | |
961 | } | |
962 | } | |
963 | ||
964 | /* adjust stage2Top */ | |
965 | if(VERBOSE && newStart<mbcsData->stage2Top) { | |
966 | printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", | |
967 | (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, | |
968 | (long)(mbcsData->stage2Top-newStart)*4); | |
969 | } | |
970 | mbcsData->stage2Top=newStart; | |
971 | ||
972 | /* now adjust stage 1 */ | |
973 | for(i=0; i<MBCS_STAGE_1_SIZE; ++i) { | |
974 | mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; | |
975 | } | |
976 | } | |
977 | ||
978 | static void | |
374ca955 A |
979 | MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) { |
980 | UCMStates *states; | |
981 | int32_t maxCharLength; | |
982 | ||
983 | states=&mbcsData->ucm->states; | |
984 | maxCharLength=states->maxCharLength; | |
b75a7d8f A |
985 | |
986 | /* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */ | |
987 | if(VERBOSE) { | |
988 | printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n", | |
374ca955 A |
989 | (unsigned long)mbcsData->stage3Top/maxCharLength, |
990 | (unsigned long)mbcsData->stage3Top/maxCharLength); | |
b75a7d8f A |
991 | } |
992 | ||
374ca955 A |
993 | ucm_optimizeStates(states, |
994 | &mbcsData->unicodeCodeUnits, | |
995 | mbcsData->toUFallbacks, mbcsData->countToUFallbacks, | |
996 | VERBOSE); | |
b75a7d8f A |
997 | |
998 | /* try to compact the fromUnicode tables */ | |
999 | transformEUC(mbcsData); | |
374ca955 | 1000 | if(maxCharLength==1) { |
b75a7d8f A |
1001 | singleCompactStage3(mbcsData); |
1002 | singleCompactStage2(mbcsData); | |
1003 | } else { | |
1004 | compactStage2(mbcsData); | |
1005 | } | |
1006 | } | |
1007 | ||
1008 | static uint32_t | |
374ca955 A |
1009 | MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, |
1010 | UNewDataMemory *pData, int32_t tableType) { | |
b75a7d8f | 1011 | MBCSData *mbcsData=(MBCSData *)cnvData; |
374ca955 | 1012 | uint32_t top; |
b75a7d8f A |
1013 | int32_t i, stage1Top; |
1014 | ||
374ca955 A |
1015 | _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 }; |
1016 | ||
b75a7d8f | 1017 | /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */ |
374ca955 | 1018 | if(mbcsData->ucm->states.maxCharLength==1) { |
b75a7d8f A |
1019 | if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { |
1020 | stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ | |
1021 | } else { | |
1022 | stage1Top=0x40; /* 0x40==64 */ | |
1023 | } | |
1024 | for(i=0; i<stage1Top; ++i) { | |
1025 | mbcsData->stage1[i]+=(uint16_t)stage1Top; | |
1026 | } | |
1027 | ||
1028 | /* stage2Top has counted 16-bit results, now we need to count bytes */ | |
1029 | mbcsData->stage2Top*=2; | |
1030 | ||
1031 | /* stage3Top has counted 16-bit results, now we need to count bytes */ | |
1032 | mbcsData->stage3Top*=2; | |
1033 | } else { | |
1034 | if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { | |
1035 | stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ | |
1036 | } else { | |
1037 | stage1Top=0x40; /* 0x40==64 */ | |
1038 | } | |
1039 | for(i=0; i<stage1Top; ++i) { | |
1040 | mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */ | |
1041 | } | |
1042 | ||
1043 | /* stage2Top has counted 32-bit results, now we need to count bytes */ | |
1044 | mbcsData->stage2Top*=4; | |
1045 | ||
1046 | /* stage3Top has already counted bytes */ | |
1047 | } | |
1048 | ||
1049 | /* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */ | |
1050 | mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3; | |
1051 | mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3; | |
1052 | ||
1053 | /* fill the header */ | |
374ca955 A |
1054 | header.version[0]=4; |
1055 | header.version[1]=2; | |
1056 | header.countStates=mbcsData->ucm->states.countStates; | |
1057 | header.countToUFallbacks=mbcsData->countToUFallbacks; | |
1058 | ||
1059 | header.offsetToUCodeUnits= | |
b75a7d8f | 1060 | sizeof(_MBCSHeader)+ |
374ca955 A |
1061 | mbcsData->ucm->states.countStates*1024+ |
1062 | mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback); | |
1063 | header.offsetFromUTable= | |
1064 | header.offsetToUCodeUnits+ | |
1065 | mbcsData->ucm->states.countToUCodeUnits*2; | |
1066 | header.offsetFromUBytes= | |
1067 | header.offsetFromUTable+ | |
b75a7d8f A |
1068 | stage1Top*2+ |
1069 | mbcsData->stage2Top; | |
374ca955 A |
1070 | header.fromUBytesLength=mbcsData->stage3Top; |
1071 | ||
1072 | top=header.offsetFromUBytes+header.fromUBytesLength; | |
1073 | ||
1074 | header.flags=(uint8_t)(mbcsData->ucm->states.outputType); | |
1075 | ||
1076 | if(tableType&TABLE_EXT) { | |
1077 | if(top>0xffffff) { | |
1078 | fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top); | |
1079 | return 0; | |
1080 | } | |
1081 | ||
1082 | header.flags|=top<<8; | |
1083 | } | |
b75a7d8f A |
1084 | |
1085 | /* write the MBCS data */ | |
374ca955 A |
1086 | udata_writeBlock(pData, &header, sizeof(_MBCSHeader)); |
1087 | udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024); | |
1088 | udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback)); | |
1089 | udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2); | |
b75a7d8f | 1090 | udata_writeBlock(pData, mbcsData->stage1, stage1Top*2); |
374ca955 | 1091 | if(mbcsData->ucm->states.maxCharLength==1) { |
b75a7d8f A |
1092 | udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top); |
1093 | } else { | |
1094 | udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top); | |
1095 | } | |
1096 | udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); | |
1097 | ||
1098 | /* return the number of bytes that should have been written */ | |
374ca955 | 1099 | return header.offsetFromUBytes+header.fromUBytesLength; |
b75a7d8f | 1100 | } |