]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/ucmstate.c
ICU-6.2.15.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ucmstate.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucmstate.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003oct09
14 * created by: Markus W. Scherer
15 *
16 * This file handles ICU .ucm file state information as part of the ucm module.
17 * Most of this code used to be in makeconv.c.
18 */
19
20 #include "unicode/utypes.h"
21 #include "cstring.h"
22 #include "cmemory.h"
23 #include "uarrsort.h"
24 #include "ucnvmbcs.h"
25 #include "ucnv_ext.h"
26 #include "uparse.h"
27 #include "ucm.h"
28 #include <stdio.h>
29
30 /* MBCS state handling ------------------------------------------------------ */
31
32 /*
33 * state table row grammar (ebnf-style):
34 * (whitespace is allowed between all tokens)
35 *
36 * row=[[firstentry ','] entry (',' entry)*]
37 * firstentry="initial" | "surrogates"
38 * (initial state (default for state 0), output is all surrogate pairs)
39 * entry=range [':' nextstate] ['.' action]
40 * range=number ['-' number]
41 * nextstate=number
42 * (0..7f)
43 * action='u' | 's' | 'p' | 'i'
44 * (unassigned, state change only, surrogate pair, illegal)
45 * number=(1- or 2-digit hexadecimal number)
46 */
47 static const char *
48 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
49 const char *t;
50 uint32_t start, end, i;
51 int32_t entry;
52
53 /* initialize the state: all illegal with U+ffff */
54 for(i=0; i<256; ++i) {
55 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
56 }
57
58 /* skip leading white space */
59 s=u_skipWhitespace(s);
60
61 /* is there an "initial" or "surrogates" directive? */
62 if(uprv_strncmp("initial", s, 7)==0) {
63 *pFlags=MBCS_STATE_FLAG_DIRECT;
64 s=u_skipWhitespace(s+7);
65 if(*s++!=',') {
66 return s-1;
67 }
68 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
69 *pFlags=MBCS_STATE_FLAG_SURROGATES;
70 s=u_skipWhitespace(s+10);
71 if(*s++!=',') {
72 return s-1;
73 }
74 } else if(*s==0) {
75 /* empty state row: all-illegal */
76 return NULL;
77 }
78
79 for(;;) {
80 /* read an entry, the start of the range first */
81 s=u_skipWhitespace(s);
82 start=uprv_strtoul(s, (char **)&t, 16);
83 if(s==t || 0xff<start) {
84 return s;
85 }
86 s=u_skipWhitespace(t);
87
88 /* read the end of the range if there is one */
89 if(*s=='-') {
90 s=u_skipWhitespace(s+1);
91 end=uprv_strtoul(s, (char **)&t, 16);
92 if(s==t || end<start || 0xff<end) {
93 return s;
94 }
95 s=u_skipWhitespace(t);
96 } else {
97 end=start;
98 }
99
100 /* determine the state entrys for this range */
101 if(*s!=':' && *s!='.') {
102 /* the default is: final state with valid entries */
103 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
104 } else {
105 entry=MBCS_ENTRY_TRANSITION(0, 0);
106 if(*s==':') {
107 /* get the next state, default to 0 */
108 s=u_skipWhitespace(s+1);
109 i=uprv_strtoul(s, (char **)&t, 16);
110 if(s!=t) {
111 if(0x7f<i) {
112 return s;
113 }
114 s=u_skipWhitespace(t);
115 entry=MBCS_ENTRY_SET_STATE(entry, i);
116 }
117 }
118
119 /* get the state action, default to valid */
120 if(*s=='.') {
121 /* this is a final state */
122 entry=MBCS_ENTRY_SET_FINAL(entry);
123
124 s=u_skipWhitespace(s+1);
125 if(*s=='u') {
126 /* unassigned set U+fffe */
127 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
128 s=u_skipWhitespace(s+1);
129 } else if(*s=='p') {
130 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
131 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
132 } else {
133 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
134 }
135 s=u_skipWhitespace(s+1);
136 } else if(*s=='s') {
137 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
138 s=u_skipWhitespace(s+1);
139 } else if(*s=='i') {
140 /* illegal set U+ffff */
141 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
142 s=u_skipWhitespace(s+1);
143 } else {
144 /* default to valid */
145 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
146 }
147 } else {
148 /* this is an intermediate state, nothing to do */
149 }
150 }
151
152 /* adjust "final valid" states according to the state flags */
153 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
154 switch(*pFlags) {
155 case 0:
156 /* no adjustment */
157 break;
158 case MBCS_STATE_FLAG_DIRECT:
159 /* set the valid-direct code point to "unassigned"==0xfffe */
160 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
161 break;
162 case MBCS_STATE_FLAG_SURROGATES:
163 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
164 break;
165 default:
166 break;
167 }
168 }
169
170 /* set this entry for the range */
171 for(i=start; i<=end; ++i) {
172 state[i]=entry;
173 }
174
175 if(*s==',') {
176 ++s;
177 } else {
178 return *s==0 ? NULL : s;
179 }
180 }
181 }
182
183 U_CAPI void U_EXPORT2
184 ucm_addState(UCMStates *states, const char *s) {
185 const char *error;
186
187 if(states->countStates==MBCS_MAX_STATE_COUNT) {
188 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
189 exit(U_INVALID_TABLE_FORMAT);
190 }
191
192 error=parseState(s, states->stateTable[states->countStates],
193 &states->stateFlags[states->countStates]);
194 if(error!=NULL) {
195 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
196 exit(U_INVALID_TABLE_FORMAT);
197 }
198
199 ++states->countStates;
200 }
201
202 U_CAPI UBool U_EXPORT2
203 ucm_parseHeaderLine(UCMFile *ucm,
204 char *line, char **pKey, char **pValue) {
205 UCMStates *states;
206 char *s, *end;
207 char c;
208
209 states=&ucm->states;
210
211 /* remove comments and trailing CR and LF and remove whitespace from the end */
212 for(end=line; (c=*end)!=0; ++end) {
213 if(c=='#' || c=='\r' || c=='\n') {
214 break;
215 }
216 }
217 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
218 --end;
219 }
220 *end=0;
221
222 /* skip leading white space and ignore empty lines */
223 s=(char *)u_skipWhitespace(line);
224 if(*s==0) {
225 return TRUE;
226 }
227
228 /* stop at the beginning of the mapping section */
229 if(uprv_memcmp(s, "CHARMAP", 7)==0) {
230 return FALSE;
231 }
232
233 /* get the key name, bracketed in <> */
234 if(*s!='<') {
235 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
236 exit(U_INVALID_TABLE_FORMAT);
237 }
238 *pKey=++s;
239 while(*s!='>') {
240 if(*s==0) {
241 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
242 exit(U_INVALID_TABLE_FORMAT);
243 }
244 ++s;
245 }
246 *s=0;
247
248 /* get the value string, possibly quoted */
249 s=(char *)u_skipWhitespace(s+1);
250 if(*s!='"') {
251 *pValue=s;
252 } else {
253 /* remove the quotes */
254 *pValue=s+1;
255 if(end>*pValue && *(end-1)=='"') {
256 *--end=0;
257 }
258 }
259
260 /* collect the information from the header field, ignore unknown keys */
261 if(uprv_strcmp(*pKey, "uconv_class")==0) {
262 if(uprv_strcmp(*pValue, "DBCS")==0) {
263 states->conversionType=UCNV_DBCS;
264 } else if(uprv_strcmp(*pValue, "SBCS")==0) {
265 states->conversionType = UCNV_SBCS;
266 } else if(uprv_strcmp(*pValue, "MBCS")==0) {
267 states->conversionType = UCNV_MBCS;
268 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
269 states->conversionType = UCNV_EBCDIC_STATEFUL;
270 } else {
271 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
272 exit(U_INVALID_TABLE_FORMAT);
273 }
274 return TRUE;
275 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
276 c=**pValue;
277 if('1'<=c && c<='4' && (*pValue)[1]==0) {
278 states->maxCharLength=(int8_t)(c-'0');
279 states->outputType=(int8_t)(states->maxCharLength-1);
280 } else {
281 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
282 exit(U_INVALID_TABLE_FORMAT);
283 }
284 return TRUE;
285 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
286 c=**pValue;
287 if('1'<=c && c<='4' && (*pValue)[1]==0) {
288 states->minCharLength=(int8_t)(c-'0');
289 } else {
290 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
291 exit(U_INVALID_TABLE_FORMAT);
292 }
293 return TRUE;
294 } else if(uprv_strcmp(*pKey, "icu:state")==0) {
295 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
296 switch(states->conversionType) {
297 case UCNV_SBCS:
298 case UCNV_DBCS:
299 case UCNV_EBCDIC_STATEFUL:
300 states->conversionType=UCNV_MBCS;
301 break;
302 case UCNV_MBCS:
303 break;
304 default:
305 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
306 exit(U_INVALID_TABLE_FORMAT);
307 }
308
309 if(states->maxCharLength==0) {
310 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
311 exit(U_INVALID_TABLE_FORMAT);
312 }
313 ucm_addState(states, *pValue);
314 return TRUE;
315 } else if(uprv_strcmp(*pKey, "icu:base")==0) {
316 if(**pValue==0) {
317 fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
318 exit(U_INVALID_TABLE_FORMAT);
319 }
320 uprv_strcpy(ucm->baseName, *pValue);
321 return TRUE;
322 }
323
324 return FALSE;
325 }
326
327 /* post-processing ---------------------------------------------------------- */
328
329 static int32_t
330 sumUpStates(UCMStates *states) {
331 int32_t entry, sum, state, cell, count;
332 UBool allStatesReady;
333
334 /*
335 * Sum up the offsets for all states.
336 * In each final state (where there are only final entries),
337 * the offsets add up directly.
338 * In all other state table rows, for each transition entry to another state,
339 * the offsets sum of that state needs to be added.
340 * This is achieved in at most countStates iterations.
341 */
342 allStatesReady=FALSE;
343 for(count=states->countStates; !allStatesReady && count>=0; --count) {
344 allStatesReady=TRUE;
345 for(state=states->countStates-1; state>=0; --state) {
346 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
347 allStatesReady=FALSE;
348 sum=0;
349
350 /* at first, add up only the final delta offsets to keep them <512 */
351 for(cell=0; cell<256; ++cell) {
352 entry=states->stateTable[state][cell];
353 if(MBCS_ENTRY_IS_FINAL(entry)) {
354 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
355 case MBCS_STATE_VALID_16:
356 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
357 sum+=1;
358 break;
359 case MBCS_STATE_VALID_16_PAIR:
360 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
361 sum+=2;
362 break;
363 default:
364 /* no addition */
365 break;
366 }
367 }
368 }
369
370 /* now, add up the delta offsets for the transitional entries */
371 for(cell=0; cell<256; ++cell) {
372 entry=states->stateTable[state][cell];
373 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
374 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
375 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
376 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
377 } else {
378 /* that next state does not have a sum yet, we cannot finish the one for this state */
379 sum=-1;
380 break;
381 }
382 }
383 }
384
385 if(sum!=-1) {
386 states->stateOffsetSum[state]=sum;
387 states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
388 }
389 }
390 }
391 }
392
393 if(!allStatesReady) {
394 fprintf(stderr, "ucm error: the state table contains loops\n");
395 exit(U_INVALID_TABLE_FORMAT);
396 }
397
398 /*
399 * For all "direct" (i.e., initial) states>0,
400 * the offsets need to be increased by the sum of
401 * the previous initial states.
402 */
403 sum=states->stateOffsetSum[0];
404 for(state=1; state<states->countStates; ++state) {
405 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
406 int32_t sum2=sum;
407 sum+=states->stateOffsetSum[state];
408 for(cell=0; cell<256; ++cell) {
409 entry=states->stateTable[state][cell];
410 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
411 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
412 }
413 }
414 }
415 }
416
417 /* round up to the next even number to have the following data 32-bit-aligned */
418 return states->countToUCodeUnits=(sum+1)&~1;
419 }
420
421 U_CAPI void U_EXPORT2
422 ucm_processStates(UCMStates *states) {
423 int32_t entry, state, cell, count;
424
425 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
426 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
427 exit(U_INVALID_TABLE_FORMAT);
428 }
429
430 if(states->countStates==0) {
431 switch(states->conversionType) {
432 case UCNV_SBCS:
433 /* SBCS: use MBCS data structure with a default state table */
434 if(states->maxCharLength!=1) {
435 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
436 exit(U_INVALID_TABLE_FORMAT);
437 }
438 states->conversionType=UCNV_MBCS;
439 ucm_addState(states, "0-ff");
440 break;
441 case UCNV_MBCS:
442 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
443 exit(U_INVALID_TABLE_FORMAT);
444 break;
445 case UCNV_EBCDIC_STATEFUL:
446 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
447 if(states->minCharLength!=1 || states->maxCharLength!=2) {
448 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
449 exit(U_INVALID_TABLE_FORMAT);
450 }
451 states->conversionType=UCNV_MBCS;
452 ucm_addState(states, "0-ff, e:1.s, f:0.s");
453 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
454 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
455 ucm_addState(states, "0-ff:1.i, 40:1.");
456 ucm_addState(states, "0-ff:1.i");
457 break;
458 case UCNV_DBCS:
459 /* DBCS: use MBCS data structure with a default state table */
460 if(states->minCharLength!=2 || states->maxCharLength!=2) {
461 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
462 exit(U_INVALID_TABLE_FORMAT);
463 }
464 states->conversionType = UCNV_MBCS;
465 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
466 ucm_addState(states, "41-fe");
467 ucm_addState(states, "40");
468 ucm_addState(states, "");
469 break;
470 default:
471 fprintf(stderr, "ucm error: unknown charset structure\n");
472 exit(U_INVALID_TABLE_FORMAT);
473 break;
474 }
475 }
476
477 /*
478 * check that the min/max character lengths are reasonable;
479 * to do this right, all paths through the state table would have to be
480 * recursively walked while keeping track of the sequence lengths,
481 * but these simple checks cover most state tables in practice
482 */
483 if(states->maxCharLength<states->minCharLength) {
484 fprintf(stderr, "ucm error: max B/char < min B/char\n");
485 exit(U_INVALID_TABLE_FORMAT);
486 }
487
488 /* count non-direct states and compare with max B/char */
489 count=0;
490 for(state=0; state<states->countStates; ++state) {
491 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
492 ++count;
493 }
494 }
495 if(states->maxCharLength>count+1) {
496 fprintf(stderr, "ucm error: max B/char too large\n");
497 exit(U_INVALID_TABLE_FORMAT);
498 }
499
500 if(states->minCharLength==1) {
501 int32_t action;
502
503 /*
504 * if there are single-byte characters,
505 * then the initial state must have direct result states
506 */
507 for(cell=0; cell<256; ++cell) {
508 entry=states->stateTable[0][cell];
509 if( MBCS_ENTRY_IS_FINAL(entry) &&
510 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
511 action==MBCS_STATE_UNASSIGNED)
512 ) {
513 break;
514 }
515 }
516
517 if(cell==256) {
518 fprintf(stderr, "ucm warning: min B/char too small\n");
519 }
520 }
521
522 /*
523 * make sure that all "next state" values are within limits
524 * and that all next states after final ones have the "direct"
525 * flag of initial states
526 */
527 for(state=states->countStates-1; state>=0; --state) {
528 for(cell=0; cell<256; ++cell) {
529 entry=states->stateTable[state][cell];
530 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
531 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
532 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
533 exit(U_INVALID_TABLE_FORMAT);
534 }
535 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
536 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
537 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
538 exit(U_INVALID_TABLE_FORMAT);
539 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
540 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
541 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
542 exit(U_INVALID_TABLE_FORMAT);
543 }
544 }
545 }
546
547 /* is this an SI/SO (like EBCDIC-stateful) state table? */
548 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
549 if(states->maxCharLength!=2) {
550 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
551 exit(U_INVALID_TABLE_FORMAT);
552 }
553 if(states->countStates<3) {
554 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
555 exit(U_INVALID_TABLE_FORMAT);
556 }
557 /* are the SI/SO all in the right places? */
558 if( states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
559 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
560 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
561 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)
562 ) {
563 states->outputType=MBCS_OUTPUT_2_SISO;
564 } else {
565 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
566 exit(U_INVALID_TABLE_FORMAT);
567 }
568 state=2;
569 } else {
570 state=1;
571 }
572
573 /* check that no unexpected state is a "direct" one */
574 while(state<states->countStates) {
575 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
576 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
577 exit(U_INVALID_TABLE_FORMAT);
578 }
579 ++state;
580 }
581
582 sumUpStates(states);
583 }
584
585 /* find a fallback for this offset; return the index or -1 if not found */
586 U_CAPI int32_t U_EXPORT2
587 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
588 uint32_t offset) {
589 int32_t i;
590
591 if(countToUFallbacks==0) {
592 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
593 return -1;
594 }
595
596 /* do a linear search for the fallback mapping (the table is not yet sorted) */
597 for(i=0; i<countToUFallbacks; ++i) {
598 if(offset==toUFallbacks[i].offset) {
599 return i;
600 }
601 }
602 return -1;
603 }
604
605 /*
606 * This function tries to compact toUnicode tables for 2-byte codepages
607 * by finding lead bytes with all-unassigned trail bytes and adding another state
608 * for them.
609 */
610 static void
611 compactToUnicode2(UCMStates *states,
612 uint16_t **pUnicodeCodeUnits,
613 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
614 UBool verbose) {
615 int32_t (*oldStateTable)[256];
616 uint16_t count[256];
617 uint16_t *oldUnicodeCodeUnits;
618 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
619 int32_t i, j, leadState, trailState, newState, fallback;
620 uint16_t unit;
621
622 /* find the lead state */
623 if(states->outputType==MBCS_OUTPUT_2_SISO) {
624 /* use the DBCS lead state for SI/SO codepages */
625 leadState=1;
626 } else {
627 leadState=0;
628 }
629
630 /* find the main trail state: the most used target state */
631 uprv_memset(count, 0, sizeof(count));
632 for(i=0; i<256; ++i) {
633 entry=states->stateTable[leadState][i];
634 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
635 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
636 }
637 }
638 trailState=0;
639 for(i=1; i<states->countStates; ++i) {
640 if(count[i]>count[trailState]) {
641 trailState=i;
642 }
643 }
644
645 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
646 uprv_memset(count, 0, sizeof(count));
647 savings=0;
648 /* for each lead byte */
649 for(i=0; i<256; ++i) {
650 entry=states->stateTable[leadState][i];
651 if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
652 /* the offset is different for each lead byte */
653 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
654 /* for each trail byte for this lead byte */
655 for(j=0; j<256; ++j) {
656 entry=states->stateTable[trailState][j];
657 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
658 case MBCS_STATE_VALID_16:
659 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
660 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
661 ++count[i];
662 } else {
663 j=999; /* do not count for this lead byte because there are assignments */
664 }
665 break;
666 case MBCS_STATE_VALID_16_PAIR:
667 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
668 if((*pUnicodeCodeUnits)[entry]==0xfffe) {
669 count[i]+=2;
670 } else {
671 j=999; /* do not count for this lead byte because there are assignments */
672 }
673 break;
674 default:
675 break;
676 }
677 }
678 if(j==256) {
679 /* all trail bytes for this lead byte are unassigned */
680 savings+=count[i];
681 } else {
682 count[i]=0;
683 }
684 }
685 }
686 /* subtract from the possible savings the cost of an additional state */
687 savings=savings*2-1024; /* count bytes, not 16-bit words */
688 if(savings<=0) {
689 return;
690 }
691 if(verbose) {
692 printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
693 }
694 if(states->countStates>=MBCS_MAX_STATE_COUNT) {
695 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
696 return;
697 }
698
699 /* make a copy of the state table */
700 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
701 if(oldStateTable==NULL) {
702 fprintf(stderr, "cannot compact toUnicode: out of memory\n");
703 return;
704 }
705 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
706
707 /* add the new state */
708 /*
709 * this function does not catch the degenerate case where all lead bytes
710 * have all-unassigned trail bytes and the lead state could be removed
711 */
712 newState=states->countStates++;
713 states->stateFlags[newState]=0;
714 /* copy the old trail state, turning all assigned states into unassigned ones */
715 for(i=0; i<256; ++i) {
716 entry=states->stateTable[trailState][i];
717 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
718 case MBCS_STATE_VALID_16:
719 case MBCS_STATE_VALID_16_PAIR:
720 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
721 break;
722 default:
723 states->stateTable[newState][i]=entry;
724 break;
725 }
726 }
727
728 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
729 for(i=0; i<256; ++i) {
730 if(count[i]>0) {
731 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
732 }
733 }
734
735 /* sum up the new state table */
736 for(i=0; i<states->countStates; ++i) {
737 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
738 }
739 sum=sumUpStates(states);
740
741 /* allocate a new, smaller code units array */
742 oldUnicodeCodeUnits=*pUnicodeCodeUnits;
743 if(sum==0) {
744 *pUnicodeCodeUnits=NULL;
745 if(oldUnicodeCodeUnits!=NULL) {
746 uprv_free(oldUnicodeCodeUnits);
747 }
748 uprv_free(oldStateTable);
749 return;
750 }
751 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
752 if(*pUnicodeCodeUnits==NULL) {
753 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
754 (long)sum);
755 /* revert to the old state table */
756 *pUnicodeCodeUnits=oldUnicodeCodeUnits;
757 --states->countStates;
758 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
759 uprv_free(oldStateTable);
760 return;
761 }
762 for(i=0; i<sum; ++i) {
763 (*pUnicodeCodeUnits)[i]=0xfffe;
764 }
765
766 /* copy the code units for all assigned characters */
767 /*
768 * The old state table has the same lead _and_ trail states for assigned characters!
769 * The differences are in the offsets, and in the trail states for some unassigned characters.
770 * For each character with an assigned state in the new table, it was assigned in the old one.
771 * Only still-assigned characters are copied.
772 * Note that fallback mappings need to get their offset values adjusted.
773 */
774
775 /* for each initial state */
776 for(leadState=0; leadState<states->countStates; ++leadState) {
777 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
778 /* for each lead byte from there */
779 for(i=0; i<256; ++i) {
780 entry=states->stateTable[leadState][i];
781 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
782 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
783 /* the new state does not have assigned states */
784 if(trailState!=newState) {
785 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
786 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
787 /* for each trail byte */
788 for(j=0; j<256; ++j) {
789 entry=states->stateTable[trailState][j];
790 /* copy assigned-character code units and adjust fallback offsets */
791 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
792 case MBCS_STATE_VALID_16:
793 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
794 /* find the old offset according to the old state table */
795 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
796 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
797 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
798 toUFallbacks[fallback].offset=0x80000000|offset;
799 }
800 break;
801 case MBCS_STATE_VALID_16_PAIR:
802 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
803 /* find the old offset according to the old state table */
804 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
805 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
806 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
807 break;
808 default:
809 break;
810 }
811 }
812 }
813 }
814 }
815 }
816 }
817
818 /* remove temporary flags from fallback offsets that protected them from being modified twice */
819 for(i=0; i<countToUFallbacks; ++i) {
820 toUFallbacks[i].offset&=0x7fffffff;
821 }
822
823 /* free temporary memory */
824 uprv_free(oldUnicodeCodeUnits);
825 uprv_free(oldStateTable);
826 }
827
828 /*
829 * recursive sub-function of compactToUnicodeHelper()
830 * returns:
831 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
832 * if all sequences from this state are unassigned, returns the
833 * <0 there are assignments in unicodeCodeUnits[]
834 * 0 no use of unicodeCodeUnits[]
835 */
836 static int32_t
837 findUnassigned(UCMStates *states,
838 uint16_t *unicodeCodeUnits,
839 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
840 int32_t state, int32_t offset, uint32_t b) {
841 int32_t i, entry, savings, localSavings, belowSavings;
842 UBool haveAssigned;
843
844 localSavings=belowSavings=0;
845 haveAssigned=FALSE;
846 for(i=0; i<256; ++i) {
847 entry=states->stateTable[state][i];
848 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
849 savings=findUnassigned(states,
850 unicodeCodeUnits,
851 toUFallbacks, countToUFallbacks,
852 MBCS_ENTRY_TRANSITION_STATE(entry),
853 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
854 (b<<8)|(uint32_t)i);
855 if(savings<0) {
856 haveAssigned=TRUE;
857 } else if(savings>0) {
858 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
859 (unsigned long)((b<<8)|i), (long)state, (long)savings);
860 belowSavings+=savings;
861 }
862 } else if(!haveAssigned) {
863 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
864 case MBCS_STATE_VALID_16:
865 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
866 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
867 localSavings+=2;
868 } else {
869 haveAssigned=TRUE;
870 }
871 break;
872 case MBCS_STATE_VALID_16_PAIR:
873 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
874 if(unicodeCodeUnits[entry]==0xfffe) {
875 localSavings+=4;
876 } else {
877 haveAssigned=TRUE;
878 }
879 break;
880 default:
881 break;
882 }
883 }
884 }
885 if(haveAssigned) {
886 return -1;
887 } else {
888 return localSavings+belowSavings;
889 }
890 }
891
892 /* helper function for finding compaction opportunities */
893 static void
894 compactToUnicodeHelper(UCMStates *states,
895 uint16_t *unicodeCodeUnits,
896 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
897 int32_t state, savings;
898
899 /* for each initial state */
900 for(state=0; state<states->countStates; ++state) {
901 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
902 savings=findUnassigned(states,
903 unicodeCodeUnits,
904 toUFallbacks, countToUFallbacks,
905 state, 0, 0);
906 if(savings>0) {
907 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
908 (long)state, (long)savings);
909 }
910 }
911 }
912 }
913
914 static int32_t
915 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
916 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
917 }
918
919 U_CAPI void U_EXPORT2
920 ucm_optimizeStates(UCMStates *states,
921 uint16_t **pUnicodeCodeUnits,
922 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
923 UBool verbose) {
924 UErrorCode errorCode;
925 int32_t state, cell, entry;
926
927 /* test each state table entry */
928 for(state=0; state<states->countStates; ++state) {
929 for(cell=0; cell<256; ++cell) {
930 entry=states->stateTable[state][cell];
931 /*
932 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
933 * and the code point is "unassigned" (0xfffe), then change it to
934 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
935 */
936 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
937 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
938 }
939 }
940 }
941
942 /* try to compact the toUnicode tables */
943 if(states->maxCharLength==2) {
944 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
945 } else if(states->maxCharLength>2) {
946 if(verbose) {
947 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
948 }
949 }
950
951 /* sort toUFallbacks */
952 /*
953 * It should be safe to sort them before compactToUnicode2() is called,
954 * because it should not change the relative order of the offset values
955 * that it adjusts, but they need to be sorted at some point, and
956 * it is safest here.
957 */
958 if(countToUFallbacks>0) {
959 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
960 uprv_sortArray(toUFallbacks, countToUFallbacks,
961 sizeof(_MBCSToUFallback),
962 compareFallbacks, NULL, FALSE, &errorCode);
963 }
964 }
965
966 /* use a complete state table ----------------------------------------------- */
967
968 U_CAPI int32_t U_EXPORT2
969 ucm_countChars(UCMStates *states,
970 const uint8_t *bytes, int32_t length) {
971 uint32_t offset;
972 int32_t i, entry, count;
973 uint8_t state;
974
975 offset=0;
976 i=count=0;
977 state=0;
978
979 if(states->countStates==0) {
980 fprintf(stderr, "ucm error: there is no state information!\n");
981 return -1;
982 }
983
984 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
985 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
986 state=1;
987 }
988
989 /*
990 * Walk down the state table like in conversion,
991 * much like getNextUChar().
992 * We assume that c<=0x10ffff.
993 */
994 for(i=0; i<length; ++i) {
995 entry=states->stateTable[state][bytes[i]];
996 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
997 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
998 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
999 } else {
1000 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1001 case MBCS_STATE_ILLEGAL:
1002 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
1003 return -1;
1004 case MBCS_STATE_CHANGE_ONLY:
1005 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
1006 return -1;
1007 case MBCS_STATE_UNASSIGNED:
1008 case MBCS_STATE_FALLBACK_DIRECT_16:
1009 case MBCS_STATE_VALID_DIRECT_16:
1010 case MBCS_STATE_FALLBACK_DIRECT_20:
1011 case MBCS_STATE_VALID_DIRECT_20:
1012 case MBCS_STATE_VALID_16:
1013 case MBCS_STATE_VALID_16_PAIR:
1014 /* count a complete character and prepare for a new one */
1015 ++count;
1016 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1017 offset=0;
1018 break;
1019 default:
1020 /* reserved, must never occur */
1021 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
1022 return -1;
1023 }
1024 }
1025 }
1026
1027 if(offset!=0) {
1028 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %hu\n", state);
1029 return -1;
1030 }
1031
1032 /*
1033 * for SI/SO (like EBCDIC-stateful), multiple-character results
1034 * must consist of only double-byte sequences
1035 */
1036 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
1037 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
1038 return -1;
1039 }
1040
1041 return count;
1042 }