2 *******************************************************************************
4 * Copyright (C) 2003-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucmstate.c
10 * tab size: 8 (not used)
13 * created on: 2003oct09
14 * created by: Markus W. Scherer
16 * This file handles ICU .ucm file state information as part of the ucm module.
17 * Most of this code used to be in makeconv.c.
20 #include "unicode/utypes.h"
30 /* MBCS state handling ------------------------------------------------------ */
33 * state table row grammar (ebnf-style):
34 * (whitespace is allowed between all tokens)
36 * row=[[firstentry ','] entry (',' entry)*]
37 * firstentry="initial" | "surrogates"
38 * (initial state (default for state 0), output is all surrogate pairs)
39 * entry=range [':' nextstate] ['.' action]
40 * range=number ['-' number]
43 * action='u' | 's' | 'p' | 'i'
44 * (unassigned, state change only, surrogate pair, illegal)
45 * number=(1- or 2-digit hexadecimal number)
48 parseState(const char *s
, int32_t state
[256], uint32_t *pFlags
) {
50 uint32_t start
, end
, i
;
53 /* initialize the state: all illegal with U+ffff */
54 for(i
=0; i
<256; ++i
) {
55 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0xffff);
58 /* skip leading white space */
59 s
=u_skipWhitespace(s
);
61 /* is there an "initial" or "surrogates" directive? */
62 if(uprv_strncmp("initial", s
, 7)==0) {
63 *pFlags
=MBCS_STATE_FLAG_DIRECT
;
64 s
=u_skipWhitespace(s
+7);
68 } else if(*pFlags
==0 && uprv_strncmp("surrogates", s
, 10)==0) {
69 *pFlags
=MBCS_STATE_FLAG_SURROGATES
;
70 s
=u_skipWhitespace(s
+10);
75 /* empty state row: all-illegal */
80 /* read an entry, the start of the range first */
81 s
=u_skipWhitespace(s
);
82 start
=uprv_strtoul(s
, (char **)&t
, 16);
83 if(s
==t
|| 0xff<start
) {
86 s
=u_skipWhitespace(t
);
88 /* read the end of the range if there is one */
90 s
=u_skipWhitespace(s
+1);
91 end
=uprv_strtoul(s
, (char **)&t
, 16);
92 if(s
==t
|| end
<start
|| 0xff<end
) {
95 s
=u_skipWhitespace(t
);
100 /* determine the state entrys for this range */
101 if(*s
!=':' && *s
!='.') {
102 /* the default is: final state with valid entries */
103 entry
=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16
, 0);
105 entry
=MBCS_ENTRY_TRANSITION(0, 0);
107 /* get the next state, default to 0 */
108 s
=u_skipWhitespace(s
+1);
109 i
=uprv_strtoul(s
, (char **)&t
, 16);
114 s
=u_skipWhitespace(t
);
115 entry
=MBCS_ENTRY_SET_STATE(entry
, i
);
119 /* get the state action, default to valid */
121 /* this is a final state */
122 entry
=MBCS_ENTRY_SET_FINAL(entry
);
124 s
=u_skipWhitespace(s
+1);
126 /* unassigned set U+fffe */
127 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
128 s
=u_skipWhitespace(s
+1);
130 if(*pFlags
!=MBCS_STATE_FLAG_DIRECT
) {
131 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16_PAIR
);
133 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
135 s
=u_skipWhitespace(s
+1);
137 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_CHANGE_ONLY
);
138 s
=u_skipWhitespace(s
+1);
140 /* illegal set U+ffff */
141 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_ILLEGAL
, 0xffff);
142 s
=u_skipWhitespace(s
+1);
144 /* default to valid */
145 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
148 /* this is an intermediate state, nothing to do */
152 /* adjust "final valid" states according to the state flags */
153 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
) {
158 case MBCS_STATE_FLAG_DIRECT
:
159 /* set the valid-direct code point to "unassigned"==0xfffe */
160 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_DIRECT_16
, 0xfffe);
162 case MBCS_STATE_FLAG_SURROGATES
:
163 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_16_PAIR
, 0);
170 /* set this entry for the range */
171 for(i
=start
; i
<=end
; ++i
) {
178 return *s
==0 ? NULL
: s
;
183 U_CAPI
void U_EXPORT2
184 ucm_addState(UCMStates
*states
, const char *s
) {
187 if(states
->countStates
==MBCS_MAX_STATE_COUNT
) {
188 fprintf(stderr
, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT
);
189 exit(U_INVALID_TABLE_FORMAT
);
192 error
=parseState(s
, states
->stateTable
[states
->countStates
],
193 &states
->stateFlags
[states
->countStates
]);
195 fprintf(stderr
, "ucm error: parse error in state definition at '%s'\n", error
);
196 exit(U_INVALID_TABLE_FORMAT
);
199 ++states
->countStates
;
202 U_CAPI UBool U_EXPORT2
203 ucm_parseHeaderLine(UCMFile
*ucm
,
204 char *line
, char **pKey
, char **pValue
) {
211 /* remove comments and trailing CR and LF and remove whitespace from the end */
212 for(end
=line
; (c
=*end
)!=0; ++end
) {
213 if(c
=='#' || c
=='\r' || c
=='\n') {
217 while(end
>line
&& (*(end
-1)==' ' || *(end
-1)=='\t')) {
222 /* skip leading white space and ignore empty lines */
223 s
=(char *)u_skipWhitespace(line
);
228 /* stop at the beginning of the mapping section */
229 if(uprv_memcmp(s
, "CHARMAP", 7)==0) {
233 /* get the key name, bracketed in <> */
235 fprintf(stderr
, "ucm error: no header field <key> in line \"%s\"\n", line
);
236 exit(U_INVALID_TABLE_FORMAT
);
241 fprintf(stderr
, "ucm error: incomplete header field <key> in line \"%s\"\n", line
);
242 exit(U_INVALID_TABLE_FORMAT
);
248 /* get the value string, possibly quoted */
249 s
=(char *)u_skipWhitespace(s
+1);
253 /* remove the quotes */
255 if(end
>*pValue
&& *(end
-1)=='"') {
260 /* collect the information from the header field, ignore unknown keys */
261 if(uprv_strcmp(*pKey
, "uconv_class")==0) {
262 if(uprv_strcmp(*pValue
, "DBCS")==0) {
263 states
->conversionType
=UCNV_DBCS
;
264 } else if(uprv_strcmp(*pValue
, "SBCS")==0) {
265 states
->conversionType
= UCNV_SBCS
;
266 } else if(uprv_strcmp(*pValue
, "MBCS")==0) {
267 states
->conversionType
= UCNV_MBCS
;
268 } else if(uprv_strcmp(*pValue
, "EBCDIC_STATEFUL")==0) {
269 states
->conversionType
= UCNV_EBCDIC_STATEFUL
;
271 fprintf(stderr
, "ucm error: unknown <uconv_class> %s\n", *pValue
);
272 exit(U_INVALID_TABLE_FORMAT
);
275 } else if(uprv_strcmp(*pKey
, "mb_cur_max")==0) {
277 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
278 states
->maxCharLength
=(int8_t)(c
-'0');
279 states
->outputType
=(int8_t)(states
->maxCharLength
-1);
281 fprintf(stderr
, "ucm error: illegal <mb_cur_max> %s\n", *pValue
);
282 exit(U_INVALID_TABLE_FORMAT
);
285 } else if(uprv_strcmp(*pKey
, "mb_cur_min")==0) {
287 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
288 states
->minCharLength
=(int8_t)(c
-'0');
290 fprintf(stderr
, "ucm error: illegal <mb_cur_min> %s\n", *pValue
);
291 exit(U_INVALID_TABLE_FORMAT
);
294 } else if(uprv_strcmp(*pKey
, "icu:state")==0) {
295 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
296 switch(states
->conversionType
) {
299 case UCNV_EBCDIC_STATEFUL
:
300 states
->conversionType
=UCNV_MBCS
;
305 fprintf(stderr
, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
306 exit(U_INVALID_TABLE_FORMAT
);
309 if(states
->maxCharLength
==0) {
310 fprintf(stderr
, "ucm error: <icu:state> before the <mb_cur_max> line\n");
311 exit(U_INVALID_TABLE_FORMAT
);
313 ucm_addState(states
, *pValue
);
315 } else if(uprv_strcmp(*pKey
, "icu:base")==0) {
317 fprintf(stderr
, "ucm error: <icu:base> without a base table name\n");
318 exit(U_INVALID_TABLE_FORMAT
);
320 uprv_strcpy(ucm
->baseName
, *pValue
);
327 /* post-processing ---------------------------------------------------------- */
330 sumUpStates(UCMStates
*states
) {
331 int32_t entry
, sum
, state
, cell
, count
;
332 UBool allStatesReady
;
335 * Sum up the offsets for all states.
336 * In each final state (where there are only final entries),
337 * the offsets add up directly.
338 * In all other state table rows, for each transition entry to another state,
339 * the offsets sum of that state needs to be added.
340 * This is achieved in at most countStates iterations.
342 allStatesReady
=FALSE
;
343 for(count
=states
->countStates
; !allStatesReady
&& count
>=0; --count
) {
345 for(state
=states
->countStates
-1; state
>=0; --state
) {
346 if(!(states
->stateFlags
[state
]&MBCS_STATE_FLAG_READY
)) {
347 allStatesReady
=FALSE
;
350 /* at first, add up only the final delta offsets to keep them <512 */
351 for(cell
=0; cell
<256; ++cell
) {
352 entry
=states
->stateTable
[state
][cell
];
353 if(MBCS_ENTRY_IS_FINAL(entry
)) {
354 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
355 case MBCS_STATE_VALID_16
:
356 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
359 case MBCS_STATE_VALID_16_PAIR
:
360 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
370 /* now, add up the delta offsets for the transitional entries */
371 for(cell
=0; cell
<256; ++cell
) {
372 entry
=states
->stateTable
[state
][cell
];
373 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
374 if(states
->stateFlags
[MBCS_ENTRY_TRANSITION_STATE(entry
)]&MBCS_STATE_FLAG_READY
) {
375 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry
, sum
);
376 sum
+=states
->stateOffsetSum
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
378 /* that next state does not have a sum yet, we cannot finish the one for this state */
386 states
->stateOffsetSum
[state
]=sum
;
387 states
->stateFlags
[state
]|=MBCS_STATE_FLAG_READY
;
393 if(!allStatesReady
) {
394 fprintf(stderr
, "ucm error: the state table contains loops\n");
395 exit(U_INVALID_TABLE_FORMAT
);
399 * For all "direct" (i.e., initial) states>0,
400 * the offsets need to be increased by the sum of
401 * the previous initial states.
403 sum
=states
->stateOffsetSum
[0];
404 for(state
=1; state
<states
->countStates
; ++state
) {
405 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
407 sum
+=states
->stateOffsetSum
[state
];
408 for(cell
=0; cell
<256; ++cell
) {
409 entry
=states
->stateTable
[state
][cell
];
410 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
411 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry
, sum2
);
417 /* round up to the next even number to have the following data 32-bit-aligned */
418 return states
->countToUCodeUnits
=(sum
+1)&~1;
421 U_CAPI
void U_EXPORT2
422 ucm_processStates(UCMStates
*states
) {
423 int32_t entry
, state
, cell
, count
;
425 if(states
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
426 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
427 exit(U_INVALID_TABLE_FORMAT
);
430 if(states
->countStates
==0) {
431 switch(states
->conversionType
) {
433 /* SBCS: use MBCS data structure with a default state table */
434 if(states
->maxCharLength
!=1) {
435 fprintf(stderr
, "error: SBCS codepage with max B/char!=1\n");
436 exit(U_INVALID_TABLE_FORMAT
);
438 states
->conversionType
=UCNV_MBCS
;
439 ucm_addState(states
, "0-ff");
442 fprintf(stderr
, "ucm error: missing state table information (<icu:state>) for MBCS\n");
443 exit(U_INVALID_TABLE_FORMAT
);
445 case UCNV_EBCDIC_STATEFUL
:
446 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
447 if(states
->minCharLength
!=1 || states
->maxCharLength
!=2) {
448 fprintf(stderr
, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
449 exit(U_INVALID_TABLE_FORMAT
);
451 states
->conversionType
=UCNV_MBCS
;
452 ucm_addState(states
, "0-ff, e:1.s, f:0.s");
453 ucm_addState(states
, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
454 ucm_addState(states
, "0-40:1.i, 41-fe:1., ff:1.i");
455 ucm_addState(states
, "0-ff:1.i, 40:1.");
456 ucm_addState(states
, "0-ff:1.i");
459 /* DBCS: use MBCS data structure with a default state table */
460 if(states
->minCharLength
!=2 || states
->maxCharLength
!=2) {
461 fprintf(stderr
, "error: DBCS codepage with min or max B/char!=2\n");
462 exit(U_INVALID_TABLE_FORMAT
);
464 states
->conversionType
= UCNV_MBCS
;
465 ucm_addState(states
, "0-3f:3, 40:2, 41-fe:1, ff:3");
466 ucm_addState(states
, "41-fe");
467 ucm_addState(states
, "40");
468 ucm_addState(states
, "");
471 fprintf(stderr
, "ucm error: unknown charset structure\n");
472 exit(U_INVALID_TABLE_FORMAT
);
478 * check that the min/max character lengths are reasonable;
479 * to do this right, all paths through the state table would have to be
480 * recursively walked while keeping track of the sequence lengths,
481 * but these simple checks cover most state tables in practice
483 if(states
->maxCharLength
<states
->minCharLength
) {
484 fprintf(stderr
, "ucm error: max B/char < min B/char\n");
485 exit(U_INVALID_TABLE_FORMAT
);
488 /* count non-direct states and compare with max B/char */
490 for(state
=0; state
<states
->countStates
; ++state
) {
491 if((states
->stateFlags
[state
]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
495 if(states
->maxCharLength
>count
+1) {
496 fprintf(stderr
, "ucm error: max B/char too large\n");
497 exit(U_INVALID_TABLE_FORMAT
);
500 if(states
->minCharLength
==1) {
504 * if there are single-byte characters,
505 * then the initial state must have direct result states
507 for(cell
=0; cell
<256; ++cell
) {
508 entry
=states
->stateTable
[0][cell
];
509 if( MBCS_ENTRY_IS_FINAL(entry
) &&
510 ((action
=MBCS_ENTRY_FINAL_ACTION(entry
))==MBCS_STATE_VALID_DIRECT_16
||
511 action
==MBCS_STATE_UNASSIGNED
)
518 fprintf(stderr
, "ucm warning: min B/char too small\n");
523 * make sure that all "next state" values are within limits
524 * and that all next states after final ones have the "direct"
525 * flag of initial states
527 for(state
=states
->countStates
-1; state
>=0; --state
) {
528 for(cell
=0; cell
<256; ++cell
) {
529 entry
=states
->stateTable
[state
][cell
];
530 if((uint8_t)MBCS_ENTRY_STATE(entry
)>=states
->countStates
) {
531 fprintf(stderr
, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
532 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
533 exit(U_INVALID_TABLE_FORMAT
);
535 if(MBCS_ENTRY_IS_FINAL(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
536 fprintf(stderr
, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
537 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
538 exit(U_INVALID_TABLE_FORMAT
);
539 } else if(MBCS_ENTRY_IS_TRANSITION(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
540 fprintf(stderr
, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
541 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
542 exit(U_INVALID_TABLE_FORMAT
);
547 /* is this an SI/SO (like EBCDIC-stateful) state table? */
548 if(states
->countStates
>=2 && (states
->stateFlags
[1]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
549 if(states
->maxCharLength
!=2) {
550 fprintf(stderr
, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states
->maxCharLength
);
551 exit(U_INVALID_TABLE_FORMAT
);
553 if(states
->countStates
<3) {
554 fprintf(stderr
, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states
->countStates
);
555 exit(U_INVALID_TABLE_FORMAT
);
557 /* are the SI/SO all in the right places? */
558 if( states
->stateTable
[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
559 states
->stateTable
[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0) &&
560 states
->stateTable
[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
561 states
->stateTable
[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0)
563 states
->outputType
=MBCS_OUTPUT_2_SISO
;
565 fprintf(stderr
, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
566 exit(U_INVALID_TABLE_FORMAT
);
573 /* check that no unexpected state is a "direct" one */
574 while(state
<states
->countStates
) {
575 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
576 fprintf(stderr
, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state
);
577 exit(U_INVALID_TABLE_FORMAT
);
585 /* find a fallback for this offset; return the index or -1 if not found */
586 U_CAPI
int32_t U_EXPORT2
587 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
591 if(countToUFallbacks
==0) {
592 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
596 /* do a linear search for the fallback mapping (the table is not yet sorted) */
597 for(i
=0; i
<countToUFallbacks
; ++i
) {
598 if(offset
==toUFallbacks
[i
].offset
) {
606 * This function tries to compact toUnicode tables for 2-byte codepages
607 * by finding lead bytes with all-unassigned trail bytes and adding another state
611 compactToUnicode2(UCMStates
*states
,
612 uint16_t **pUnicodeCodeUnits
,
613 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
615 int32_t (*oldStateTable
)[256];
617 uint16_t *oldUnicodeCodeUnits
;
618 int32_t entry
, offset
, oldOffset
, trailOffset
, oldTrailOffset
, savings
, sum
;
619 int32_t i
, j
, leadState
, trailState
, newState
, fallback
;
622 /* find the lead state */
623 if(states
->outputType
==MBCS_OUTPUT_2_SISO
) {
624 /* use the DBCS lead state for SI/SO codepages */
630 /* find the main trail state: the most used target state */
631 uprv_memset(count
, 0, sizeof(count
));
632 for(i
=0; i
<256; ++i
) {
633 entry
=states
->stateTable
[leadState
][i
];
634 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
635 ++count
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
639 for(i
=1; i
<states
->countStates
; ++i
) {
640 if(count
[i
]>count
[trailState
]) {
645 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
646 uprv_memset(count
, 0, sizeof(count
));
648 /* for each lead byte */
649 for(i
=0; i
<256; ++i
) {
650 entry
=states
->stateTable
[leadState
][i
];
651 if(MBCS_ENTRY_IS_TRANSITION(entry
) && (MBCS_ENTRY_TRANSITION_STATE(entry
))==trailState
) {
652 /* the offset is different for each lead byte */
653 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
654 /* for each trail byte for this lead byte */
655 for(j
=0; j
<256; ++j
) {
656 entry
=states
->stateTable
[trailState
][j
];
657 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
658 case MBCS_STATE_VALID_16
:
659 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
660 if((*pUnicodeCodeUnits
)[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
663 j
=999; /* do not count for this lead byte because there are assignments */
666 case MBCS_STATE_VALID_16_PAIR
:
667 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
668 if((*pUnicodeCodeUnits
)[entry
]==0xfffe) {
671 j
=999; /* do not count for this lead byte because there are assignments */
679 /* all trail bytes for this lead byte are unassigned */
686 /* subtract from the possible savings the cost of an additional state */
687 savings
=savings
*2-1024; /* count bytes, not 16-bit words */
692 printf("compacting toUnicode data saves %ld bytes\n", (long)savings
);
694 if(states
->countStates
>=MBCS_MAX_STATE_COUNT
) {
695 fprintf(stderr
, "cannot compact toUnicode because the maximum number of states is reached\n");
699 /* make a copy of the state table */
700 oldStateTable
=(int32_t (*)[256])uprv_malloc(states
->countStates
*1024);
701 if(oldStateTable
==NULL
) {
702 fprintf(stderr
, "cannot compact toUnicode: out of memory\n");
705 uprv_memcpy(oldStateTable
, states
->stateTable
, states
->countStates
*1024);
707 /* add the new state */
709 * this function does not catch the degenerate case where all lead bytes
710 * have all-unassigned trail bytes and the lead state could be removed
712 newState
=states
->countStates
++;
713 states
->stateFlags
[newState
]=0;
714 /* copy the old trail state, turning all assigned states into unassigned ones */
715 for(i
=0; i
<256; ++i
) {
716 entry
=states
->stateTable
[trailState
][i
];
717 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
718 case MBCS_STATE_VALID_16
:
719 case MBCS_STATE_VALID_16_PAIR
:
720 states
->stateTable
[newState
][i
]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
723 states
->stateTable
[newState
][i
]=entry
;
728 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
729 for(i
=0; i
<256; ++i
) {
731 states
->stateTable
[leadState
][i
]=MBCS_ENTRY_SET_STATE(states
->stateTable
[leadState
][i
], newState
);
735 /* sum up the new state table */
736 for(i
=0; i
<states
->countStates
; ++i
) {
737 states
->stateFlags
[i
]&=~MBCS_STATE_FLAG_READY
;
739 sum
=sumUpStates(states
);
741 /* allocate a new, smaller code units array */
742 oldUnicodeCodeUnits
=*pUnicodeCodeUnits
;
744 *pUnicodeCodeUnits
=NULL
;
745 if(oldUnicodeCodeUnits
!=NULL
) {
746 uprv_free(oldUnicodeCodeUnits
);
748 uprv_free(oldStateTable
);
751 *pUnicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
752 if(*pUnicodeCodeUnits
==NULL
) {
753 fprintf(stderr
, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
755 /* revert to the old state table */
756 *pUnicodeCodeUnits
=oldUnicodeCodeUnits
;
757 --states
->countStates
;
758 uprv_memcpy(states
->stateTable
, oldStateTable
, states
->countStates
*1024);
759 uprv_free(oldStateTable
);
762 for(i
=0; i
<sum
; ++i
) {
763 (*pUnicodeCodeUnits
)[i
]=0xfffe;
766 /* copy the code units for all assigned characters */
768 * The old state table has the same lead _and_ trail states for assigned characters!
769 * The differences are in the offsets, and in the trail states for some unassigned characters.
770 * For each character with an assigned state in the new table, it was assigned in the old one.
771 * Only still-assigned characters are copied.
772 * Note that fallback mappings need to get their offset values adjusted.
775 /* for each initial state */
776 for(leadState
=0; leadState
<states
->countStates
; ++leadState
) {
777 if((states
->stateFlags
[leadState
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
778 /* for each lead byte from there */
779 for(i
=0; i
<256; ++i
) {
780 entry
=states
->stateTable
[leadState
][i
];
781 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
782 trailState
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
783 /* the new state does not have assigned states */
784 if(trailState
!=newState
) {
785 trailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
786 oldTrailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable
[leadState
][i
]);
787 /* for each trail byte */
788 for(j
=0; j
<256; ++j
) {
789 entry
=states
->stateTable
[trailState
][j
];
790 /* copy assigned-character code units and adjust fallback offsets */
791 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
792 case MBCS_STATE_VALID_16
:
793 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
794 /* find the old offset according to the old state table */
795 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
796 unit
=(*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
797 if(unit
==0xfffe && (fallback
=ucm_findFallback(toUFallbacks
, countToUFallbacks
, oldOffset
))>=0) {
798 toUFallbacks
[fallback
].offset
=0x80000000|offset
;
801 case MBCS_STATE_VALID_16_PAIR
:
802 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
803 /* find the old offset according to the old state table */
804 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
805 (*pUnicodeCodeUnits
)[offset
++]=oldUnicodeCodeUnits
[oldOffset
++];
806 (*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
818 /* remove temporary flags from fallback offsets that protected them from being modified twice */
819 for(i
=0; i
<countToUFallbacks
; ++i
) {
820 toUFallbacks
[i
].offset
&=0x7fffffff;
823 /* free temporary memory */
824 uprv_free(oldUnicodeCodeUnits
);
825 uprv_free(oldStateTable
);
829 * recursive sub-function of compactToUnicodeHelper()
831 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
832 * if all sequences from this state are unassigned, returns the
833 * <0 there are assignments in unicodeCodeUnits[]
834 * 0 no use of unicodeCodeUnits[]
837 findUnassigned(UCMStates
*states
,
838 uint16_t *unicodeCodeUnits
,
839 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
840 int32_t state
, int32_t offset
, uint32_t b
) {
841 int32_t i
, entry
, savings
, localSavings
, belowSavings
;
844 localSavings
=belowSavings
=0;
846 for(i
=0; i
<256; ++i
) {
847 entry
=states
->stateTable
[state
][i
];
848 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
849 savings
=findUnassigned(states
,
851 toUFallbacks
, countToUFallbacks
,
852 MBCS_ENTRY_TRANSITION_STATE(entry
),
853 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
857 } else if(savings
>0) {
858 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
859 (unsigned long)((b
<<8)|i
), (long)state
, (long)savings
);
860 belowSavings
+=savings
;
862 } else if(!haveAssigned
) {
863 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
864 case MBCS_STATE_VALID_16
:
865 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
866 if(unicodeCodeUnits
[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
872 case MBCS_STATE_VALID_16_PAIR
:
873 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
874 if(unicodeCodeUnits
[entry
]==0xfffe) {
888 return localSavings
+belowSavings
;
892 /* helper function for finding compaction opportunities */
894 compactToUnicodeHelper(UCMStates
*states
,
895 uint16_t *unicodeCodeUnits
,
896 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
) {
897 int32_t state
, savings
;
899 /* for each initial state */
900 for(state
=0; state
<states
->countStates
; ++state
) {
901 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
902 savings
=findUnassigned(states
,
904 toUFallbacks
, countToUFallbacks
,
907 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
908 (long)state
, (long)savings
);
915 compareFallbacks(const void *context
, const void *fb1
, const void *fb2
) {
916 return ((const _MBCSToUFallback
*)fb1
)->offset
-((const _MBCSToUFallback
*)fb2
)->offset
;
919 U_CAPI
void U_EXPORT2
920 ucm_optimizeStates(UCMStates
*states
,
921 uint16_t **pUnicodeCodeUnits
,
922 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
924 UErrorCode errorCode
;
925 int32_t state
, cell
, entry
;
927 /* test each state table entry */
928 for(state
=0; state
<states
->countStates
; ++state
) {
929 for(cell
=0; cell
<256; ++cell
) {
930 entry
=states
->stateTable
[state
][cell
];
932 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
933 * and the code point is "unassigned" (0xfffe), then change it to
934 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
936 if(MBCS_ENTRY_SET_STATE(entry
, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
937 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_UNASSIGNED
);
942 /* try to compact the toUnicode tables */
943 if(states
->maxCharLength
==2) {
944 compactToUnicode2(states
, pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
, verbose
);
945 } else if(states
->maxCharLength
>2) {
947 compactToUnicodeHelper(states
, *pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
);
951 /* sort toUFallbacks */
953 * It should be safe to sort them before compactToUnicode2() is called,
954 * because it should not change the relative order of the offset values
955 * that it adjusts, but they need to be sorted at some point, and
958 if(countToUFallbacks
>0) {
959 errorCode
=U_ZERO_ERROR
; /* nothing bad will happen... */
960 uprv_sortArray(toUFallbacks
, countToUFallbacks
,
961 sizeof(_MBCSToUFallback
),
962 compareFallbacks
, NULL
, FALSE
, &errorCode
);
966 /* use a complete state table ----------------------------------------------- */
968 U_CAPI
int32_t U_EXPORT2
969 ucm_countChars(UCMStates
*states
,
970 const uint8_t *bytes
, int32_t length
) {
972 int32_t i
, entry
, count
;
979 if(states
->countStates
==0) {
980 fprintf(stderr
, "ucm error: there is no state information!\n");
984 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
985 if(length
==2 && states
->outputType
==MBCS_OUTPUT_2_SISO
) {
990 * Walk down the state table like in conversion,
991 * much like getNextUChar().
992 * We assume that c<=0x10ffff.
994 for(i
=0; i
<length
; ++i
) {
995 entry
=states
->stateTable
[state
][bytes
[i
]];
996 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
997 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
998 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1000 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
1001 case MBCS_STATE_ILLEGAL
:
1002 fprintf(stderr
, "ucm error: byte sequence ends in illegal state\n");
1004 case MBCS_STATE_CHANGE_ONLY
:
1005 fprintf(stderr
, "ucm error: byte sequence ends in state-change-only\n");
1007 case MBCS_STATE_UNASSIGNED
:
1008 case MBCS_STATE_FALLBACK_DIRECT_16
:
1009 case MBCS_STATE_VALID_DIRECT_16
:
1010 case MBCS_STATE_FALLBACK_DIRECT_20
:
1011 case MBCS_STATE_VALID_DIRECT_20
:
1012 case MBCS_STATE_VALID_16
:
1013 case MBCS_STATE_VALID_16_PAIR
:
1014 /* count a complete character and prepare for a new one */
1016 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1020 /* reserved, must never occur */
1021 fprintf(stderr
, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry
);
1028 fprintf(stderr
, "ucm error: byte sequence too short, ends in non-final state %hu\n", state
);
1033 * for SI/SO (like EBCDIC-stateful), multiple-character results
1034 * must consist of only double-byte sequences
1036 if(count
>1 && states
->outputType
==MBCS_OUTPUT_2_SISO
&& length
!=2*count
) {
1037 fprintf(stderr
, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count
);