1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2003-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ucmstate.c
12 * tab size: 8 (not used)
15 * created on: 2003oct09
16 * created by: Markus W. Scherer
18 * This file handles ICU .ucm file state information as part of the ucm module.
19 * Most of this code used to be in makeconv.c.
22 #include "unicode/utypes.h"
32 #if !UCONFIG_NO_CONVERSION
34 /* MBCS state handling ------------------------------------------------------ */
37 * state table row grammar (ebnf-style):
38 * (whitespace is allowed between all tokens)
40 * row=[[firstentry ','] entry (',' entry)*]
41 * firstentry="initial" | "surrogates"
42 * (initial state (default for state 0), output is all surrogate pairs)
43 * entry=range [':' nextstate] ['.' action]
44 * range=number ['-' number]
47 * action='u' | 's' | 'p' | 'i'
48 * (unassigned, state change only, surrogate pair, illegal)
49 * number=(1- or 2-digit hexadecimal number)
52 parseState(const char *s
, int32_t state
[256], uint32_t *pFlags
) {
54 uint32_t start
, end
, i
;
57 /* initialize the state: all illegal with U+ffff */
58 for(i
=0; i
<256; ++i
) {
59 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0xffff);
62 /* skip leading white space */
63 s
=u_skipWhitespace(s
);
65 /* is there an "initial" or "surrogates" directive? */
66 if(uprv_strncmp("initial", s
, 7)==0) {
67 *pFlags
=MBCS_STATE_FLAG_DIRECT
;
68 s
=u_skipWhitespace(s
+7);
72 } else if(*pFlags
==0 && uprv_strncmp("surrogates", s
, 10)==0) {
73 *pFlags
=MBCS_STATE_FLAG_SURROGATES
;
74 s
=u_skipWhitespace(s
+10);
79 /* empty state row: all-illegal */
84 /* read an entry, the start of the range first */
85 s
=u_skipWhitespace(s
);
86 start
=uprv_strtoul(s
, (char **)&t
, 16);
87 if(s
==t
|| 0xff<start
) {
90 s
=u_skipWhitespace(t
);
92 /* read the end of the range if there is one */
94 s
=u_skipWhitespace(s
+1);
95 end
=uprv_strtoul(s
, (char **)&t
, 16);
96 if(s
==t
|| end
<start
|| 0xff<end
) {
99 s
=u_skipWhitespace(t
);
104 /* determine the state entrys for this range */
105 if(*s
!=':' && *s
!='.') {
106 /* the default is: final state with valid entries */
107 entry
=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16
, 0);
109 entry
=MBCS_ENTRY_TRANSITION(0, 0);
111 /* get the next state, default to 0 */
112 s
=u_skipWhitespace(s
+1);
113 i
=uprv_strtoul(s
, (char **)&t
, 16);
118 s
=u_skipWhitespace(t
);
119 entry
=MBCS_ENTRY_SET_STATE(entry
, i
);
123 /* get the state action, default to valid */
125 /* this is a final state */
126 entry
=MBCS_ENTRY_SET_FINAL(entry
);
128 s
=u_skipWhitespace(s
+1);
130 /* unassigned set U+fffe */
131 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
132 s
=u_skipWhitespace(s
+1);
134 if(*pFlags
!=MBCS_STATE_FLAG_DIRECT
) {
135 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16_PAIR
);
137 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
139 s
=u_skipWhitespace(s
+1);
141 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_CHANGE_ONLY
);
142 s
=u_skipWhitespace(s
+1);
144 /* illegal set U+ffff */
145 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_ILLEGAL
, 0xffff);
146 s
=u_skipWhitespace(s
+1);
148 /* default to valid */
149 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
152 /* this is an intermediate state, nothing to do */
156 /* adjust "final valid" states according to the state flags */
157 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
) {
162 case MBCS_STATE_FLAG_DIRECT
:
163 /* set the valid-direct code point to "unassigned"==0xfffe */
164 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_DIRECT_16
, 0xfffe);
166 case MBCS_STATE_FLAG_SURROGATES
:
167 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_16_PAIR
, 0);
174 /* set this entry for the range */
175 for(i
=start
; i
<=end
; ++i
) {
182 return *s
==0 ? NULL
: s
;
187 U_CAPI
void U_EXPORT2
188 ucm_addState(UCMStates
*states
, const char *s
) {
191 if(states
->countStates
==MBCS_MAX_STATE_COUNT
) {
192 fprintf(stderr
, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT
);
193 exit(U_INVALID_TABLE_FORMAT
);
196 error
=parseState(s
, states
->stateTable
[states
->countStates
],
197 &states
->stateFlags
[states
->countStates
]);
199 fprintf(stderr
, "ucm error: parse error in state definition at '%s'\n", error
);
200 exit(U_INVALID_TABLE_FORMAT
);
203 ++states
->countStates
;
206 U_CAPI UBool U_EXPORT2
207 ucm_parseHeaderLine(UCMFile
*ucm
,
208 char *line
, char **pKey
, char **pValue
) {
215 /* remove comments and trailing CR and LF and remove whitespace from the end */
216 for(end
=line
; (c
=*end
)!=0; ++end
) {
217 if(c
=='#' || c
=='\r' || c
=='\n') {
221 while(end
>line
&& (*(end
-1)==' ' || *(end
-1)=='\t')) {
226 /* skip leading white space and ignore empty lines */
227 s
=(char *)u_skipWhitespace(line
);
232 /* stop at the beginning of the mapping section */
233 if(uprv_memcmp(s
, "CHARMAP", 7)==0) {
237 /* get the key name, bracketed in <> */
239 fprintf(stderr
, "ucm error: no header field <key> in line \"%s\"\n", line
);
240 exit(U_INVALID_TABLE_FORMAT
);
245 fprintf(stderr
, "ucm error: incomplete header field <key> in line \"%s\"\n", line
);
246 exit(U_INVALID_TABLE_FORMAT
);
252 /* get the value string, possibly quoted */
253 s
=(char *)u_skipWhitespace(s
+1);
257 /* remove the quotes */
259 if(end
>*pValue
&& *(end
-1)=='"') {
264 /* collect the information from the header field, ignore unknown keys */
265 if(uprv_strcmp(*pKey
, "uconv_class")==0) {
266 if(uprv_strcmp(*pValue
, "DBCS")==0) {
267 states
->conversionType
=UCNV_DBCS
;
268 } else if(uprv_strcmp(*pValue
, "SBCS")==0) {
269 states
->conversionType
= UCNV_SBCS
;
270 } else if(uprv_strcmp(*pValue
, "MBCS")==0) {
271 states
->conversionType
= UCNV_MBCS
;
272 } else if(uprv_strcmp(*pValue
, "EBCDIC_STATEFUL")==0) {
273 states
->conversionType
= UCNV_EBCDIC_STATEFUL
;
275 fprintf(stderr
, "ucm error: unknown <uconv_class> %s\n", *pValue
);
276 exit(U_INVALID_TABLE_FORMAT
);
279 } else if(uprv_strcmp(*pKey
, "mb_cur_max")==0) {
281 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
282 states
->maxCharLength
=(int8_t)(c
-'0');
283 states
->outputType
=(int8_t)(states
->maxCharLength
-1);
285 fprintf(stderr
, "ucm error: illegal <mb_cur_max> %s\n", *pValue
);
286 exit(U_INVALID_TABLE_FORMAT
);
289 } else if(uprv_strcmp(*pKey
, "mb_cur_min")==0) {
291 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
292 states
->minCharLength
=(int8_t)(c
-'0');
294 fprintf(stderr
, "ucm error: illegal <mb_cur_min> %s\n", *pValue
);
295 exit(U_INVALID_TABLE_FORMAT
);
298 } else if(uprv_strcmp(*pKey
, "icu:state")==0) {
299 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
300 switch(states
->conversionType
) {
303 case UCNV_EBCDIC_STATEFUL
:
304 states
->conversionType
=UCNV_MBCS
;
309 fprintf(stderr
, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
310 exit(U_INVALID_TABLE_FORMAT
);
313 if(states
->maxCharLength
==0) {
314 fprintf(stderr
, "ucm error: <icu:state> before the <mb_cur_max> line\n");
315 exit(U_INVALID_TABLE_FORMAT
);
317 ucm_addState(states
, *pValue
);
319 } else if(uprv_strcmp(*pKey
, "icu:base")==0) {
321 fprintf(stderr
, "ucm error: <icu:base> without a base table name\n");
322 exit(U_INVALID_TABLE_FORMAT
);
324 uprv_strcpy(ucm
->baseName
, *pValue
);
331 /* post-processing ---------------------------------------------------------- */
334 sumUpStates(UCMStates
*states
) {
335 int32_t entry
, sum
, state
, cell
, count
;
336 UBool allStatesReady
;
339 * Sum up the offsets for all states.
340 * In each final state (where there are only final entries),
341 * the offsets add up directly.
342 * In all other state table rows, for each transition entry to another state,
343 * the offsets sum of that state needs to be added.
344 * This is achieved in at most countStates iterations.
346 allStatesReady
=FALSE
;
347 for(count
=states
->countStates
; !allStatesReady
&& count
>=0; --count
) {
349 for(state
=states
->countStates
-1; state
>=0; --state
) {
350 if(!(states
->stateFlags
[state
]&MBCS_STATE_FLAG_READY
)) {
351 allStatesReady
=FALSE
;
354 /* at first, add up only the final delta offsets to keep them <512 */
355 for(cell
=0; cell
<256; ++cell
) {
356 entry
=states
->stateTable
[state
][cell
];
357 if(MBCS_ENTRY_IS_FINAL(entry
)) {
358 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
359 case MBCS_STATE_VALID_16
:
360 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
363 case MBCS_STATE_VALID_16_PAIR
:
364 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
374 /* now, add up the delta offsets for the transitional entries */
375 for(cell
=0; cell
<256; ++cell
) {
376 entry
=states
->stateTable
[state
][cell
];
377 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
378 if(states
->stateFlags
[MBCS_ENTRY_TRANSITION_STATE(entry
)]&MBCS_STATE_FLAG_READY
) {
379 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry
, sum
);
380 sum
+=states
->stateOffsetSum
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
382 /* that next state does not have a sum yet, we cannot finish the one for this state */
390 states
->stateOffsetSum
[state
]=sum
;
391 states
->stateFlags
[state
]|=MBCS_STATE_FLAG_READY
;
397 if(!allStatesReady
) {
398 fprintf(stderr
, "ucm error: the state table contains loops\n");
399 exit(U_INVALID_TABLE_FORMAT
);
403 * For all "direct" (i.e., initial) states>0,
404 * the offsets need to be increased by the sum of
405 * the previous initial states.
407 sum
=states
->stateOffsetSum
[0];
408 for(state
=1; state
<states
->countStates
; ++state
) {
409 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
411 sum
+=states
->stateOffsetSum
[state
];
412 for(cell
=0; cell
<256; ++cell
) {
413 entry
=states
->stateTable
[state
][cell
];
414 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
415 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry
, sum2
);
421 /* round up to the next even number to have the following data 32-bit-aligned */
422 return states
->countToUCodeUnits
=(sum
+1)&~1;
425 U_CAPI
void U_EXPORT2
426 ucm_processStates(UCMStates
*states
, UBool ignoreSISOCheck
) {
427 int32_t entry
, state
, cell
, count
;
429 if(states
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
430 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
431 exit(U_INVALID_TABLE_FORMAT
);
434 if(states
->countStates
==0) {
435 switch(states
->conversionType
) {
437 /* SBCS: use MBCS data structure with a default state table */
438 if(states
->maxCharLength
!=1) {
439 fprintf(stderr
, "error: SBCS codepage with max B/char!=1\n");
440 exit(U_INVALID_TABLE_FORMAT
);
442 states
->conversionType
=UCNV_MBCS
;
443 ucm_addState(states
, "0-ff");
446 fprintf(stderr
, "ucm error: missing state table information (<icu:state>) for MBCS\n");
447 exit(U_INVALID_TABLE_FORMAT
);
449 case UCNV_EBCDIC_STATEFUL
:
450 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
451 if(states
->minCharLength
!=1 || states
->maxCharLength
!=2) {
452 fprintf(stderr
, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
453 exit(U_INVALID_TABLE_FORMAT
);
455 states
->conversionType
=UCNV_MBCS
;
456 ucm_addState(states
, "0-ff, e:1.s, f:0.s");
457 ucm_addState(states
, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
458 ucm_addState(states
, "0-40:1.i, 41-fe:1., ff:1.i");
459 ucm_addState(states
, "0-ff:1.i, 40:1.");
460 ucm_addState(states
, "0-ff:1.i");
463 /* DBCS: use MBCS data structure with a default state table */
464 if(states
->minCharLength
!=2 || states
->maxCharLength
!=2) {
465 fprintf(stderr
, "error: DBCS codepage with min or max B/char!=2\n");
466 exit(U_INVALID_TABLE_FORMAT
);
468 states
->conversionType
= UCNV_MBCS
;
469 ucm_addState(states
, "0-3f:3, 40:2, 41-fe:1, ff:3");
470 ucm_addState(states
, "41-fe");
471 ucm_addState(states
, "40");
472 ucm_addState(states
, "");
475 fprintf(stderr
, "ucm error: unknown charset structure\n");
476 exit(U_INVALID_TABLE_FORMAT
);
482 * check that the min/max character lengths are reasonable;
483 * to do this right, all paths through the state table would have to be
484 * recursively walked while keeping track of the sequence lengths,
485 * but these simple checks cover most state tables in practice
487 if(states
->maxCharLength
<states
->minCharLength
) {
488 fprintf(stderr
, "ucm error: max B/char < min B/char\n");
489 exit(U_INVALID_TABLE_FORMAT
);
492 /* count non-direct states and compare with max B/char */
494 for(state
=0; state
<states
->countStates
; ++state
) {
495 if((states
->stateFlags
[state
]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
499 if(states
->maxCharLength
>count
+1) {
500 fprintf(stderr
, "ucm error: max B/char too large\n");
501 exit(U_INVALID_TABLE_FORMAT
);
504 if(states
->minCharLength
==1) {
508 * if there are single-byte characters,
509 * then the initial state must have direct result states
511 for(cell
=0; cell
<256; ++cell
) {
512 entry
=states
->stateTable
[0][cell
];
513 if( MBCS_ENTRY_IS_FINAL(entry
) &&
514 ((action
=MBCS_ENTRY_FINAL_ACTION(entry
))==MBCS_STATE_VALID_DIRECT_16
||
515 action
==MBCS_STATE_UNASSIGNED
)
522 fprintf(stderr
, "ucm warning: min B/char too small\n");
527 * make sure that all "next state" values are within limits
528 * and that all next states after final ones have the "direct"
529 * flag of initial states
531 for(state
=states
->countStates
-1; state
>=0; --state
) {
532 for(cell
=0; cell
<256; ++cell
) {
533 entry
=states
->stateTable
[state
][cell
];
534 if((uint8_t)MBCS_ENTRY_STATE(entry
)>=states
->countStates
) {
535 fprintf(stderr
, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
536 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
537 exit(U_INVALID_TABLE_FORMAT
);
539 if(MBCS_ENTRY_IS_FINAL(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
540 fprintf(stderr
, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
541 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
542 exit(U_INVALID_TABLE_FORMAT
);
543 } else if(MBCS_ENTRY_IS_TRANSITION(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
544 fprintf(stderr
, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
545 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
546 exit(U_INVALID_TABLE_FORMAT
);
551 /* is this an SI/SO (like EBCDIC-stateful) state table? */
552 if(states
->countStates
>=2 && (states
->stateFlags
[1]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
553 if(states
->maxCharLength
!=2) {
554 fprintf(stderr
, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states
->maxCharLength
);
555 exit(U_INVALID_TABLE_FORMAT
);
557 if(states
->countStates
<3) {
558 fprintf(stderr
, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states
->countStates
);
559 exit(U_INVALID_TABLE_FORMAT
);
561 /* are the SI/SO all in the right places? */
562 if( ignoreSISOCheck
||
563 (states
->stateTable
[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
564 states
->stateTable
[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0) &&
565 states
->stateTable
[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
566 states
->stateTable
[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0))
568 states
->outputType
=MBCS_OUTPUT_2_SISO
;
570 fprintf(stderr
, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
571 exit(U_INVALID_TABLE_FORMAT
);
578 /* check that no unexpected state is a "direct" one */
579 while(state
<states
->countStates
) {
580 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
581 fprintf(stderr
, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state
);
582 exit(U_INVALID_TABLE_FORMAT
);
590 /* find a fallback for this offset; return the index or -1 if not found */
591 U_CAPI
int32_t U_EXPORT2
592 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
596 if(countToUFallbacks
==0) {
597 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
601 /* do a linear search for the fallback mapping (the table is not yet sorted) */
602 for(i
=0; i
<countToUFallbacks
; ++i
) {
603 if(offset
==toUFallbacks
[i
].offset
) {
611 * This function tries to compact toUnicode tables for 2-byte codepages
612 * by finding lead bytes with all-unassigned trail bytes and adding another state
616 compactToUnicode2(UCMStates
*states
,
617 uint16_t **pUnicodeCodeUnits
,
618 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
620 int32_t (*oldStateTable
)[256];
622 uint16_t *oldUnicodeCodeUnits
;
623 int32_t entry
, offset
, oldOffset
, trailOffset
, oldTrailOffset
, savings
, sum
;
624 int32_t i
, j
, leadState
, trailState
, newState
, fallback
;
627 /* find the lead state */
628 if(states
->outputType
==MBCS_OUTPUT_2_SISO
) {
629 /* use the DBCS lead state for SI/SO codepages */
635 /* find the main trail state: the most used target state */
636 uprv_memset(count
, 0, sizeof(count
));
637 for(i
=0; i
<256; ++i
) {
638 entry
=states
->stateTable
[leadState
][i
];
639 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
640 ++count
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
644 for(i
=1; i
<states
->countStates
; ++i
) {
645 if(count
[i
]>count
[trailState
]) {
650 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
651 uprv_memset(count
, 0, sizeof(count
));
653 /* for each lead byte */
654 for(i
=0; i
<256; ++i
) {
655 entry
=states
->stateTable
[leadState
][i
];
656 if(MBCS_ENTRY_IS_TRANSITION(entry
) && (MBCS_ENTRY_TRANSITION_STATE(entry
))==trailState
) {
657 /* the offset is different for each lead byte */
658 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
659 /* for each trail byte for this lead byte */
660 for(j
=0; j
<256; ++j
) {
661 entry
=states
->stateTable
[trailState
][j
];
662 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
663 case MBCS_STATE_VALID_16
:
664 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
665 if((*pUnicodeCodeUnits
)[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
668 j
=999; /* do not count for this lead byte because there are assignments */
671 case MBCS_STATE_VALID_16_PAIR
:
672 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
673 if((*pUnicodeCodeUnits
)[entry
]==0xfffe) {
676 j
=999; /* do not count for this lead byte because there are assignments */
684 /* all trail bytes for this lead byte are unassigned */
691 /* subtract from the possible savings the cost of an additional state */
692 savings
=savings
*2-1024; /* count bytes, not 16-bit words */
697 printf("compacting toUnicode data saves %ld bytes\n", (long)savings
);
699 if(states
->countStates
>=MBCS_MAX_STATE_COUNT
) {
700 fprintf(stderr
, "cannot compact toUnicode because the maximum number of states is reached\n");
704 /* make a copy of the state table */
705 oldStateTable
=(int32_t (*)[256])uprv_malloc(states
->countStates
*1024);
706 if(oldStateTable
==NULL
) {
707 fprintf(stderr
, "cannot compact toUnicode: out of memory\n");
710 uprv_memcpy(oldStateTable
, states
->stateTable
, states
->countStates
*1024);
712 /* add the new state */
714 * this function does not catch the degenerate case where all lead bytes
715 * have all-unassigned trail bytes and the lead state could be removed
717 newState
=states
->countStates
++;
718 states
->stateFlags
[newState
]=0;
719 /* copy the old trail state, turning all assigned states into unassigned ones */
720 for(i
=0; i
<256; ++i
) {
721 entry
=states
->stateTable
[trailState
][i
];
722 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
723 case MBCS_STATE_VALID_16
:
724 case MBCS_STATE_VALID_16_PAIR
:
725 states
->stateTable
[newState
][i
]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
728 states
->stateTable
[newState
][i
]=entry
;
733 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
734 for(i
=0; i
<256; ++i
) {
736 states
->stateTable
[leadState
][i
]=MBCS_ENTRY_SET_STATE(states
->stateTable
[leadState
][i
], newState
);
740 /* sum up the new state table */
741 for(i
=0; i
<states
->countStates
; ++i
) {
742 states
->stateFlags
[i
]&=~MBCS_STATE_FLAG_READY
;
744 sum
=sumUpStates(states
);
746 /* allocate a new, smaller code units array */
747 oldUnicodeCodeUnits
=*pUnicodeCodeUnits
;
749 *pUnicodeCodeUnits
=NULL
;
750 if(oldUnicodeCodeUnits
!=NULL
) {
751 uprv_free(oldUnicodeCodeUnits
);
753 uprv_free(oldStateTable
);
756 *pUnicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
757 if(*pUnicodeCodeUnits
==NULL
) {
758 fprintf(stderr
, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
760 /* revert to the old state table */
761 *pUnicodeCodeUnits
=oldUnicodeCodeUnits
;
762 --states
->countStates
;
763 uprv_memcpy(states
->stateTable
, oldStateTable
, states
->countStates
*1024);
764 uprv_free(oldStateTable
);
767 for(i
=0; i
<sum
; ++i
) {
768 (*pUnicodeCodeUnits
)[i
]=0xfffe;
771 /* copy the code units for all assigned characters */
773 * The old state table has the same lead _and_ trail states for assigned characters!
774 * The differences are in the offsets, and in the trail states for some unassigned characters.
775 * For each character with an assigned state in the new table, it was assigned in the old one.
776 * Only still-assigned characters are copied.
777 * Note that fallback mappings need to get their offset values adjusted.
780 /* for each initial state */
781 for(leadState
=0; leadState
<states
->countStates
; ++leadState
) {
782 if((states
->stateFlags
[leadState
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
783 /* for each lead byte from there */
784 for(i
=0; i
<256; ++i
) {
785 entry
=states
->stateTable
[leadState
][i
];
786 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
787 trailState
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
788 /* the new state does not have assigned states */
789 if(trailState
!=newState
) {
790 trailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
791 oldTrailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable
[leadState
][i
]);
792 /* for each trail byte */
793 for(j
=0; j
<256; ++j
) {
794 entry
=states
->stateTable
[trailState
][j
];
795 /* copy assigned-character code units and adjust fallback offsets */
796 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
797 case MBCS_STATE_VALID_16
:
798 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
799 /* find the old offset according to the old state table */
800 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
801 unit
=(*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
802 if(unit
==0xfffe && (fallback
=ucm_findFallback(toUFallbacks
, countToUFallbacks
, oldOffset
))>=0) {
803 toUFallbacks
[fallback
].offset
=0x80000000|offset
;
806 case MBCS_STATE_VALID_16_PAIR
:
807 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
808 /* find the old offset according to the old state table */
809 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
810 (*pUnicodeCodeUnits
)[offset
++]=oldUnicodeCodeUnits
[oldOffset
++];
811 (*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
823 /* remove temporary flags from fallback offsets that protected them from being modified twice */
824 for(i
=0; i
<countToUFallbacks
; ++i
) {
825 toUFallbacks
[i
].offset
&=0x7fffffff;
828 /* free temporary memory */
829 uprv_free(oldUnicodeCodeUnits
);
830 uprv_free(oldStateTable
);
834 * recursive sub-function of compactToUnicodeHelper()
836 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
837 * if all sequences from this state are unassigned, returns the
838 * <0 there are assignments in unicodeCodeUnits[]
839 * 0 no use of unicodeCodeUnits[]
842 findUnassigned(UCMStates
*states
,
843 uint16_t *unicodeCodeUnits
,
844 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
845 int32_t state
, int32_t offset
, uint32_t b
) {
846 int32_t i
, entry
, savings
, localSavings
, belowSavings
;
849 localSavings
=belowSavings
=0;
851 for(i
=0; i
<256; ++i
) {
852 entry
=states
->stateTable
[state
][i
];
853 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
854 savings
=findUnassigned(states
,
856 toUFallbacks
, countToUFallbacks
,
857 MBCS_ENTRY_TRANSITION_STATE(entry
),
858 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
862 } else if(savings
>0) {
863 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
864 (unsigned long)((b
<<8)|i
), (long)state
, (long)savings
);
865 belowSavings
+=savings
;
867 } else if(!haveAssigned
) {
868 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
869 case MBCS_STATE_VALID_16
:
870 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
871 if(unicodeCodeUnits
[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
877 case MBCS_STATE_VALID_16_PAIR
:
878 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
879 if(unicodeCodeUnits
[entry
]==0xfffe) {
893 return localSavings
+belowSavings
;
897 /* helper function for finding compaction opportunities */
899 compactToUnicodeHelper(UCMStates
*states
,
900 uint16_t *unicodeCodeUnits
,
901 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
) {
902 int32_t state
, savings
;
904 /* for each initial state */
905 for(state
=0; state
<states
->countStates
; ++state
) {
906 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
907 savings
=findUnassigned(states
,
909 toUFallbacks
, countToUFallbacks
,
912 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
913 (long)state
, (long)savings
);
920 static int32_t U_CALLCONV
921 compareFallbacks(const void *context
, const void *fb1
, const void *fb2
) {
923 return ((const _MBCSToUFallback
*)fb1
)->offset
-((const _MBCSToUFallback
*)fb2
)->offset
;
927 U_CAPI
void U_EXPORT2
928 ucm_optimizeStates(UCMStates
*states
,
929 uint16_t **pUnicodeCodeUnits
,
930 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
932 UErrorCode errorCode
;
933 int32_t state
, cell
, entry
;
935 /* test each state table entry */
936 for(state
=0; state
<states
->countStates
; ++state
) {
937 for(cell
=0; cell
<256; ++cell
) {
938 entry
=states
->stateTable
[state
][cell
];
940 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
941 * and the code point is "unassigned" (0xfffe), then change it to
942 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
944 if(MBCS_ENTRY_SET_STATE(entry
, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
945 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_UNASSIGNED
);
950 /* try to compact the toUnicode tables */
951 if(states
->maxCharLength
==2) {
952 compactToUnicode2(states
, pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
, verbose
);
953 } else if(states
->maxCharLength
>2) {
955 compactToUnicodeHelper(states
, *pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
);
959 /* sort toUFallbacks */
961 * It should be safe to sort them before compactToUnicode2() is called,
962 * because it should not change the relative order of the offset values
963 * that it adjusts, but they need to be sorted at some point, and
966 if(countToUFallbacks
>0) {
967 errorCode
=U_ZERO_ERROR
; /* nothing bad will happen... */
968 uprv_sortArray(toUFallbacks
, countToUFallbacks
,
969 sizeof(_MBCSToUFallback
),
970 compareFallbacks
, NULL
, FALSE
, &errorCode
);
974 /* use a complete state table ----------------------------------------------- */
976 U_CAPI
int32_t U_EXPORT2
977 ucm_countChars(UCMStates
*states
,
978 const uint8_t *bytes
, int32_t length
) {
980 int32_t i
, entry
, count
;
987 if(states
->countStates
==0) {
988 fprintf(stderr
, "ucm error: there is no state information!\n");
992 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
993 if(length
==2 && states
->outputType
==MBCS_OUTPUT_2_SISO
) {
998 * Walk down the state table like in conversion,
999 * much like getNextUChar().
1000 * We assume that c<=0x10ffff.
1002 for(i
=0; i
<length
; ++i
) {
1003 entry
=states
->stateTable
[state
][bytes
[i
]];
1004 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
1005 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1006 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1008 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
1009 case MBCS_STATE_ILLEGAL
:
1010 fprintf(stderr
, "ucm error: byte sequence ends in illegal state\n");
1012 case MBCS_STATE_CHANGE_ONLY
:
1013 fprintf(stderr
, "ucm error: byte sequence ends in state-change-only\n");
1015 case MBCS_STATE_UNASSIGNED
:
1016 case MBCS_STATE_FALLBACK_DIRECT_16
:
1017 case MBCS_STATE_VALID_DIRECT_16
:
1018 case MBCS_STATE_FALLBACK_DIRECT_20
:
1019 case MBCS_STATE_VALID_DIRECT_20
:
1020 case MBCS_STATE_VALID_16
:
1021 case MBCS_STATE_VALID_16_PAIR
:
1022 /* count a complete character and prepare for a new one */
1024 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1028 /* reserved, must never occur */
1029 fprintf(stderr
, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry
);
1036 fprintf(stderr
, "ucm error: byte sequence too short, ends in non-final state %u\n", state
);
1041 * for SI/SO (like EBCDIC-stateful), multiple-character results
1042 * must consist of only double-byte sequences
1044 if(count
>1 && states
->outputType
==MBCS_OUTPUT_2_SISO
&& length
!=2*count
) {
1045 fprintf(stderr
, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count
);