1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2003-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ucmstate.c
12 * tab size: 8 (not used)
15 * created on: 2003oct09
16 * created by: Markus W. Scherer
18 * This file handles ICU .ucm file state information as part of the ucm module.
19 * Most of this code used to be in makeconv.c.
22 #include "unicode/utypes.h"
32 #if !UCONFIG_NO_CONVERSION
34 /* MBCS state handling ------------------------------------------------------ */
37 * state table row grammar (ebnf-style):
38 * (whitespace is allowed between all tokens)
40 * row=[[firstentry ','] entry (',' entry)*]
41 * firstentry="initial" | "surrogates"
42 * (initial state (default for state 0), output is all surrogate pairs)
43 * entry=range [':' nextstate] ['.' action]
44 * range=number ['-' number]
47 * action='u' | 's' | 'p' | 'i'
48 * (unassigned, state change only, surrogate pair, illegal)
49 * number=(1- or 2-digit hexadecimal number)
52 parseState(const char *s
, int32_t state
[256], uint32_t *pFlags
) {
54 uint32_t start
, end
, i
;
57 /* initialize the state: all illegal with U+ffff */
58 for(i
=0; i
<256; ++i
) {
59 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0xffff);
62 /* skip leading white space */
63 s
=u_skipWhitespace(s
);
65 /* is there an "initial" or "surrogates" directive? */
66 if(uprv_strncmp("initial", s
, 7)==0) {
67 *pFlags
=MBCS_STATE_FLAG_DIRECT
;
68 s
=u_skipWhitespace(s
+7);
72 } else if(*pFlags
==0 && uprv_strncmp("surrogates", s
, 10)==0) {
73 *pFlags
=MBCS_STATE_FLAG_SURROGATES
;
74 s
=u_skipWhitespace(s
+10);
79 /* empty state row: all-illegal */
84 /* read an entry, the start of the range first */
85 s
=u_skipWhitespace(s
);
86 start
=uprv_strtoul(s
, (char **)&t
, 16);
87 if(s
==t
|| 0xff<start
) {
90 s
=u_skipWhitespace(t
);
92 /* read the end of the range if there is one */
94 s
=u_skipWhitespace(s
+1);
95 end
=uprv_strtoul(s
, (char **)&t
, 16);
96 if(s
==t
|| end
<start
|| 0xff<end
) {
99 s
=u_skipWhitespace(t
);
104 /* determine the state entrys for this range */
105 if(*s
!=':' && *s
!='.') {
106 /* the default is: final state with valid entries */
107 entry
=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16
, 0);
109 entry
=MBCS_ENTRY_TRANSITION(0, 0);
111 /* get the next state, default to 0 */
112 s
=u_skipWhitespace(s
+1);
113 i
=uprv_strtoul(s
, (char **)&t
, 16);
118 s
=u_skipWhitespace(t
);
119 entry
=MBCS_ENTRY_SET_STATE(entry
, i
);
123 /* get the state action, default to valid */
125 /* this is a final state */
126 entry
=MBCS_ENTRY_SET_FINAL(entry
);
128 s
=u_skipWhitespace(s
+1);
130 /* unassigned set U+fffe */
131 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
132 s
=u_skipWhitespace(s
+1);
134 if(*pFlags
!=MBCS_STATE_FLAG_DIRECT
) {
135 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16_PAIR
);
137 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
139 s
=u_skipWhitespace(s
+1);
141 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_CHANGE_ONLY
);
142 s
=u_skipWhitespace(s
+1);
144 /* illegal set U+ffff */
145 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_ILLEGAL
, 0xffff);
146 s
=u_skipWhitespace(s
+1);
148 /* default to valid */
149 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
152 /* this is an intermediate state, nothing to do */
156 /* adjust "final valid" states according to the state flags */
157 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
) {
162 case MBCS_STATE_FLAG_DIRECT
:
163 /* set the valid-direct code point to "unassigned"==0xfffe */
164 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_DIRECT_16
, 0xfffe);
166 case MBCS_STATE_FLAG_SURROGATES
:
167 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_16_PAIR
, 0);
174 /* set this entry for the range */
175 for(i
=start
; i
<=end
; ++i
) {
182 return *s
==0 ? NULL
: s
;
187 U_CAPI
void U_EXPORT2
188 ucm_addState(UCMStates
*states
, const char *s
) {
191 if(states
->countStates
==MBCS_MAX_STATE_COUNT
) {
192 fprintf(stderr
, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT
);
193 exit(U_INVALID_TABLE_FORMAT
);
196 error
=parseState(s
, states
->stateTable
[states
->countStates
],
197 &states
->stateFlags
[states
->countStates
]);
199 fprintf(stderr
, "ucm error: parse error in state definition at '%s'\n", error
);
200 exit(U_INVALID_TABLE_FORMAT
);
203 ++states
->countStates
;
206 U_CAPI UBool U_EXPORT2
207 ucm_parseHeaderLine(UCMFile
*ucm
,
208 char *line
, char **pKey
, char **pValue
) {
215 /* remove comments and trailing CR and LF and remove whitespace from the end */
216 for(end
=line
; (c
=*end
)!=0; ++end
) {
217 if(c
=='#' || c
=='\r' || c
=='\n') {
221 while(end
>line
&& (*(end
-1)==' ' || *(end
-1)=='\t')) {
226 /* skip leading white space and ignore empty lines */
227 s
=(char *)u_skipWhitespace(line
);
232 /* stop at the beginning of the mapping section */
233 if(uprv_memcmp(s
, "CHARMAP", 7)==0) {
237 /* get the key name, bracketed in <> */
239 fprintf(stderr
, "ucm error: no header field <key> in line \"%s\"\n", line
);
240 exit(U_INVALID_TABLE_FORMAT
);
245 fprintf(stderr
, "ucm error: incomplete header field <key> in line \"%s\"\n", line
);
246 exit(U_INVALID_TABLE_FORMAT
);
252 /* get the value string, possibly quoted */
253 s
=(char *)u_skipWhitespace(s
+1);
257 /* remove the quotes */
259 if(end
>*pValue
&& *(end
-1)=='"') {
264 /* collect the information from the header field, ignore unknown keys */
265 if(uprv_strcmp(*pKey
, "uconv_class")==0) {
266 if(uprv_strcmp(*pValue
, "DBCS")==0) {
267 states
->conversionType
=UCNV_DBCS
;
268 } else if(uprv_strcmp(*pValue
, "SBCS")==0) {
269 states
->conversionType
= UCNV_SBCS
;
270 } else if(uprv_strcmp(*pValue
, "MBCS")==0) {
271 states
->conversionType
= UCNV_MBCS
;
272 } else if(uprv_strcmp(*pValue
, "EBCDIC_STATEFUL")==0) {
273 states
->conversionType
= UCNV_EBCDIC_STATEFUL
;
275 fprintf(stderr
, "ucm error: unknown <uconv_class> %s\n", *pValue
);
276 exit(U_INVALID_TABLE_FORMAT
);
279 } else if(uprv_strcmp(*pKey
, "mb_cur_max")==0) {
281 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
282 states
->maxCharLength
=(int8_t)(c
-'0');
283 states
->outputType
=(int8_t)(states
->maxCharLength
-1);
285 fprintf(stderr
, "ucm error: illegal <mb_cur_max> %s\n", *pValue
);
286 exit(U_INVALID_TABLE_FORMAT
);
289 } else if(uprv_strcmp(*pKey
, "mb_cur_min")==0) {
291 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
292 states
->minCharLength
=(int8_t)(c
-'0');
294 fprintf(stderr
, "ucm error: illegal <mb_cur_min> %s\n", *pValue
);
295 exit(U_INVALID_TABLE_FORMAT
);
298 } else if(uprv_strcmp(*pKey
, "icu:state")==0) {
299 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
300 switch(states
->conversionType
) {
303 case UCNV_EBCDIC_STATEFUL
:
304 states
->conversionType
=UCNV_MBCS
;
309 fprintf(stderr
, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
310 exit(U_INVALID_TABLE_FORMAT
);
313 if(states
->maxCharLength
==0) {
314 fprintf(stderr
, "ucm error: <icu:state> before the <mb_cur_max> line\n");
315 exit(U_INVALID_TABLE_FORMAT
);
317 ucm_addState(states
, *pValue
);
319 } else if(uprv_strcmp(*pKey
, "icu:base")==0) {
321 fprintf(stderr
, "ucm error: <icu:base> without a base table name\n");
322 exit(U_INVALID_TABLE_FORMAT
);
324 uprv_strcpy(ucm
->baseName
, *pValue
);
331 /* post-processing ---------------------------------------------------------- */
334 sumUpStates(UCMStates
*states
) {
335 int32_t entry
, sum
, state
, cell
, count
;
336 UBool allStatesReady
;
339 * Sum up the offsets for all states.
340 * In each final state (where there are only final entries),
341 * the offsets add up directly.
342 * In all other state table rows, for each transition entry to another state,
343 * the offsets sum of that state needs to be added.
344 * This is achieved in at most countStates iterations.
346 allStatesReady
=FALSE
;
347 for(count
=states
->countStates
; !allStatesReady
&& count
>=0; --count
) {
349 for(state
=states
->countStates
-1; state
>=0; --state
) {
350 if(!(states
->stateFlags
[state
]&MBCS_STATE_FLAG_READY
)) {
351 allStatesReady
=FALSE
;
354 /* at first, add up only the final delta offsets to keep them <512 */
355 for(cell
=0; cell
<256; ++cell
) {
356 entry
=states
->stateTable
[state
][cell
];
357 if(MBCS_ENTRY_IS_FINAL(entry
)) {
358 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
359 case MBCS_STATE_VALID_16
:
360 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
363 case MBCS_STATE_VALID_16_PAIR
:
364 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
374 /* now, add up the delta offsets for the transitional entries */
375 for(cell
=0; cell
<256; ++cell
) {
376 entry
=states
->stateTable
[state
][cell
];
377 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
378 if(states
->stateFlags
[MBCS_ENTRY_TRANSITION_STATE(entry
)]&MBCS_STATE_FLAG_READY
) {
379 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry
, sum
);
380 sum
+=states
->stateOffsetSum
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
382 /* that next state does not have a sum yet, we cannot finish the one for this state */
390 states
->stateOffsetSum
[state
]=sum
;
391 states
->stateFlags
[state
]|=MBCS_STATE_FLAG_READY
;
397 if(!allStatesReady
) {
398 fprintf(stderr
, "ucm error: the state table contains loops\n");
399 exit(U_INVALID_TABLE_FORMAT
);
403 * For all "direct" (i.e., initial) states>0,
404 * the offsets need to be increased by the sum of
405 * the previous initial states.
407 sum
=states
->stateOffsetSum
[0];
408 for(state
=1; state
<states
->countStates
; ++state
) {
409 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
411 sum
+=states
->stateOffsetSum
[state
];
412 for(cell
=0; cell
<256; ++cell
) {
413 entry
=states
->stateTable
[state
][cell
];
414 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
415 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry
, sum2
);
421 /* round up to the next even number to have the following data 32-bit-aligned */
422 return states
->countToUCodeUnits
=(sum
+1)&~1;
425 U_CAPI
void U_EXPORT2
426 ucm_processStates(UCMStates
*states
, UBool ignoreSISOCheck
) {
427 int32_t entry
, state
, cell
, count
;
429 if(states
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
430 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
431 exit(U_INVALID_TABLE_FORMAT
);
434 if(states
->countStates
==0) {
435 switch(states
->conversionType
) {
437 /* SBCS: use MBCS data structure with a default state table */
438 if(states
->maxCharLength
!=1) {
439 fprintf(stderr
, "error: SBCS codepage with max B/char!=1\n");
440 exit(U_INVALID_TABLE_FORMAT
);
442 states
->conversionType
=UCNV_MBCS
;
443 ucm_addState(states
, "0-ff");
446 fprintf(stderr
, "ucm error: missing state table information (<icu:state>) for MBCS\n");
447 exit(U_INVALID_TABLE_FORMAT
);
449 case UCNV_EBCDIC_STATEFUL
:
450 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
451 if(states
->minCharLength
!=1 || states
->maxCharLength
!=2) {
452 fprintf(stderr
, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
453 exit(U_INVALID_TABLE_FORMAT
);
455 states
->conversionType
=UCNV_MBCS
;
456 ucm_addState(states
, "0-ff, e:1.s, f:0.s");
457 ucm_addState(states
, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
458 ucm_addState(states
, "0-40:1.i, 41-fe:1., ff:1.i");
459 ucm_addState(states
, "0-ff:1.i, 40:1.");
460 ucm_addState(states
, "0-ff:1.i");
463 /* DBCS: use MBCS data structure with a default state table */
464 if(states
->minCharLength
!=2 || states
->maxCharLength
!=2) {
465 fprintf(stderr
, "error: DBCS codepage with min or max B/char!=2\n");
466 exit(U_INVALID_TABLE_FORMAT
);
468 states
->conversionType
= UCNV_MBCS
;
469 ucm_addState(states
, "0-3f:3, 40:2, 41-fe:1, ff:3");
470 ucm_addState(states
, "41-fe");
471 ucm_addState(states
, "40");
472 ucm_addState(states
, "");
475 fprintf(stderr
, "ucm error: unknown charset structure\n");
476 exit(U_INVALID_TABLE_FORMAT
);
482 * check that the min/max character lengths are reasonable;
483 * to do this right, all paths through the state table would have to be
484 * recursively walked while keeping track of the sequence lengths,
485 * but these simple checks cover most state tables in practice
487 if(states
->maxCharLength
<states
->minCharLength
) {
488 fprintf(stderr
, "ucm error: max B/char < min B/char\n");
489 exit(U_INVALID_TABLE_FORMAT
);
492 /* count non-direct states and compare with max B/char */
494 for(state
=0; state
<states
->countStates
; ++state
) {
495 if((states
->stateFlags
[state
]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
499 if(states
->maxCharLength
>count
+1) {
500 fprintf(stderr
, "ucm error: max B/char too large\n");
501 exit(U_INVALID_TABLE_FORMAT
);
504 if(states
->minCharLength
==1) {
508 * if there are single-byte characters,
509 * then the initial state must have direct result states
511 for(cell
=0; cell
<256; ++cell
) {
512 entry
=states
->stateTable
[0][cell
];
513 if( MBCS_ENTRY_IS_FINAL(entry
) &&
514 ((action
=MBCS_ENTRY_FINAL_ACTION(entry
))==MBCS_STATE_VALID_DIRECT_16
||
515 action
==MBCS_STATE_UNASSIGNED
)
522 fprintf(stderr
, "ucm warning: min B/char too small\n");
527 * make sure that all "next state" values are within limits
528 * and that all next states after final ones have the "direct"
529 * flag of initial states
531 for(state
=states
->countStates
-1; state
>=0; --state
) {
532 for(cell
=0; cell
<256; ++cell
) {
533 entry
=states
->stateTable
[state
][cell
];
534 if((uint8_t)MBCS_ENTRY_STATE(entry
)>=states
->countStates
) {
535 fprintf(stderr
, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
536 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
537 exit(U_INVALID_TABLE_FORMAT
);
539 if(MBCS_ENTRY_IS_FINAL(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
540 fprintf(stderr
, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
541 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
542 exit(U_INVALID_TABLE_FORMAT
);
543 } else if(MBCS_ENTRY_IS_TRANSITION(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
544 fprintf(stderr
, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
545 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
546 exit(U_INVALID_TABLE_FORMAT
);
551 /* is this an SI/SO (like EBCDIC-stateful) state table? */
552 if(states
->countStates
>=2 && (states
->stateFlags
[1]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
553 if(states
->maxCharLength
!=2) {
554 fprintf(stderr
, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states
->maxCharLength
);
555 exit(U_INVALID_TABLE_FORMAT
);
557 if(states
->countStates
<3) {
558 fprintf(stderr
, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states
->countStates
);
559 exit(U_INVALID_TABLE_FORMAT
);
561 /* are the SI/SO all in the right places? */
562 if( ignoreSISOCheck
||
563 (states
->stateTable
[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
564 states
->stateTable
[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0) &&
565 states
->stateTable
[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
566 states
->stateTable
[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0))
568 states
->outputType
=MBCS_OUTPUT_2_SISO
;
570 fprintf(stderr
, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
571 exit(U_INVALID_TABLE_FORMAT
);
578 /* check that no unexpected state is a "direct" one */
579 while(state
<states
->countStates
) {
580 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
581 fprintf(stderr
, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state
);
582 exit(U_INVALID_TABLE_FORMAT
);
590 /* find a fallback for this offset; return the index or -1 if not found */
591 U_CAPI
int32_t U_EXPORT2
592 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
596 if(countToUFallbacks
==0) {
597 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
601 /* do a linear search for the fallback mapping (the table is not yet sorted) */
602 for(i
=0; i
<countToUFallbacks
; ++i
) {
603 if(offset
==toUFallbacks
[i
].offset
) {
611 * This function tries to compact toUnicode tables for 2-byte codepages
612 * by finding lead bytes with all-unassigned trail bytes and adding another state
616 compactToUnicode2(UCMStates
*states
,
617 uint16_t **pUnicodeCodeUnits
,
618 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
620 int32_t (*oldStateTable
)[256];
622 uint16_t *oldUnicodeCodeUnits
;
623 int32_t entry
, offset
, oldOffset
, trailOffset
, oldTrailOffset
, savings
, sum
;
624 int32_t i
, j
, leadState
, trailState
, newState
, fallback
;
627 /* find the lead state */
628 if(states
->outputType
==MBCS_OUTPUT_2_SISO
) {
629 /* use the DBCS lead state for SI/SO codepages */
635 /* find the main trail state: the most used target state */
636 uprv_memset(count
, 0, sizeof(count
));
637 for(i
=0; i
<256; ++i
) {
638 entry
=states
->stateTable
[leadState
][i
];
639 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
640 ++count
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
644 for(i
=1; i
<states
->countStates
; ++i
) {
645 if(count
[i
]>count
[trailState
]) {
650 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
651 uprv_memset(count
, 0, sizeof(count
));
653 /* for each lead byte */
654 for(i
=0; i
<256; ++i
) {
655 entry
=states
->stateTable
[leadState
][i
];
656 if(MBCS_ENTRY_IS_TRANSITION(entry
) &&
657 (MBCS_ENTRY_TRANSITION_STATE(entry
))==static_cast<uint32_t>(trailState
)) {
658 /* the offset is different for each lead byte */
659 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
660 /* for each trail byte for this lead byte */
661 for(j
=0; j
<256; ++j
) {
662 entry
=states
->stateTable
[trailState
][j
];
663 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
664 case MBCS_STATE_VALID_16
:
665 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
666 if((*pUnicodeCodeUnits
)[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
669 j
=999; /* do not count for this lead byte because there are assignments */
672 case MBCS_STATE_VALID_16_PAIR
:
673 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
674 if((*pUnicodeCodeUnits
)[entry
]==0xfffe) {
677 j
=999; /* do not count for this lead byte because there are assignments */
685 /* all trail bytes for this lead byte are unassigned */
692 /* subtract from the possible savings the cost of an additional state */
693 savings
=savings
*2-1024; /* count bytes, not 16-bit words */
698 printf("compacting toUnicode data saves %ld bytes\n", (long)savings
);
700 if(states
->countStates
>=MBCS_MAX_STATE_COUNT
) {
701 fprintf(stderr
, "cannot compact toUnicode because the maximum number of states is reached\n");
705 /* make a copy of the state table */
706 oldStateTable
=(int32_t (*)[256])uprv_malloc(states
->countStates
*1024);
707 if(oldStateTable
==NULL
) {
708 fprintf(stderr
, "cannot compact toUnicode: out of memory\n");
711 uprv_memcpy(oldStateTable
, states
->stateTable
, states
->countStates
*1024);
713 /* add the new state */
715 * this function does not catch the degenerate case where all lead bytes
716 * have all-unassigned trail bytes and the lead state could be removed
718 newState
=states
->countStates
++;
719 states
->stateFlags
[newState
]=0;
720 /* copy the old trail state, turning all assigned states into unassigned ones */
721 for(i
=0; i
<256; ++i
) {
722 entry
=states
->stateTable
[trailState
][i
];
723 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
724 case MBCS_STATE_VALID_16
:
725 case MBCS_STATE_VALID_16_PAIR
:
726 states
->stateTable
[newState
][i
]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
729 states
->stateTable
[newState
][i
]=entry
;
734 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
735 for(i
=0; i
<256; ++i
) {
737 states
->stateTable
[leadState
][i
]=MBCS_ENTRY_SET_STATE(states
->stateTable
[leadState
][i
], newState
);
741 /* sum up the new state table */
742 for(i
=0; i
<states
->countStates
; ++i
) {
743 states
->stateFlags
[i
]&=~MBCS_STATE_FLAG_READY
;
745 sum
=sumUpStates(states
);
747 /* allocate a new, smaller code units array */
748 oldUnicodeCodeUnits
=*pUnicodeCodeUnits
;
750 *pUnicodeCodeUnits
=NULL
;
751 if(oldUnicodeCodeUnits
!=NULL
) {
752 uprv_free(oldUnicodeCodeUnits
);
754 uprv_free(oldStateTable
);
757 *pUnicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
758 if(*pUnicodeCodeUnits
==NULL
) {
759 fprintf(stderr
, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
761 /* revert to the old state table */
762 *pUnicodeCodeUnits
=oldUnicodeCodeUnits
;
763 --states
->countStates
;
764 uprv_memcpy(states
->stateTable
, oldStateTable
, states
->countStates
*1024);
765 uprv_free(oldStateTable
);
768 for(i
=0; i
<sum
; ++i
) {
769 (*pUnicodeCodeUnits
)[i
]=0xfffe;
772 /* copy the code units for all assigned characters */
774 * The old state table has the same lead _and_ trail states for assigned characters!
775 * The differences are in the offsets, and in the trail states for some unassigned characters.
776 * For each character with an assigned state in the new table, it was assigned in the old one.
777 * Only still-assigned characters are copied.
778 * Note that fallback mappings need to get their offset values adjusted.
781 /* for each initial state */
782 for(leadState
=0; leadState
<states
->countStates
; ++leadState
) {
783 if((states
->stateFlags
[leadState
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
784 /* for each lead byte from there */
785 for(i
=0; i
<256; ++i
) {
786 entry
=states
->stateTable
[leadState
][i
];
787 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
788 trailState
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
789 /* the new state does not have assigned states */
790 if(trailState
!=newState
) {
791 trailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
792 oldTrailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable
[leadState
][i
]);
793 /* for each trail byte */
794 for(j
=0; j
<256; ++j
) {
795 entry
=states
->stateTable
[trailState
][j
];
796 /* copy assigned-character code units and adjust fallback offsets */
797 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
798 case MBCS_STATE_VALID_16
:
799 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
800 /* find the old offset according to the old state table */
801 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
802 unit
=(*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
803 if(unit
==0xfffe && (fallback
=ucm_findFallback(toUFallbacks
, countToUFallbacks
, oldOffset
))>=0) {
804 toUFallbacks
[fallback
].offset
=0x80000000|offset
;
807 case MBCS_STATE_VALID_16_PAIR
:
808 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
809 /* find the old offset according to the old state table */
810 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
811 (*pUnicodeCodeUnits
)[offset
++]=oldUnicodeCodeUnits
[oldOffset
++];
812 (*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
824 /* remove temporary flags from fallback offsets that protected them from being modified twice */
825 for(i
=0; i
<countToUFallbacks
; ++i
) {
826 toUFallbacks
[i
].offset
&=0x7fffffff;
829 /* free temporary memory */
830 uprv_free(oldUnicodeCodeUnits
);
831 uprv_free(oldStateTable
);
835 * recursive sub-function of compactToUnicodeHelper()
837 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
838 * if all sequences from this state are unassigned, returns the
839 * <0 there are assignments in unicodeCodeUnits[]
840 * 0 no use of unicodeCodeUnits[]
843 findUnassigned(UCMStates
*states
,
844 uint16_t *unicodeCodeUnits
,
845 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
846 int32_t state
, int32_t offset
, uint32_t b
) {
847 int32_t i
, entry
, savings
, localSavings
, belowSavings
;
850 localSavings
=belowSavings
=0;
852 for(i
=0; i
<256; ++i
) {
853 entry
=states
->stateTable
[state
][i
];
854 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
855 savings
=findUnassigned(states
,
857 toUFallbacks
, countToUFallbacks
,
858 MBCS_ENTRY_TRANSITION_STATE(entry
),
859 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
863 } else if(savings
>0) {
864 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
865 (unsigned long)((b
<<8)|i
), (long)state
, (long)savings
);
866 belowSavings
+=savings
;
868 } else if(!haveAssigned
) {
869 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
870 case MBCS_STATE_VALID_16
:
871 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
872 if(unicodeCodeUnits
[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
878 case MBCS_STATE_VALID_16_PAIR
:
879 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
880 if(unicodeCodeUnits
[entry
]==0xfffe) {
894 return localSavings
+belowSavings
;
898 /* helper function for finding compaction opportunities */
900 compactToUnicodeHelper(UCMStates
*states
,
901 uint16_t *unicodeCodeUnits
,
902 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
) {
903 int32_t state
, savings
;
905 /* for each initial state */
906 for(state
=0; state
<states
->countStates
; ++state
) {
907 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
908 savings
=findUnassigned(states
,
910 toUFallbacks
, countToUFallbacks
,
913 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
914 (long)state
, (long)savings
);
921 static int32_t U_CALLCONV
922 compareFallbacks(const void *context
, const void *fb1
, const void *fb2
) {
924 return ((const _MBCSToUFallback
*)fb1
)->offset
-((const _MBCSToUFallback
*)fb2
)->offset
;
928 U_CAPI
void U_EXPORT2
929 ucm_optimizeStates(UCMStates
*states
,
930 uint16_t **pUnicodeCodeUnits
,
931 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
933 UErrorCode errorCode
;
934 int32_t state
, cell
, entry
;
936 /* test each state table entry */
937 for(state
=0; state
<states
->countStates
; ++state
) {
938 for(cell
=0; cell
<256; ++cell
) {
939 entry
=states
->stateTable
[state
][cell
];
941 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
942 * and the code point is "unassigned" (0xfffe), then change it to
943 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
945 if(MBCS_ENTRY_SET_STATE(entry
, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
946 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_UNASSIGNED
);
951 /* try to compact the toUnicode tables */
952 if(states
->maxCharLength
==2) {
953 compactToUnicode2(states
, pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
, verbose
);
954 } else if(states
->maxCharLength
>2) {
956 compactToUnicodeHelper(states
, *pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
);
960 /* sort toUFallbacks */
962 * It should be safe to sort them before compactToUnicode2() is called,
963 * because it should not change the relative order of the offset values
964 * that it adjusts, but they need to be sorted at some point, and
967 if(countToUFallbacks
>0) {
968 errorCode
=U_ZERO_ERROR
; /* nothing bad will happen... */
969 uprv_sortArray(toUFallbacks
, countToUFallbacks
,
970 sizeof(_MBCSToUFallback
),
971 compareFallbacks
, NULL
, FALSE
, &errorCode
);
975 /* use a complete state table ----------------------------------------------- */
977 U_CAPI
int32_t U_EXPORT2
978 ucm_countChars(UCMStates
*states
,
979 const uint8_t *bytes
, int32_t length
) {
981 int32_t i
, entry
, count
;
988 if(states
->countStates
==0) {
989 fprintf(stderr
, "ucm error: there is no state information!\n");
993 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
994 if(length
==2 && states
->outputType
==MBCS_OUTPUT_2_SISO
) {
999 * Walk down the state table like in conversion,
1000 * much like getNextUChar().
1001 * We assume that c<=0x10ffff.
1003 for(i
=0; i
<length
; ++i
) {
1004 entry
=states
->stateTable
[state
][bytes
[i
]];
1005 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
1006 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1007 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1009 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
1010 case MBCS_STATE_ILLEGAL
:
1011 fprintf(stderr
, "ucm error: byte sequence ends in illegal state\n");
1013 case MBCS_STATE_CHANGE_ONLY
:
1014 fprintf(stderr
, "ucm error: byte sequence ends in state-change-only\n");
1016 case MBCS_STATE_UNASSIGNED
:
1017 case MBCS_STATE_FALLBACK_DIRECT_16
:
1018 case MBCS_STATE_VALID_DIRECT_16
:
1019 case MBCS_STATE_FALLBACK_DIRECT_20
:
1020 case MBCS_STATE_VALID_DIRECT_20
:
1021 case MBCS_STATE_VALID_16
:
1022 case MBCS_STATE_VALID_16_PAIR
:
1023 /* count a complete character and prepare for a new one */
1025 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1029 /* reserved, must never occur */
1030 fprintf(stderr
, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry
);
1037 fprintf(stderr
, "ucm error: byte sequence too short, ends in non-final state %u\n", state
);
1042 * for SI/SO (like EBCDIC-stateful), multiple-character results
1043 * must consist of only double-byte sequences
1045 if(count
>1 && states
->outputType
==MBCS_OUTPUT_2_SISO
&& length
!=2*count
) {
1046 fprintf(stderr
, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count
);