2 *******************************************************************************
4 * Copyright (C) 2003-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucmstate.c
10 * tab size: 8 (not used)
13 * created on: 2003oct09
14 * created by: Markus W. Scherer
16 * This file handles ICU .ucm file state information as part of the ucm module.
17 * Most of this code used to be in makeconv.c.
20 #include "unicode/utypes.h"
30 #if !UCONFIG_NO_CONVERSION
32 /* MBCS state handling ------------------------------------------------------ */
35 * state table row grammar (ebnf-style):
36 * (whitespace is allowed between all tokens)
38 * row=[[firstentry ','] entry (',' entry)*]
39 * firstentry="initial" | "surrogates"
40 * (initial state (default for state 0), output is all surrogate pairs)
41 * entry=range [':' nextstate] ['.' action]
42 * range=number ['-' number]
45 * action='u' | 's' | 'p' | 'i'
46 * (unassigned, state change only, surrogate pair, illegal)
47 * number=(1- or 2-digit hexadecimal number)
50 parseState(const char *s
, int32_t state
[256], uint32_t *pFlags
) {
52 uint32_t start
, end
, i
;
55 /* initialize the state: all illegal with U+ffff */
56 for(i
=0; i
<256; ++i
) {
57 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0xffff);
60 /* skip leading white space */
61 s
=u_skipWhitespace(s
);
63 /* is there an "initial" or "surrogates" directive? */
64 if(uprv_strncmp("initial", s
, 7)==0) {
65 *pFlags
=MBCS_STATE_FLAG_DIRECT
;
66 s
=u_skipWhitespace(s
+7);
70 } else if(*pFlags
==0 && uprv_strncmp("surrogates", s
, 10)==0) {
71 *pFlags
=MBCS_STATE_FLAG_SURROGATES
;
72 s
=u_skipWhitespace(s
+10);
77 /* empty state row: all-illegal */
82 /* read an entry, the start of the range first */
83 s
=u_skipWhitespace(s
);
84 start
=uprv_strtoul(s
, (char **)&t
, 16);
85 if(s
==t
|| 0xff<start
) {
88 s
=u_skipWhitespace(t
);
90 /* read the end of the range if there is one */
92 s
=u_skipWhitespace(s
+1);
93 end
=uprv_strtoul(s
, (char **)&t
, 16);
94 if(s
==t
|| end
<start
|| 0xff<end
) {
97 s
=u_skipWhitespace(t
);
102 /* determine the state entrys for this range */
103 if(*s
!=':' && *s
!='.') {
104 /* the default is: final state with valid entries */
105 entry
=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16
, 0);
107 entry
=MBCS_ENTRY_TRANSITION(0, 0);
109 /* get the next state, default to 0 */
110 s
=u_skipWhitespace(s
+1);
111 i
=uprv_strtoul(s
, (char **)&t
, 16);
116 s
=u_skipWhitespace(t
);
117 entry
=MBCS_ENTRY_SET_STATE(entry
, i
);
121 /* get the state action, default to valid */
123 /* this is a final state */
124 entry
=MBCS_ENTRY_SET_FINAL(entry
);
126 s
=u_skipWhitespace(s
+1);
128 /* unassigned set U+fffe */
129 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
130 s
=u_skipWhitespace(s
+1);
132 if(*pFlags
!=MBCS_STATE_FLAG_DIRECT
) {
133 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16_PAIR
);
135 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
137 s
=u_skipWhitespace(s
+1);
139 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_CHANGE_ONLY
);
140 s
=u_skipWhitespace(s
+1);
142 /* illegal set U+ffff */
143 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_ILLEGAL
, 0xffff);
144 s
=u_skipWhitespace(s
+1);
146 /* default to valid */
147 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
150 /* this is an intermediate state, nothing to do */
154 /* adjust "final valid" states according to the state flags */
155 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
) {
160 case MBCS_STATE_FLAG_DIRECT
:
161 /* set the valid-direct code point to "unassigned"==0xfffe */
162 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_DIRECT_16
, 0xfffe);
164 case MBCS_STATE_FLAG_SURROGATES
:
165 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_16_PAIR
, 0);
172 /* set this entry for the range */
173 for(i
=start
; i
<=end
; ++i
) {
180 return *s
==0 ? NULL
: s
;
185 U_CAPI
void U_EXPORT2
186 ucm_addState(UCMStates
*states
, const char *s
) {
189 if(states
->countStates
==MBCS_MAX_STATE_COUNT
) {
190 fprintf(stderr
, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT
);
191 exit(U_INVALID_TABLE_FORMAT
);
194 error
=parseState(s
, states
->stateTable
[states
->countStates
],
195 &states
->stateFlags
[states
->countStates
]);
197 fprintf(stderr
, "ucm error: parse error in state definition at '%s'\n", error
);
198 exit(U_INVALID_TABLE_FORMAT
);
201 ++states
->countStates
;
204 U_CAPI UBool U_EXPORT2
205 ucm_parseHeaderLine(UCMFile
*ucm
,
206 char *line
, char **pKey
, char **pValue
) {
213 /* remove comments and trailing CR and LF and remove whitespace from the end */
214 for(end
=line
; (c
=*end
)!=0; ++end
) {
215 if(c
=='#' || c
=='\r' || c
=='\n') {
219 while(end
>line
&& (*(end
-1)==' ' || *(end
-1)=='\t')) {
224 /* skip leading white space and ignore empty lines */
225 s
=(char *)u_skipWhitespace(line
);
230 /* stop at the beginning of the mapping section */
231 if(uprv_memcmp(s
, "CHARMAP", 7)==0) {
235 /* get the key name, bracketed in <> */
237 fprintf(stderr
, "ucm error: no header field <key> in line \"%s\"\n", line
);
238 exit(U_INVALID_TABLE_FORMAT
);
243 fprintf(stderr
, "ucm error: incomplete header field <key> in line \"%s\"\n", line
);
244 exit(U_INVALID_TABLE_FORMAT
);
250 /* get the value string, possibly quoted */
251 s
=(char *)u_skipWhitespace(s
+1);
255 /* remove the quotes */
257 if(end
>*pValue
&& *(end
-1)=='"') {
262 /* collect the information from the header field, ignore unknown keys */
263 if(uprv_strcmp(*pKey
, "uconv_class")==0) {
264 if(uprv_strcmp(*pValue
, "DBCS")==0) {
265 states
->conversionType
=UCNV_DBCS
;
266 } else if(uprv_strcmp(*pValue
, "SBCS")==0) {
267 states
->conversionType
= UCNV_SBCS
;
268 } else if(uprv_strcmp(*pValue
, "MBCS")==0) {
269 states
->conversionType
= UCNV_MBCS
;
270 } else if(uprv_strcmp(*pValue
, "EBCDIC_STATEFUL")==0) {
271 states
->conversionType
= UCNV_EBCDIC_STATEFUL
;
273 fprintf(stderr
, "ucm error: unknown <uconv_class> %s\n", *pValue
);
274 exit(U_INVALID_TABLE_FORMAT
);
277 } else if(uprv_strcmp(*pKey
, "mb_cur_max")==0) {
279 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
280 states
->maxCharLength
=(int8_t)(c
-'0');
281 states
->outputType
=(int8_t)(states
->maxCharLength
-1);
283 fprintf(stderr
, "ucm error: illegal <mb_cur_max> %s\n", *pValue
);
284 exit(U_INVALID_TABLE_FORMAT
);
287 } else if(uprv_strcmp(*pKey
, "mb_cur_min")==0) {
289 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
290 states
->minCharLength
=(int8_t)(c
-'0');
292 fprintf(stderr
, "ucm error: illegal <mb_cur_min> %s\n", *pValue
);
293 exit(U_INVALID_TABLE_FORMAT
);
296 } else if(uprv_strcmp(*pKey
, "icu:state")==0) {
297 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
298 switch(states
->conversionType
) {
301 case UCNV_EBCDIC_STATEFUL
:
302 states
->conversionType
=UCNV_MBCS
;
307 fprintf(stderr
, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
308 exit(U_INVALID_TABLE_FORMAT
);
311 if(states
->maxCharLength
==0) {
312 fprintf(stderr
, "ucm error: <icu:state> before the <mb_cur_max> line\n");
313 exit(U_INVALID_TABLE_FORMAT
);
315 ucm_addState(states
, *pValue
);
317 } else if(uprv_strcmp(*pKey
, "icu:base")==0) {
319 fprintf(stderr
, "ucm error: <icu:base> without a base table name\n");
320 exit(U_INVALID_TABLE_FORMAT
);
322 uprv_strcpy(ucm
->baseName
, *pValue
);
329 /* post-processing ---------------------------------------------------------- */
332 sumUpStates(UCMStates
*states
) {
333 int32_t entry
, sum
, state
, cell
, count
;
334 UBool allStatesReady
;
337 * Sum up the offsets for all states.
338 * In each final state (where there are only final entries),
339 * the offsets add up directly.
340 * In all other state table rows, for each transition entry to another state,
341 * the offsets sum of that state needs to be added.
342 * This is achieved in at most countStates iterations.
344 allStatesReady
=FALSE
;
345 for(count
=states
->countStates
; !allStatesReady
&& count
>=0; --count
) {
347 for(state
=states
->countStates
-1; state
>=0; --state
) {
348 if(!(states
->stateFlags
[state
]&MBCS_STATE_FLAG_READY
)) {
349 allStatesReady
=FALSE
;
352 /* at first, add up only the final delta offsets to keep them <512 */
353 for(cell
=0; cell
<256; ++cell
) {
354 entry
=states
->stateTable
[state
][cell
];
355 if(MBCS_ENTRY_IS_FINAL(entry
)) {
356 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
357 case MBCS_STATE_VALID_16
:
358 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
361 case MBCS_STATE_VALID_16_PAIR
:
362 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
372 /* now, add up the delta offsets for the transitional entries */
373 for(cell
=0; cell
<256; ++cell
) {
374 entry
=states
->stateTable
[state
][cell
];
375 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
376 if(states
->stateFlags
[MBCS_ENTRY_TRANSITION_STATE(entry
)]&MBCS_STATE_FLAG_READY
) {
377 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry
, sum
);
378 sum
+=states
->stateOffsetSum
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
380 /* that next state does not have a sum yet, we cannot finish the one for this state */
388 states
->stateOffsetSum
[state
]=sum
;
389 states
->stateFlags
[state
]|=MBCS_STATE_FLAG_READY
;
395 if(!allStatesReady
) {
396 fprintf(stderr
, "ucm error: the state table contains loops\n");
397 exit(U_INVALID_TABLE_FORMAT
);
401 * For all "direct" (i.e., initial) states>0,
402 * the offsets need to be increased by the sum of
403 * the previous initial states.
405 sum
=states
->stateOffsetSum
[0];
406 for(state
=1; state
<states
->countStates
; ++state
) {
407 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
409 sum
+=states
->stateOffsetSum
[state
];
410 for(cell
=0; cell
<256; ++cell
) {
411 entry
=states
->stateTable
[state
][cell
];
412 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
413 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry
, sum2
);
419 /* round up to the next even number to have the following data 32-bit-aligned */
420 return states
->countToUCodeUnits
=(sum
+1)&~1;
423 U_CAPI
void U_EXPORT2
424 ucm_processStates(UCMStates
*states
, UBool ignoreSISOCheck
) {
425 int32_t entry
, state
, cell
, count
;
427 if(states
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
428 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
429 exit(U_INVALID_TABLE_FORMAT
);
432 if(states
->countStates
==0) {
433 switch(states
->conversionType
) {
435 /* SBCS: use MBCS data structure with a default state table */
436 if(states
->maxCharLength
!=1) {
437 fprintf(stderr
, "error: SBCS codepage with max B/char!=1\n");
438 exit(U_INVALID_TABLE_FORMAT
);
440 states
->conversionType
=UCNV_MBCS
;
441 ucm_addState(states
, "0-ff");
444 fprintf(stderr
, "ucm error: missing state table information (<icu:state>) for MBCS\n");
445 exit(U_INVALID_TABLE_FORMAT
);
447 case UCNV_EBCDIC_STATEFUL
:
448 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
449 if(states
->minCharLength
!=1 || states
->maxCharLength
!=2) {
450 fprintf(stderr
, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
451 exit(U_INVALID_TABLE_FORMAT
);
453 states
->conversionType
=UCNV_MBCS
;
454 ucm_addState(states
, "0-ff, e:1.s, f:0.s");
455 ucm_addState(states
, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
456 ucm_addState(states
, "0-40:1.i, 41-fe:1., ff:1.i");
457 ucm_addState(states
, "0-ff:1.i, 40:1.");
458 ucm_addState(states
, "0-ff:1.i");
461 /* DBCS: use MBCS data structure with a default state table */
462 if(states
->minCharLength
!=2 || states
->maxCharLength
!=2) {
463 fprintf(stderr
, "error: DBCS codepage with min or max B/char!=2\n");
464 exit(U_INVALID_TABLE_FORMAT
);
466 states
->conversionType
= UCNV_MBCS
;
467 ucm_addState(states
, "0-3f:3, 40:2, 41-fe:1, ff:3");
468 ucm_addState(states
, "41-fe");
469 ucm_addState(states
, "40");
470 ucm_addState(states
, "");
473 fprintf(stderr
, "ucm error: unknown charset structure\n");
474 exit(U_INVALID_TABLE_FORMAT
);
480 * check that the min/max character lengths are reasonable;
481 * to do this right, all paths through the state table would have to be
482 * recursively walked while keeping track of the sequence lengths,
483 * but these simple checks cover most state tables in practice
485 if(states
->maxCharLength
<states
->minCharLength
) {
486 fprintf(stderr
, "ucm error: max B/char < min B/char\n");
487 exit(U_INVALID_TABLE_FORMAT
);
490 /* count non-direct states and compare with max B/char */
492 for(state
=0; state
<states
->countStates
; ++state
) {
493 if((states
->stateFlags
[state
]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
497 if(states
->maxCharLength
>count
+1) {
498 fprintf(stderr
, "ucm error: max B/char too large\n");
499 exit(U_INVALID_TABLE_FORMAT
);
502 if(states
->minCharLength
==1) {
506 * if there are single-byte characters,
507 * then the initial state must have direct result states
509 for(cell
=0; cell
<256; ++cell
) {
510 entry
=states
->stateTable
[0][cell
];
511 if( MBCS_ENTRY_IS_FINAL(entry
) &&
512 ((action
=MBCS_ENTRY_FINAL_ACTION(entry
))==MBCS_STATE_VALID_DIRECT_16
||
513 action
==MBCS_STATE_UNASSIGNED
)
520 fprintf(stderr
, "ucm warning: min B/char too small\n");
525 * make sure that all "next state" values are within limits
526 * and that all next states after final ones have the "direct"
527 * flag of initial states
529 for(state
=states
->countStates
-1; state
>=0; --state
) {
530 for(cell
=0; cell
<256; ++cell
) {
531 entry
=states
->stateTable
[state
][cell
];
532 if((uint8_t)MBCS_ENTRY_STATE(entry
)>=states
->countStates
) {
533 fprintf(stderr
, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
534 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
535 exit(U_INVALID_TABLE_FORMAT
);
537 if(MBCS_ENTRY_IS_FINAL(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
538 fprintf(stderr
, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
539 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
540 exit(U_INVALID_TABLE_FORMAT
);
541 } else if(MBCS_ENTRY_IS_TRANSITION(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
542 fprintf(stderr
, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
543 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
544 exit(U_INVALID_TABLE_FORMAT
);
549 /* is this an SI/SO (like EBCDIC-stateful) state table? */
550 if(states
->countStates
>=2 && (states
->stateFlags
[1]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
551 if(states
->maxCharLength
!=2) {
552 fprintf(stderr
, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states
->maxCharLength
);
553 exit(U_INVALID_TABLE_FORMAT
);
555 if(states
->countStates
<3) {
556 fprintf(stderr
, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states
->countStates
);
557 exit(U_INVALID_TABLE_FORMAT
);
559 /* are the SI/SO all in the right places? */
560 if( ignoreSISOCheck
||
561 (states
->stateTable
[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
562 states
->stateTable
[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0) &&
563 states
->stateTable
[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
564 states
->stateTable
[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0))
566 states
->outputType
=MBCS_OUTPUT_2_SISO
;
568 fprintf(stderr
, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
569 exit(U_INVALID_TABLE_FORMAT
);
576 /* check that no unexpected state is a "direct" one */
577 while(state
<states
->countStates
) {
578 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
579 fprintf(stderr
, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state
);
580 exit(U_INVALID_TABLE_FORMAT
);
588 /* find a fallback for this offset; return the index or -1 if not found */
589 U_CAPI
int32_t U_EXPORT2
590 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
594 if(countToUFallbacks
==0) {
595 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
599 /* do a linear search for the fallback mapping (the table is not yet sorted) */
600 for(i
=0; i
<countToUFallbacks
; ++i
) {
601 if(offset
==toUFallbacks
[i
].offset
) {
609 * This function tries to compact toUnicode tables for 2-byte codepages
610 * by finding lead bytes with all-unassigned trail bytes and adding another state
614 compactToUnicode2(UCMStates
*states
,
615 uint16_t **pUnicodeCodeUnits
,
616 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
618 int32_t (*oldStateTable
)[256];
620 uint16_t *oldUnicodeCodeUnits
;
621 int32_t entry
, offset
, oldOffset
, trailOffset
, oldTrailOffset
, savings
, sum
;
622 int32_t i
, j
, leadState
, trailState
, newState
, fallback
;
625 /* find the lead state */
626 if(states
->outputType
==MBCS_OUTPUT_2_SISO
) {
627 /* use the DBCS lead state for SI/SO codepages */
633 /* find the main trail state: the most used target state */
634 uprv_memset(count
, 0, sizeof(count
));
635 for(i
=0; i
<256; ++i
) {
636 entry
=states
->stateTable
[leadState
][i
];
637 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
638 ++count
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
642 for(i
=1; i
<states
->countStates
; ++i
) {
643 if(count
[i
]>count
[trailState
]) {
648 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
649 uprv_memset(count
, 0, sizeof(count
));
651 /* for each lead byte */
652 for(i
=0; i
<256; ++i
) {
653 entry
=states
->stateTable
[leadState
][i
];
654 if(MBCS_ENTRY_IS_TRANSITION(entry
) && (MBCS_ENTRY_TRANSITION_STATE(entry
))==trailState
) {
655 /* the offset is different for each lead byte */
656 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
657 /* for each trail byte for this lead byte */
658 for(j
=0; j
<256; ++j
) {
659 entry
=states
->stateTable
[trailState
][j
];
660 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
661 case MBCS_STATE_VALID_16
:
662 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
663 if((*pUnicodeCodeUnits
)[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
666 j
=999; /* do not count for this lead byte because there are assignments */
669 case MBCS_STATE_VALID_16_PAIR
:
670 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
671 if((*pUnicodeCodeUnits
)[entry
]==0xfffe) {
674 j
=999; /* do not count for this lead byte because there are assignments */
682 /* all trail bytes for this lead byte are unassigned */
689 /* subtract from the possible savings the cost of an additional state */
690 savings
=savings
*2-1024; /* count bytes, not 16-bit words */
695 printf("compacting toUnicode data saves %ld bytes\n", (long)savings
);
697 if(states
->countStates
>=MBCS_MAX_STATE_COUNT
) {
698 fprintf(stderr
, "cannot compact toUnicode because the maximum number of states is reached\n");
702 /* make a copy of the state table */
703 oldStateTable
=(int32_t (*)[256])uprv_malloc(states
->countStates
*1024);
704 if(oldStateTable
==NULL
) {
705 fprintf(stderr
, "cannot compact toUnicode: out of memory\n");
708 uprv_memcpy(oldStateTable
, states
->stateTable
, states
->countStates
*1024);
710 /* add the new state */
712 * this function does not catch the degenerate case where all lead bytes
713 * have all-unassigned trail bytes and the lead state could be removed
715 newState
=states
->countStates
++;
716 states
->stateFlags
[newState
]=0;
717 /* copy the old trail state, turning all assigned states into unassigned ones */
718 for(i
=0; i
<256; ++i
) {
719 entry
=states
->stateTable
[trailState
][i
];
720 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
721 case MBCS_STATE_VALID_16
:
722 case MBCS_STATE_VALID_16_PAIR
:
723 states
->stateTable
[newState
][i
]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
726 states
->stateTable
[newState
][i
]=entry
;
731 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
732 for(i
=0; i
<256; ++i
) {
734 states
->stateTable
[leadState
][i
]=MBCS_ENTRY_SET_STATE(states
->stateTable
[leadState
][i
], newState
);
738 /* sum up the new state table */
739 for(i
=0; i
<states
->countStates
; ++i
) {
740 states
->stateFlags
[i
]&=~MBCS_STATE_FLAG_READY
;
742 sum
=sumUpStates(states
);
744 /* allocate a new, smaller code units array */
745 oldUnicodeCodeUnits
=*pUnicodeCodeUnits
;
747 *pUnicodeCodeUnits
=NULL
;
748 if(oldUnicodeCodeUnits
!=NULL
) {
749 uprv_free(oldUnicodeCodeUnits
);
751 uprv_free(oldStateTable
);
754 *pUnicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
755 if(*pUnicodeCodeUnits
==NULL
) {
756 fprintf(stderr
, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
758 /* revert to the old state table */
759 *pUnicodeCodeUnits
=oldUnicodeCodeUnits
;
760 --states
->countStates
;
761 uprv_memcpy(states
->stateTable
, oldStateTable
, states
->countStates
*1024);
762 uprv_free(oldStateTable
);
765 for(i
=0; i
<sum
; ++i
) {
766 (*pUnicodeCodeUnits
)[i
]=0xfffe;
769 /* copy the code units for all assigned characters */
771 * The old state table has the same lead _and_ trail states for assigned characters!
772 * The differences are in the offsets, and in the trail states for some unassigned characters.
773 * For each character with an assigned state in the new table, it was assigned in the old one.
774 * Only still-assigned characters are copied.
775 * Note that fallback mappings need to get their offset values adjusted.
778 /* for each initial state */
779 for(leadState
=0; leadState
<states
->countStates
; ++leadState
) {
780 if((states
->stateFlags
[leadState
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
781 /* for each lead byte from there */
782 for(i
=0; i
<256; ++i
) {
783 entry
=states
->stateTable
[leadState
][i
];
784 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
785 trailState
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
786 /* the new state does not have assigned states */
787 if(trailState
!=newState
) {
788 trailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
789 oldTrailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable
[leadState
][i
]);
790 /* for each trail byte */
791 for(j
=0; j
<256; ++j
) {
792 entry
=states
->stateTable
[trailState
][j
];
793 /* copy assigned-character code units and adjust fallback offsets */
794 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
795 case MBCS_STATE_VALID_16
:
796 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
797 /* find the old offset according to the old state table */
798 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
799 unit
=(*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
800 if(unit
==0xfffe && (fallback
=ucm_findFallback(toUFallbacks
, countToUFallbacks
, oldOffset
))>=0) {
801 toUFallbacks
[fallback
].offset
=0x80000000|offset
;
804 case MBCS_STATE_VALID_16_PAIR
:
805 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
806 /* find the old offset according to the old state table */
807 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
808 (*pUnicodeCodeUnits
)[offset
++]=oldUnicodeCodeUnits
[oldOffset
++];
809 (*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
821 /* remove temporary flags from fallback offsets that protected them from being modified twice */
822 for(i
=0; i
<countToUFallbacks
; ++i
) {
823 toUFallbacks
[i
].offset
&=0x7fffffff;
826 /* free temporary memory */
827 uprv_free(oldUnicodeCodeUnits
);
828 uprv_free(oldStateTable
);
832 * recursive sub-function of compactToUnicodeHelper()
834 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
835 * if all sequences from this state are unassigned, returns the
836 * <0 there are assignments in unicodeCodeUnits[]
837 * 0 no use of unicodeCodeUnits[]
840 findUnassigned(UCMStates
*states
,
841 uint16_t *unicodeCodeUnits
,
842 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
843 int32_t state
, int32_t offset
, uint32_t b
) {
844 int32_t i
, entry
, savings
, localSavings
, belowSavings
;
847 localSavings
=belowSavings
=0;
849 for(i
=0; i
<256; ++i
) {
850 entry
=states
->stateTable
[state
][i
];
851 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
852 savings
=findUnassigned(states
,
854 toUFallbacks
, countToUFallbacks
,
855 MBCS_ENTRY_TRANSITION_STATE(entry
),
856 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
860 } else if(savings
>0) {
861 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
862 (unsigned long)((b
<<8)|i
), (long)state
, (long)savings
);
863 belowSavings
+=savings
;
865 } else if(!haveAssigned
) {
866 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
867 case MBCS_STATE_VALID_16
:
868 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
869 if(unicodeCodeUnits
[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
875 case MBCS_STATE_VALID_16_PAIR
:
876 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
877 if(unicodeCodeUnits
[entry
]==0xfffe) {
891 return localSavings
+belowSavings
;
895 /* helper function for finding compaction opportunities */
897 compactToUnicodeHelper(UCMStates
*states
,
898 uint16_t *unicodeCodeUnits
,
899 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
) {
900 int32_t state
, savings
;
902 /* for each initial state */
903 for(state
=0; state
<states
->countStates
; ++state
) {
904 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
905 savings
=findUnassigned(states
,
907 toUFallbacks
, countToUFallbacks
,
910 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
911 (long)state
, (long)savings
);
918 compareFallbacks(const void *context
, const void *fb1
, const void *fb2
) {
919 return ((const _MBCSToUFallback
*)fb1
)->offset
-((const _MBCSToUFallback
*)fb2
)->offset
;
922 U_CAPI
void U_EXPORT2
923 ucm_optimizeStates(UCMStates
*states
,
924 uint16_t **pUnicodeCodeUnits
,
925 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
927 UErrorCode errorCode
;
928 int32_t state
, cell
, entry
;
930 /* test each state table entry */
931 for(state
=0; state
<states
->countStates
; ++state
) {
932 for(cell
=0; cell
<256; ++cell
) {
933 entry
=states
->stateTable
[state
][cell
];
935 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
936 * and the code point is "unassigned" (0xfffe), then change it to
937 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
939 if(MBCS_ENTRY_SET_STATE(entry
, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
940 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_UNASSIGNED
);
945 /* try to compact the toUnicode tables */
946 if(states
->maxCharLength
==2) {
947 compactToUnicode2(states
, pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
, verbose
);
948 } else if(states
->maxCharLength
>2) {
950 compactToUnicodeHelper(states
, *pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
);
954 /* sort toUFallbacks */
956 * It should be safe to sort them before compactToUnicode2() is called,
957 * because it should not change the relative order of the offset values
958 * that it adjusts, but they need to be sorted at some point, and
961 if(countToUFallbacks
>0) {
962 errorCode
=U_ZERO_ERROR
; /* nothing bad will happen... */
963 uprv_sortArray(toUFallbacks
, countToUFallbacks
,
964 sizeof(_MBCSToUFallback
),
965 compareFallbacks
, NULL
, FALSE
, &errorCode
);
969 /* use a complete state table ----------------------------------------------- */
971 U_CAPI
int32_t U_EXPORT2
972 ucm_countChars(UCMStates
*states
,
973 const uint8_t *bytes
, int32_t length
) {
975 int32_t i
, entry
, count
;
982 if(states
->countStates
==0) {
983 fprintf(stderr
, "ucm error: there is no state information!\n");
987 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
988 if(length
==2 && states
->outputType
==MBCS_OUTPUT_2_SISO
) {
993 * Walk down the state table like in conversion,
994 * much like getNextUChar().
995 * We assume that c<=0x10ffff.
997 for(i
=0; i
<length
; ++i
) {
998 entry
=states
->stateTable
[state
][bytes
[i
]];
999 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
1000 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1001 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1003 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
1004 case MBCS_STATE_ILLEGAL
:
1005 fprintf(stderr
, "ucm error: byte sequence ends in illegal state\n");
1007 case MBCS_STATE_CHANGE_ONLY
:
1008 fprintf(stderr
, "ucm error: byte sequence ends in state-change-only\n");
1010 case MBCS_STATE_UNASSIGNED
:
1011 case MBCS_STATE_FALLBACK_DIRECT_16
:
1012 case MBCS_STATE_VALID_DIRECT_16
:
1013 case MBCS_STATE_FALLBACK_DIRECT_20
:
1014 case MBCS_STATE_VALID_DIRECT_20
:
1015 case MBCS_STATE_VALID_16
:
1016 case MBCS_STATE_VALID_16_PAIR
:
1017 /* count a complete character and prepare for a new one */
1019 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1023 /* reserved, must never occur */
1024 fprintf(stderr
, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry
);
1031 fprintf(stderr
, "ucm error: byte sequence too short, ends in non-final state %u\n", state
);
1036 * for SI/SO (like EBCDIC-stateful), multiple-character results
1037 * must consist of only double-byte sequences
1039 if(count
>1 && states
->outputType
==MBCS_OUTPUT_2_SISO
&& length
!=2*count
) {
1040 fprintf(stderr
, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count
);