2 *******************************************************************************
4 * Copyright (C) 2003-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucmstate.c
10 * tab size: 8 (not used)
13 * created on: 2003oct09
14 * created by: Markus W. Scherer
16 * This file handles ICU .ucm file state information as part of the ucm module.
17 * Most of this code used to be in makeconv.c.
20 #include "unicode/utypes.h"
30 #if !UCONFIG_NO_CONVERSION
32 /* MBCS state handling ------------------------------------------------------ */
35 * state table row grammar (ebnf-style):
36 * (whitespace is allowed between all tokens)
38 * row=[[firstentry ','] entry (',' entry)*]
39 * firstentry="initial" | "surrogates"
40 * (initial state (default for state 0), output is all surrogate pairs)
41 * entry=range [':' nextstate] ['.' action]
42 * range=number ['-' number]
45 * action='u' | 's' | 'p' | 'i'
46 * (unassigned, state change only, surrogate pair, illegal)
47 * number=(1- or 2-digit hexadecimal number)
50 parseState(const char *s
, int32_t state
[256], uint32_t *pFlags
) {
52 uint32_t start
, end
, i
;
55 /* initialize the state: all illegal with U+ffff */
56 for(i
=0; i
<256; ++i
) {
57 state
[i
]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL
, 0xffff);
60 /* skip leading white space */
61 s
=u_skipWhitespace(s
);
63 /* is there an "initial" or "surrogates" directive? */
64 if(uprv_strncmp("initial", s
, 7)==0) {
65 *pFlags
=MBCS_STATE_FLAG_DIRECT
;
66 s
=u_skipWhitespace(s
+7);
70 } else if(*pFlags
==0 && uprv_strncmp("surrogates", s
, 10)==0) {
71 *pFlags
=MBCS_STATE_FLAG_SURROGATES
;
72 s
=u_skipWhitespace(s
+10);
77 /* empty state row: all-illegal */
82 /* read an entry, the start of the range first */
83 s
=u_skipWhitespace(s
);
84 start
=uprv_strtoul(s
, (char **)&t
, 16);
85 if(s
==t
|| 0xff<start
) {
88 s
=u_skipWhitespace(t
);
90 /* read the end of the range if there is one */
92 s
=u_skipWhitespace(s
+1);
93 end
=uprv_strtoul(s
, (char **)&t
, 16);
94 if(s
==t
|| end
<start
|| 0xff<end
) {
97 s
=u_skipWhitespace(t
);
102 /* determine the state entrys for this range */
103 if(*s
!=':' && *s
!='.') {
104 /* the default is: final state with valid entries */
105 entry
=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16
, 0);
107 entry
=MBCS_ENTRY_TRANSITION(0, 0);
109 /* get the next state, default to 0 */
110 s
=u_skipWhitespace(s
+1);
111 i
=uprv_strtoul(s
, (char **)&t
, 16);
116 s
=u_skipWhitespace(t
);
117 entry
=MBCS_ENTRY_SET_STATE(entry
, i
);
121 /* get the state action, default to valid */
123 /* this is a final state */
124 entry
=MBCS_ENTRY_SET_FINAL(entry
);
126 s
=u_skipWhitespace(s
+1);
128 /* unassigned set U+fffe */
129 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
130 s
=u_skipWhitespace(s
+1);
132 if(*pFlags
!=MBCS_STATE_FLAG_DIRECT
) {
133 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16_PAIR
);
135 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
137 s
=u_skipWhitespace(s
+1);
139 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_CHANGE_ONLY
);
140 s
=u_skipWhitespace(s
+1);
142 /* illegal set U+ffff */
143 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_ILLEGAL
, 0xffff);
144 s
=u_skipWhitespace(s
+1);
146 /* default to valid */
147 entry
=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_VALID_16
);
150 /* this is an intermediate state, nothing to do */
154 /* adjust "final valid" states according to the state flags */
155 if(MBCS_ENTRY_FINAL_ACTION(entry
)==MBCS_STATE_VALID_16
) {
160 case MBCS_STATE_FLAG_DIRECT
:
161 /* set the valid-direct code point to "unassigned"==0xfffe */
162 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_DIRECT_16
, 0xfffe);
164 case MBCS_STATE_FLAG_SURROGATES
:
165 entry
=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_VALID_16_PAIR
, 0);
172 /* set this entry for the range */
173 for(i
=start
; i
<=end
; ++i
) {
180 return *s
==0 ? NULL
: s
;
185 U_CAPI
void U_EXPORT2
186 ucm_addState(UCMStates
*states
, const char *s
) {
189 if(states
->countStates
==MBCS_MAX_STATE_COUNT
) {
190 fprintf(stderr
, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT
);
191 exit(U_INVALID_TABLE_FORMAT
);
194 error
=parseState(s
, states
->stateTable
[states
->countStates
],
195 &states
->stateFlags
[states
->countStates
]);
197 fprintf(stderr
, "ucm error: parse error in state definition at '%s'\n", error
);
198 exit(U_INVALID_TABLE_FORMAT
);
201 ++states
->countStates
;
204 U_CAPI UBool U_EXPORT2
205 ucm_parseHeaderLine(UCMFile
*ucm
,
206 char *line
, char **pKey
, char **pValue
) {
213 /* remove comments and trailing CR and LF and remove whitespace from the end */
214 for(end
=line
; (c
=*end
)!=0; ++end
) {
215 if(c
=='#' || c
=='\r' || c
=='\n') {
219 while(end
>line
&& (*(end
-1)==' ' || *(end
-1)=='\t')) {
224 /* skip leading white space and ignore empty lines */
225 s
=(char *)u_skipWhitespace(line
);
230 /* stop at the beginning of the mapping section */
231 if(uprv_memcmp(s
, "CHARMAP", 7)==0) {
235 /* get the key name, bracketed in <> */
237 fprintf(stderr
, "ucm error: no header field <key> in line \"%s\"\n", line
);
238 exit(U_INVALID_TABLE_FORMAT
);
243 fprintf(stderr
, "ucm error: incomplete header field <key> in line \"%s\"\n", line
);
244 exit(U_INVALID_TABLE_FORMAT
);
250 /* get the value string, possibly quoted */
251 s
=(char *)u_skipWhitespace(s
+1);
255 /* remove the quotes */
257 if(end
>*pValue
&& *(end
-1)=='"') {
262 /* collect the information from the header field, ignore unknown keys */
263 if(uprv_strcmp(*pKey
, "uconv_class")==0) {
264 if(uprv_strcmp(*pValue
, "DBCS")==0) {
265 states
->conversionType
=UCNV_DBCS
;
266 } else if(uprv_strcmp(*pValue
, "SBCS")==0) {
267 states
->conversionType
= UCNV_SBCS
;
268 } else if(uprv_strcmp(*pValue
, "MBCS")==0) {
269 states
->conversionType
= UCNV_MBCS
;
270 } else if(uprv_strcmp(*pValue
, "EBCDIC_STATEFUL")==0) {
271 states
->conversionType
= UCNV_EBCDIC_STATEFUL
;
273 fprintf(stderr
, "ucm error: unknown <uconv_class> %s\n", *pValue
);
274 exit(U_INVALID_TABLE_FORMAT
);
277 } else if(uprv_strcmp(*pKey
, "mb_cur_max")==0) {
279 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
280 states
->maxCharLength
=(int8_t)(c
-'0');
281 states
->outputType
=(int8_t)(states
->maxCharLength
-1);
283 fprintf(stderr
, "ucm error: illegal <mb_cur_max> %s\n", *pValue
);
284 exit(U_INVALID_TABLE_FORMAT
);
287 } else if(uprv_strcmp(*pKey
, "mb_cur_min")==0) {
289 if('1'<=c
&& c
<='4' && (*pValue
)[1]==0) {
290 states
->minCharLength
=(int8_t)(c
-'0');
292 fprintf(stderr
, "ucm error: illegal <mb_cur_min> %s\n", *pValue
);
293 exit(U_INVALID_TABLE_FORMAT
);
296 } else if(uprv_strcmp(*pKey
, "icu:state")==0) {
297 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
298 switch(states
->conversionType
) {
301 case UCNV_EBCDIC_STATEFUL
:
302 states
->conversionType
=UCNV_MBCS
;
307 fprintf(stderr
, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
308 exit(U_INVALID_TABLE_FORMAT
);
311 if(states
->maxCharLength
==0) {
312 fprintf(stderr
, "ucm error: <icu:state> before the <mb_cur_max> line\n");
313 exit(U_INVALID_TABLE_FORMAT
);
315 ucm_addState(states
, *pValue
);
317 } else if(uprv_strcmp(*pKey
, "icu:base")==0) {
319 fprintf(stderr
, "ucm error: <icu:base> without a base table name\n");
320 exit(U_INVALID_TABLE_FORMAT
);
322 uprv_strcpy(ucm
->baseName
, *pValue
);
329 /* post-processing ---------------------------------------------------------- */
332 sumUpStates(UCMStates
*states
) {
333 int32_t entry
, sum
, state
, cell
, count
;
334 UBool allStatesReady
;
337 * Sum up the offsets for all states.
338 * In each final state (where there are only final entries),
339 * the offsets add up directly.
340 * In all other state table rows, for each transition entry to another state,
341 * the offsets sum of that state needs to be added.
342 * This is achieved in at most countStates iterations.
344 allStatesReady
=FALSE
;
345 for(count
=states
->countStates
; !allStatesReady
&& count
>=0; --count
) {
347 for(state
=states
->countStates
-1; state
>=0; --state
) {
348 if(!(states
->stateFlags
[state
]&MBCS_STATE_FLAG_READY
)) {
349 allStatesReady
=FALSE
;
352 /* at first, add up only the final delta offsets to keep them <512 */
353 for(cell
=0; cell
<256; ++cell
) {
354 entry
=states
->stateTable
[state
][cell
];
355 if(MBCS_ENTRY_IS_FINAL(entry
)) {
356 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
357 case MBCS_STATE_VALID_16
:
358 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
361 case MBCS_STATE_VALID_16_PAIR
:
362 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_VALUE(entry
, sum
);
372 /* now, add up the delta offsets for the transitional entries */
373 for(cell
=0; cell
<256; ++cell
) {
374 entry
=states
->stateTable
[state
][cell
];
375 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
376 if(states
->stateFlags
[MBCS_ENTRY_TRANSITION_STATE(entry
)]&MBCS_STATE_FLAG_READY
) {
377 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry
, sum
);
378 sum
+=states
->stateOffsetSum
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
380 /* that next state does not have a sum yet, we cannot finish the one for this state */
388 states
->stateOffsetSum
[state
]=sum
;
389 states
->stateFlags
[state
]|=MBCS_STATE_FLAG_READY
;
395 if(!allStatesReady
) {
396 fprintf(stderr
, "ucm error: the state table contains loops\n");
397 exit(U_INVALID_TABLE_FORMAT
);
401 * For all "direct" (i.e., initial) states>0,
402 * the offsets need to be increased by the sum of
403 * the previous initial states.
405 sum
=states
->stateOffsetSum
[0];
406 for(state
=1; state
<states
->countStates
; ++state
) {
407 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
409 sum
+=states
->stateOffsetSum
[state
];
410 for(cell
=0; cell
<256; ++cell
) {
411 entry
=states
->stateTable
[state
][cell
];
412 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
413 states
->stateTable
[state
][cell
]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry
, sum2
);
419 /* round up to the next even number to have the following data 32-bit-aligned */
420 return states
->countToUCodeUnits
=(sum
+1)&~1;
423 U_CAPI
void U_EXPORT2
424 ucm_processStates(UCMStates
*states
) {
425 int32_t entry
, state
, cell
, count
;
427 if(states
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
428 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
429 exit(U_INVALID_TABLE_FORMAT
);
432 if(states
->countStates
==0) {
433 switch(states
->conversionType
) {
435 /* SBCS: use MBCS data structure with a default state table */
436 if(states
->maxCharLength
!=1) {
437 fprintf(stderr
, "error: SBCS codepage with max B/char!=1\n");
438 exit(U_INVALID_TABLE_FORMAT
);
440 states
->conversionType
=UCNV_MBCS
;
441 ucm_addState(states
, "0-ff");
444 fprintf(stderr
, "ucm error: missing state table information (<icu:state>) for MBCS\n");
445 exit(U_INVALID_TABLE_FORMAT
);
447 case UCNV_EBCDIC_STATEFUL
:
448 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
449 if(states
->minCharLength
!=1 || states
->maxCharLength
!=2) {
450 fprintf(stderr
, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
451 exit(U_INVALID_TABLE_FORMAT
);
453 states
->conversionType
=UCNV_MBCS
;
454 ucm_addState(states
, "0-ff, e:1.s, f:0.s");
455 ucm_addState(states
, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
456 ucm_addState(states
, "0-40:1.i, 41-fe:1., ff:1.i");
457 ucm_addState(states
, "0-ff:1.i, 40:1.");
458 ucm_addState(states
, "0-ff:1.i");
461 /* DBCS: use MBCS data structure with a default state table */
462 if(states
->minCharLength
!=2 || states
->maxCharLength
!=2) {
463 fprintf(stderr
, "error: DBCS codepage with min or max B/char!=2\n");
464 exit(U_INVALID_TABLE_FORMAT
);
466 states
->conversionType
= UCNV_MBCS
;
467 ucm_addState(states
, "0-3f:3, 40:2, 41-fe:1, ff:3");
468 ucm_addState(states
, "41-fe");
469 ucm_addState(states
, "40");
470 ucm_addState(states
, "");
473 fprintf(stderr
, "ucm error: unknown charset structure\n");
474 exit(U_INVALID_TABLE_FORMAT
);
480 * check that the min/max character lengths are reasonable;
481 * to do this right, all paths through the state table would have to be
482 * recursively walked while keeping track of the sequence lengths,
483 * but these simple checks cover most state tables in practice
485 if(states
->maxCharLength
<states
->minCharLength
) {
486 fprintf(stderr
, "ucm error: max B/char < min B/char\n");
487 exit(U_INVALID_TABLE_FORMAT
);
490 /* count non-direct states and compare with max B/char */
492 for(state
=0; state
<states
->countStates
; ++state
) {
493 if((states
->stateFlags
[state
]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
497 if(states
->maxCharLength
>count
+1) {
498 fprintf(stderr
, "ucm error: max B/char too large\n");
499 exit(U_INVALID_TABLE_FORMAT
);
502 if(states
->minCharLength
==1) {
506 * if there are single-byte characters,
507 * then the initial state must have direct result states
509 for(cell
=0; cell
<256; ++cell
) {
510 entry
=states
->stateTable
[0][cell
];
511 if( MBCS_ENTRY_IS_FINAL(entry
) &&
512 ((action
=MBCS_ENTRY_FINAL_ACTION(entry
))==MBCS_STATE_VALID_DIRECT_16
||
513 action
==MBCS_STATE_UNASSIGNED
)
520 fprintf(stderr
, "ucm warning: min B/char too small\n");
525 * make sure that all "next state" values are within limits
526 * and that all next states after final ones have the "direct"
527 * flag of initial states
529 for(state
=states
->countStates
-1; state
>=0; --state
) {
530 for(cell
=0; cell
<256; ++cell
) {
531 entry
=states
->stateTable
[state
][cell
];
532 if((uint8_t)MBCS_ENTRY_STATE(entry
)>=states
->countStates
) {
533 fprintf(stderr
, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
534 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
535 exit(U_INVALID_TABLE_FORMAT
);
537 if(MBCS_ENTRY_IS_FINAL(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)!=MBCS_STATE_FLAG_DIRECT
) {
538 fprintf(stderr
, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
539 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
540 exit(U_INVALID_TABLE_FORMAT
);
541 } else if(MBCS_ENTRY_IS_TRANSITION(entry
) && (states
->stateFlags
[MBCS_ENTRY_STATE(entry
)]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
542 fprintf(stderr
, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
543 (int)state
, (int)cell
, (int)MBCS_ENTRY_STATE(entry
));
544 exit(U_INVALID_TABLE_FORMAT
);
549 /* is this an SI/SO (like EBCDIC-stateful) state table? */
550 if(states
->countStates
>=2 && (states
->stateFlags
[1]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
551 if(states
->maxCharLength
!=2) {
552 fprintf(stderr
, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states
->maxCharLength
);
553 exit(U_INVALID_TABLE_FORMAT
);
555 if(states
->countStates
<3) {
556 fprintf(stderr
, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states
->countStates
);
557 exit(U_INVALID_TABLE_FORMAT
);
559 /* are the SI/SO all in the right places? */
560 if( states
->stateTable
[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
561 states
->stateTable
[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0) &&
562 states
->stateTable
[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY
, 0) &&
563 states
->stateTable
[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY
, 0)
565 states
->outputType
=MBCS_OUTPUT_2_SISO
;
567 fprintf(stderr
, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
568 exit(U_INVALID_TABLE_FORMAT
);
575 /* check that no unexpected state is a "direct" one */
576 while(state
<states
->countStates
) {
577 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
578 fprintf(stderr
, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state
);
579 exit(U_INVALID_TABLE_FORMAT
);
587 /* find a fallback for this offset; return the index or -1 if not found */
588 U_CAPI
int32_t U_EXPORT2
589 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
593 if(countToUFallbacks
==0) {
594 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
598 /* do a linear search for the fallback mapping (the table is not yet sorted) */
599 for(i
=0; i
<countToUFallbacks
; ++i
) {
600 if(offset
==toUFallbacks
[i
].offset
) {
608 * This function tries to compact toUnicode tables for 2-byte codepages
609 * by finding lead bytes with all-unassigned trail bytes and adding another state
613 compactToUnicode2(UCMStates
*states
,
614 uint16_t **pUnicodeCodeUnits
,
615 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
617 int32_t (*oldStateTable
)[256];
619 uint16_t *oldUnicodeCodeUnits
;
620 int32_t entry
, offset
, oldOffset
, trailOffset
, oldTrailOffset
, savings
, sum
;
621 int32_t i
, j
, leadState
, trailState
, newState
, fallback
;
624 /* find the lead state */
625 if(states
->outputType
==MBCS_OUTPUT_2_SISO
) {
626 /* use the DBCS lead state for SI/SO codepages */
632 /* find the main trail state: the most used target state */
633 uprv_memset(count
, 0, sizeof(count
));
634 for(i
=0; i
<256; ++i
) {
635 entry
=states
->stateTable
[leadState
][i
];
636 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
637 ++count
[MBCS_ENTRY_TRANSITION_STATE(entry
)];
641 for(i
=1; i
<states
->countStates
; ++i
) {
642 if(count
[i
]>count
[trailState
]) {
647 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
648 uprv_memset(count
, 0, sizeof(count
));
650 /* for each lead byte */
651 for(i
=0; i
<256; ++i
) {
652 entry
=states
->stateTable
[leadState
][i
];
653 if(MBCS_ENTRY_IS_TRANSITION(entry
) && (MBCS_ENTRY_TRANSITION_STATE(entry
))==trailState
) {
654 /* the offset is different for each lead byte */
655 offset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
656 /* for each trail byte for this lead byte */
657 for(j
=0; j
<256; ++j
) {
658 entry
=states
->stateTable
[trailState
][j
];
659 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
660 case MBCS_STATE_VALID_16
:
661 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
662 if((*pUnicodeCodeUnits
)[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
665 j
=999; /* do not count for this lead byte because there are assignments */
668 case MBCS_STATE_VALID_16_PAIR
:
669 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
670 if((*pUnicodeCodeUnits
)[entry
]==0xfffe) {
673 j
=999; /* do not count for this lead byte because there are assignments */
681 /* all trail bytes for this lead byte are unassigned */
688 /* subtract from the possible savings the cost of an additional state */
689 savings
=savings
*2-1024; /* count bytes, not 16-bit words */
694 printf("compacting toUnicode data saves %ld bytes\n", (long)savings
);
696 if(states
->countStates
>=MBCS_MAX_STATE_COUNT
) {
697 fprintf(stderr
, "cannot compact toUnicode because the maximum number of states is reached\n");
701 /* make a copy of the state table */
702 oldStateTable
=(int32_t (*)[256])uprv_malloc(states
->countStates
*1024);
703 if(oldStateTable
==NULL
) {
704 fprintf(stderr
, "cannot compact toUnicode: out of memory\n");
707 uprv_memcpy(oldStateTable
, states
->stateTable
, states
->countStates
*1024);
709 /* add the new state */
711 * this function does not catch the degenerate case where all lead bytes
712 * have all-unassigned trail bytes and the lead state could be removed
714 newState
=states
->countStates
++;
715 states
->stateFlags
[newState
]=0;
716 /* copy the old trail state, turning all assigned states into unassigned ones */
717 for(i
=0; i
<256; ++i
) {
718 entry
=states
->stateTable
[trailState
][i
];
719 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
720 case MBCS_STATE_VALID_16
:
721 case MBCS_STATE_VALID_16_PAIR
:
722 states
->stateTable
[newState
][i
]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry
, MBCS_STATE_UNASSIGNED
, 0xfffe);
725 states
->stateTable
[newState
][i
]=entry
;
730 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
731 for(i
=0; i
<256; ++i
) {
733 states
->stateTable
[leadState
][i
]=MBCS_ENTRY_SET_STATE(states
->stateTable
[leadState
][i
], newState
);
737 /* sum up the new state table */
738 for(i
=0; i
<states
->countStates
; ++i
) {
739 states
->stateFlags
[i
]&=~MBCS_STATE_FLAG_READY
;
741 sum
=sumUpStates(states
);
743 /* allocate a new, smaller code units array */
744 oldUnicodeCodeUnits
=*pUnicodeCodeUnits
;
746 *pUnicodeCodeUnits
=NULL
;
747 if(oldUnicodeCodeUnits
!=NULL
) {
748 uprv_free(oldUnicodeCodeUnits
);
750 uprv_free(oldStateTable
);
753 *pUnicodeCodeUnits
=(uint16_t *)uprv_malloc(sum
*sizeof(uint16_t));
754 if(*pUnicodeCodeUnits
==NULL
) {
755 fprintf(stderr
, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
757 /* revert to the old state table */
758 *pUnicodeCodeUnits
=oldUnicodeCodeUnits
;
759 --states
->countStates
;
760 uprv_memcpy(states
->stateTable
, oldStateTable
, states
->countStates
*1024);
761 uprv_free(oldStateTable
);
764 for(i
=0; i
<sum
; ++i
) {
765 (*pUnicodeCodeUnits
)[i
]=0xfffe;
768 /* copy the code units for all assigned characters */
770 * The old state table has the same lead _and_ trail states for assigned characters!
771 * The differences are in the offsets, and in the trail states for some unassigned characters.
772 * For each character with an assigned state in the new table, it was assigned in the old one.
773 * Only still-assigned characters are copied.
774 * Note that fallback mappings need to get their offset values adjusted.
777 /* for each initial state */
778 for(leadState
=0; leadState
<states
->countStates
; ++leadState
) {
779 if((states
->stateFlags
[leadState
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
780 /* for each lead byte from there */
781 for(i
=0; i
<256; ++i
) {
782 entry
=states
->stateTable
[leadState
][i
];
783 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
784 trailState
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
785 /* the new state does not have assigned states */
786 if(trailState
!=newState
) {
787 trailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
788 oldTrailOffset
=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable
[leadState
][i
]);
789 /* for each trail byte */
790 for(j
=0; j
<256; ++j
) {
791 entry
=states
->stateTable
[trailState
][j
];
792 /* copy assigned-character code units and adjust fallback offsets */
793 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
794 case MBCS_STATE_VALID_16
:
795 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
796 /* find the old offset according to the old state table */
797 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
798 unit
=(*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
799 if(unit
==0xfffe && (fallback
=ucm_findFallback(toUFallbacks
, countToUFallbacks
, oldOffset
))>=0) {
800 toUFallbacks
[fallback
].offset
=0x80000000|offset
;
803 case MBCS_STATE_VALID_16_PAIR
:
804 offset
=trailOffset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
805 /* find the old offset according to the old state table */
806 oldOffset
=oldTrailOffset
+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable
[trailState
][j
]);
807 (*pUnicodeCodeUnits
)[offset
++]=oldUnicodeCodeUnits
[oldOffset
++];
808 (*pUnicodeCodeUnits
)[offset
]=oldUnicodeCodeUnits
[oldOffset
];
820 /* remove temporary flags from fallback offsets that protected them from being modified twice */
821 for(i
=0; i
<countToUFallbacks
; ++i
) {
822 toUFallbacks
[i
].offset
&=0x7fffffff;
825 /* free temporary memory */
826 uprv_free(oldUnicodeCodeUnits
);
827 uprv_free(oldStateTable
);
831 * recursive sub-function of compactToUnicodeHelper()
833 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
834 * if all sequences from this state are unassigned, returns the
835 * <0 there are assignments in unicodeCodeUnits[]
836 * 0 no use of unicodeCodeUnits[]
839 findUnassigned(UCMStates
*states
,
840 uint16_t *unicodeCodeUnits
,
841 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
842 int32_t state
, int32_t offset
, uint32_t b
) {
843 int32_t i
, entry
, savings
, localSavings
, belowSavings
;
846 localSavings
=belowSavings
=0;
848 for(i
=0; i
<256; ++i
) {
849 entry
=states
->stateTable
[state
][i
];
850 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
851 savings
=findUnassigned(states
,
853 toUFallbacks
, countToUFallbacks
,
854 MBCS_ENTRY_TRANSITION_STATE(entry
),
855 offset
+MBCS_ENTRY_TRANSITION_OFFSET(entry
),
859 } else if(savings
>0) {
860 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
861 (unsigned long)((b
<<8)|i
), (long)state
, (long)savings
);
862 belowSavings
+=savings
;
864 } else if(!haveAssigned
) {
865 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
866 case MBCS_STATE_VALID_16
:
867 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
868 if(unicodeCodeUnits
[entry
]==0xfffe && ucm_findFallback(toUFallbacks
, countToUFallbacks
, entry
)<0) {
874 case MBCS_STATE_VALID_16_PAIR
:
875 entry
=offset
+MBCS_ENTRY_FINAL_VALUE_16(entry
);
876 if(unicodeCodeUnits
[entry
]==0xfffe) {
890 return localSavings
+belowSavings
;
894 /* helper function for finding compaction opportunities */
896 compactToUnicodeHelper(UCMStates
*states
,
897 uint16_t *unicodeCodeUnits
,
898 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
) {
899 int32_t state
, savings
;
901 /* for each initial state */
902 for(state
=0; state
<states
->countStates
; ++state
) {
903 if((states
->stateFlags
[state
]&0xf)==MBCS_STATE_FLAG_DIRECT
) {
904 savings
=findUnassigned(states
,
906 toUFallbacks
, countToUFallbacks
,
909 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
910 (long)state
, (long)savings
);
917 compareFallbacks(const void *context
, const void *fb1
, const void *fb2
) {
918 return ((const _MBCSToUFallback
*)fb1
)->offset
-((const _MBCSToUFallback
*)fb2
)->offset
;
921 U_CAPI
void U_EXPORT2
922 ucm_optimizeStates(UCMStates
*states
,
923 uint16_t **pUnicodeCodeUnits
,
924 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
926 UErrorCode errorCode
;
927 int32_t state
, cell
, entry
;
929 /* test each state table entry */
930 for(state
=0; state
<states
->countStates
; ++state
) {
931 for(cell
=0; cell
<256; ++cell
) {
932 entry
=states
->stateTable
[state
][cell
];
934 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
935 * and the code point is "unassigned" (0xfffe), then change it to
936 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
938 if(MBCS_ENTRY_SET_STATE(entry
, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16
, 0xfffe)) {
939 states
->stateTable
[state
][cell
]=MBCS_ENTRY_FINAL_SET_ACTION(entry
, MBCS_STATE_UNASSIGNED
);
944 /* try to compact the toUnicode tables */
945 if(states
->maxCharLength
==2) {
946 compactToUnicode2(states
, pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
, verbose
);
947 } else if(states
->maxCharLength
>2) {
949 compactToUnicodeHelper(states
, *pUnicodeCodeUnits
, toUFallbacks
, countToUFallbacks
);
953 /* sort toUFallbacks */
955 * It should be safe to sort them before compactToUnicode2() is called,
956 * because it should not change the relative order of the offset values
957 * that it adjusts, but they need to be sorted at some point, and
960 if(countToUFallbacks
>0) {
961 errorCode
=U_ZERO_ERROR
; /* nothing bad will happen... */
962 uprv_sortArray(toUFallbacks
, countToUFallbacks
,
963 sizeof(_MBCSToUFallback
),
964 compareFallbacks
, NULL
, FALSE
, &errorCode
);
968 /* use a complete state table ----------------------------------------------- */
970 U_CAPI
int32_t U_EXPORT2
971 ucm_countChars(UCMStates
*states
,
972 const uint8_t *bytes
, int32_t length
) {
974 int32_t i
, entry
, count
;
981 if(states
->countStates
==0) {
982 fprintf(stderr
, "ucm error: there is no state information!\n");
986 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
987 if(length
==2 && states
->outputType
==MBCS_OUTPUT_2_SISO
) {
992 * Walk down the state table like in conversion,
993 * much like getNextUChar().
994 * We assume that c<=0x10ffff.
996 for(i
=0; i
<length
; ++i
) {
997 entry
=states
->stateTable
[state
][bytes
[i
]];
998 if(MBCS_ENTRY_IS_TRANSITION(entry
)) {
999 state
=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry
);
1000 offset
+=MBCS_ENTRY_TRANSITION_OFFSET(entry
);
1002 switch(MBCS_ENTRY_FINAL_ACTION(entry
)) {
1003 case MBCS_STATE_ILLEGAL
:
1004 fprintf(stderr
, "ucm error: byte sequence ends in illegal state\n");
1006 case MBCS_STATE_CHANGE_ONLY
:
1007 fprintf(stderr
, "ucm error: byte sequence ends in state-change-only\n");
1009 case MBCS_STATE_UNASSIGNED
:
1010 case MBCS_STATE_FALLBACK_DIRECT_16
:
1011 case MBCS_STATE_VALID_DIRECT_16
:
1012 case MBCS_STATE_FALLBACK_DIRECT_20
:
1013 case MBCS_STATE_VALID_DIRECT_20
:
1014 case MBCS_STATE_VALID_16
:
1015 case MBCS_STATE_VALID_16_PAIR
:
1016 /* count a complete character and prepare for a new one */
1018 state
=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry
);
1022 /* reserved, must never occur */
1023 fprintf(stderr
, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry
);
1030 fprintf(stderr
, "ucm error: byte sequence too short, ends in non-final state %hu\n", state
);
1035 * for SI/SO (like EBCDIC-stateful), multiple-character results
1036 * must consist of only double-byte sequences
1038 if(count
>1 && states
->outputType
==MBCS_OUTPUT_2_SISO
&& length
!=2*count
) {
1039 fprintf(stderr
, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count
);