/*
*******************************************************************************
*
-* Copyright (C) 2000-2003, International Business Machines
+* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
#include "unewdata.h"
#include "ucnv_cnv.h"
#include "ucnvmbcs.h"
+#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
-enum {
- MBCS_STATE_FLAG_DIRECT=1,
- MBCS_STATE_FLAG_SURROGATES,
-
- MBCS_STATE_FLAG_READY=16
-};
-
-enum {
- MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
- MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
- MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
- MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
- MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
- MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
-
- MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
- MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
-
- MBCS_MAX_STATE_COUNT=128,
- MBCS_MAX_FALLBACK_COUNT=8192
-};
-
typedef struct MBCSData {
NewConverter newConverter;
- /* toUnicode */
- int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
- uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
- stateOffsetSum[MBCS_MAX_STATE_COUNT];
+ UCMFile *ucm;
+
+ /* toUnicode (state table in ucm->states) */
_MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
+ int32_t countToUFallbacks;
uint16_t *unicodeCodeUnits;
- _MBCSHeader header;
- int32_t countToUCodeUnits;
/* fromUnicode */
uint16_t stage1[MBCS_STAGE_1_SIZE];
uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
uint8_t *fromUBytes;
- uint32_t stage2Top, stage3Top, maxCharLength;
+ uint32_t stage2Top, stage3Top;
} MBCSData;
/* prototypes */
MBCSClose(NewConverter *cnvData);
static UBool
-MBCSProcessStates(NewConverter *cnvData);
+MBCSStartMappings(MBCSData *mbcsData);
static UBool
-MBCSAddToUnicode(NewConverter *cnvData,
+MBCSAddToUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static UBool
MBCSIsValid(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- uint32_t b);
+ const uint8_t *bytes, int32_t length);
static UBool
-MBCSSingleAddFromUnicode(NewConverter *cnvData,
+MBCSSingleAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static UBool
-MBCSAddFromUnicode(NewConverter *cnvData,
+MBCSAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static void
-MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData);
+MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
+
+static UBool
+MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
static uint32_t
-MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
+MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType);
+
+/* helper ------------------------------------------------------------------- */
+
+static U_INLINE char
+hexDigit(uint8_t digit) {
+ return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
+}
+
+static U_INLINE char *
+printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
+ char *s=buffer;
+ while(length>0) {
+ *s++=hexDigit((uint8_t)(*bytes>>4));
+ *s++=hexDigit((uint8_t)(*bytes&0xf));
+ ++bytes;
+ --length;
+ }
+
+ *s=0;
+ return buffer;
+}
/* implementation ----------------------------------------------------------- */
static void
-MBCSInit(MBCSData *mbcsData, uint8_t maxCharLength) {
- int i;
+MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
+ int32_t i, maxCharLength;
uprv_memset(mbcsData, 0, sizeof(MBCSData));
+ maxCharLength=ucm->states.maxCharLength;
+
+ mbcsData->ucm=ucm; /* aliased, not owned */
+
mbcsData->newConverter.close=MBCSClose;
- mbcsData->newConverter.startMappings=MBCSProcessStates;
mbcsData->newConverter.isValid=MBCSIsValid;
- mbcsData->newConverter.addToUnicode=MBCSAddToUnicode;
- if(maxCharLength==1) {
- mbcsData->newConverter.addFromUnicode=MBCSSingleAddFromUnicode;
- } else {
- mbcsData->newConverter.addFromUnicode=MBCSAddFromUnicode;
- }
- mbcsData->newConverter.finishMappings=MBCSPostprocess;
+ mbcsData->newConverter.addTable=MBCSAddTable;
mbcsData->newConverter.write=MBCSWrite;
- mbcsData->header.version[0]=4;
- mbcsData->header.version[1]=1;
- mbcsData->stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
- mbcsData->maxCharLength=maxCharLength;
- mbcsData->header.flags=maxCharLength-1; /* outputType */
/* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
}
NewConverter *
-MBCSOpen(uint8_t maxCharLength) {
+MBCSOpen(UCMFile *ucm) {
MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
if(mbcsData!=NULL) {
- MBCSInit(mbcsData, maxCharLength);
+ MBCSInit(mbcsData, ucm);
}
return &mbcsData->newConverter;
}
MBCSClose(NewConverter *cnvData) {
MBCSData *mbcsData=(MBCSData *)cnvData;
if(mbcsData!=NULL) {
- if(mbcsData->unicodeCodeUnits!=NULL) {
- uprv_free(mbcsData->unicodeCodeUnits);
- }
- if(mbcsData->fromUBytes!=NULL) {
- uprv_free(mbcsData->fromUBytes);
- }
+ uprv_free(mbcsData->unicodeCodeUnits);
+ uprv_free(mbcsData->fromUBytes);
uprv_free(mbcsData);
}
}
-static const char *
-skipWhitespace(const char *s) {
- while(*s==' ' || *s=='\t') {
- ++s;
- }
- return s;
-}
-
-/*
- * state table row grammar (ebnf-style):
- * (whitespace is allowed between all tokens)
- *
- * row=[[firstentry ','] entry (',' entry)*]
- * firstentry="initial" | "surrogates"
- * (initial state (default for state 0), output is all surrogate pairs)
- * entry=range [':' nextstate] ['.' action]
- * range=number ['-' number]
- * nextstate=number
- * (0..7f)
- * action='u' | 's' | 'p' | 'i'
- * (unassigned, state change only, surrogate pair, illegal)
- * number=(1- or 2-digit hexadecimal number)
- */
-static const char *
-parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
- const char *t;
- uint32_t start, end, i;
- int32_t entry;
-
- /* initialize the state: all illegal with U+ffff */
- for(i=0; i<256; ++i) {
- state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
- }
-
- /* skip leading white space */
- s=skipWhitespace(s);
-
- /* is there an "initial" or "surrogates" directive? */
- if(uprv_strncmp("initial", s, 7)==0) {
- *pFlags=MBCS_STATE_FLAG_DIRECT;
- s=skipWhitespace(s+7);
- if(*s++!=',') {
- return s-1;
- }
- } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
- *pFlags=MBCS_STATE_FLAG_SURROGATES;
- s=skipWhitespace(s+10);
- if(*s++!=',') {
- return s-1;
- }
- } else if(*s==0) {
- /* empty state row: all-illegal */
- return NULL;
- }
-
- for(;;) {
- /* read an entry, the start of the range first */
- s=skipWhitespace(s);
- start=uprv_strtoul(s, (char **)&t, 16);
- if(s==t || 0xff<start) {
- return s;
- }
- s=skipWhitespace(t);
-
- /* read the end of the range if there is one */
- if(*s=='-') {
- s=skipWhitespace(s+1);
- end=uprv_strtoul(s, (char **)&t, 16);
- if(s==t || end<start || 0xff<end) {
- return s;
- }
- s=skipWhitespace(t);
- } else {
- end=start;
- }
-
- /* determine the state entrys for this range */
- if(*s!=':' && *s!='.') {
- /* the default is: final state with valid entries */
- entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
- } else {
- entry=MBCS_ENTRY_TRANSITION(0, 0);
- if(*s==':') {
- /* get the next state, default to 0 */
- s=skipWhitespace(s+1);
- i=uprv_strtoul(s, (char **)&t, 16);
- if(s!=t) {
- if(0x7f<i) {
- return s;
- }
- s=skipWhitespace(t);
- entry=MBCS_ENTRY_SET_STATE(entry, i);
- }
- }
-
- /* get the state action, default to valid */
- if(*s=='.') {
- /* this is a final state */
- entry=MBCS_ENTRY_SET_FINAL(entry);
-
- s=skipWhitespace(s+1);
- if(*s=='u') {
- /* unassigned set U+fffe */
- entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
- s=skipWhitespace(s+1);
- } else if(*s=='p') {
- if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
- } else {
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
- }
- s=skipWhitespace(s+1);
- } else if(*s=='s') {
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
- s=skipWhitespace(s+1);
- } else if(*s=='i') {
- /* illegal set U+ffff */
- entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
- s=skipWhitespace(s+1);
- } else {
- /* default to valid */
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
- }
- } else {
- /* this is an intermediate state, nothing to do */
- }
- }
-
- /* adjust "final valid" states according to the state flags */
- if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
- switch(*pFlags) {
- case 0:
- /* no adjustment */
- break;
- case MBCS_STATE_FLAG_DIRECT:
- /* set the valid-direct code point to "unassigned"==0xfffe */
- entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
- break;
- case MBCS_STATE_FLAG_SURROGATES:
- entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
- break;
- default:
- break;
- }
- }
-
- /* set this entry for the range */
- for(i=start; i<=end; ++i) {
- state[i]=entry;
- }
-
- if(*s==',') {
- ++s;
- } else {
- return *s==0 ? NULL : s;
- }
- }
-}
-
-UBool
-MBCSAddState(NewConverter *cnvData, const char *s) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
- const char *error;
-
- if(mbcsData->header.countStates==MBCS_MAX_STATE_COUNT) {
- fprintf(stderr, "error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
- return FALSE;
- }
-
- error=parseState(s, mbcsData->stateTable[mbcsData->header.countStates],
- &mbcsData->stateFlags[mbcsData->header.countStates]);
- if(error!=NULL) {
- fprintf(stderr, "parse error in state definition at '%s'\n", error);
- return FALSE;
- }
-
- ++mbcsData->header.countStates;
- return TRUE;
-}
-
-static int32_t
-sumUpStates(MBCSData *mbcsData) {
- int32_t entry, sum;
- int state, cell, count;
- UBool allStatesReady;
-
- /*
- * Sum up the offsets for all states.
- * In each final state (where there are only final entries),
- * the offsets add up directly.
- * In all other state table rows, for each transition entry to another state,
- * the offsets sum of that state needs to be added.
- * This is achieved in at most countStates iterations.
- */
- allStatesReady=FALSE;
- for(count=mbcsData->header.countStates; !allStatesReady && count>=0; --count) {
- allStatesReady=TRUE;
- for(state=mbcsData->header.countStates-1; state>=0; --state) {
- if(!(mbcsData->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
- allStatesReady=FALSE;
- sum=0;
-
- /* at first, add up only the final delta offsets to keep them <512 */
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_FINAL(entry)) {
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
- sum+=1;
- break;
- case MBCS_STATE_VALID_16_PAIR:
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
- sum+=2;
- break;
- default:
- /* no addition */
- break;
- }
- }
- }
-
- /* now, add up the delta offsets for the transitional entries */
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- if(mbcsData->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
- sum+=mbcsData->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
- } else {
- /* that next state does not have a sum yet, we cannot finish the one for this state */
- sum=-1;
- break;
- }
- }
- }
-
- if(sum!=-1) {
- mbcsData->stateOffsetSum[state]=sum;
- mbcsData->stateFlags[state]|=MBCS_STATE_FLAG_READY;
- }
- }
- }
- }
-
- if(!allStatesReady) {
- fprintf(stderr, "error: the state table contains loops\n");
- return -1;
- }
-
- /*
- * For all "direct" (i.e., initial) states>0,
- * the offsets need to be increased by the sum of
- * the previous initial states.
- */
- sum=mbcsData->stateOffsetSum[0];
- for(state=1; state<(int)mbcsData->header.countStates; ++state) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- int32_t sum2=sum;
- sum+=mbcsData->stateOffsetSum[state];
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
- }
- }
- }
- }
- if(VERBOSE) {
- printf("the total number of offsets is 0x%lx=%ld\n",
- (unsigned long)sum, (long)sum);
- }
-
- /* round up to the next even number to have the following data 32-bit-aligned */
- sum=(sum+1)&~1;
- return mbcsData->countToUCodeUnits=sum;
-}
-
static UBool
-MBCSProcessStates(NewConverter *cnvData) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
- int32_t i, entry, sum;
- int state, cell;
-
- /*
- * first make sure that all "next state" values are within limits
- * and that all next states after final ones have the "direct"
- * flag of initial states
- */
- for(state=mbcsData->header.countStates-1; state>=0; --state) {
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if((uint8_t)MBCS_ENTRY_STATE(entry)>=mbcsData->header.countStates) {
- fprintf(stderr, "error: state table entry [%x][%x] has a next state of %x that is too high\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- }
- if(MBCS_ENTRY_IS_FINAL(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- }
- }
- }
+MBCSStartMappings(MBCSData *mbcsData) {
+ int32_t i, sum;
- /* is this an SI/SO (like EBCDIC-stateful) state table? */
- if(mbcsData->header.countStates>=2 && (mbcsData->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- if(mbcsData->maxCharLength!=2) {
- fprintf(stderr, "error: SI/SO codepages must have max 2 bytes/char (not %x)\n", mbcsData->maxCharLength);
- return FALSE;
- }
- if(mbcsData->header.countStates<3) {
- fprintf(stderr, "error: SI/SO codepages must have at least 3 states (not %x)\n", mbcsData->header.countStates);
- return FALSE;
- }
- /* are the SI/SO all in the right places? */
- if( mbcsData->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)
- ) {
- mbcsData->header.flags=MBCS_OUTPUT_2_SISO;
- } else {
- fprintf(stderr, "error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
- return FALSE;
- }
- state=2;
- } else {
- state=1;
- }
-
- /* check that no unexpected state is a "direct" one */
- while(state<(int)mbcsData->header.countStates) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state %d is 'initial' - not supported except for SI/SO codepages\n", state);
- return FALSE;
- }
- ++state;
- }
-
- sum=sumUpStates(mbcsData);
- if(sum<0) {
- return FALSE;
+ /* allocate the code unit array and prefill it with "unassigned" values */
+ sum=mbcsData->ucm->states.countToUCodeUnits;
+ if(VERBOSE) {
+ printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum);
}
- /* allocate the code unit array and prefill it with "unassigned" values */
if(sum>0) {
mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
if(mbcsData->unicodeCodeUnits==NULL) {
}
/* allocate the codepage mappings and preset the first 16 characters to 0 */
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
/* allocate 64k 16-bit results for single-byte codepages */
sum=0x20000;
} else {
/* allocate 1M * maxCharLength bytes for at most 1M mappings */
- sum=0x100000*mbcsData->maxCharLength;
+ sum=0x100000*mbcsData->ucm->states.maxCharLength;
}
mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
if(mbcsData->fromUBytes==NULL) {
- fprintf(stderr, "error: out of memory allocating %ldMB for target mappings\n",
- (long)sum);
+ fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
return FALSE;
}
/* initialize the all-unassigned first stage 3 block */
return TRUE;
}
-/* find a fallback for this offset; return the index or -1 if not found */
-static int32_t
-findFallback(MBCSData *mbcsData, uint32_t offset) {
- _MBCSToUFallback *toUFallbacks;
- int32_t i, limit;
-
- limit=mbcsData->header.countToUFallbacks;
- if(limit==0) {
- /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
- return -1;
- }
-
- /* do a linear search for the fallback mapping (the table is not yet sorted) */
- toUFallbacks=mbcsData->toUFallbacks;
- for(i=0; i<limit; ++i) {
- if(offset==toUFallbacks[i].offset) {
- return i;
- }
- }
- return -1;
-}
-
/* return TRUE for success */
static UBool
setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
- int32_t i=findFallback(mbcsData, offset);
+ int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
if(i>=0) {
/* if there is already a fallback for this offset, then overwrite it */
mbcsData->toUFallbacks[i].codePoint=c;
return TRUE;
} else {
/* if there is no fallback for this offset, then add one */
- i=mbcsData->header.countToUFallbacks;
+ i=mbcsData->countToUFallbacks;
if(i>=MBCS_MAX_FALLBACK_COUNT) {
- fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", c);
+ fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c);
return FALSE;
} else {
mbcsData->toUFallbacks[i].offset=offset;
mbcsData->toUFallbacks[i].codePoint=c;
- mbcsData->header.countToUFallbacks=i+1;
+ mbcsData->countToUFallbacks=i+1;
return TRUE;
}
}
/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
static int32_t
removeFallback(MBCSData *mbcsData, uint32_t offset) {
- int32_t i=findFallback(mbcsData, offset);
+ int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
if(i>=0) {
_MBCSToUFallback *toUFallbacks;
int32_t limit, old;
toUFallbacks=mbcsData->toUFallbacks;
- limit=mbcsData->header.countToUFallbacks;
+ limit=mbcsData->countToUFallbacks;
old=(int32_t)toUFallbacks[i].codePoint;
/* copy the last fallback entry here to keep the list contiguous */
toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
- mbcsData->header.countToUFallbacks=limit-1;
+ mbcsData->countToUFallbacks=limit-1;
return old;
} else {
return -1;
* -1 the precision of this mapping is not specified
*/
static UBool
-MBCSAddToUnicode(NewConverter *cnvData,
+MBCSAddToUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
+ char buffer[10];
uint32_t offset=0;
int32_t i=0, entry, old;
uint8_t state=0;
- if(mbcsData->header.countStates==0) {
+ if(mbcsData->ucm->states.countStates==0) {
fprintf(stderr, "error: there is no state information!\n");
return FALSE;
}
/* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
- if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
+ if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
state=1;
}
* We assume that c<=0x10ffff.
*/
for(i=0;;) {
- entry=mbcsData->stateTable[state][bytes[i++]];
+ entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
if(i==length) {
- fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx (U+%x)\n",
- state, (unsigned long)b, c);
+ fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
+ (short)state, printBytes(buffer, bytes, length), (int)c);
return FALSE;
}
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
} else {
if(i<length) {
- fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%02lx (U+%x)\n",
- (length-i), state, (unsigned long)b, c);
+ fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%s (U+%x)\n",
+ (int)(length-i), state, printBytes(buffer, bytes, length), (int)c);
return FALSE;
}
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
case MBCS_STATE_ILLEGAL:
- fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_CHANGE_ONLY:
- fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_UNASSIGNED:
- fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_FALLBACK_DIRECT_16:
case MBCS_STATE_VALID_DIRECT_16:
} else {
old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
}
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
}
/*
* Continue after the above warning
*/
}
/* reassign the correct action code */
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(isFallback>0 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
/* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
if(c<=0xffff) {
} else {
entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
}
- mbcsData->stateTable[state][bytes[i-1]]=entry;
+ mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
break;
case MBCS_STATE_VALID_16:
/* bits 26..16 are not used, 0 */
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
/* check that this byte sequence is still unassigned */
if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
}
}
if(c>=0x10000) {
- fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
}
- if(isFallback>0) {
+ if(flag>0) {
/* assign only if there is no precise mapping */
if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
return setFallback(mbcsData, offset, c);
} else /* old<=0xe001 */ {
real=mbcsData->unicodeCodeUnits[offset+1];
}
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)real);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)real);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)real);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)real);
}
}
- if(isFallback>0) {
+ if(flag>0) {
/* assign only if there is no precise mapping */
if(old<=0xdbff || old==0xe000) {
/* do nothing */
break;
default:
/* reserved, must never occur */
- fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx (U+%x)\n",
- (unsigned long)entry, (unsigned long)b, c);
+ fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
+ (int)entry, printBytes(buffer, bytes, length), (int)c);
return FALSE;
}
/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
static UBool
MBCSIsValid(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- uint32_t b) {
+ const uint8_t *bytes, int32_t length) {
MBCSData *mbcsData=(MBCSData *)cnvData;
- uint32_t offset=0;
- int32_t i=0, entry;
- uint8_t state=0;
-
- if(mbcsData->header.countStates==0) {
- fprintf(stderr, "error: there is no state information!\n");
- return FALSE;
- }
-
- /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
- if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
- state=1;
- }
- /*
- * Walk down the state table like in conversion,
- * much like getNextUChar().
- * We assume that c<=0x10ffff.
- */
- for(i=0;;) {
- entry=mbcsData->stateTable[state][bytes[i++]];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- if(i==length) {
- fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n",
- state, (unsigned long)b);
- return FALSE;
- }
- state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
- offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- } else {
- if(i<length) {
- fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%02lx\n",
- (length-i), state, (unsigned long)b);
- return FALSE;
- }
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_ILLEGAL:
- fprintf(stderr, "error: byte sequence ends in illegal state: 0x%02lx\n",
- (unsigned long)b);
- return FALSE;
- case MBCS_STATE_CHANGE_ONLY:
- fprintf(stderr, "error: byte sequence ends in state-change-only: 0x%02lx\n",
- (unsigned long)b);
- return FALSE;
- case MBCS_STATE_UNASSIGNED:
- fprintf(stderr, "error: byte sequence ends in unassigned state: 0x%02lx\n",
- (unsigned long)b);
- return FALSE;
- case MBCS_STATE_FALLBACK_DIRECT_16:
- case MBCS_STATE_VALID_DIRECT_16:
- case MBCS_STATE_FALLBACK_DIRECT_20:
- case MBCS_STATE_VALID_DIRECT_20:
- case MBCS_STATE_VALID_16:
- case MBCS_STATE_VALID_16_PAIR:
- return TRUE;
- default:
- /* reserved, must never occur */
- fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx\n",
- (long)entry, (unsigned long)b);
- return FALSE;
- }
- }
- }
+ return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length));
}
static UBool
-MBCSSingleAddFromUnicode(NewConverter *cnvData,
+MBCSSingleAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
uint16_t *p;
uint32_t index;
uint16_t old;
+ uint8_t b;
+
+ /* ignore |2 SUB mappings */
+ if(flag==2) {
+ return TRUE;
+ }
/*
* Walk down the triple-stage compact array ("trie") and
* Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
* We assume that length<=maxCharLength and that c<=0x10ffff.
*/
+ b=*bytes;
/* inspect stage 1 */
index=c>>10;
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
- fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
return FALSE;
}
if(mbcsData->stage2Single[index]==0) {
/* allocate another block in stage 3 */
if(mbcsData->stage3Top>=0x10000) {
- fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
return FALSE;
}
/* each block has 16 uint16_t entries */
/* write the codepage entry into stage 3 and get the previous entry */
p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
old=*p;
- if(isFallback<=0) {
+ if(flag<=0) {
*p=(uint16_t)(0xf00|b);
} else if(IS_PRIVATE_USE(c)) {
*p=(uint16_t)(0xc00|b);
/* check that this Unicode code point was still unassigned */
if(old>=0x100) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n",
- c, (unsigned long)b, old&0xff);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
+ (int)c, b, old&0xff);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n",
- c, (unsigned long)b, old&0xff);
+ fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
+ (int)c, b, old&0xff);
}
/* continue after the above warning if the precision of the mapping is unspecified */
}
}
static UBool
-MBCSAddFromUnicode(NewConverter *cnvData,
+MBCSAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
+ char buffer[10];
+ const uint8_t *pb;
uint8_t *p;
- uint32_t index, old;
+ uint32_t index, b, old;
+ int32_t maxCharLength;
+
+ /* ignore |2 SUB mappings */
+ if(flag==2) {
+ return TRUE;
+ }
+
+ maxCharLength=mbcsData->ucm->states.maxCharLength;
+
+ if(maxCharLength==1) {
+ return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag);
+ }
- if( (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO &&
+ if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
(*bytes==0xe || *bytes==0xf)
) {
- fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
}
+
+ if(flag==1 && length==1 && *bytes==0) {
+ fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
+ (int)c, *bytes);
+ return FALSE;
+ }
+
/*
* Walk down the triple-stage compact array ("trie") and
* allocate parts as necessary.
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
- fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
}
index=mbcsData->stage1[index]+((c>>4)&0x3f);
if(mbcsData->stage2[index]==0) {
/* allocate another block in stage 3 */
- if(mbcsData->stage3Top>=0x100000*mbcsData->maxCharLength) {
- fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) {
+ fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
+ (int)c, printBytes(buffer, bytes, length));
return FALSE;
}
/* each block has 16*maxCharLength bytes */
- mbcsData->stage2[index]=(mbcsData->stage3Top/16)/mbcsData->maxCharLength;
- uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*mbcsData->maxCharLength);
- mbcsData->stage3Top+=16*mbcsData->maxCharLength;
+ mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength;
+ uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength);
+ mbcsData->stage3Top+=16*maxCharLength;
}
/* write the codepage bytes into stage 3 and get the previous bytes */
+
+ /* assemble the bytes into a single integer */
+ pb=bytes;
+ b=0;
+ switch(length) {
+ case 4:
+ b=*pb++;
+ case 3:
+ b=(b<<8)|*pb++;
+ case 2:
+ b=(b<<8)|*pb++;
+ case 1:
+ default:
+ b=(b<<8)|*pb++;
+ break;
+ }
+
old=0;
- p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*mbcsData->maxCharLength;
- switch(mbcsData->maxCharLength) {
+ p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength;
+ switch(maxCharLength) {
case 2:
old=*(uint16_t *)p;
*(uint16_t *)p=(uint16_t)b;
/* check that this Unicode code point was still unassigned */
if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n",
- c, (unsigned long)b, (unsigned long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n",
- c, (unsigned long)b, (unsigned long)old);
+ fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
+ (int)c, printBytes(buffer, bytes, length), (int)old);
}
/* continue after the above warning if the precision of the mapping is
unspecified */
}
- if(isFallback<=0) {
- /* set the "assigned" flag */
+ if(flag<=0) {
+ /* set the roundtrip flag */
mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
}
return TRUE;
}
-static int
-compareFallbacks(const void *fb1, const void *fb2) {
- return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
-}
-
-/*
- * This function tries to compact toUnicode tables for 2-byte codepages
- * by finding lead bytes with all-unassigned trail bytes and adding another state
- * for them.
- */
-static void
-compactToUnicode2(MBCSData *mbcsData) {
- int32_t (*oldStateTable)[256];
- uint16_t count[256];
- uint16_t *oldUnicodeCodeUnits;
- int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
- int32_t i, j, leadState, trailState, newState, fallback;
- uint16_t unit;
-
- /* find the lead state */
- if((mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
- /* use the DBCS lead state for SI/SO codepages */
- leadState=1;
- } else {
- leadState=0;
- }
-
- /* find the main trail state: the most used target state */
- uprv_memset(count, 0, sizeof(count));
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
- }
- }
- trailState=0;
- for(i=1; i<(int)mbcsData->header.countStates; ++i) {
- if(count[i]>count[trailState]) {
- trailState=i;
- }
- }
-
- /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
- uprv_memset(count, 0, sizeof(count));
- savings=0;
- /* for each lead byte */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
- /* the offset is different for each lead byte */
- offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- /* for each trail byte for this lead byte */
- for(j=0; j<256; ++j) {
- entry=mbcsData->stateTable[trailState][j];
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
- ++count[i];
- } else {
- j=999; /* do not count for this lead byte because there are assignments */
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
- count[i]+=2;
- } else {
- j=999; /* do not count for this lead byte because there are assignments */
- }
- break;
- default:
- break;
- }
- }
- if(j==256) {
- /* all trail bytes for this lead byte are unassigned */
- savings+=count[i];
- } else {
- count[i]=0;
- }
- }
- }
- /* subtract from the possible savings the cost of an additional state */
- savings=savings*2-1024; /* count bytes, not 16-bit words */
- if(savings<=0) {
- return;
- }
- if(VERBOSE) {
- printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
- }
- if(mbcsData->header.countStates>=MBCS_MAX_STATE_COUNT) {
- fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
- return;
- }
-
- /* make a copy of the state table */
- oldStateTable=(int32_t (*)[256])uprv_malloc(mbcsData->header.countStates*1024);
- if(oldStateTable==NULL) {
- fprintf(stderr, "cannot compact toUnicode: out of memory\n");
- return;
- }
- uprv_memcpy(oldStateTable, mbcsData->stateTable, mbcsData->header.countStates*1024);
-
- /* add the new state */
- /*
- * this function does not catch the degenerate case where all lead bytes
- * have all-unassigned trail bytes and the lead state could be removed
- */
- newState=mbcsData->header.countStates++;
- mbcsData->stateFlags[newState]=0;
- /* copy the old trail state, turning all assigned states into unassigned ones */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[trailState][i];
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- case MBCS_STATE_VALID_16_PAIR:
- mbcsData->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
- break;
- default:
- mbcsData->stateTable[newState][i]=entry;
- break;
- }
- }
-
- /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
- for(i=0; i<256; ++i) {
- if(count[i]>0) {
- mbcsData->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(mbcsData->stateTable[leadState][i], newState);
- }
+/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
+static UBool
+MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
+ MBCSData *mbcsData;
+ UCMapping *m;
+ UChar32 c;
+ int32_t i;
+ UBool isOK;
+
+ staticData->unicodeMask=table->unicodeMask;
+ if(staticData->unicodeMask==3) {
+ fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
+ return FALSE;
}
- /* sum up the new state table */
- for(i=0; i<(int)mbcsData->header.countStates; ++i) {
- mbcsData->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
- }
- sum=sumUpStates(mbcsData);
-
- /* allocate a new, smaller code units array */
- oldUnicodeCodeUnits=mbcsData->unicodeCodeUnits;
- if(sum==0) {
- mbcsData->unicodeCodeUnits=NULL;
- if(oldUnicodeCodeUnits!=NULL) {
- uprv_free(oldUnicodeCodeUnits);
- }
- uprv_free(oldStateTable);
- return;
- }
- mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
- if(mbcsData->unicodeCodeUnits==NULL) {
- fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
- (long)sum);
- /* revert to the old state table */
- mbcsData->unicodeCodeUnits=oldUnicodeCodeUnits;
- --mbcsData->header.countStates;
- uprv_memcpy(mbcsData->stateTable, oldStateTable, mbcsData->header.countStates*1024);
- uprv_free(oldStateTable);
- return;
- }
- for(i=0; i<sum; ++i) {
- mbcsData->unicodeCodeUnits[i]=0xfffe;
- }
+ staticData->conversionType=UCNV_MBCS;
- /* copy the code units for all assigned characters */
- /*
- * The old state table has the same lead _and_ trail states for assigned characters!
- * The differences are in the offsets, and in the trail states for some unassigned characters.
- * For each character with an assigned state in the new table, it was assigned in the old one.
- * Only still-assigned characters are copied.
- * Note that fallback mappings need to get their offset values adjusted.
- */
+ mbcsData=(MBCSData *)cnvData;
- /* for each initial state */
- for(leadState=0; leadState<(int)mbcsData->header.countStates; ++leadState) {
- if((mbcsData->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- /* for each lead byte from there */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
- /* the new state does not have assigned states */
- if(trailState!=newState) {
- trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
- /* for each trail byte */
- for(j=0; j<256; ++j) {
- entry=mbcsData->stateTable[trailState][j];
- /* copy assigned-character code units and adjust fallback offsets */
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- /* find the old offset according to the old state table */
- oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
- unit=mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
- if(unit==0xfffe && (fallback=findFallback(mbcsData, oldOffset))>=0) {
- mbcsData->toUFallbacks[fallback].offset=0x80000000|offset;
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- /* find the old offset according to the old state table */
- oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
- mbcsData->unicodeCodeUnits[offset++]=oldUnicodeCodeUnits[oldOffset++];
- mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
- break;
- default:
- break;
- }
- }
- }
- }
- }
- }
+ if(!MBCSStartMappings(mbcsData)) {
+ return FALSE;
}
- /* remove temporary flags from fallback offsets that protected them from being modified twice */
- sum=mbcsData->header.countToUFallbacks;
- for(i=0; i<sum; ++i) {
- mbcsData->toUFallbacks[i].offset&=0x7fffffff;
- }
+ isOK=TRUE;
- /* free temporary memory */
- uprv_free(oldUnicodeCodeUnits);
- uprv_free(oldStateTable);
-}
+ m=table->mappings;
+ for(i=0; i<table->mappingsLength; ++m, ++i) {
+ c=m->u;
-/*
- * recursive sub-function of compactToUnicodeHelper()
- * returns:
- * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
- * if all sequences from this state are unassigned, returns the
- * <0 there are assignments in unicodeCodeUnits[]
- * 0 no use of unicodeCodeUnits[]
- */
-static int32_t
-findUnassigned(MBCSData *mbcsData, int32_t state, int32_t offset, uint32_t b) {
- int32_t i, entry, savings, localSavings, belowSavings;
- UBool haveAssigned;
-
- localSavings=belowSavings=0;
- haveAssigned=FALSE;
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[state][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- savings=findUnassigned(mbcsData, MBCS_ENTRY_TRANSITION_STATE(entry), offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), (b<<8)|(uint32_t)i);
- if(savings<0) {
- haveAssigned=TRUE;
- } else if(savings>0) {
- printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
- (unsigned long)((b<<8)|i), (long)state, (long)savings);
- belowSavings+=savings;
- }
- } else if(!haveAssigned) {
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
- localSavings+=2;
- } else {
- haveAssigned=TRUE;
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
- localSavings+=4;
- } else {
- haveAssigned=TRUE;
- }
- break;
- default:
- break;
- }
+ switch(m->f) {
+ case -1:
+ /* there was no precision/fallback indicator */
+ /* fall through to set the mappings */
+ case 0:
+ /* set roundtrip mappings */
+ isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) &&
+ MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
+ break;
+ case 1:
+ /* set only a fallback mapping from Unicode to codepage */
+ staticData->hasFromUnicodeFallback=TRUE;
+ isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
+ break;
+ case 2:
+ /* ignore |2 SUB mappings */
+ break;
+ case 3:
+ /* set only a fallback mapping from codepage to Unicode */
+ staticData->hasToUnicodeFallback=TRUE;
+ isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
+ break;
+ default:
+ /* will not occur because the parser checked it already */
+ fprintf(stderr, "error: illegal fallback indicator %d\n", m->f);
+ return FALSE;
}
}
- if(haveAssigned) {
- return -1;
- } else {
- return localSavings+belowSavings;
- }
-}
-
-/* helper function for finding compaction opportunities */
-static void
-compactToUnicodeHelper(MBCSData *mbcsData) {
- int32_t state, savings;
- if(!VERBOSE) {
- return;
- }
+ MBCSPostprocess(mbcsData, staticData);
- /* for each initial state */
- for(state=0; state<(int)mbcsData->header.countStates; ++state) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- savings=findUnassigned(mbcsData, state, 0, 0);
- if(savings>0) {
- printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
- (long)state, (long)savings);
- }
- }
- }
+ return isOK;
}
static UBool
transformEUC(MBCSData *mbcsData) {
uint8_t *p8;
- uint32_t i, value, oldLength=mbcsData->maxCharLength, old3Top=mbcsData->stage3Top, new3Top;
+ uint32_t i, value, oldLength, old3Top, new3Top;
uint8_t b;
+ oldLength=mbcsData->ucm->states.maxCharLength;
if(oldLength<3) {
return FALSE;
}
+ old3Top=mbcsData->stage3Top;
+
/* careful: 2-byte and 4-byte codes are stored in platform endianness! */
/* test if all first bytes are in {0, 0x8e, 0x8f} */
p8=mbcsData->fromUBytes;
/* modify outputType and adjust stage3Top */
- mbcsData->header.flags=MBCS_OUTPUT_3_EUC+oldLength-3;
+ mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength;
/*
}
static void
-MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
- int32_t entry;
- int state, cell;
+MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
+ UCMStates *states;
+ int32_t maxCharLength;
+
+ states=&mbcsData->ucm->states;
+ maxCharLength=states->maxCharLength;
/* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
if(VERBOSE) {
printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
- (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength,
- (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength);
- }
-
- /* test each state table entry */
- for(state=0; state<(int)mbcsData->header.countStates; ++state) {
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- /*
- * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
- * and the code point is "unassigned" (0xfffe), then change it to
- * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
- */
- if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
- }
- }
- }
-
- /* try to compact the toUnicode tables */
- if(mbcsData->maxCharLength==2) {
- compactToUnicode2(mbcsData);
- } else if(mbcsData->maxCharLength>2) {
- compactToUnicodeHelper(mbcsData);
+ (unsigned long)mbcsData->stage3Top/maxCharLength,
+ (unsigned long)mbcsData->stage3Top/maxCharLength);
}
- /* sort toUFallbacks */
- /*
- * It should be safe to sort them before compactToUnicode2() is called,
- * because it should not change the relative order of the offset values
- * that it adjusts, but they need to be sorted at some point, and
- * it is safest here.
- */
- if(mbcsData->header.countToUFallbacks>0) {
- qsort(mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks, sizeof(_MBCSToUFallback), compareFallbacks);
- }
+ ucm_optimizeStates(states,
+ &mbcsData->unicodeCodeUnits,
+ mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
+ VERBOSE);
/* try to compact the fromUnicode tables */
transformEUC(mbcsData);
- if(mbcsData->maxCharLength==1) {
+ if(maxCharLength==1) {
singleCompactStage3(mbcsData);
singleCompactStage2(mbcsData);
} else {
}
static uint32_t
-MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData) {
+MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType) {
MBCSData *mbcsData=(MBCSData *)cnvData;
+ uint32_t top;
int32_t i, stage1Top;
+ _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
+
/* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
} else {
mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
/* fill the header */
- mbcsData->header.offsetToUCodeUnits=
+ header.version[0]=4;
+ header.version[1]=2;
+ header.countStates=mbcsData->ucm->states.countStates;
+ header.countToUFallbacks=mbcsData->countToUFallbacks;
+
+ header.offsetToUCodeUnits=
sizeof(_MBCSHeader)+
- mbcsData->header.countStates*1024+
- mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback);
- mbcsData->header.offsetFromUTable=
- mbcsData->header.offsetToUCodeUnits+
- mbcsData->countToUCodeUnits*2;
- mbcsData->header.offsetFromUBytes=
- mbcsData->header.offsetFromUTable+
+ mbcsData->ucm->states.countStates*1024+
+ mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
+ header.offsetFromUTable=
+ header.offsetToUCodeUnits+
+ mbcsData->ucm->states.countToUCodeUnits*2;
+ header.offsetFromUBytes=
+ header.offsetFromUTable+
stage1Top*2+
mbcsData->stage2Top;
- mbcsData->header.fromUBytesLength=mbcsData->stage3Top;
+ header.fromUBytesLength=mbcsData->stage3Top;
+
+ top=header.offsetFromUBytes+header.fromUBytesLength;
+
+ header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
+
+ if(tableType&TABLE_EXT) {
+ if(top>0xffffff) {
+ fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
+ return 0;
+ }
+
+ header.flags|=top<<8;
+ }
/* write the MBCS data */
- udata_writeBlock(pData, &mbcsData->header, sizeof(_MBCSHeader));
- udata_writeBlock(pData, mbcsData->stateTable, mbcsData->header.countStates*1024);
- udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback));
- udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->countToUCodeUnits*2);
+ udata_writeBlock(pData, &header, sizeof(_MBCSHeader));
+ udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
+ udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
+ udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
} else {
udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
/* return the number of bytes that should have been written */
- return mbcsData->header.offsetFromUBytes+mbcsData->header.fromUBytesLength;
+ return header.offsetFromUBytes+header.fromUBytesLength;
}