X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..9d88c94317aeac5dd26c1dbe8c2112dbe855d2b5:/icuSources/tools/gennorm/store.c diff --git a/icuSources/tools/gennorm/store.c b/icuSources/tools/gennorm/store.c index dfb8d101..5dc0162a 100644 --- a/icuSources/tools/gennorm/store.c +++ b/icuSources/tools/gennorm/store.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2003, International Business Machines +* Copyright (C) 1999-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -20,12 +20,14 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" +#include "unicode/ustring.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" #include "unicode/udata.h" #include "utrie.h" #include "unicode/uset.h" +#include "toolutil.h" #include "unewdata.h" #include "unormimp.h" #include "gennorm.h" @@ -35,6 +37,8 @@ #define DO_DEBUG_OUT 0 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + /* * The new implementation of the normalization code loads its data from * unorm.icu, which is generated with this gennorm tool. @@ -73,7 +77,7 @@ static UDataInfo dataInfo={ 0, { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ - { 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ }; @@ -86,96 +90,15 @@ setUnicodeVersion(const char *v) { static int32_t indexes[_NORM_INDEX_TOP]={ 0 }; -/* tool memory helper ------------------------------------------------------- */ - -/* - * UToolMemory is used for generic, custom memory management. - * It is allocated with enough space for count*size bytes starting - * at array. - * The array is declared with a union of large data types so - * that its base address is aligned for any types. - * If size is a multiple of a data type size, then such items - * can be safely allocated inside the array, at offsets that - * are themselves multiples of size. - */ -typedef struct UToolMemory { - char name[64]; - uint32_t count, size, index; - union { - uint32_t u; - double d; - void *p; - } array[1]; -} UToolMemory; - -static UToolMemory * -utm_open(const char *name, uint32_t count, uint32_t size) { - UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size); - if(mem==NULL) { - fprintf(stderr, "error: %s - out of memory\n", name); - exit(U_MEMORY_ALLOCATION_ERROR); - } - uprv_strcpy(mem->name, name); - mem->count=count; - mem->size=size; - mem->index=0; - return mem; -} - -static void -utm_close(UToolMemory *mem) { - if(mem!=NULL) { - uprv_free(mem); - } -} - - - -static void * -utm_getStart(UToolMemory *mem) { - return (char *)mem->array; -} - -static int32_t -utm_countItems(UToolMemory *mem) { - return mem->index; -} - -static void * -utm_alloc(UToolMemory *mem) { - char *p=(char *)mem->array+mem->index*mem->size; - if(++mem->index<=mem->count) { - uprv_memset(p, 0, mem->size); - return p; - } else { - fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", - mem->name, (long)mem->count); - exit(U_MEMORY_ALLOCATION_ERROR); - } -} - -static void * -utm_allocN(UToolMemory *mem, int32_t n) { - char *p=(char *)mem->array+mem->index*mem->size; - if((mem->index+=(uint32_t)n)<=mem->count) { - uprv_memset(p, 0, n*mem->size); - return p; - } else { - fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", - mem->name, (long)mem->count); - exit(U_MEMORY_ALLOCATION_ERROR); - } -} - /* builder data ------------------------------------------------------------- */ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); static UNewTrie - normTrie={ {0},0,0,0,0,0,0,0,0,{0} }, - norm32Trie={ {0},0,0,0,0,0,0,0,0,{0} }, - fcdTrie={ {0},0,0,0,0,0,0,0,0,{0} }, - auxTrie={ {0},0,0,0,0,0,0,0,0,{0} }; + *normTrie, + *norm32Trie, + *fcdTrie, + *auxTrie; static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem; @@ -187,6 +110,9 @@ static Norm *norms; */ static uint32_t haveSeenFlags[256]; +/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */ +static USet *nfdQCNoSet; + /* see addCombiningCP() for details */ static uint32_t combiningCPs[2000]; @@ -220,7 +146,8 @@ static uint16_t combiningTable[0x8000]; static uint16_t combiningTableTop=0; #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000 -static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH]; +static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH + +10000]; /* +10000 for exclusion sets */ static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP; static int32_t canonSetsCount=0; @@ -228,30 +155,42 @@ extern void init() { uint16_t *p16; + normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); + uprv_memset(normTrie, 0, sizeof(UNewTrie)); + norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); + uprv_memset(norm32Trie, 0, sizeof(UNewTrie)); + fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); + uprv_memset(fcdTrie, 0, sizeof(UNewTrie)); + auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); + uprv_memset(auxTrie, 0, sizeof(UNewTrie)); + /* initialize the two tries */ - if(NULL==utrie_open(&normTrie, NULL, 30000, 0, FALSE)) { + if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) { fprintf(stderr, "error: failed to initialize tries\n"); exit(U_MEMORY_ALLOCATION_ERROR); } /* allocate Norm structures and reset the first one */ - normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm)); + normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm)); norms=utm_alloc(normMem); /* allocate UTF-32 string memory */ - utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4); + utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4); /* reset all "have seen" flags */ uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); + /* open an empty set */ + nfdQCNoSet=uset_open(1, 0); + /* allocate extra data memory for UTF-16 decomposition strings and other values */ - extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2); + extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2); /* initialize the extraMem counter for the top of FNC strings */ p16=(uint16_t *)utm_alloc(extraMem); *p16=1; /* allocate temporary memory for combining triples */ - combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple)); + combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple)); /* set the minimum code points for no/maybe quick check values to the end of the BMP */ indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; @@ -272,13 +211,13 @@ createNorm(uint32_t code) { Norm *p; uint32_t i; - i=utrie_get32(&normTrie, (UChar32)code, NULL); + i=utrie_get32(normTrie, (UChar32)code, NULL); if(i!=0) { p=norms+i; } else { /* allocate Norm */ p=(Norm *)utm_alloc(normMem); - if(!utrie_set32(&normTrie, (UChar32)code, (uint32_t)(p-norms))) { + if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) { fprintf(stderr, "error: too many normalization entries\n"); exit(U_BUFFER_OVERFLOW_ERROR); } @@ -291,7 +230,7 @@ static Norm * getNorm(uint32_t code) { uint32_t i; - i=utrie_get32(&normTrie, (UChar32)code, NULL); + i=utrie_get32(normTrie, (UChar32)code, NULL); if(i==0) { return NULL; } @@ -321,7 +260,7 @@ enumTrie(EnumTrieFn *fn, void *context) { count=0; for(code=0; code<=0x10ffff;) { - i=utrie_get32(&normTrie, code, &isInBlockZero); + i=utrie_get32(normTrie, code, &isInBlockZero); if(isInBlockZero) { code+=UTRIE_DATA_BLOCK_LENGTH; } else { @@ -499,7 +438,7 @@ processCombining() { triples=utm_getStart(combiningTriplesMem); /* add lead and trail indexes to the triples for sorting */ - count=(uint16_t)combiningTriplesMem->index; + count=(uint16_t)utm_countItems(combiningTriplesMem); for(i=0; ispecialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL; norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD; - if(!utrie_setRange32(&normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) { + if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) { fprintf(stderr, "error: too many normalization entries (setting Hangul)\n"); exit(U_BUFFER_OVERFLOW_ERROR); } @@ -1108,7 +1051,7 @@ postParseFn(void *context, uint32_t code, Norm *norm) { } else { uset_add(otherNorm->canonStart, code); if(!uset_contains(otherNorm->canonStart, code)) { - fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", c, code); + fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code); exit(U_INTERNAL_PROGRAM_ERROR); } } @@ -1256,12 +1199,12 @@ makeAll32() { uint32_t n; int32_t i, normLength, count; - count=(int32_t)normMem->index; + count=(int32_t)utm_countItems(normMem); for(i=0; iindex; + count=utm_countItems(normMem); for(i=0; i%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop); + fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop); exit(errorCode); } if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) { @@ -1391,7 +1334,7 @@ combine(uint32_t lead, uint32_t trail) { /* search for all triples with c as lead code point */ triples=utm_getStart(combiningTriplesMem); - count=combiningTriplesMem->index; + count=utm_countItems(combiningTriplesMem); /* triples are not sorted by code point but for each lead CP there is one contiguous block */ for(i=0; i[%ld], U+%04lx, %u)\n", - s[0], s[1], (long)length, (long)c, cc); + fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n", + (int)s[0], (int)s[1], (int)length, (int)c, cc); exit(U_INTERNAL_PROGRAM_ERROR); } } @@ -1503,7 +1446,7 @@ canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) { /* search for all triples with c as lead code point */ triples=utm_getStart(combiningTriplesMem); - count=combiningTriplesMem->index; + count=utm_countItems(combiningTriplesMem); c=s[0]; /* triples are not sorted by code point but for each lead CP there is one contiguous block */ @@ -1600,7 +1543,7 @@ makeAux() { uint32_t *pData; int32_t i, length; - pData=utrie_getData(&auxTrie, &length); + pData=utrie_getData(auxTrie, &length); for(i=0; iindex+combiningTableTop)&1) { + if((utm_countItems(extraMem)+combiningTableTop)&1) { combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */ } @@ -1841,30 +1827,31 @@ generateData(const char *dataDir) { size= _NORM_INDEX_TOP*4+ normTrieSize+ - extraMem->index*2+ + utm_countItems(extraMem)*2+ combiningTableTop*2+ fcdTrieSize+ auxTrieSize+ canonStartSetsTop*2; if(beVerbose) { - printf("size of normalization trie %5u bytes\n", normTrieSize); - printf("size of 16-bit extra memory %5u UChars/uint16_t\n", extraMem->index); + printf("size of normalization trie %5u bytes\n", (int)normTrieSize); + printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem)); printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]); printf("size of combining table %5u uint16_t\n", combiningTableTop); - printf("size of FCD trie %5u bytes\n", fcdTrieSize); - printf("size of auxiliary trie %5u bytes\n", auxTrieSize); - printf("size of canonStartSets[] %5u uint16_t\n", canonStartSetsTop); + printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize); + printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize); + printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop); printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP); printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP); - printf(" number of sets %5d\n", canonSetsCount); + printf(" number of sets %5d\n", (int)canonSetsCount); printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]); printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]); + printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]); printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size); } indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize; - indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index; + indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem); indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; @@ -1880,7 +1867,7 @@ generateData(const char *dataDir) { #endif /* write the data */ - pData=udata_create(dataDir, DATA_TYPE, U_ICUDATA_NAME "_" DATA_NAME, &dataInfo, + pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode); @@ -1891,7 +1878,7 @@ generateData(const char *dataDir) { udata_writeBlock(pData, indexes, sizeof(indexes)); udata_writeBlock(pData, normTrieBlock, normTrieSize); - udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2); + udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2); udata_writeBlock(pData, combiningTable, combiningTableTop*2); udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize); udata_writeBlock(pData, auxTrieBlock, auxTrieSize); @@ -1919,7 +1906,7 @@ extern void cleanUpData(void) { int32_t i, count; - count=(int32_t)normMem->index; + count=utm_countItems(normMem); for(i=0; i