/*
******************************************************************************
*
-* Copyright (C) 2001-2004, International Business Machines
+* Copyright (C) 2001-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#endif
#include "unicode/utypes.h"
-#include "udataswp.h"
#include "cmemory.h"
#include "utrie.h"
+/* miscellaneous ------------------------------------------------------------ */
+
#undef ABS
#define ABS(x) ((x)>=0 ? (x) : -(x))
+static U_INLINE UBool
+equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
+ while(length>0 && *s==*t) {
+ ++s;
+ ++t;
+ --length;
+ }
+ return (UBool)(length==0);
+}
+
/* Building a trie ----------------------------------------------------------*/
U_CAPI UNewTrie * U_EXPORT2
}
static int32_t
-_findSameIndexBlock(const int32_t *index, int32_t indexLength,
+_findSameIndexBlock(const int32_t *idx, int32_t indexLength,
int32_t otherBlock) {
int32_t block, i;
for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
- if(index[block+i]!=index[otherBlock+i]) {
+ if(idx[block+i]!=idx[otherBlock+i]) {
break;
}
}
static void
utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
- int32_t *index;
+ int32_t *idx;
uint32_t value;
UChar32 c;
int32_t indexLength, block;
+#ifdef UTRIE_DEBUG
+ int countLeadCUWithData=0;
+#endif
- index=trie->index;
+ idx=trie->index;
/* copy the lead surrogate indexes into a temporary array */
- uprv_memcpy(leadIndexes, index+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
+ uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
/*
* set all values for lead surrogate code *units* to leadUnitValue
/* search for any index (stage 1) entries for supplementary code points */
for(c=0x10000; c<0x110000;) {
- if(index[c>>UTRIE_SHIFT]!=0) {
+ if(idx[c>>UTRIE_SHIFT]!=0) {
/* there is data, treat the full block for a lead surrogate */
c&=~0x3ff;
#ifdef UTRIE_DEBUG
- printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10)));
+ ++countLeadCUWithData;
+ /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
#endif
/* is there an identical index block? */
- block=_findSameIndexBlock(index, indexLength, c>>UTRIE_SHIFT);
+ block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
/*
* get a folded value for [c..c+0x400[ and,
/* if we did not find an identical index block... */
if(block==indexLength) {
/* move the actual index (stage 1) entries from the supplementary position to the new one */
- uprv_memmove(index+indexLength,
- index+(c>>UTRIE_SHIFT),
+ uprv_memmove(idx+indexLength,
+ idx+(c>>UTRIE_SHIFT),
4*UTRIE_SURROGATE_BLOCK_COUNT);
indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
}
c+=UTRIE_DATA_BLOCK_LENGTH;
}
}
+#ifdef UTRIE_DEBUG
+ if(countLeadCUWithData>0) {
+ printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
+ }
+#endif
/*
* index array overflow?
* make space for the lead surrogate index block and
* insert it between the BMP indexes and the folded ones
*/
- uprv_memmove(index+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
- index+UTRIE_BMP_INDEX_LENGTH,
+ uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
+ idx+UTRIE_BMP_INDEX_LENGTH,
4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
- uprv_memcpy(index+UTRIE_BMP_INDEX_LENGTH,
+ uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
leadIndexes,
4*UTRIE_SURROGATE_BLOCK_COUNT);
indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
static int32_t
_findSameDataBlock(const uint32_t *data, int32_t dataLength,
int32_t otherBlock, int32_t step) {
- int32_t block, i;
+ int32_t block;
/* ensure that we do not even partially get past dataLength */
dataLength-=UTRIE_DATA_BLOCK_LENGTH;
for(block=0; block<=dataLength; block+=step) {
- for(i=0; i<UTRIE_DATA_BLOCK_LENGTH; ++i) {
- if(data[block+i]!=data[otherBlock+i]) {
- break;
- }
- }
- if(i==UTRIE_DATA_BLOCK_LENGTH) {
+ if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
return block;
}
}
* - removes blocks that are identical with earlier ones
* - overlaps adjacent blocks as much as possible (if overlap==TRUE)
* - moves blocks in steps of the data granularity
+ * - moves and overlaps blocks that overlap with multiple values in the overlap region
*
* It does not
* - try to move and overlap blocks that are not already adjacent
- * - try to move and overlap blocks that overlap with multiple values in the overlap region
*/
static void
utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
- uint32_t x;
- int32_t i, start, prevEnd, newStart, overlapStart;
+ int32_t i, start, newStart, overlapStart;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
newStart=UTRIE_DATA_BLOCK_LENGTH;
- prevEnd=newStart-1;
for(start=newStart; start<trie->dataLength;) {
/*
* start: index of first entry of current block
- * prevEnd: index to last entry of previous block
* newStart: index where the current block is to be moved
+ * (right after current end of already-compacted data)
*/
/* skip blocks that are not used */
/* advance start to the next block */
start+=UTRIE_DATA_BLOCK_LENGTH;
- /* leave prevEnd and newStart with the previous block! */
+ /* leave newStart with the previous block! */
continue;
}
/* advance start to the next block */
start+=UTRIE_DATA_BLOCK_LENGTH;
- /* leave prevEnd and newStart with the previous block! */
+ /* leave newStart with the previous block! */
continue;
}
/* see if the beginning of this block can be overlapped with the end of the previous block */
- /* x: first value in the current block */
- x=trie->data[start];
- if(x==trie->data[prevEnd] && overlap && start>=overlapStart) {
- /* overlap by at least one */
- for(i=1; i<UTRIE_DATA_BLOCK_LENGTH && x==trie->data[start+i] && x==trie->data[prevEnd-i]; ++i) {}
-
- /* overlap by i, rounded down for the data block granularity */
- i&=~(UTRIE_DATA_GRANULARITY-1);
+ if(overlap && start>=overlapStart) {
+ /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
+ for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
+ i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
+ i-=UTRIE_DATA_GRANULARITY) {}
} else {
i=0;
}
newStart+=UTRIE_DATA_BLOCK_LENGTH;
start=newStart;
}
-
- prevEnd=newStart-1;
}
/* now adjust the index (stage 1) table */
/* serialization ------------------------------------------------------------ */
-/**
- * Trie data structure in serialized form:
- *
- * UTrieHeader header;
- * uint16_t index[header.indexLength];
- * uint16_t data[header.dataLength];
- */
-struct UTrieHeader {
- /** "Trie" in big-endian US-ASCII (0x54726965) */
- uint32_t signature;
-
- /**
- * options bit field:
- * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
- * 8 0=16-bit data, 1=32-bit data
- * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
- * 3..0 UTRIE_SHIFT // 1..9
- */
- uint32_t options;
-
- /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
- int32_t indexLength;
-
- /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
- int32_t dataLength;
-};
-
-typedef struct UTrieHeader UTrieHeader;
-
-/**
- * Constants for use with UTrieHeader.options.
- */
-enum {
- /** Mask to get the UTRIE_SHIFT value from options. */
- UTRIE_OPTIONS_SHIFT_MASK=0xf,
-
- /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
- UTRIE_OPTIONS_INDEX_SHIFT=4,
-
- /** If set, then the data (stage 2) array is 32 bits wide. */
- UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
-
- /**
- * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
- * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
- */
- UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
-};
-
/*
* Default function for the folding value:
* Just store the offset (16 bits) if there is any non-initial-value entry.
return length; /* preflighting */
}
+#ifdef UTRIE_DEBUG
+ printf("**UTrieLengths(serialize)** index:%6ld data:%6ld serialized:%6ld\n",
+ (long)trie->indexLength, (long)trie->dataLength, (long)length);
+#endif
+
/* set the header fields */
header=(UTrieHeader *)data;
data+=sizeof(UTrieHeader);
}
/* inverse to defaultGetFoldedValue() */
-static int32_t U_CALLCONV
-defaultGetFoldingOffset(uint32_t data) {
+U_CAPI int32_t U_EXPORT2
+utrie_defaultGetFoldingOffset(uint32_t data) {
return (int32_t)data;
}
U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
- UTrieHeader *header;
- uint16_t *p16;
+ const UTrieHeader *header;
+ const uint16_t *p16;
uint32_t options;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
}
/* check the signature */
- header=(UTrieHeader *)data;
+ header=(const UTrieHeader *)data;
if(header->signature!=0x54726965) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return -1;
*pErrorCode=U_INVALID_FORMAT_ERROR;
return -1;
}
- p16=(uint16_t *)(header+1);
+ p16=(const uint16_t *)(header+1);
trie->index=p16;
p16+=trie->indexLength;
length-=2*trie->indexLength;
length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
}
- trie->getFoldingOffset=defaultGetFoldingOffset;
+ trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
return length;
}
-/* swapping ----------------------------------------------------------------- */
-
U_CAPI int32_t U_EXPORT2
-utrie_swap(const UDataSwapper *ds,
- const void *inData, int32_t length, void *outData,
- UErrorCode *pErrorCode) {
- const UTrieHeader *inTrie;
- UTrieHeader trie;
- int32_t size;
- UBool dataIs32;
+utrie_unserializeDummy(UTrie *trie,
+ void *data, int32_t length,
+ uint32_t initialValue, uint32_t leadUnitValue,
+ UBool make16BitTrie,
+ UErrorCode *pErrorCode) {
+ uint16_t *p16;
+ int32_t actualLength, latin1Length, i, limit;
+ uint16_t block;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return 0;
+ return -1;
}
- if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
+
+ /* calculate the actual size of the dummy trie data */
+
+ /* max(Latin-1, block 0) */
+ latin1Length= UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;
+
+ trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
+ trie->dataLength=latin1Length;
+ if(leadUnitValue!=initialValue) {
+ trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
}
- /* setup and swapping */
- if(length>=0 && length<sizeof(UTrieHeader)) {
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
+ actualLength=trie->indexLength*2;
+ if(make16BitTrie) {
+ actualLength+=trie->dataLength*2;
+ } else {
+ actualLength+=trie->dataLength*4;
}
- inTrie=(const UTrieHeader *)inData;
- trie.signature=ds->readUInt32(inTrie->signature);
- trie.options=ds->readUInt32(inTrie->options);
- trie.indexLength=udata_readInt32(ds, inTrie->indexLength);
- trie.dataLength=udata_readInt32(ds, inTrie->dataLength);
-
- if( trie.signature!=0x54726965 ||
- (trie.options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
- ((trie.options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT ||
- trie.indexLength<UTRIE_BMP_INDEX_LENGTH ||
- (trie.indexLength&(UTRIE_SURROGATE_BLOCK_COUNT-1))!=0 ||
- trie.dataLength<UTRIE_DATA_BLOCK_LENGTH ||
- (trie.dataLength&(UTRIE_DATA_GRANULARITY-1))!=0 ||
- ((trie.options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0 && trie.dataLength<(UTRIE_DATA_BLOCK_LENGTH+0x100))
- ) {
- *pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */
- return 0;
+ /* enough space for the dummy trie? */
+ if(length<actualLength) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ return actualLength;
}
- dataIs32=(UBool)((trie.options&UTRIE_OPTIONS_DATA_IS_32_BIT)!=0);
- size=sizeof(UTrieHeader)+trie.indexLength*2+trie.dataLength*(dataIs32?4:2);
+ trie->isLatin1Linear=TRUE;
+ trie->initialValue=initialValue;
- if(length>=0) {
- UTrieHeader *outTrie;
+ /* fill the index and data arrays */
+ p16=(uint16_t *)data;
+ trie->index=p16;
- if(length<size) {
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
+ if(make16BitTrie) {
+ /* indexes to block 0 */
+ block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
+ limit=trie->indexLength;
+ for(i=0; i<limit; ++i) {
+ p16[i]=block;
}
- outTrie=(UTrieHeader *)outData;
+ if(leadUnitValue!=initialValue) {
+ /* indexes for lead surrogate code units to the block after Latin-1 */
+ block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
+ i=0xd800>>UTRIE_SHIFT;
+ limit=0xdc00>>UTRIE_SHIFT;
+ for(; i<limit; ++i) {
+ p16[i]=block;
+ }
+ }
- /* swap the header */
- ds->swapArray32(ds, inTrie, sizeof(UTrieHeader), outTrie, pErrorCode);
+ trie->data32=NULL;
- /* swap the index and the data */
- if(dataIs32) {
- ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode);
- ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, trie.dataLength*4,
- (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode);
- } else {
- ds->swapArray16(ds, inTrie+1, (trie.indexLength+trie.dataLength)*2, outTrie+1, pErrorCode);
+ /* Latin-1 data */
+ p16+=trie->indexLength;
+ for(i=0; i<latin1Length; ++i) {
+ p16[i]=(uint16_t)initialValue;
+ }
+
+ /* data for lead surrogate code units */
+ if(leadUnitValue!=initialValue) {
+ limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
+ for(/* i=latin1Length */; i<limit; ++i) {
+ p16[i]=(uint16_t)leadUnitValue;
+ }
+ }
+ } else {
+ uint32_t *p32;
+
+ /* indexes to block 0 */
+ uprv_memset(p16, 0, trie->indexLength*2);
+
+ if(leadUnitValue!=initialValue) {
+ /* indexes for lead surrogate code units to the block after Latin-1 */
+ block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
+ i=0xd800>>UTRIE_SHIFT;
+ limit=0xdc00>>UTRIE_SHIFT;
+ for(; i<limit; ++i) {
+ p16[i]=block;
+ }
+ }
+
+ trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
+
+ /* Latin-1 data */
+ for(i=0; i<latin1Length; ++i) {
+ p32[i]=initialValue;
+ }
+
+ /* data for lead surrogate code units */
+ if(leadUnitValue!=initialValue) {
+ limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
+ for(/* i=latin1Length */; i<limit; ++i) {
+ p32[i]=leadUnitValue;
+ }
}
}
- return size;
+ trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
+
+ return actualLength;
}
/* enumeration -------------------------------------------------------------- */
utrie_enum(const UTrie *trie,
UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
const uint32_t *data32;
- const uint16_t *index;
+ const uint16_t *idx;
uint32_t value, prevValue, initialValue;
UChar32 c, prev;
- int32_t l, i, j, block, prevBlock, offset;
+ int32_t l, i, j, block, prevBlock, nullBlock, offset;
/* check arguments */
if(trie==NULL || trie->index==NULL || enumRange==NULL) {
enumValue=enumSameValue;
}
- index=trie->index;
+ idx=trie->index;
data32=trie->data32;
/* get the enumeration value that corresponds to an initial-value trie data entry */
initialValue=enumValue(context, trie->initialValue);
+ if(data32==NULL) {
+ nullBlock=trie->indexLength;
+ } else {
+ nullBlock=0;
+ }
+
/* set variables for previous range */
- prevBlock=0;
+ prevBlock=nullBlock;
prev=0;
prevValue=initialValue;
i=c>>UTRIE_SHIFT;
}
- block=index[i]<<UTRIE_INDEX_SHIFT;
+ block=idx[i]<<UTRIE_INDEX_SHIFT;
if(block==prevBlock) {
/* the block is the same as the previous one, and filled with value */
c+=UTRIE_DATA_BLOCK_LENGTH;
- } else if(block==0) {
+ } else if(block==nullBlock) {
/* this is the all-initial-value block */
if(prevValue!=initialValue) {
if(prev<c) {
return;
}
}
- prevBlock=0;
+ prevBlock=nullBlock;
prev=c;
prevValue=initialValue;
}
} else {
prevBlock=block;
for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
- value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
+ value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
if(value!=prevValue) {
if(prev<c) {
if(!enumRange(context, prev, c, prevValue)) {
}
}
if(j>0) {
+ /* the block is not filled with all the same value */
prevBlock=-1;
}
prev=c;
/* enumerate supplementary code points */
for(l=0xd800; l<0xdc00;) {
/* lead surrogate access */
- offset=index[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
- if(offset==(data32!=NULL ? 0 : trie->indexLength)) {
+ offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
+ if(offset==nullBlock) {
/* no entries for a whole block of lead surrogates */
if(prevValue!=initialValue) {
if(prev<c) {
return;
}
}
- prevBlock=0;
+ prevBlock=nullBlock;
prev=c;
prevValue=initialValue;
}
continue;
}
- value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : index[offset+(l&UTRIE_MASK)];
+ value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
/* enumerate trail surrogates for this lead surrogate */
offset=trie->getFoldingOffset(value);
return;
}
}
- prevBlock=0;
+ prevBlock=nullBlock;
prev=c;
prevValue=initialValue;
}
offset+=UTRIE_SURROGATE_BLOCK_COUNT;
do {
/* copy of most of the body of the BMP loop */
- block=index[i]<<UTRIE_INDEX_SHIFT;
+ block=idx[i]<<UTRIE_INDEX_SHIFT;
if(block==prevBlock) {
/* the block is the same as the previous one, and filled with value */
c+=UTRIE_DATA_BLOCK_LENGTH;
- } else if(block==0) {
+ } else if(block==nullBlock) {
/* this is the all-initial-value block */
if(prevValue!=initialValue) {
if(prev<c) {
return;
}
}
- prevBlock=0;
+ prevBlock=nullBlock;
prev=c;
prevValue=initialValue;
}
} else {
prevBlock=block;
for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
- value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
+ value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
if(value!=prevValue) {
if(prev<c) {
if(!enumRange(context, prev, c, prevValue)) {
}
}
if(j>0) {
+ /* the block is not filled with all the same value */
prevBlock=-1;
}
prev=c;