/*
*******************************************************************************
*
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*/
#include <stdio.h>
-#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#include "cstring.h"
-#include "filestrm.h"
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
+#include "writesrc.h"
#include "uprops.h"
#include "genprops.h"
precedes the actual data. It contains platform properties values and the
file format version.
-The following is a description of format version 3 .
+The following is a description of format version 5 .
+
+The format changes between version 3 and 4 because the properties related to
+case mappings and bidi/shaping are pulled out into separate files
+for modularization.
+In order to reduce the need for code changes, some of the previous data
+structures are omitted, rather than rearranging everything.
+
+For details see "Changes in format version 4" below.
+
+Format version 5 became necessary because the bit field for script codes
+overflowed. Several bit fields got rearranged, and three (Script, Block,
+Word_Break) got widened by one bit each.
Data contents:
const int32_t indexes[16] with values i0..i15:
+ i0 indicates the length of the main trie.
+ i0..i3 all have the same value in format version 4.0;
+ the related props32[] and exceptions[] and uchars[] were used in format version 3
+
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
i7..i9 reservedIndexes; -- reserved values; 0 for now
- i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
- i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
+ i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
+ i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
i12..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
+ P, E, and U are not used (empty) in format version 4
+
P const uint32_t props32[i1-i0];
E const uint32_t exceptions[i2-i1];
U const UChar uchars[2*(i3-i2)];
The lookup of properties for a given code point is done with a trie lookup,
using the UTrie implementation.
-The trie lookup result is a 16-bit index in the props32[] table where the
-actual 32-bit properties word is stored. This is done to save space.
-
-(There are thousands of 16-bit entries in the trie data table, but
-only a few hundred unique 32-bit properties words.
-If the trie data table contained 32-bit words directly, then that would be
-larger because the length of the table would be the same as now but the
-width would be 32 bits instead of 16. This saves more than 10kB.)
+The trie lookup result is a 16-bit properties word.
With a given Unicode code point
and 0<=c<0x110000, the lookup is done like this:
- uint16_t i;
- UTRIE_GET16(c, i);
- uint32_t props=p32[i];
-
-For some characters, not all of the properties can be efficiently encoded
-using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
-array:
-
- if(props&EXCEPTION_BIT)) {
- uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
- ...
- }
-
-The exception values are a variable number of uint32_t starting at
-
- const uint32_t *pe=p32+exceptionsIndex+e;
-
-The first uint32_t there contains flags about what values actually follow it.
-Some of the exception values are UChar32 code points for the case mappings,
-others are numeric values etc.
+ uint16_t props;
+ UTRIE_GET16(trie, c, props);
-32-bit properties sets:
-
-Each 32-bit properties word contains:
+Each 16-bit properties word contains:
0.. 4 general category
- 5 has exception values
- 6..10 BiDi category
-11 is mirrored
-12..14 numericType:
- 0 no numeric value
- 1 decimal digit value
- 2 digit value
- 3 numeric value
- ### TODO: type 4 for Han digits & numbers?!
-15..19 reserved
-20..31 value according to bits 0..5:
- if(has exception) {
- exception index;
- } else switch(general category) {
- case Ll: delta to uppercase; -- same as titlecase
- case Lu: -delta to lowercase; -- titlecase is same as c
- case Lt: -delta to lowercase; -- uppercase is same as c
- default:
- if(is mirrored) {
- delta to mirror;
- } else if(numericType!=0) {
- numericValue;
- } else {
- 0;
- };
- }
-
-Exception values:
-
-In the first uint32_t exception word for a code point,
-bits
-31..16 reserved
-15..0 flags that indicate which values follow:
-
-bit
- 0 has uppercase mapping
- 1 has lowercase mapping
- 2 has titlecase mapping
- 3 unused
- 4 has numeric value (numerator)
- if numericValue=0x7fffff00+x then numericValue=10^x
- 5 has denominator value
- 6 has a mirror-image Unicode code point
- 7 has SpecialCasing.txt entries
- 8 has CaseFolding.txt entries
-
-According to the flags in this word, one or more uint32_t words follow it
-in the sequence of the bit flags in the flags word; if a flag is not set,
-then the value is missing or 0:
-
-For the case mappings and the mirror-image Unicode code point,
-one uint32_t or UChar32 each is the code point.
-If the titlecase mapping is missing, then it is the same as the uppercase mapping.
-
-For the digit values, bits 31..16 contain the decimal digit value, and
-bits 15..0 contain the digit value. A value of -1 indicates that
-this value is missing.
-
-For the numeric/numerator value, an int32_t word contains the value directly,
-except for when there is no numerator but a denominator, then the numerator
-is implicitly 1. This means:
- numerator denominator result
- none none none
- x none x
- none y 1/y
- x y x/y
-
-If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
-
-For the denominator value, a uint32_t word contains the value directly.
-
-For special casing mappings, the 32-bit exception word contains:
-31 if set, this character has complex, conditional mappings
- that are not stored;
- otherwise, the mappings are stored according to the following bits
-30..24 number of UChars used for mappings
-23..16 reserved
-15.. 0 UChar offset from the beginning of the UChars array where the
- UChars for the special case mappings are stored in the following format:
-
-Format of special casing UChars:
-One UChar value with lengths as follows:
-14..10 number of UChars for titlecase mapping
- 9.. 5 number of UChars for uppercase mapping
- 4.. 0 number of UChars for lowercase mapping
-
-Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
-
-For case folding mappings, the 32-bit exception word contains:
-31..24 number of UChars used for the full mapping
-23..16 reserved
-15.. 0 UChar offset from the beginning of the UChars array where the
- UChars for the special case mappings are stored in the following format:
-
-Format of case folding UChars:
-Two UChars contain the simple mapping as follows:
- 0, 0 no simple mapping
- BMP,0 a simple mapping to a BMP code point
- s1, s2 a simple mapping to a supplementary code point stored as two surrogates
-This is followed by the UChars for the full case folding mappings.
-
-Example:
-U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
-mapping and a numeric value.
-Its exception values would be stored as 3 uint32_t words:
-
-- flags=0x0a (see above) with combining class 0
-- lowercase mapping 0x2170
-- numeric value=1
+ 5.. 7 numeric type
+ non-digit numbers are stored with multiple types and pseudo-types
+ in order to facilitate compact encoding:
+ 0 no numeric value (0)
+ 1 decimal digit value (0..9)
+ 2 digit value (0..9)
+ 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
+ 4 (internal type UPROPS_NT_FRACTION) fraction
+ 5 (internal type UPROPS_NT_LARGE) large number >0xff
+ 6..7 reserved
+
+ when returning the numeric type from a public API,
+ internal types must be turned into U_NT_NUMERIC
+
+ 8..15 numeric value
+ encoding of fractions and large numbers see below
+
+Fractions:
+ // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
+ int32_t num, den;
+ num=n>>3; // num=0..31
+ den=(n&7)+2; // den=2..9
+ if(num==0) {
+ num=-1; // num=-1 or 1..31
+ }
+ double result=(double)num/(double)den;
+
+Large numbers:
+ // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
+ int32_t m, e;
+ m=n>>4; // m=0..15
+ e=(n&0xf);
+ if(m==0) {
+ m=1; // for large powers of 10
+ e+=18; // e=18..33
+ } else {
+ e+=2; // e=2..17
+ } // m==10..15 are reserved
+ double result=(double)m*10^e;
--- Additional properties (new in format version 2.1) ---
- i10 also contains U_LB_COUNT and U_EA_COUNT.
- i11 contains maxValues2 for vector word 2.
+--- Changes in format version 4 ---
+
+The format changes between version 3 and 4 because the properties related to
+case mappings and bidi/shaping are pulled out into separate files
+for modularization.
+In order to reduce the need for code changes, some of the previous data
+structures are omitted, rather than rearranging everything.
+
+(The change to format version 4 is for ICU 3.4. The last CVS revision of
+genprops/store.c for format version 3.2 is 1.48.)
+
+The main trie's data is significantly simplified:
+- The trie's 16-bit data word is used directly instead of as an index
+ into props32[].
+- The trie uses the default trie folding functions instead of custom ones.
+- Numeric values are stored directly in the trie data word, with special
+ encodings.
+- No more exception data (the data that needed it was pulled out, or, in the
+ case of numeric values, encoded differently).
+- No more string data (pulled out - was for case mappings).
+
+Also, some of the previously used properties vector bits are reserved again.
+
+The indexes[] values for the omitted structures are still filled in
+(indicating zero-length arrays) so that the swapper code remains unchanged.
+
+--- Changes in format version 5 ---
+
+Rearranged bit fields in the second trie (AT) because the script code field
+overflowed. Old code would have seen nonsensically low values for new, higher
+script codes.
+Modified bit fields in icu/source/common/uprops.h
+
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
0,
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
- { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
- { 4, 0, 0, 0 } /* dataVersion */
-};
-
-/* definitions of expected data size limits */
-enum {
- MAX_PROPS_COUNT=25000,
- MAX_UCHAR_COUNT=10000
+ { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
+ { 5, 1, 0, 0 } /* dataVersion */
};
static UNewTrie *pTrie=NULL;
-/* props32[] contains unique properties words after compacting the array of properties */
-static uint32_t props32[MAX_PROPS_COUNT];
-
-/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
-static uint32_t *props;
-
-/* length of props32[] after compaction */
-static int32_t propsTop;
-
-/* exceptions values */
-static uint32_t exceptions[UPROPS_MAX_EXCEPTIONS_COUNT+20];
-static uint16_t exceptionsTop=0;
-
-/* Unicode characters, e.g. for special casing or decomposition */
-static UChar uchars[MAX_UCHAR_COUNT+20];
-static uint32_t ucharsTop=0;
-
-/* statistics */
-static uint16_t exceptionsCount=0;
-
-/* prototypes --------------------------------------------------------------- */
-
-static int
-compareProps(const void *l, const void *r);
-
-static uint32_t
-addUChars(const UChar *s, uint32_t length);
-
/* -------------------------------------------------------------------------- */
extern void
extern void
initStore() {
- pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, 0, TRUE);
+ pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
if(pTrie==NULL) {
fprintf(stderr, "error: unable to create a UNewTrie\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
- uprv_memset(props32, 0, sizeof(props32));
initAdditionalProperties();
}
+extern void
+exitStore() {
+ utrie_close(pTrie);
+ exitAdditionalProperties();
+}
+
+static uint32_t printNumericTypeValueError(Props *p) {
+ fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n",
+ (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ return 0;
+}
+
/* store a character's properties ------------------------------------------- */
extern uint32_t
makeProps(Props *p) {
- uint32_t x;
- int32_t value;
- uint16_t count;
- UBool isNumber;
-
- /*
- * Simple ideas for reducing the number of bits for one character's
- * properties:
- *
- * Some fields are only used for characters of certain
- * general categories:
- * - casing fields for letters and others, not for
- * numbers & Mn
- * + uppercase not for uppercase letters
- * + lowercase not for lowercase letters
- * + titlecase not for titlecase letters
- *
- * * most of the time, uppercase=titlecase
- * - numeric fields for various digit & other types
- * - canonical combining classes for non-spacing marks (Mn)
- * * the above is not always true, for all three cases
- *
- * Using the same bits for alternate fields saves some space.
- *
- * For the canonical categories, there are only few actually used
- * most of the time.
- * They can be stored using 5 bits.
- *
- * In the BiDi categories, the 5 explicit codes are only ever
- * assigned 1:1 to 5 well-known code points. Storing only one
- * value for all "explicit codes" gets this down to 4 bits.
- * Client code then needs to check for this special value
- * and replace it by the real one using a 5-element table.
- *
- * The general categories Mn & Me, non-spacing & enclosing marks,
- * are always NSM, and NSM are always of those categories.
- *
- * Digit values can often be derived from the code point value
- * itself in a simple way.
- *
- */
-
- /* count the case mappings and other values competing for the value bit field */
- x=0;
- value=0;
- count=0;
- isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N');
-
- if(p->upperCase!=0) {
- /* verify that no numbers and no Mn have case mappings */
- if(p->generalCategory==U_LOWERCASE_LETTER) {
- value=(int32_t)p->code-(int32_t)p->upperCase;
- } else {
- x=UPROPS_EXCEPTION_BIT;
+ uint32_t den;
+ int32_t type, value, exp;
+
+ /* encode numeric type & value */
+ type=p->numericType;
+ value=p->numericValue;
+ den=p->denominator;
+ exp=p->exponent;
+
+ if(den!=0) {
+ /* fraction */
+ if( type!=U_NT_NUMERIC ||
+ value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
+ den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
+ exp!=0
+ ) {
+ return printNumericTypeValueError(p);
}
- ++count;
- }
- if(p->lowerCase!=0) {
- /* verify that no numbers and no Mn have case mappings */
- if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) {
- value=(int32_t)p->lowerCase-(int32_t)p->code;
- } else {
- x=UPROPS_EXCEPTION_BIT;
+ type=UPROPS_NT_FRACTION;
+
+ if(value==-1) {
+ value=0;
}
- ++count;
- }
- if(p->upperCase!=p->titleCase) {
- x=UPROPS_EXCEPTION_BIT;
- ++count;
- }
- if(p->numericType!=0) {
- value=p->numericValue;
- ++count;
- }
- if(p->denominator!=0) {
- x=UPROPS_EXCEPTION_BIT;
- ++count;
- }
- if(p->isMirrored) {
- if(p->mirrorMapping!=0) {
- value=(int32_t)p->mirrorMapping-(int32_t)p->code;
+ den-=UPROPS_FRACTION_DEN_OFFSET;
+ value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
+ } else if(exp!=0) {
+ /* very large value */
+ if( type!=U_NT_NUMERIC ||
+ value<1 || 9<value ||
+ exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
+ ) {
+ return printNumericTypeValueError(p);
}
- ++count;
- }
- if(p->specialCasing!=NULL) {
- x=UPROPS_EXCEPTION_BIT;
- ++count;
- }
- if(p->caseFolding!=NULL) {
- x=UPROPS_EXCEPTION_BIT;
- ++count;
- }
+ type=UPROPS_NT_LARGE;
- /* handle exceptions */
- if(count>1 || x!=0 || value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
- /* this code point needs exception values */
- if(beVerbose) {
- if(x!=0) {
- /* do not print - many code points because of SpecialCasing & CaseFolding
- printf("*** code 0x%06x needs an exception because it is irregular\n", p->code);
- */
- } else if(value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
- printf("*** U+%04x needs an exception because its value is out-of-bounds at %ld (not [%ld..%ld]\n",
- p->code, (long)value, (long)UPROPS_MIN_VALUE, (long)UPROPS_MAX_VALUE);
- } else {
- printf("*** U+%04x needs an exception because it has %u values\n", p->code, count);
+ if(exp<=UPROPS_LARGE_MAX_EXP) {
+ /* 1..9 * 10^(2..17) */
+ exp-=UPROPS_LARGE_EXP_OFFSET;
+ } else {
+ /* 1 * 10^(18..33) */
+ if(value!=1) {
+ return printNumericTypeValueError(p);
}
+ value=0;
+ exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
+ }
+ value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
+ } else if(value>UPROPS_MAX_SMALL_NUMBER) {
+ /* large value */
+ if(type!=U_NT_NUMERIC) {
+ return printNumericTypeValueError(p);
}
+ type=UPROPS_NT_LARGE;
- ++exceptionsCount;
- x=UPROPS_EXCEPTION_BIT;
+ /* split the value into mantissa and exponent, base 10 */
+ while((value%10)==0) {
+ value/=10;
+ ++exp;
+ }
+ if(value>9) {
+ return printNumericTypeValueError(p);
+ }
- /* allocate and create exception values */
- value=exceptionsTop;
- if(value>=UPROPS_MAX_EXCEPTIONS_COUNT) {
- fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n",
- p->code, value);
- exit(U_MEMORY_ALLOCATION_ERROR);
- } else {
- uint32_t first=0;
- uint16_t length=1;
+ exp-=UPROPS_LARGE_EXP_OFFSET;
+ value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
+ } else if(value<0) {
+ /* unable to encode negative values, other than fractions -1/x */
+ return printNumericTypeValueError(p);
- if(p->upperCase!=0) {
- first|=1;
- exceptions[value+length++]=p->upperCase;
- }
- if(p->lowerCase!=0) {
- first|=2;
- exceptions[value+length++]=p->lowerCase;
- }
- if(p->upperCase!=p->titleCase) {
- first|=4;
- if(p->titleCase!=0) {
- exceptions[value+length++]=p->titleCase;
- } else {
- exceptions[value+length++]=p->code;
- }
- }
- if(p->numericType!=0) {
- if(p->denominator==0) {
- first|=0x10;
- exceptions[value+length++]=(uint32_t)p->numericValue;
- } else {
- if(p->numericValue!=1) {
- first|=0x10;
- exceptions[value+length++]=(uint32_t)p->numericValue;
- }
- first|=0x20;
- exceptions[value+length++]=p->denominator;
- }
- }
- if(p->isMirrored) {
- first|=0x40;
- exceptions[value+length++]=p->mirrorMapping;
- }
- if(p->specialCasing!=NULL) {
- first|=0x80;
- if(p->specialCasing->isComplex) {
- /* complex special casing */
- exceptions[value+length++]=0x80000000;
- } else {
- /* unconditional special casing */
- UChar u[128];
- uint32_t i;
- uint16_t j, entry;
-
- i=1;
- entry=0;
- j=p->specialCasing->lowerCase[0];
- if(j>0) {
- uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j);
- i+=j;
- entry=j;
- }
- j=p->specialCasing->upperCase[0];
- if(j>0) {
- uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j);
- i+=j;
- entry|=j<<5;
- }
- j=p->specialCasing->titleCase[0];
- if(j>0) {
- uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j);
- i+=j;
- entry|=j<<10;
- }
- u[0]=entry;
-
- exceptions[value+length++]=(i<<24)|addUChars(u, i);
- }
- }
- if(p->caseFolding!=NULL) {
- first|=0x100;
- if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
- /* special case folding, store only a marker */
- exceptions[value+length++]=0;
- } else {
- /* normal case folding with a simple and a full mapping */
- UChar u[128];
- uint16_t i;
-
- /* store the simple mapping into the first two UChars */
- i=0;
- u[1]=0;
- UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
-
- /* store the full mapping after that */
- i=p->caseFolding->full[0];
- if(i>0) {
- uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
- }
-
- exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
- }
- }
- exceptions[value]=first;
- exceptionsTop+=length;
- }
+ /* } else normal value=0..0xff { */
}
- /* put together the 32-bit word of encoded properties */
- x|=
+ /* encode the properties */
+ return
(uint32_t)p->generalCategory |
- (uint32_t)p->bidi<<UPROPS_BIDI_SHIFT |
- (uint32_t)p->isMirrored<<UPROPS_MIRROR_SHIFT |
- (uint32_t)p->numericType<<UPROPS_NUMERIC_TYPE_SHIFT |
- (uint32_t)value<<UPROPS_VALUE_SHIFT;
-
- return x;
-
- /*
- * "Higher-hanging fruit" (not implemented):
- *
- * For some sets of fields, there are fewer sets of values
- * than the product of the numbers of values per field.
- * This means that storing one single value for more than
- * one field and later looking up both field values in a table
- * saves space.
- * Examples:
- * - general category & BiDi
- *
- * There are only few common displacements between a code point
- * and its case mappings. Store deltas. Store codes for few
- * occuring deltas.
- */
+ ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
+ ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
}
extern void
}
}
-extern void
-addCaseSensitive(UChar32 first, UChar32 last) {
- uint32_t x, cs;
-
- cs=U_MASK(UPROPS_CASE_SENSITIVE_SHIFT);
- while(first<=last) {
- x=utrie_get32(pTrie, first, NULL);
- if(!utrie_set32(pTrie, first, x|cs)) {
- fprintf(stderr, "error: too many entries for the properties trie\n");
- exit(U_BUFFER_OVERFLOW_ERROR);
- }
- ++first;
- }
-}
-
extern uint32_t
getProps(uint32_t c) {
return utrie_get32(pTrie, (UChar32)c, NULL);
}
}
-/* compacting --------------------------------------------------------------- */
-
-static void
-compactProps(void) {
- /*
- * At this point, all the propsTop properties are in props[], but they
- * are not all unique.
- * Now we sort them, reduce them to unique ones in props32[], and
- * build an index in stage3[] from the old to the new indexes.
- * (The quick sort averages at N*log(N) with N=propsTop. The inverting
- * yields linear performance.)
- */
-
- /*
- * We are going to sort only an index table in map[] because we need this
- * index table anyway and qsort() does not allow to sort two tables together
- * directly. This will thus also reduce the amount of data moved around.
- */
- uint32_t x;
- int32_t i, oldIndex, newIndex;
-
- static uint16_t map[MAX_PROPS_COUNT];
-
-#if DO_DEBUG_OUT
- {
- /* debug output */
- uint16_t i1, i2, i3;
- uint32_t c;
- for(c=0; c<0xffff; c+=307) {
- printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3));
- }
- }
-#endif
-
- props=utrie_getData(pTrie, &propsTop);
-
- /* build the index table */
- for(i=propsTop; i>0;) {
- --i;
- map[i]=(uint16_t)i;
- }
-
- /* reorder */
- qsort(map, propsTop, 2, compareProps);
-
- /*
- * Now invert the reordered table and compact it in the same step.
- * The result will be props32[] having only unique properties words
- * and stage3[] having indexes to them.
- */
- newIndex=0;
- for(i=0; i<propsTop;) {
- /* set the first of a possible series of the same properties */
- oldIndex=map[i];
- props32[newIndex]=x=props[oldIndex];
- props[oldIndex]=newIndex;
-
- /* set the following same properties only in stage3 */
- while(++i<propsTop && x==props[map[i]]) {
- props[map[i]]=newIndex;
- }
-
- ++newIndex;
- }
-
- /* we saved some space */
- if(beVerbose) {
- printf("compactProps() reduced propsTop from %u to %u\n", propsTop, newIndex);
- }
- propsTop=newIndex;
-
-#if DO_DEBUG_OUT
- {
- /* debug output */
- uint16_t i1, i2, i3, i4;
- uint32_t c;
- for(c=0; c<0xffff; c+=307) {
- printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
- }
- }
-#endif
-}
-
-static int
-compareProps(const void *l, const void *r) {
- uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
-
- /* compare general categories first */
- int rc=(int)(left&0x1f)-(int)(right&0x1f);
- if(rc==0 && left!=right) {
- rc= left<right ? -1 : 1;
- }
- return rc;
-}
-
/* generate output data ----------------------------------------------------- */
-/* folding value: just store the offset (16 bits) if there is any non-0 entry */
-U_CAPI uint32_t U_EXPORT2
-getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
- uint32_t value;
- UChar32 limit;
- UBool inBlockZero;
-
- limit=start+0x400;
- while(start<limit) {
- value=utrie_get32(trie, start, &inBlockZero);
- if(inBlockZero) {
- start+=UTRIE_DATA_BLOCK_LENGTH;
- } else if(value!=0) {
- return (uint32_t)(offset|0x8000);
- } else {
- ++start;
- }
- }
- return 0;
-}
-
extern void
-generateData(const char *dataDir) {
+generateData(const char *dataDir, UBool csource) {
static int32_t indexes[UPROPS_INDEX_COUNT]={
0, 0, 0, 0,
0, 0, 0, 0,
UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR;
- uint32_t size;
+ uint32_t size = 0;
int32_t trieSize, additionalPropsSize, offset;
long dataLength;
- compactProps();
-
- trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
+ trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
exit(errorCode);
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
- /* round up trie size to 4-alignement */
+ /* round up trie size to 4-alignment */
trieSize=(trieSize+3)&~3;
offset+=trieSize>>2;
- indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
-
- offset+=propsTop;
- indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
-
- offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
- indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
-
- /* round up UChar count to 4-alignement */
- ucharsTop=(ucharsTop+1)&~1;
- offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
+ indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
+ indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
+ indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
if(beVerbose) {
- printf("trie size in bytes: %5u\n", trieSize);
- printf("number of unique properties values: %5u\n", propsTop);
- printf("number of code points with exceptions: %5u\n", exceptionsCount);
- printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
- printf("number of UChars for special mappings: %5u\n", ucharsTop);
+ printf("trie size in bytes: %5u\n", (int)trieSize);
}
- additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
+ if(csource) {
+ /* write .c file for hardcoded data */
+ UTrie trie={ NULL };
+ FILE *f;
+
+ utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(
+ stderr,
+ "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
+ u_errorName(errorCode));
+ return;
+ }
- size=4*offset+additionalPropsSize; /* total size of data */
- if(beVerbose) {
- printf("data size: %6lu\n", (unsigned long)size);
- }
+ f=usrc_create(dataDir, "uchar_props_data.c");
+ if(f!=NULL) {
+ usrc_writeArray(f,
+ "static const UVersionInfo formatVersion={",
+ dataInfo.formatVersion, 8, 4,
+ "};\n\n");
+ usrc_writeArray(f,
+ "static const UVersionInfo dataVersion={",
+ dataInfo.dataVersion, 8, 4,
+ "};\n\n");
+ usrc_writeUTrieArrays(f,
+ "static const uint16_t propsTrie_index[%ld]={\n", NULL,
+ &trie,
+ "\n};\n\n");
+ usrc_writeUTrieStruct(f,
+ "static const UTrie propsTrie={\n",
+ &trie, "propsTrie_index", NULL, NULL,
+ "};\n\n");
+
+ additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
+ size=4*offset+additionalPropsSize; /* total size of data */
+
+ usrc_writeArray(f,
+ "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
+ indexes, 32, UPROPS_INDEX_COUNT,
+ "};\n\n");
+ fclose(f);
+ }
+ } else {
+ /* write the data */
+ pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
+ haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
- /* write the data */
- pData=udata_create(dataDir, DATA_TYPE, U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
- haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
- exit(errorCode);
- }
+ additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
+ size=4*offset+additionalPropsSize; /* total size of data */
- udata_writeBlock(pData, indexes, sizeof(indexes));
- udata_writeBlock(pData, trieBlock, trieSize);
- udata_writeBlock(pData, props32, 4*propsTop);
- udata_writeBlock(pData, exceptions, 4*exceptionsTop);
- udata_writeBlock(pData, uchars, 2*ucharsTop);
- udata_writeBlock(pData, additionalProps, additionalPropsSize);
+ udata_writeBlock(pData, indexes, sizeof(indexes));
+ udata_writeBlock(pData, trieBlock, trieSize);
+ udata_writeBlock(pData, additionalProps, additionalPropsSize);
- /* finish up */
- dataLength=udata_finish(pData, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
- exit(errorCode);
- }
+ /* finish up */
+ dataLength=udata_finish(pData, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
+ exit(errorCode);
+ }
- if(dataLength!=(long)size) {
- fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
- dataLength, (unsigned long)size);
- exit(U_INTERNAL_PROGRAM_ERROR);
+ if(dataLength!=(long)size) {
+ fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
+ dataLength, (unsigned long)size);
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
}
- utrie_close(pTrie);
-}
-
-/* helpers ------------------------------------------------------------------ */
-
-static uint32_t
-addUChars(const UChar *s, uint32_t length) {
- uint32_t top=(uint16_t)(ucharsTop+length);
- UChar *p;
-
- if(top>=MAX_UCHAR_COUNT) {
- fprintf(stderr, "genprops: out of UChars memory\n");
- exit(U_MEMORY_ALLOCATION_ERROR);
+ if(beVerbose) {
+ printf("data size: %6lu\n", (unsigned long)size);
}
- p=uchars+ucharsTop;
- uprv_memcpy(p, s, 2*length);
- ucharsTop=top;
- return (uint32_t)(p-uchars);
}
/*