/*
*******************************************************************************
*
-* Copyright (C) 2002-2004, International Business Machines
+* Copyright (C) 2002-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
#include "uprops.h"
#include "propsvec.h"
#include "uparse.h"
+#include "writesrc.h"
#include "genprops.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
-static void U_CALLCONV
-bidiClassLineFn(void *context,
- char *fields[][2], int32_t fieldCount,
- UErrorCode *pErrorCode);
-
/* parse files with single enumerated properties ---------------------------- */
struct SingleEnum {
0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
};
+static const SingleEnum graphemeClusterBreakSingleEnum={
+ "GraphemeBreakProperty", "Grapheme_Cluster_Break",
+ UCHAR_GRAPHEME_CLUSTER_BREAK,
+ 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
+};
+
+static const SingleEnum wordBreakSingleEnum={
+ "WordBreakProperty", "Word_Break",
+ UCHAR_WORD_BREAK,
+ 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
+};
+
+static const SingleEnum sentenceBreakSingleEnum={
+ "SentenceBreakProperty", "Sentence_Break",
+ UCHAR_SENTENCE_BREAK,
+ 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
+};
+
static const SingleEnum lineBreakSingleEnum={
"LineBreak", "line break",
UCHAR_LINE_BREAK,
0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
};
-static const SingleEnum jtSingleEnum={
- "DerivedJoiningType", "joining type",
- UCHAR_JOINING_TYPE,
- 2, UPROPS_JT_SHIFT, UPROPS_JT_MASK
-};
-
-static const SingleEnum jgSingleEnum={
- "DerivedJoiningGroup", "joining group",
- UCHAR_JOINING_GROUP,
- 2, UPROPS_JG_SHIFT, UPROPS_JG_MASK
-};
-
static void U_CALLCONV
singleEnumLineFn(void *context,
char *fields[][2], int32_t fieldCount,
static const Binary
propListNames[]={
{ "White_Space", 1, UPROPS_WHITE_SPACE },
- { "Bidi_Control", 1, UPROPS_BIDI_CONTROL },
- { "Join_Control", 1, UPROPS_JOIN_CONTROL },
{ "Dash", 1, UPROPS_DASH },
{ "Hyphen", 1, UPROPS_HYPHEN },
{ "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
{ "Radical", 1, UPROPS_RADICAL },
{ "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
{ "Deprecated", 1, UPROPS_DEPRECATED },
- { "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
/* new properties in Unicode 4.0.1 */
{ "STerm", 2, UPROPS_V2_S_TERM },
- { "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR }
+ { "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR },
+
+ /* new properties in Unicode 4.1 */
+ { "Pattern_Syntax", 2, UPROPS_V2_PATTERN_SYNTAX },
+ { "Pattern_White_Space", 2, UPROPS_V2_PATTERN_WHITE_SPACE }
};
static const Binaries
/* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
{ "Math", 1, UPROPS_MATH },
{ "Alphabetic", 1, UPROPS_ALPHABETIC },
- { "Lowercase", 1, UPROPS_LOWERCASE },
- { "Uppercase", 1, UPROPS_UPPERCASE },
{ "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
{ "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
/* new properties bits in ICU 2.6/format version 3.2 */
{ "ID_Start", 1, UPROPS_ID_START },
{ "ID_Continue", 1, UPROPS_ID_CONTINUE },
- { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE }
+ { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE },
+
+ /*
+ * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
+ * to DerivedCoreProperties.txt and deprecates it.
+ */
+ { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }
};
static const Binaries
for(i=0;; ++i) {
if(i==bin->binariesCount) {
/* ignore unrecognized properties */
- addIgnoredProp(s, fields[1][1]);
+ if(beVerbose) {
+ addIgnoredProp(s, fields[1][1]);
+ }
return;
}
if(isToken(bin->binaries[i].propName, s)) {
fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
}
- for(i=0; i<ignoredPropsCount; ++i) {
- printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
+ if(beVerbose) {
+ for(i=0; i<ignoredPropsCount; ++i) {
+ printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
+ }
}
}
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
}
+U_CFUNC void
+exitAdditionalProperties() {
+ utrie_close(trie);
+ upvec_close(pv);
+}
+
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
char *basename;
/* add Han numeric types & values */
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
- /* set proper bidi class for unassigned code points (Cn) */
- parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
-
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
/*
parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
+ parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
+
+ parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
+
+ parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
+
/*
* LineBreak-4.0.0.txt:
* - All code points, assigned and unassigned, that are not listed
*/
parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
- parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode);
-
- parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode);
-
/*
* Preset East Asian Width defaults:
*
return;
}
- pvCount=upvec_toTrie(pv, trie, pErrorCode);
+ pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
++limit;
- /* parse version number */
+ /* ignore "unassigned" (the default is already set to 0.0) */
s=(char *)u_skipWhitespace(fields[1][0]);
+ if(0==uprv_strncmp(s, "unassigned", 10)) {
+ return;
+ }
+
+ /* parse version number */
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
numericLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
- Props newProps;
+ Props newProps={ 0 };
char *s, *end;
uint32_t start, limit, value, oldProps32;
int32_t oldType;
/* try large powers of 10 first, may otherwise overflow strtoul() */
if(0==uprv_strncmp(s, "10000000000", 11)) {
/* large powers of 10 are encoded in a special way, see store.c */
- value=0x7fffff00;
+ uint8_t exp=0;
+
end=s;
while(*(++end)=='0') {
- ++value;
+ ++exp;
}
+ value=1;
+ newProps.exponent=exp;
} else {
/* normal number parsing */
value=(uint32_t)uprv_strtoul(s, &end, 10);
* specific properties for single characters.
*/
+ /* set the new numeric type and value */
+ newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
+ newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
+ /* the exponent may have been set above */
+ value=makeProps(&newProps);
+
for(; start<limit; ++start) {
oldProps32=getProps(start);
oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
- if(oldType!=0) {
- /* this code point was already listed with its numeric value in UnicodeData.txt */
- continue;
+
+ if(isFraction) {
+ if(oldType!=0) {
+ /* this code point was already listed with its numeric value in UnicodeData.txt */
+ continue;
+ } else {
+ fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
+ exit(U_PARSE_ERROR);
+ }
}
/*
- * Do not set a numeric value for code points that have other
- * values or exceptions because the code below is not prepared
- * to maintain such values and exceptions.
- *
- * Check store.c (e.g., file format description and makeProps())
- * for details of what code points get their value field interpreted.
- * For example, case mappings for Ll/Lt/Lu and mirror mappings for mirrored characters.
- *
* For simplicity, and because we only expect to set numeric values for Han characters,
* for now we only allow to set these values for Lo characters.
*/
- if(GET_UNSIGNED_VALUE(oldProps32)!=0 || PROPS_VALUE_IS_EXCEPTION(oldProps32) || GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
- fprintf(stderr, "genprops error: new numeric value for a character with some other value in DerivedNumericValues.txt at %s\n", fields[0][0]);
+ if(oldType==0 && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
+ fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
- if(isFraction) {
- fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
- exit(U_PARSE_ERROR);
- }
-
- if(beVerbose) {
- printf("adding U+%04x numeric type %d value %u\n", (int)start, U_NT_NUMERIC, (int)value);
- }
-
- /* reconstruct the properties and set the new numeric type and value */
- uprv_memset(&newProps, 0, sizeof(newProps));
- newProps.code=start;
- newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
- newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
- newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
- newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
- newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
- addProps(start, makeProps(&newProps));
- }
-}
-
-/* DerivedBidiClass.txt ----------------------------------------------------- */
-
-static void U_CALLCONV
-bidiClassLineFn(void *context,
- char *fields[][2], int32_t fieldCount,
- UErrorCode *pErrorCode) {
- char *s;
- uint32_t oldStart, start, limit, value, props32;
- UBool didSet;
-
- /* get the code point range */
- u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- fprintf(stderr, "genprops: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
- exit(*pErrorCode);
- }
- ++limit;
-
- /* parse bidi class */
- s=trimTerminateField(fields[1][0], fields[1][1]);
- value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
- if((int32_t)value<0) {
- fprintf(stderr, "genprops error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
- exit(U_PARSE_ERROR);
- }
-
- didSet=FALSE;
- oldStart=start;
- for(; start<limit; ++start) {
- props32=getProps(start);
-
- /* ignore if this bidi class is already set */
- if(value==GET_BIDI_CLASS(props32)) {
- continue;
- }
+ /* verify that we do not change an existing value (fractions were excluded above) */
+ if(oldType!=0) {
+ /* the code point already has a value stored */
+ if((oldProps32&0xff00)!=(value&0xff00)) {
+ fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
+ exit(U_PARSE_ERROR);
+ }
+ /* same value, continue */
+ } else {
+ /* the code point is getting a new numeric value */
+ if(beVerbose) {
+ printf("adding U+%04x numeric type %d value 0x%04x from %s\n", (int)start, U_NT_NUMERIC, (int)value, fields[0][0]);
+ }
- /* ignore old bidi class, set only for unassigned code points (Cn) */
- if(GET_CATEGORY(props32)!=0) {
- /* error if this one contradicts what we parsed from UnicodeData.txt */
- fprintf(stderr, "genprops error: different bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
- exit(U_PARSE_ERROR);
+ addProps(start, value|GET_CATEGORY(oldProps32));
}
-
- /* remove whatever bidi class was set before */
- props32&=~(0x1f<<UPROPS_BIDI_SHIFT);
-
- /* set bidi class for Cn according to DerivedBidiClass.txt */
- props32|=value<<UPROPS_BIDI_SHIFT;
-
- /* set the modified properties */
- addProps(start, props32);
- didSet=TRUE;
- }
-
- if(didSet && beVerbose) {
- printf("setting U+%04x..U+%04x bidi class %d\n", (int)oldStart, (int)limit-1, (int)value);
}
}
/* data serialization ------------------------------------------------------- */
U_CFUNC int32_t
-writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
+writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
int32_t length;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
- length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
+ length=utrie_serialize(trie, p, capacity, NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
exit(errorCode);
}
if(p!=NULL) {
- p+=length;
- capacity-=length;
if(beVerbose) {
printf("size in bytes of additional props trie:%5u\n", (int)length);
}
+ if(f!=NULL) {
+ UTrie trie2={ NULL };
+ utrie_unserialize(&trie2, p, length, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(
+ stderr,
+ "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
+ u_errorName(errorCode));
+ exit(errorCode);
+ }
+ usrc_writeUTrieArrays(f,
+ "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
+ &trie2,
+ "\n};\n\n");
+ usrc_writeUTrieStruct(f,
+ "static const UTrie propsVectorsTrie={\n",
+ &trie2, "propsVectorsTrie_index", NULL, NULL,
+ "};\n\n");
+ }
+
+ p+=length;
+ capacity-=length;
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
((int32_t)USCRIPT_CODE_LIMIT-1);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
- (((int32_t)U_JT_COUNT-1)<<UPROPS_JT_SHIFT)|
- (((int32_t)U_JG_COUNT-1)<<UPROPS_JG_SHIFT)|
+ (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
+ (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
+ (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
((int32_t)U_DT_COUNT-1);
}
if(p!=NULL && (pvCount*4)<=capacity) {
- uprv_memcpy(p, pv, pvCount*4);
+ if(f!=NULL) {
+ usrc_writeArray(f,
+ "static const uint32_t propsVectors[%ld]={\n",
+ pv, 32, pvCount,
+ "};\n\n");
+ fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
+ fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
+ } else {
+ uprv_memcpy(p, pv, pvCount*4);
+ }
if(beVerbose) {
printf("number of additional props vectors: %5u\n", (int)pvCount/UPROPS_VECTOR_WORDS);
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
}
length+=pvCount*4;
- if(p!=NULL) {
- utrie_close(trie);
- upvec_close(pv);
- }
return length;
}