X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..d5d484b0fbe924d3663b177965538d517ee412c1:/icuSources/tools/genprops/genprops.c diff --git a/icuSources/tools/genprops/genprops.c b/icuSources/tools/genprops/genprops.c index 95091d52..bb388126 100644 --- a/icuSources/tools/genprops/genprops.c +++ b/icuSources/tools/genprops/genprops.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2003, International Business Machines +* Copyright (C) 1999-2005, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -24,8 +24,8 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" -#include "unicode/uset.h" #include "unicode/putil.h" +#include "unicode/uclean.h" #include "cmemory.h" #include "cstring.h" #include "unewdata.h" @@ -42,30 +42,27 @@ U_CDECL_END UBool beVerbose=FALSE, haveCopyright=TRUE; -/* - * Unicode set collecting the case-sensitive characters; - * see uchar.h UCHAR_CASE_SENSITIVE. - * Add code points from case mappings/foldings in - * the root locale and with default options. - */ -static USet *caseSensitive; - /* prototypes --------------------------------------------------------------- */ -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode); - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode); - static void parseDB(const char *filename, UErrorCode *pErrorCode); /* -------------------------------------------------------------------------- */ +enum +{ + HELP_H, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR, + UNICODE_VERSION, + ICUDATADIR, + CSOURCE +}; + +/* Keep these values in sync with the above enums */ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, @@ -73,7 +70,9 @@ static UOption options[]={ UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR, - { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } + UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), + UOPTION_ICUDATADIR, + UOPTION_DEF("csource", 'C', UOPT_NO_ARG) }; extern int @@ -86,9 +85,10 @@ main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); /* preset then read command line options */ - options[4].value=u_getDataDirectory(); - options[5].value=""; - options[6].value=""; + options[DESTDIR].value=u_getDataDirectory(); + options[SOURCEDIR].value=""; + options[UNICODE_VERSION].value=""; + options[ICUDATADIR].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); /* error handling, printing usage message */ @@ -97,7 +97,7 @@ main(int argc, char* argv[]) { "error in command line argument \"%s\"\n", argv[-argc]); } - if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. @@ -114,21 +114,25 @@ main(int argc, char* argv[]) { "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" "\t-c or --copyright include a copyright notice\n" - "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); + "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" + "\t-C or --csource generate a .c source file rather than the .icu binary\n"); fprintf(stderr, "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n" "\tsuffix suffix that is to be appended with a '-'\n" "\t to the source file basenames before opening;\n" - "\t 'genprops new' will read UnicodeData-new.txt etc.\n"); + "\t 'genprops new' will read UnicodeData-new.txt etc.\n", + u_getDataDirectory()); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ - beVerbose=options[2].doesOccur; - haveCopyright=options[3].doesOccur; - srcDir=options[5].value; - destDir=options[4].value; + beVerbose=options[VERBOSE].doesOccur; + haveCopyright=options[COPYRIGHT].doesOccur; + srcDir=options[SOURCEDIR].value; + destDir=options[DESTDIR].value; if(argc>=2) { suffix=argv[1]; @@ -136,11 +140,15 @@ main(int argc, char* argv[]) { suffix=NULL; } - if(options[6].doesOccur) { - setUnicodeVersion(options[6].value); + if(options[UNICODE_VERSION].doesOccur) { + setUnicodeVersion(options[UNICODE_VERSION].value); } /* else use the default dataVersion in store.c */ + if (options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } + /* prepare the filename beginning with the source dir */ uprv_strcpy(filename, srcDir); basename=filename+uprv_strlen(filename); @@ -150,19 +158,6 @@ main(int argc, char* argv[]) { /* initialize */ initStore(); - caseSensitive=uset_open(1, 0); /* empty set (start>end) */ - - /* process BidiMirroring.txt */ - writeUCDFilename(basename, "BidiMirroring", suffix); - parseBidiMirroring(filename, &errorCode); - - /* process SpecialCasing.txt */ - writeUCDFilename(basename, "SpecialCasing", suffix); - parseSpecialCasing(filename, &errorCode); - - /* process CaseFolding.txt */ - writeUCDFilename(basename, "CaseFolding", suffix); - parseCaseFolding(filename, &errorCode); /* process UnicodeData.txt */ writeUCDFilename(basename, "UnicodeData", suffix); @@ -175,20 +170,22 @@ main(int argc, char* argv[]) { /* process parsed data */ if(U_SUCCESS(errorCode)) { /* write the properties data file */ - generateData(destDir); + generateData(destDir, options[CSOURCE].doesOccur); } + exitStore(); + u_cleanup(); return errorCode; } U_CFUNC void writeUCDFilename(char *basename, const char *filename, const char *suffix) { - int32_t length=uprv_strlen(filename); + int32_t length=(int32_t)uprv_strlen(filename); uprv_strcpy(basename, filename); if(suffix!=NULL) { basename[length++]='-'; uprv_strcpy(basename+length, suffix); - length+=uprv_strlen(suffix); + length+=(int32_t)uprv_strlen(suffix); } uprv_strcpy(basename+length, ".txt"); } @@ -245,301 +242,6 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { return -1; } -static void -_set_addAll(USet *set, const UChar *s, int32_t length) { - UChar32 c; - int32_t i; - - /* needs length>=0 */ - for(i=0; i0) { - fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)mirrorMappings[mirrorCount][0], - (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=mirrorMappings[mirrorCount][0]; - - if(++mirrorCount==MAX_MIRROR_COUNT) { - fprintf(stderr, "genprops: too many mirror mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode); -} - -/* parser for SpecialCasing.txt --------------------------------------------- */ - -#define MAX_SPECIAL_CASING_COUNT 500 - -static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; -static int32_t specialCasingCount=0; - -static void U_CALLCONV -specialCasingLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - - /* get code point */ - specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* is this a complex mapping? */ - if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { - /* there is some condition text in the fifth field */ - specialCasings[specialCasingCount].isComplex=TRUE; - - /* do not store any actual mappings for this */ - specialCasings[specialCasingCount].lowerCase[0]=0; - specialCasings[specialCasingCount].upperCase[0]=0; - specialCasings[specialCasingCount].titleCase[0]=0; - } else { - /* just set the "complex" flag and get the case mappings */ - specialCasings[specialCasingCount].isComplex=FALSE; - specialCasings[specialCasingCount].lowerCase[0]= - (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].upperCase[0]= - (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].titleCase[0]= - (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); - } - - if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { - fprintf(stderr, "genprops: too many special casing mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static int -compareSpecialCasings(const void *left, const void *right) { - return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; -} - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { - char *fields[5][2]; - int32_t i, j; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); - - /* sort the special casing entries by code point */ - if(specialCasingCount>0) { - qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings); - } - - /* replace multiple entries for any code point by one "complex" one */ - j=0; - for(i=1; i0) { - qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings); - specialCasingCount-=j; - } - - /* - * Add one complex mapping to caseSensitive that was filtered out above: - * Greek final Sigma has a conditional mapping but not locale-sensitive, - * and it is taken when lowercasing just U+03A3 alone. - * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA - */ - uset_add(caseSensitive, 0x3c2); -} - -/* parser for CaseFolding.txt ----------------------------------------------- */ - -#define MAX_CASE_FOLDING_COUNT 2000 - -static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; -static int32_t caseFoldingCount=0; - -static void U_CALLCONV -caseFoldingLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - static uint32_t prevCode=0; - int32_t count; - char status; - - /* get code point */ - caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get the status of this mapping */ - caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); - if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { - fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ - if(status=='L') { - return; - } - - /* get the mapping */ - count=caseFoldings[caseFoldingCount].full[0]= - (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ - if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { - caseFoldings[caseFoldingCount].simple=0; - } - - /* update the case-sensitive set */ - if(status!='T') { - uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); - _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); - } - - /* check the status */ - if(status=='S') { - /* check if there was a full mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='F' - ) { - /* merge the two entries */ - caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; - return; - } - } else if(status=='F') { - /* check if there was a simple mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='S' - ) { - /* merge the two entries */ - uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); - return; - } - } else if(status=='I' || status=='T') { - /* check if there was a default mapping for this code point before (remove it) */ - while(caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code - ) { - prevCode=0; - --caseFoldingCount; - } - /* store only a marker for special handling for cases like dotless i */ - caseFoldings[caseFoldingCount].simple=0; - caseFoldings[caseFoldingCount].full[0]=0; - } - - /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ - if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { - fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)caseFoldings[caseFoldingCount].code, - (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=caseFoldings[caseFoldingCount].code; - - if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { - fprintf(stderr, "genprops: too many case folding mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { - char *fields[3][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); -} - /* parser for UnicodeData.txt ----------------------------------------------- */ /* general categories */ @@ -555,12 +257,6 @@ genCategoryNames[U_CHAR_CATEGORY_COUNT]={ "Pi", "Pf" }; -const char *const -bidiNames[U_CHAR_DIRECTION_COUNT]={ - "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S", - "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN" -}; - const char *const decompositionTypeNames[U_DT_COUNT]={ NULL, @@ -588,7 +284,7 @@ static struct { char name[80]; } unicodeAreas[32]; -static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0; +static int32_t unicodeAreaIndex=0; static void U_CALLCONV unicodeDataLineFn(void *context, @@ -622,17 +318,6 @@ unicodeDataLineFn(void *context, exit(U_PARSE_ERROR); } - /* get BiDi category, field 4 */ - i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]); - if(i>=0) { - p.bidi=(uint8_t)i; - } else { - fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", - fields[4][0], (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - /* get decomposition type, field 5 */ if(fields[5][0]status=='C' && - p.caseFolding->simple==p.lowerCase - ) { - p.caseFolding=NULL; - } - } else { - p.caseFolding=NULL; - } - value=makeProps(&p); if(*fields[1][0]=='<') { @@ -941,41 +552,12 @@ repeatAreaProps() { static void parseDB(const char *filename, UErrorCode *pErrorCode) { - /* default Bidi classes for unassigned code points */ - static const uint32_t defaultBidi[][2]={ /* { limit, class } */ - { 0x0590, U_LEFT_TO_RIGHT }, - { 0x0600, U_RIGHT_TO_LEFT }, - { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFB1D, U_LEFT_TO_RIGHT }, - { 0xFB50, U_RIGHT_TO_LEFT }, - { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFE70, U_LEFT_TO_RIGHT }, - { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, - { 0x110000, U_LEFT_TO_RIGHT } - }; - char *fields[15][2]; - UChar32 start, end; - uint32_t prev; - int32_t i; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } - /* - * Set default Bidi classes for unassigned code points. - * See table 3-7 "Bidirectional Character Types" in UAX #9. - * http://www.unicode.org/reports/tr9/ - */ - prev=0; - for(i=0; i