X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..d5d484b0fbe924d3663b177965538d517ee412c1:/icuSources/tools/genuca/genuca.cpp diff --git a/icuSources/tools/genuca/genuca.cpp b/icuSources/tools/genuca/genuca.cpp index cecd040d..8a0f8718 100644 --- a/icuSources/tools/genuca/genuca.cpp +++ b/icuSources/tools/genuca/genuca.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2003, International Business Machines +* Copyright (C) 2000-2006, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -23,9 +23,10 @@ * 05/09/2001 weiv Case bits are now in the CEs, not in front */ -#include #include "unicode/utypes.h" +#include "unicode/putil.h" #include "unicode/udata.h" +#include "unicode/uclean.h" #include "ucol_imp.h" #include "genuca.h" #include "uoptions.h" @@ -34,6 +35,8 @@ #include "cstring.h" #include "cmemory.h" +#include + /* * Global - verbosity */ @@ -60,6 +63,42 @@ static UDataInfo dummyDataInfo = { #else +static const UDataInfo ucaDataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3}, /* dataFormat="UCol" */ + /* 03/26/2002 bumped up version since format has changed */ + /* 09/16/2002 bumped up version since we went from UColAttributeValue */ + /* to int32_t in UColOptionSet */ + /* 05/13/2003 This one also updated since we added UCA and UCD versions */ + /* to header */ + /* 09/11/2003 Adding information required by data swapper */ + {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3}, /* formatVersion */ + {0, 0, 0, 0} /* dataVersion = Unicode Version*/ +}; + +static const UDataInfo invUcaDataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3}, /* dataFormat="InvC" */ + /* 03/26/2002 bumped up version since format has changed */ + /* 04/29/2003 2.1 format - we have added UCA version to header */ + {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3}, /* formatVersion */ + {0, 0, 0, 0} /* dataVersion = Unicode Version*/ +}; + UCAElements le; int32_t readElement(char **from, char *to, char separator, UErrorCode *status) { @@ -217,10 +256,58 @@ static void addToExistingInverse(UCAElements *element, uint32_t position, UError } } +/* + * Takes two CEs (lead and continuation) and + * compares them as CEs should be compared: + * primary vs. primary, secondary vs. secondary + * tertiary vs. tertiary + */ +static int32_t compareCEs(uint32_t *source, uint32_t *target) { + uint32_t s1 = source[0], s2, t1 = target[0], t2; + if(isContinuation(source[1])) { + s2 = source[1]; + } else { + s2 = 0; + } + if(isContinuation(target[1])) { + t2 = target[1]; + } else { + t2 = 0; + } + + uint32_t s = 0, t = 0; + if(s1 == t1 && s2 == t2) { + return 0; + } + s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); + t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); + if(s < t) { + return -1; + } else if(s > t) { + return 1; + } else { + s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; + t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; + if(s < t) { + return -1; + } else if(s > t) { + return 1; + } else { + s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); + t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); + if(s < t) { + return -1; + } else { + return 1; + } + } + } +} + static uint32_t addToInverse(UCAElements *element, UErrorCode *status) { - uint32_t comp = 0; uint32_t position = inversePos; uint32_t saveElement = element->CEs[0]; + int32_t compResult = 0; element->CEs[0] &= 0xFFFFFF3F; if(element->noOfCEs == 1) { element->CEs[1] = 0; @@ -228,42 +315,17 @@ static uint32_t addToInverse(UCAElements *element, UErrorCode *status) { if(inversePos == 0) { inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0; addNewInverse(element, status); - } else if(inverseTable[inversePos][0] > element->CEs[0]) { - while(inverseTable[--position][0] > element->CEs[0]) {} - if(VERBOSE) { fprintf(stdout, "p:%i ", position); } - if(inverseTable[position][0] == element->CEs[0]) { - if(isContinuation(element->CEs[1])) { - comp = element->CEs[1]; - } else { - comp = 0; - } - if(inverseTable[position][1] > comp) { - while(inverseTable[--position][1] > comp) {} - } - if(inverseTable[position][1] == comp) { - addToExistingInverse(element, position, status); - } else { - insertInverse(element, position+1, status); - } - } else { - if(VERBOSE) { fprintf(stdout, "ins"); } + } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) { + while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0); + if(VERBOSE) { fprintf(stdout, "p:%u ", (int)position); } + if(compResult == 0) { + addToExistingInverse(element, position, status); + } else { insertInverse(element, position+1, status); } - } else if(inverseTable[inversePos][0] == element->CEs[0]) { - if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { - comp = element->CEs[1]; - if(inverseTable[position][1] > comp) { - while(inverseTable[--position][1] > comp) {} - } - if(inverseTable[position][1] == comp) { - addToExistingInverse(element, position, status); - } else { - insertInverse(element, position+1, status); - } - } else { - addToExistingInverse(element, inversePos, status); - } - } else { + } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) { + addToExistingInverse(element, inversePos, status); + } else { addNewInverse(element, status); } element->CEs[0] = saveElement; @@ -280,6 +342,7 @@ static InverseUCATableHeader *assembleInverseTable(UErrorCode *status) uint32_t i = 0; result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize); + uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize); if(result != NULL) { result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize; @@ -290,10 +353,10 @@ static InverseUCATableHeader *assembleInverseTable(UErrorCode *status) inversePos++; for(i = 2; i inverseTable[i][0]) { - fprintf(stderr, "Error at %i: %08X & %08X\n", i, inverseTable[i-1][0], inverseTable[i][0]); + if(compareCEs(inverseTable[i-1], inverseTable[i]) > 0) { + fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]); } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) { - fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", i, inverseTable[i-1][0], inverseTable[i-1][1], inverseTable[i][0], inverseTable[i][1]); + fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]); } } @@ -328,18 +391,18 @@ static void writeOutInverseData(InverseUCATableHeader *data, uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo)); u_getUnicodeVersion(invUcaInfo.dataVersion); - pData=udata_create(outputDir, INVC_DATA_TYPE, U_ICUDATA_NAME "_" INVC_DATA_NAME, &invUcaInfo, + pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo, copyright, status); if(U_FAILURE(*status)) { - fprintf(stderr, "Error: unable to create data memory, error %d\n", *status); + fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); return; } /* write the data to the file */ if (VERBOSE) { fprintf(stdout, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR, - U_ICUDATA_NAME "_" INVC_DATA_NAME, + INVC_DATA_NAME, INVC_DATA_TYPE); } udata_writeBlock(pData, data, data->byteSize); @@ -377,7 +440,7 @@ UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UE char *endCodePoint = NULL; char *spacePointer = NULL; char *result = fgets(buffer, 2048, data); - int32_t buflen = uprv_strlen(buffer); + int32_t buflen = (int32_t)uprv_strlen(buffer); if(U_FAILURE(*status)) { return 0; } @@ -615,12 +678,13 @@ UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UE i++; } element->noOfCEs = CEindex; - +#if 0 element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]); - +#endif // we don't want any strange stuff after useful data! while(pointer < commentStart) { - if(*pointer != ' ') { + if(*pointer != ' ' && *pointer != '\t') + { *status=U_INVALID_FORMAT_ERROR; break; } @@ -628,7 +692,7 @@ UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UE } if(U_FAILURE(*status)) { - fprintf(stderr, "problem putting stuff in hash table\n"); + fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status)); *status = U_INTERNAL_PROGRAM_ERROR; return NULL; } @@ -651,6 +715,9 @@ void writeOutData(UCATableHeader *data, uint32_t size = data->size; + data->UCAConsts = data->size; + data->size += paddedsize(sizeof(UCAConstants)); + if(noOfcontractions != 0) { contractions[noOfcontractions][0] = 0; contractions[noOfcontractions][1] = 0; @@ -658,9 +725,9 @@ void writeOutData(UCATableHeader *data, noOfcontractions++; - data->UCAConsts = data->size; - data->size += paddedsize(sizeof(UCAConstants)); data->contractionUCACombos = data->size; + data->contractionUCACombosWidth = 3; + data->contractionUCACombosSize = noOfcontractions; data->size += paddedsize((noOfcontractions*3*sizeof(UChar))); } @@ -671,11 +738,11 @@ void writeOutData(UCATableHeader *data, uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo)); u_getUnicodeVersion(ucaInfo.dataVersion); - pData=udata_create(outputDir, UCA_DATA_TYPE, U_ICUDATA_NAME "_" UCA_DATA_NAME, &ucaInfo, + pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo, copyright, status); if(U_FAILURE(*status)) { - fprintf(stderr, "Error: unable to create data memory, error %d\n", *status); + fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); return; } @@ -711,27 +778,35 @@ write_uca_table(const char *filename, UErrorCode *status) { FILE *data = fopen(filename, "r"); + if(data == NULL) { + fprintf(stderr, "Couldn't open file: %s\n", filename); + return -1; + } uint32_t line = 0; UCAElements *element = NULL; UChar variableTopValue = 0; UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); /* test for NULL */ - if(myD == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - fclose(data); - return 0; - } + if(myD == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + fclose(data); + return 0; + } + uprv_memset(myD, 0, sizeof(UCATableHeader)); UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); /* test for NULL */ - if(opts == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - uprv_free(myD); - fclose(data); - return 0; - } - UChar contractionCEs[256][3]; + if(opts == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + uprv_free(myD); + fclose(data); + return 0; + } + uprv_memset(opts, 0, sizeof(UColOptionSet)); + UChar contractionCEs[512][3]; + uprv_memset(contractionCEs, 0, 512*3*sizeof(UChar)); uint32_t noOfContractions = 0; UCAConstants consts; + uprv_memset(&consts, 0, sizeof(consts)); #if 0 UCAConstants consts = { UCOL_RESET_TOP_VALUE, @@ -765,12 +840,7 @@ write_uca_table(const char *filename, #endif - if(data == NULL) { - fprintf(stderr, "Couldn't open file: %s\n", filename); - return -1; - } - - memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF); + uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF); opts->variableTopValue = variableTopValue; opts->strength = UCOL_TERTIARY; @@ -780,12 +850,16 @@ write_uca_table(const char *filename, opts->caseLevel = UCOL_OFF; /* do we have an extra case level */ opts->normalizationMode = UCOL_OFF; /* attribute for normalization */ opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */ + opts->numericCollation = UCOL_OFF; myD->jamoSpecial = FALSE; - tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, status); + tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status); if(U_FAILURE(*status)) { fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status)); + uprv_free(opts); + uprv_free(myD); + fclose(data); return -1; } @@ -816,7 +890,7 @@ struct { {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/ #endif {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/ - {0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/ + //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/ {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF // Now directly handled in the collation code by the swapCJK function. //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/ @@ -836,15 +910,15 @@ struct { int32_t surrogateCount = 0; while(!feof(data)) { if(U_FAILURE(*status)) { - fprintf(stderr, "Something returned an error %i (%s) while processing line %i of %s. Exiting...\n", - *status, u_errorName(*status), line, filename); + fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", + *status, u_errorName(*status), (int)line, filename); exit(*status); } element = readAnElement(data, t, &consts, status); line++; if(VERBOSE) { - fprintf(stdout, "%i ", line); + fprintf(stdout, "%u ", (int)line); } if(element != NULL) { // we have read the line, now do something sensible with the read data! @@ -880,29 +954,35 @@ struct { } if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { - fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); - return -1; + fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); + uprv_uca_closeTempTable(t); + uprv_free(opts); + uprv_free(myD); + fclose(data); + return -1; } - +/* { + uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL); + }*/ if (VERBOSE) { - fprintf(stdout, "\nLines read: %i\n", line); - fprintf(stdout, "Surrogate count: %i\n", surrogateCount); + fprintf(stdout, "\nLines read: %u\n", (int)line); + fprintf(stdout, "Surrogate count: %i\n", (int)surrogateCount); fprintf(stdout, "Raw data breakdown:\n"); /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ - fprintf(stdout, "Number of contractions: %i\n", noOfContractions); - fprintf(stdout, "Contraction image size: %i\n", t->image->contractionSize); - fprintf(stdout, "Expansions size: %i\n", t->expansions->position); + fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions); + fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize); + fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position); } /* produce canonical closure for table */ /* first set up constants for implicit calculation */ - uprv_uca_initImplicitConstants(consts.UCA_PRIMARY_IMPLICIT_MIN); + uprv_uca_initImplicitConstants(consts.UCA_PRIMARY_IMPLICIT_MIN, consts.UCA_PRIMARY_IMPLICIT_MAX, status); /* do the closure */ int32_t noOfClosures = uprv_uca_canonicalClosure(t, status); if(noOfClosures != 0) { - fprintf(stderr, "Warning: %i canonical closures occured!\n", noOfClosures); + fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures); } /* test */ @@ -911,9 +991,18 @@ struct { if (VERBOSE) { fprintf(stdout, "Compacted data breakdown:\n"); /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ - fprintf(stdout, "Number of contractions: %i\n", noOfContractions); - fprintf(stdout, "Contraction image size: %i\n", t->image->contractionSize); - fprintf(stdout, "Expansions size: %i\n", t->expansions->position); + fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions); + fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize); + fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position); + } + + if(U_FAILURE(*status)) { + fprintf(stderr, "Error creating table: %s\n", u_errorName(*status)); + uprv_uca_closeTempTable(t); + uprv_free(opts); + uprv_free(myD); + fclose(data); + return -1; } /* populate the version info struct with version info*/ @@ -934,7 +1023,7 @@ struct { uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo)); writeOutInverseData(inverse, outputDir, copyright, status); - uprv_uca_closeTempTable(t); + uprv_uca_closeTempTable(t); uprv_free(myD); uprv_free(opts); @@ -1000,17 +1089,15 @@ int main(int argc, char* argv[]) { argv[0], u_getDataDirectory()); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } - if(options[3].doesOccur) { - fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n", + fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n", #if UCONFIG_NO_COLLATION 0, 0 #else - ucaDataInfo.formatVersion[0], ucaDataInfo.formatVersion[1] + UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1 #endif ); - fprintf(stdout, "Copyright (C) 2000-2001, International Business Machines\n"); - fprintf(stdout, "Corporation and others. All Rights Reserved.\n"); + fprintf(stdout, U_COPYRIGHT_STRING"\n"); exit(0); } @@ -1026,6 +1113,15 @@ int main(int argc, char* argv[]) { if (options[7].doesOccur) { u_setDataDirectory(options[7].value); } + /* Initialize ICU */ + u_init(&status); + if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", + argv[0], u_errorName(status)); + exit(1); + } + status = U_ZERO_ERROR; + /* prepare the filename beginning with the source dir */ uprv_strcpy(filename, srcDir); @@ -1055,16 +1151,16 @@ int main(int argc, char* argv[]) { UNewDataMemory *pData; const char *msg; - msg = "genuca writes dummy " U_ICUDATA_NAME "_" UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; + msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; fprintf(stderr, "%s\n", msg); - pData = udata_create(destdir, UCA_DATA_TYPE, U_ICUDATA_NAME "_" UCA_DATA_NAME, &dummyDataInfo, + pData = udata_create(destdir, UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); - msg = "genuca writes dummy " U_ICUDATA_NAME "_" INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; + msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; fprintf(stderr, "%s\n", msg); - pData = udata_create(destdir, INVC_DATA_TYPE, U_ICUDATA_NAME "_" INVC_DATA_NAME, &dummyDataInfo, + pData = udata_create(destdir, INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status);