icuSources/tools/gennames/gennames.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennames.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999sep30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the character code,
  18 *   the "modern" character name, and optionally the
  19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
  20 *   It then tokenizes and compresses the names and builds
  21 *   compact binary tables for random-access lookup
  22 *   in a u_charName() API function.
  23 *
  24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
  25 * (all data is static const)
  26 *
  27 * UDataInfo fields:
  28 *   dataFormat "unam"
  29 *   formatVersion 1.0
  30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
  31 *
  32 * -- data-based names
  33 * uint32_t tokenStringOffset,
  34 *          groupsOffset,
  35 *          groupStringOffset,
  36 *          algNamesOffset;
  37 *
  38 * uint16_t tokenCount;
  39 * uint16_t tokenTable[tokenCount];
  40 *
  41 * char     tokenStrings[]; -- padded to even count
  42 *
  43 * -- strings (groupStrings) are tokenized as follows:
  44 *   for each character c
  45 *       if(c>=tokenCount) write that character c directly
  46 *   else
  47 *       token=tokenTable[c];
  48 *       if(token==0xfffe) -- lead byte of double-byte token
  49 *           token=tokenTable[c<<8|next character];
  50 *       if(token==-1)
  51 *           write c directly
  52 *       else
  53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
  54 *           append zero-terminated tokenString;
  55 *
  56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
  57 *    are separated by ';'.
  58 *
  59 * uint16_t groupCount;
  60 * struct {
  61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
  62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
  63 *   uint16_t offsetLow;
  64 * } groupTable[groupCount];
  65 *
  66 * char     groupStrings[]; -- padded to 4-count
  67 *
  68 * -- The actual, tokenized group strings are not zero-terminated because
  69 *   that would take up too much space.
  70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
  71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
  72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
  73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
  74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
  75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
  76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
  77 *   For the de-tokenizing, see token description above; the strings immediately follow the
  78 *   32 lengths.
  79 *
  80 * -- algorithmic names
  81 *
  82 * typedef struct AlgorithmicRange {
  83 *     uint32_t rangeStart, rangeEnd;
  84 *     uint8_t algorithmType, algorithmVariant;
  85 *     uint16_t rangeSize;
  86 * } AlgorithmicRange;
  87 *
  88 * uint32_t algRangesCount; -- number of data blocks for ranges of
  89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
  90 *
  91 * struct {
  92 *     AlgorithmicRange algRange;
  93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
  94 * } algRanges[algNamesCount];
  95 * -- not a real array because each part has a different size
  96 *    of algRange.rangeSize (including AlgorithmicRange)
  97 *
  98 * -- algorithmic range types:
  99 *
 100 * 0 Names are formed from a string prefix that is stored in
 101 *   the algRangeData (zero-terminated), followed by the Unicode code point
 102 *   of the character in hexadecimal digits;
 103 *   algRange.algorithmVariant digits are written
 104 *
 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
 106 *   algRange.algorithmVariant is the count of modulo factors
 107 *   algRangeData contains
 108 *       uint16_t factors[algRange.algorithmVariant];
 109 *       char strings[];
 110 *   the first zero-terminated string is written as the prefix; then:
 111 *
 112 *   The rangeStart is subtracted; with the difference, here "code":
 113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
 114 *       index[i]=code%factor[i];
 115 *       code/=factor[i];
 116 *
 117 *   The strings after the prefix are short pieces that are then appended to the result
 118 *   according to index[0..algRange.algorithmVariant-1].
 119 */
 120
 121 #include <stdio.h>
 122 #include "unicode/utypes.h"
 123 #include "unicode/putil.h"
 124 #include "unicode/uclean.h"
 125 #include "unicode/udata.h"
 126 #include "cmemory.h"
 127 #include "cstring.h"
 128 #include "uarrsort.h"
 129 #include "unewdata.h"
 130 #include "uoptions.h"
 131 #include "uparse.h"
 132
 133 #define STRING_STORE_SIZE 1000000
 134 #define GROUP_STORE_SIZE 5000
 135
 136 #define GROUP_SHIFT 5
 137 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
 138 #define GROUP_MASK (LINES_PER_GROUP-1)
 139
 140 #define MAX_LINE_COUNT 50000
 141 #define MAX_WORD_COUNT 20000
 142 #define MAX_GROUP_COUNT 5000
 143
 144 #define DATA_NAME "unames"
 145 #define DATA_TYPE "icu"
 146 #define VERSION_STRING "unam"
 147 #define NAME_SEPARATOR_CHAR ';'
 148
 149 static const UVersionInfo
 150 unicode_3_0={ 3, 0, 0, 0 },
 151 unicode_3_1={ 3, 1, 0, 0 };
 152
 153 /* UDataInfo cf. udata.h */
 154 static UDataInfo dataInfo={
 155     sizeof(UDataInfo),
 156     0,
 157
 158     U_IS_BIG_ENDIAN,
 159     U_CHARSET_FAMILY,
 160     sizeof(UChar),
 161     0,
 162
 163     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
 164     {1, 0, 0, 0},                 /* formatVersion */
 165     {3, 0, 0, 0}                  /* dataVersion */
 166 };
 167
 168 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
 169
 170 static uint8_t stringStore[STRING_STORE_SIZE],
 171                groupStore[GROUP_STORE_SIZE],
 172                lineLengths[LINES_PER_GROUP];
 173
 174 static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
 175
 176 typedef struct {
 177     uint32_t code;
 178     int16_t length;
 179     uint8_t *s;
 180 } Line;
 181
 182 typedef struct {
 183     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
 184     int16_t count;
 185     int16_t length;
 186     uint8_t *s;
 187 } Word;
 188
 189 static Line lines[MAX_LINE_COUNT];
 190 static Word words[MAX_WORD_COUNT];
 191
 192 static uint32_t lineCount=0, wordCount=0;
 193
 194 static int16_t leadByteCount;
 195
 196 #define LEADBYTE_LIMIT 16
 197
 198 static int16_t tokens[LEADBYTE_LIMIT*256];
 199 static uint32_t tokenCount;
 200
 201 /* prototypes --------------------------------------------------------------- */
 202
 203 static void
 204 init(void);
 205
 206 static void
 207 parseDB(const char *filename, UBool store10Names);
 208
 209 static void
 210 parseName(char *name, int16_t length);
 211
 212 static int16_t
 213 skipNoise(char *line, int16_t start, int16_t limit);
 214
 215 static int16_t
 216 getWord(char *line, int16_t start, int16_t limit);
 217
 218 static void
 219 compress(void);
 220
 221 static void
 222 compressLines(void);
 223
 224 static int16_t
 225 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
 226
 227 static int32_t
 228 compareWords(const void *context, const void *word1, const void *word2);
 229
 230 static void
 231 generateData(const char *dataDir);
 232
 233 static uint32_t
 234 generateAlgorithmicData(UNewDataMemory *pData);
 235
 236 static int16_t
 237 findToken(uint8_t *s, int16_t length);
 238
 239 static Word *
 240 findWord(char *s, int16_t length);
 241
 242 static Word *
 243 addWord(char *s, int16_t length);
 244
 245 static void
 246 countWord(Word *word);
 247
 248 static void
 249 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
 250
 251 static void
 252 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
 253
 254 static uint32_t
 255 addToken(uint8_t *s, int16_t length);
 256
 257 static void
 258 appendLineLength(int16_t length);
 259
 260 static void
 261 appendLineLengthNibble(uint8_t nibble);
 262
 263 static uint8_t *
 264 allocLine(int32_t length);
 265
 266 static uint8_t *
 267 allocWord(uint32_t length);
 268
 269 /* -------------------------------------------------------------------------- */
 270
 271 static UOption options[]={
 272     UOPTION_HELP_H,
 273     UOPTION_HELP_QUESTION_MARK,
 274     UOPTION_VERBOSE,
 275     UOPTION_QUIET,
 276     UOPTION_COPYRIGHT,
 277     UOPTION_DESTDIR,
 278     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
 279     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
 280 };
 281
 282 extern int
 283 main(int argc, char* argv[]) {
 284     UVersionInfo version;
 285     UBool store10Names=FALSE;
 286     UErrorCode errorCode = U_ZERO_ERROR;
 287
 288     U_MAIN_INIT_ARGS(argc, argv);
 289
 290     /* Initialize ICU */
 291     u_init(&errorCode);
 292     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
 293         /* Note: u_init() will try to open ICU property data.
 294          *       failures here are expected when building ICU from scratch.
 295          *       ignore them.
 296          */
 297         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
 298             argv[0], u_errorName(errorCode));
 299         exit(1);
 300     }
 301
 302     /* preset then read command line options */
 303     options[5].value=u_getDataDirectory();
 304     options[6].value="3.2";
 305     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 306
 307     /* error handling, printing usage message */
 308     if(argc<0) {
 309         fprintf(stderr,
 310             "error in command line argument \"%s\"\n",
 311             argv[-argc]);
 312     } else if(argc<2) {
 313         argc=-1;
 314     }
 315     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
 316         /*
 317          * Broken into chucks because the C89 standard says the minimum
 318          * required supported string length is 509 bytes.
 319          */
 320         fprintf(stderr,
 321             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
 322             "\n"
 323             "Read the UnicodeData.txt file and \n"
 324             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
 325             "\n"
 326             "\tfilename  absolute path/filename for the Unicode database text file\n"
 327             "\t\t(default: standard input)\n"
 328             "\n",
 329             argv[0]);
 330         fprintf(stderr,
 331             "Options:\n"
 332             "\t-h or -? or --help  this usage text\n"
 333             "\t-v or --verbose     verbose output\n"
 334             "\t-q or --quiet       no output\n"
 335             "\t-c or --copyright   include a copyright notice\n"
 336             "\t-d or --destdir     destination directory, followed by the path\n"
 337             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 338             "\t-1 or --unicode1-names  store Unicode 1.0 character names\n");
 339         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 340     }
 341
 342     /* get the options values */
 343     beVerbose=options[2].doesOccur;
 344     beQuiet=options[3].doesOccur;
 345     haveCopyright=options[4].doesOccur;
 346     store10Names=options[7].doesOccur;
 347
 348     /* set the Unicode version */
 349     u_versionFromString(version, options[6].value);
 350     uprv_memcpy(dataInfo.dataVersion, version, 4);
 351
 352     init();
 353     parseDB(argc>=2 ? argv[1] : "-", store10Names);
 354     compress();
 355     generateData(options[5].value);
 356
 357     u_cleanup();
 358     return 0;
 359 }
 360
 361 static void
 362 init() {
 363     int i;
 364
 365     for(i=0; i<256; ++i) {
 366         tokens[i]=0;
 367     }
 368 }
 369
 370 /* parsing ------------------------------------------------------------------ */
 371
 372 /* get a name, strip leading and trailing whitespace */
 373 static int16_t
 374 getName(char **pStart, char *limit) {
 375     /* strip leading whitespace */
 376     char *start=(char *)u_skipWhitespace(*pStart);
 377
 378     /* strip trailing whitespace */
 379     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
 380         --limit;
 381     }
 382
 383     /* return results */
 384     *pStart=start;
 385     return (int16_t)(limit-start);
 386 }
 387
 388 static void U_CALLCONV
 389 lineFn(void *context,
 390        char *fields[][2], int32_t fieldCount,
 391        UErrorCode *pErrorCode) {
 392     char *names[3];
 393     int16_t lengths[3];
 394     static uint32_t prevCode=0;
 395     uint32_t code=0;
 396
 397     if(U_FAILURE(*pErrorCode)) {
 398         return;
 399     }
 400     /* get the character code */
 401     code=uprv_strtoul(fields[0][0], NULL, 16);
 402
 403     /* get the character name */
 404     names[0]=fields[1][0];
 405     lengths[0]=getName(names+0, fields[1][1]);
 406     if(names[0][0]=='<') {
 407         /* do not store pseudo-names in <> brackets */
 408         lengths[0]=0;
 409     }
 410
 411     /* store 1.0 names */
 412     /* get the second character name, the one from Unicode 1.0 */
 413     /* do not store pseudo-names in <> brackets */
 414     names[1]=fields[10][0];
 415     lengths[1]=getName(names+1, fields[10][1]);
 416     if(*(UBool *)context && names[1][0]!='<') {
 417         /* keep the name */
 418     } else {
 419         lengths[1]=0;
 420     }
 421
 422     /* get the ISO 10646 comment */
 423     names[2]=fields[11][0];
 424     lengths[2]=getName(names+2, fields[11][1]);
 425
 426     if(lengths[0]+lengths[1]+lengths[2]==0) {
 427         return;
 428     }
 429
 430     /* check for non-character code points */
 431     if(!UTF_IS_UNICODE_CHAR(code)) {
 432         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
 433                 (unsigned long)code);
 434         *pErrorCode=U_PARSE_ERROR;
 435         exit(U_PARSE_ERROR);
 436     }
 437
 438     /* check that the code points (code) are in ascending order */
 439     if(code<=prevCode && code>0) {
 440         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 441                 (unsigned long)code, (unsigned long)prevCode);
 442         *pErrorCode=U_PARSE_ERROR;
 443         exit(U_PARSE_ERROR);
 444     }
 445     prevCode=code;
 446
 447     parseName(names[0], lengths[0]);
 448     parseName(names[1], lengths[1]);
 449     parseName(names[2], lengths[2]);
 450
 451     /*
 452      * set the count argument to
 453      * 1: only store regular names
 454      * 2: store regular and 1.0 names
 455      * 3: store names and ISO 10646 comment
 456      */
 457     addLine(code, names, lengths, 3);
 458 }
 459
 460 static void
 461 parseDB(const char *filename, UBool store10Names) {
 462     char *fields[15][2];
 463     UErrorCode errorCode=U_ZERO_ERROR;
 464
 465     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
 466     if(U_FAILURE(errorCode)) {
 467         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
 468         exit(errorCode);
 469     }
 470
 471     if(!beQuiet) {
 472         printf("size of all names in the database: %lu\n",
 473             (unsigned long)lineTop);
 474         printf("number of named Unicode characters: %lu\n",
 475             (unsigned long)lineCount);
 476         printf("number of words in the dictionary from these names: %lu\n",
 477             (unsigned long)wordCount);
 478     }
 479 }
 480
 481 static void
 482 parseName(char *name, int16_t length) {
 483     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
 484     Word *word;
 485
 486     while(start<length) {
 487         /* skip any "noise" characters */
 488         limit=skipNoise(name, start, length);
 489         if(start<limit) {
 490             /*prevStart=-1;*/
 491             start=limit;
 492         }
 493         if(start==length) {
 494             break;
 495         }
 496
 497         /* get a word and add it if it is longer than 1 */
 498         limit=getWord(name, start, length);
 499         wordLength=(int16_t)(limit-start);
 500         if(wordLength>1) {
 501             word=findWord(name+start, wordLength);
 502             if(word==NULL) {
 503                 word=addWord(name+start, wordLength);
 504             }
 505             countWord(word);
 506         }
 507
 508 #if 0
 509         /*
 510          * if there was a word before this
 511          * (with no noise in between), then add the pair of words, too
 512          */
 513         if(prevStart!=-1) {
 514             wordLength=limit-prevStart;
 515             word=findWord(name+prevStart, wordLength);
 516             if(word==NULL) {
 517                 word=addWord(name+prevStart, wordLength);
 518             }
 519             countWord(word);
 520         }
 521 #endif
 522
 523         /*prevStart=start;*/
 524         start=limit;
 525     }
 526 }
 527
 528 static UBool U_INLINE
 529 isWordChar(char c) {
 530     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
 531            ('J'<=c && c<='R') ||
 532            ('S'<=c && c<='Z') ||
 533
 534            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
 535            ('j'<=c && c<='r') ||
 536            ('s'<=c && c<='z') ||
 537
 538            ('0'<=c && c<='9');
 539 }
 540
 541 static int16_t
 542 skipNoise(char *line, int16_t start, int16_t limit) {
 543     /* skip anything that is not part of a word in this sense */
 544     while(start<limit && !isWordChar(line[start])) {
 545         ++start;
 546     }
 547
 548     return start;
 549 }
 550
 551 static int16_t
 552 getWord(char *line, int16_t start, int16_t limit) {
 553     char c=0; /* initialize to avoid a compiler warning although the code was safe */
 554
 555     /* a unicode character name word consists of A-Z0-9 */
 556     while(start<limit && isWordChar(line[start])) {
 557         ++start;
 558     }
 559
 560     /* include a following space or dash */
 561     if(start<limit && ((c=line[start])==' ' || c=='-')) {
 562         ++start;
 563     }
 564
 565     return start;
 566 }
 567
 568 /* compressing -------------------------------------------------------------- */
 569
 570 static void
 571 compress() {
 572     uint32_t i, letterCount;
 573     int16_t wordNumber;
 574     UErrorCode errorCode;
 575
 576     /* sort the words in reverse order by weight */
 577     errorCode=U_ZERO_ERROR;
 578     uprv_sortArray(words, wordCount, sizeof(Word),
 579                     compareWords, NULL, FALSE, &errorCode);
 580
 581     /* remove the words that do not save anything */
 582     while(wordCount>0 && words[wordCount-1].weight<1) {
 583         --wordCount;
 584     }
 585
 586     /* count the letters in the token range */
 587     letterCount=0;
 588     for(i=LEADBYTE_LIMIT; i<256; ++i) {
 589         if(tokens[i]==-1) {
 590             ++letterCount;
 591         }
 592     }
 593     if(!beQuiet) {
 594         printf("number of letters used in the names: %d\n", (int)letterCount);
 595     }
 596
 597     /* do we need double-byte tokens? */
 598     if(wordCount+letterCount<=256) {
 599         /* no, single-byte tokens are enough */
 600         leadByteCount=0;
 601         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
 602             if(tokens[i]!=-1) {
 603                 tokens[i]=wordNumber;
 604                 if(beVerbose) {
 605                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 606                             (int)i, (long)words[wordNumber].weight,
 607                             words[wordNumber].length, words[wordNumber].s);
 608                 }
 609                 ++wordNumber;
 610             }
 611         }
 612         tokenCount=i;
 613     } else {
 614         /*
 615          * The tokens that need two token bytes
 616          * get their weight reduced by their count
 617          * because they save less.
 618          */
 619         tokenCount=256-letterCount;
 620         for(i=tokenCount; i<wordCount; ++i) {
 621             words[i].weight-=words[i].count;
 622         }
 623
 624         /* sort these words in reverse order by weight */
 625         errorCode=U_ZERO_ERROR;
 626         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
 627                         compareWords, NULL, FALSE, &errorCode);
 628
 629         /* remove the words that do not save anything */
 630         while(wordCount>0 && words[wordCount-1].weight<1) {
 631             --wordCount;
 632         }
 633
 634         /* how many tokens and lead bytes do we have now? */
 635         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
 636         /*
 637          * adjust upwards to take into account that
 638          * double-byte tokens must not
 639          * use NAME_SEPARATOR_CHAR as a second byte
 640          */
 641         tokenCount+=(tokenCount-256+254)/255;
 642
 643         leadByteCount=(int16_t)(tokenCount>>8);
 644         if(leadByteCount<LEADBYTE_LIMIT) {
 645             /* adjust for the real number of lead bytes */
 646             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
 647         } else {
 648             /* limit the number of lead bytes */
 649             leadByteCount=LEADBYTE_LIMIT-1;
 650             tokenCount=LEADBYTE_LIMIT*256;
 651             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
 652             /* adjust again to skip double-byte tokens with ';' */
 653             wordCount-=(tokenCount-256+254)/255;
 654         }
 655
 656         /* set token 0 to word 0 */
 657         tokens[0]=0;
 658         if(beVerbose) {
 659             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
 660                     (long)words[0].weight,
 661                     words[0].length, words[0].s);
 662         }
 663         wordNumber=1;
 664
 665         /* set the lead byte tokens */
 666         for(i=1; (int16_t)i<=leadByteCount; ++i) {
 667             tokens[i]=-2;
 668         }
 669
 670         /* set the tokens */
 671         for(; i<256; ++i) {
 672             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
 673             if(tokens[i]!=-1) {
 674                 tokens[i]=wordNumber;
 675                 if(beVerbose) {
 676                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 677                             (int)i, (long)words[wordNumber].weight,
 678                             words[wordNumber].length, words[wordNumber].s);
 679                 }
 680                 ++wordNumber;
 681             }
 682         }
 683
 684         /* continue above 255 where there are no letters */
 685         for(; (uint32_t)wordNumber<wordCount; ++i) {
 686             if((i&0xff)==NAME_SEPARATOR_CHAR) {
 687                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 688             } else {
 689                 tokens[i]=wordNumber;
 690                 if(beVerbose) {
 691                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 692                             (int)i, (long)words[wordNumber].weight,
 693                             words[wordNumber].length, words[wordNumber].s);
 694                 }
 695                 ++wordNumber;
 696             }
 697         }
 698         tokenCount=i; /* should be already tokenCount={i or i+1} */
 699     }
 700
 701     if(!beQuiet) {
 702         printf("number of lead bytes: %d\n", leadByteCount);
 703         printf("number of single-byte tokens: %lu\n",
 704             (unsigned long)256-letterCount-leadByteCount);
 705         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
 706     }
 707
 708     compressLines();
 709 }
 710
 711 static void
 712 compressLines() {
 713     Line *line=NULL;
 714     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
 715              groupMSB=0xffff, lineCount2;
 716     int16_t groupTop=0;
 717
 718     /* store the groups like lines, reusing the lines' memory */
 719     lineTop=0;
 720     lineCount2=lineCount;
 721     lineCount=0;
 722
 723     /* loop over all lines */
 724     while(i<lineCount2) {
 725         line=lines+i++;
 726         inLine=line->code;
 727
 728         /* segment the lines to groups of 32 */
 729         if(inLine>>GROUP_SHIFT!=groupMSB) {
 730             /* finish the current group with empty lines */
 731             while((++outLine&GROUP_MASK)!=0) {
 732                 appendLineLength(0);
 733             }
 734
 735             /* store the group like a line */
 736             if(groupTop>0) {
 737                 if(groupTop>GROUP_STORE_SIZE) {
 738                     fprintf(stderr, "gennames: group store overflow\n");
 739                     exit(U_BUFFER_OVERFLOW_ERROR);
 740                 }
 741                 addGroup(groupMSB, groupStore, groupTop);
 742                 if(lineTop>(uint32_t)(line->s-stringStore)) {
 743                     fprintf(stderr, "gennames: group store runs into string store\n");
 744                     exit(U_INTERNAL_PROGRAM_ERROR);
 745                 }
 746             }
 747
 748             /* start the new group */
 749             lineLengthsTop=0;
 750             groupTop=0;
 751             groupMSB=inLine>>GROUP_SHIFT;
 752             outLine=(inLine&~GROUP_MASK)-1;
 753         }
 754
 755         /* write empty lines between the previous line in the group and this one */
 756         while(++outLine<inLine) {
 757             appendLineLength(0);
 758         }
 759
 760         /* write characters and tokens for this line */
 761         appendLineLength(compressLine(line->s, line->length, &groupTop));
 762     }
 763
 764     /* finish and store the last group */
 765     if(line && groupMSB!=0xffff) {
 766         /* finish the current group with empty lines */
 767         while((++outLine&GROUP_MASK)!=0) {
 768             appendLineLength(0);
 769         }
 770
 771         /* store the group like a line */
 772         if(groupTop>0) {
 773             if(groupTop>GROUP_STORE_SIZE) {
 774                 fprintf(stderr, "gennames: group store overflow\n");
 775                 exit(U_BUFFER_OVERFLOW_ERROR);
 776             }
 777             addGroup(groupMSB, groupStore, groupTop);
 778             if(lineTop>(uint32_t)(line->s-stringStore)) {
 779                 fprintf(stderr, "gennames: group store runs into string store\n");
 780                 exit(U_INTERNAL_PROGRAM_ERROR);
 781             }
 782         }
 783     }
 784
 785     if(!beQuiet) {
 786         printf("number of groups: %lu\n", (unsigned long)lineCount);
 787     }
 788 }
 789
 790 static int16_t
 791 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
 792     int16_t start, limit, token, groupTop=*pGroupTop;
 793
 794     start=0;
 795     do {
 796         /* write any "noise" characters */
 797         limit=skipNoise((char *)s, start, length);
 798         while(start<limit) {
 799             groupStore[groupTop++]=s[start++];
 800         }
 801
 802         if(start==length) {
 803             break;
 804         }
 805
 806         /* write a word, as token or directly */
 807         limit=getWord((char *)s, start, length);
 808         if(limit-start==1) {
 809             groupStore[groupTop++]=s[start++];
 810         } else {
 811             token=findToken(s+start, (int16_t)(limit-start));
 812             if(token!=-1) {
 813                 if(token>0xff) {
 814                     groupStore[groupTop++]=(uint8_t)(token>>8);
 815                 }
 816                 groupStore[groupTop++]=(uint8_t)token;
 817                 start=limit;
 818             } else {
 819                 while(start<limit) {
 820                     groupStore[groupTop++]=s[start++];
 821                 }
 822             }
 823         }
 824     } while(start<length);
 825
 826     length=(int16_t)(groupTop-*pGroupTop);
 827     *pGroupTop=groupTop;
 828     return length;
 829 }
 830
 831 static int32_t
 832 compareWords(const void *context, const void *word1, const void *word2) {
 833     /* reverse sort by word weight */
 834     return ((Word *)word2)->weight-((Word *)word1)->weight;
 835 }
 836
 837 /* generate output data ----------------------------------------------------- */
 838
 839 static void
 840 generateData(const char *dataDir) {
 841     UNewDataMemory *pData;
 842     UErrorCode errorCode=U_ZERO_ERROR;
 843     uint16_t groupWords[3];
 844     uint32_t i, groupTop=lineTop, offset, size,
 845              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
 846     long dataLength;
 847     int16_t token;
 848
 849     pData=udata_create(dataDir, DATA_TYPE,DATA_NAME, &dataInfo,
 850                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
 851     if(U_FAILURE(errorCode)) {
 852         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
 853         exit(errorCode);
 854     }
 855
 856     /* first, see how much space we need, and prepare the token strings */
 857     for(i=0; i<tokenCount; ++i) {
 858         token=tokens[i];
 859         if(token!=-1 && token!=-2) {
 860             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
 861         }
 862     }
 863
 864     /*
 865      * Calculate the total size in bytes of the data including:
 866      * - the offset to the token strings, uint32_t (4)
 867      * - the offset to the group table, uint32_t (4)
 868      * - the offset to the group strings, uint32_t (4)
 869      * - the offset to the algorithmic names, uint32_t (4)
 870      *
 871      * - the number of tokens, uint16_t (2)
 872      * - the token table, uint16_t[tokenCount] (2*tokenCount)
 873      *
 874      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
 875      *
 876      * - the number of groups, uint16_t (2)
 877      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
 878      *
 879      * - the group strings (groupTop), 2-padded
 880      *
 881      * - the size of the data for the algorithmic names
 882      */
 883     tokenStringOffset=4+4+4+4+2+2*tokenCount;
 884     groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
 885     groupStringOffset=groupsOffset+2+6*lineCount;
 886     algNamesOffset=(groupStringOffset+groupTop+3)&~3;
 887
 888     offset=generateAlgorithmicData(NULL);
 889     size=algNamesOffset+offset;
 890
 891     if(!beQuiet) {
 892         printf("size of the Unicode Names data:\n"
 893                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
 894                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
 895                 (unsigned long)groupTop, (unsigned long)offset);
 896     }
 897
 898     /* write the data to the file */
 899     /* offsets */
 900     udata_write32(pData, tokenStringOffset);
 901     udata_write32(pData, groupsOffset);
 902     udata_write32(pData, groupStringOffset);
 903     udata_write32(pData, algNamesOffset);
 904
 905     /* token table */
 906     udata_write16(pData, (uint16_t)tokenCount);
 907     udata_writeBlock(pData, tokens, 2*tokenCount);
 908
 909     /* token strings */
 910     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
 911     if((lineTop-groupTop)&1) {
 912         /* 2-padding */
 913         udata_writePadding(pData, 1);
 914     }
 915
 916     /* group table */
 917     udata_write16(pData, (uint16_t)lineCount);
 918     for(i=0; i<lineCount; ++i) {
 919         /* groupMSB */
 920         groupWords[0]=(uint16_t)lines[i].code;
 921
 922         /* offset */
 923         offset = (uint32_t)(lines[i].s - stringStore);
 924         groupWords[1]=(uint16_t)(offset>>16);
 925         groupWords[2]=(uint16_t)(offset);
 926         udata_writeBlock(pData, groupWords, 6);
 927     }
 928
 929     /* group strings */
 930     udata_writeBlock(pData, stringStore, groupTop);
 931
 932     /* 4-align the algorithmic names data */
 933     udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
 934
 935     generateAlgorithmicData(pData);
 936
 937     /* finish up */
 938     dataLength=udata_finish(pData, &errorCode);
 939     if(U_FAILURE(errorCode)) {
 940         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
 941         exit(errorCode);
 942     }
 943
 944     if(dataLength!=(long)size) {
 945         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
 946 dataLength, (unsigned long)size);
 947         exit(U_INTERNAL_PROGRAM_ERROR);
 948     }
 949 }
 950
 951 /* the structure for algorithmic names needs to be 4-aligned */
 952 typedef struct AlgorithmicRange {
 953     uint32_t rangeStart, rangeEnd;
 954     uint8_t algorithmType, algorithmVariant;
 955     uint16_t rangeSize;
 956 } AlgorithmicRange;
 957
 958 static uint32_t
 959 generateAlgorithmicData(UNewDataMemory *pData) {
 960     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
 961 #   define PREFIX_LENGTH 23
 962 #   define PREFIX_LENGTH_4 24
 963     uint32_t countAlgRanges;
 964
 965     static AlgorithmicRange cjkExtA={
 966         0x3400, 0x4db5,
 967         0, 4,
 968         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 969     };
 970     static AlgorithmicRange cjk={
 971         0x4e00, 0x9fa5,
 972         0, 4,
 973         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 974     };
 975     static AlgorithmicRange cjkExtB={
 976         0x20000, 0x2a6d6,
 977         0, 5,
 978         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 979     };
 980
 981     static char jamo[]=
 982         "HANGUL SYLLABLE \0"
 983
 984         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
 985         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
 986
 987         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
 988         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
 989         "YU\0EU\0YI\0I\0"
 990
 991         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
 992         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
 993         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
 994     ;
 995
 996     static AlgorithmicRange hangul={
 997         0xac00, 0xd7a3,
 998         1, 3,
 999         sizeof(AlgorithmicRange)+6+sizeof(jamo)
1000     };
1001
1002     /* modulo factors, maximum 8 */
1003     /* 3 factors: 19, 21, 28, most-to-least-significant */
1004     static uint16_t hangulFactors[3]={
1005         19, 21, 28
1006     };
1007
1008     uint32_t size;
1009
1010     size=0;
1011
1012     /* number of ranges of algorithmic names */
1013     if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) {
1014         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1015         countAlgRanges=4;
1016     } else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) {
1017         /* Unicode 3.0 has 3 ranges including CJK Extension A */
1018         countAlgRanges=3;
1019     } else {
1020         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1021         countAlgRanges=2;
1022     }
1023
1024     if(pData!=NULL) {
1025         udata_write32(pData, countAlgRanges);
1026     } else {
1027         size+=4;
1028     }
1029
1030     /*
1031      * each range:
1032      * uint32_t rangeStart
1033      * uint32_t rangeEnd
1034      * uint8_t algorithmType
1035      * uint8_t algorithmVariant
1036      * uint16_t size of range data
1037      * uint8_t[size] data
1038      */
1039
1040     /* range 0: cjk extension a */
1041     if(countAlgRanges>=3) {
1042         if(pData!=NULL) {
1043             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1044             udata_writeString(pData, prefix, PREFIX_LENGTH);
1045             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1046                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1047             }
1048         } else {
1049             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1050         }
1051     }
1052
1053     /* range 1: cjk */
1054     if(pData!=NULL) {
1055         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1056         udata_writeString(pData, prefix, PREFIX_LENGTH);
1057         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1058             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1059         }
1060     } else {
1061         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1062     }
1063
1064     /* range 2: hangul syllables */
1065     if(pData!=NULL) {
1066         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1067         udata_writeBlock(pData, hangulFactors, 6);
1068         udata_writeString(pData, jamo, sizeof(jamo));
1069     } else {
1070         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1071     }
1072
1073     /* range 3: cjk extension b */
1074     if(countAlgRanges>=4) {
1075         if(pData!=NULL) {
1076             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1077             udata_writeString(pData, prefix, PREFIX_LENGTH);
1078             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1079                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1080             }
1081         } else {
1082             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1083         }
1084     }
1085
1086     return size;
1087 }
1088
1089 /* helpers ------------------------------------------------------------------ */
1090
1091 static int16_t
1092 findToken(uint8_t *s, int16_t length) {
1093     int16_t i, token;
1094
1095     for(i=0; i<(int16_t)tokenCount; ++i) {
1096         token=tokens[i];
1097         if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1098             return i;
1099         }
1100     }
1101
1102     return -1;
1103 }
1104
1105 static Word *
1106 findWord(char *s, int16_t length) {
1107     uint32_t i;
1108
1109     for(i=0; i<wordCount; ++i) {
1110         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1111             return words+i;
1112         }
1113     }
1114
1115     return NULL;
1116 }
1117
1118 static Word *
1119 addWord(char *s, int16_t length) {
1120     uint8_t *stringStart;
1121     Word *word;
1122
1123     if(wordCount==MAX_WORD_COUNT) {
1124         fprintf(stderr, "gennames: too many words\n");
1125         exit(U_BUFFER_OVERFLOW_ERROR);
1126     }
1127
1128     stringStart=allocWord(length);
1129     uprv_memcpy(stringStart, s, length);
1130
1131     word=words+wordCount;
1132
1133     /*
1134      * Initialize the weight with the costs for this token:
1135      * a zero-terminated string and a 16-bit offset.
1136      */
1137     word->weight=-(length+1+2);
1138     word->count=0;
1139     word->length=length;
1140     word->s=stringStart;
1141
1142     ++wordCount;
1143
1144     return word;
1145 }
1146
1147 static void
1148 countWord(Word *word) {
1149     /* add to the weight the savings: the length of the word minus 1 byte for the token */
1150     word->weight+=word->length-1;
1151     ++word->count;
1152 }
1153
1154 static void
1155 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1156     uint8_t *stringStart;
1157     Line *line;
1158     int16_t i, length;
1159
1160     if(lineCount==MAX_LINE_COUNT) {
1161         fprintf(stderr, "gennames: too many lines\n");
1162         exit(U_BUFFER_OVERFLOW_ERROR);
1163     }
1164
1165     /* find the last non-empty name */
1166     while(count>0 && lengths[count-1]==0) {
1167         --count;
1168     }
1169     if(count==0) {
1170         return; /* should not occur: caller should not have called */
1171     }
1172
1173     /* there will be (count-1) separator characters */
1174     i=count;
1175     length=count-1;
1176
1177     /* add lengths of strings */
1178     while(i>0) {
1179         length+=lengths[--i];
1180     }
1181
1182     /* allocate line memory */
1183     stringStart=allocLine(length);
1184
1185     /* copy all strings into the line memory */
1186     length=0; /* number of chars copied so far */
1187     for(i=0; i<count; ++i) {
1188         if(i>0) {
1189             stringStart[length++]=NAME_SEPARATOR_CHAR;
1190         }
1191         if(lengths[i]>0) {
1192             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1193             length+=lengths[i];
1194         }
1195     }
1196
1197     line=lines+lineCount;
1198
1199     line->code=code;
1200     line->length=length;
1201     line->s=stringStart;
1202
1203     ++lineCount;
1204
1205     /* prevent a character value that is actually in a name from becoming a token */
1206     while(length>0) {
1207         tokens[stringStart[--length]]=-1;
1208     }
1209 }
1210
1211 static void
1212 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1213     uint8_t *stringStart;
1214     Line *line;
1215
1216     if(lineCount==MAX_LINE_COUNT) {
1217         fprintf(stderr, "gennames: too many groups\n");
1218         exit(U_BUFFER_OVERFLOW_ERROR);
1219     }
1220
1221     /* store the line lengths first, then the strings */
1222     lineLengthsTop=(lineLengthsTop+1)/2;
1223     stringStart=allocLine(lineLengthsTop+length);
1224     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1225     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1226
1227     line=lines+lineCount;
1228
1229     line->code=groupMSB;
1230     line->length=length;
1231     line->s=stringStart;
1232
1233     ++lineCount;
1234 }
1235
1236 static uint32_t
1237 addToken(uint8_t *s, int16_t length) {
1238     uint8_t *stringStart;
1239
1240     stringStart=allocLine(length+1);
1241     uprv_memcpy(stringStart, s, length);
1242     stringStart[length]=0;
1243
1244     return (uint32_t)(stringStart - stringStore);
1245 }
1246
1247 static void
1248 appendLineLength(int16_t length) {
1249     if(length>=76) {
1250         fprintf(stderr, "gennames: compressed line too long\n");
1251         exit(U_BUFFER_OVERFLOW_ERROR);
1252     }
1253     if(length>=12) {
1254         length-=12;
1255         appendLineLengthNibble((uint8_t)((length>>4)|12));
1256     }
1257     appendLineLengthNibble((uint8_t)length);
1258 }
1259
1260 static void
1261 appendLineLengthNibble(uint8_t nibble) {
1262     if((lineLengthsTop&1)==0) {
1263         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1264     } else {
1265         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1266     }
1267     ++lineLengthsTop;
1268 }
1269
1270 static uint8_t *
1271 allocLine(int32_t length) {
1272     uint32_t top=lineTop+length;
1273     uint8_t *p;
1274
1275     if(top>wordBottom) {
1276         fprintf(stderr, "gennames: out of memory\n");
1277         exit(U_MEMORY_ALLOCATION_ERROR);
1278     }
1279     p=stringStore+lineTop;
1280     lineTop=top;
1281     return p;
1282 }
1283
1284 static uint8_t *
1285 allocWord(uint32_t length) {
1286     uint32_t bottom=wordBottom-length;
1287
1288     if(lineTop>bottom) {
1289         fprintf(stderr, "gennames: out of memory\n");
1290         exit(U_MEMORY_ALLOCATION_ERROR);
1291     }
1292     wordBottom=bottom;
1293     return stringStore+bottom;
1294 }
1295
1296 /*
1297  * Hey, Emacs, please set the following:
1298  *
1299  * Local Variables:
1300  * indent-tabs-mode: nil
1301  * End:
1302  *
1303  */