icuSources/tools/gennames/gennames.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2001, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennames.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999sep30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the character code,
  18 *   the "modern" character name, and optionally the
  19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
  20 *   It then tokenizes and compresses the names and builds
  21 *   compact binary tables for random-access lookup
  22 *   in a u_charName() API function.
  23 *
  24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
  25 * (all data is static const)
  26 *
  27 * UDataInfo fields:
  28 *   dataFormat "unam"
  29 *   formatVersion 1.0
  30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
  31 *
  32 * -- data-based names
  33 * uint32_t tokenStringOffset,
  34 *          groupsOffset,
  35 *          groupStringOffset,
  36 *          algNamesOffset;
  37 *
  38 * uint16_t tokenCount;
  39 * uint16_t tokenTable[tokenCount];
  40 *
  41 * char     tokenStrings[]; -- padded to even count
  42 *
  43 * -- strings (groupStrings) are tokenized as follows:
  44 *   for each character c
  45 *       if(c>=tokenCount) write that character c directly
  46 *   else
  47 *       token=tokenTable[c];
  48 *       if(token==0xfffe) -- lead byte of double-byte token
  49 *           token=tokenTable[c<<8|next character];
  50 *       if(token==-1)
  51 *           write c directly
  52 *       else
  53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
  54 *           append zero-terminated tokenString;
  55 *
  56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
  57 *    are separated by ';'.
  58 *
  59 * uint16_t groupCount;
  60 * struct {
  61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
  62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
  63 *   uint16_t offsetLow;
  64 * } groupTable[groupCount];
  65 *
  66 * char     groupStrings[]; -- padded to 4-count
  67 *
  68 * -- The actual, tokenized group strings are not zero-terminated because
  69 *   that would take up too much space.
  70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
  71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
  72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
  73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
  74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
  75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
  76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
  77 *   For the de-tokenizing, see token description above; the strings immediately follow the
  78 *   32 lengths.
  79 *
  80 * -- algorithmic names
  81 *
  82 * typedef struct AlgorithmicRange {
  83 *     uint32_t rangeStart, rangeEnd;
  84 *     uint8_t algorithmType, algorithmVariant;
  85 *     uint16_t rangeSize;
  86 * } AlgorithmicRange;
  87 *
  88 * uint32_t algRangesCount; -- number of data blocks for ranges of
  89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
  90 *
  91 * struct {
  92 *     AlgorithmicRange algRange;
  93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
  94 * } algRanges[algNamesCount];
  95 * -- not a real array because each part has a different size
  96 *    of algRange.rangeSize (including AlgorithmicRange)
  97 *
  98 * -- algorithmic range types:
  99 *
 100 * 0 Names are formed from a string prefix that is stored in
 101 *   the algRangeData (zero-terminated), followed by the Unicode code point
 102 *   of the character in hexadecimal digits;
 103 *   algRange.algorithmVariant digits are written
 104 *
 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
 106 *   algRange.algorithmVariant is the count of modulo factors
 107 *   algRangeData contains
 108 *       uint16_t factors[algRange.algorithmVariant];
 109 *       char strings[];
 110 *   the first zero-terminated string is written as the prefix; then:
 111 *
 112 *   The rangeStart is subtracted; with the difference, here "code":
 113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
 114 *       index[i]=code%factor[i];
 115 *       code/=factor[i];
 116 *
 117 *   The strings after the prefix are short pieces that are then appended to the result
 118 *   according to index[0..algRange.algorithmVariant-1].
 119 */
 120
 121 #include <stdio.h>
 122 #include <stdlib.h>
 123 #include "unicode/utypes.h"
 124 #include "unicode/putil.h"
 125 #include "cmemory.h"
 126 #include "cstring.h"
 127 #include "unicode/udata.h"
 128 #include "unewdata.h"
 129 #include "uoptions.h"
 130 #include "uparse.h"
 131
 132 #define STRING_STORE_SIZE 1000000
 133 #define GROUP_STORE_SIZE 5000
 134
 135 #define GROUP_SHIFT 5
 136 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
 137 #define GROUP_MASK (LINES_PER_GROUP-1)
 138
 139 #define MAX_LINE_COUNT 50000
 140 #define MAX_WORD_COUNT 20000
 141 #define MAX_GROUP_COUNT 5000
 142
 143 #define DATA_NAME "unames"
 144 #define DATA_TYPE "icu"
 145 #define VERSION_STRING "unam"
 146 #define NAME_SEPARATOR_CHAR ';'
 147
 148 static const UVersionInfo
 149 unicode_3_0={ 3, 0, 0, 0 },
 150 unicode_3_1={ 3, 1, 0, 0 };
 151
 152 /* UDataInfo cf. udata.h */
 153 static UDataInfo dataInfo={
 154     sizeof(UDataInfo),
 155     0,
 156
 157     U_IS_BIG_ENDIAN,
 158     U_CHARSET_FAMILY,
 159     sizeof(UChar),
 160     0,
 161
 162     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
 163     {1, 0, 0, 0},                 /* formatVersion */
 164     {3, 0, 0, 0}                  /* dataVersion */
 165 };
 166
 167 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
 168
 169 static uint8_t stringStore[STRING_STORE_SIZE],
 170                groupStore[GROUP_STORE_SIZE],
 171                lineLengths[LINES_PER_GROUP];
 172
 173 static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
 174
 175 typedef struct {
 176     uint32_t code;
 177     int16_t length;
 178     uint8_t *s;
 179 } Line;
 180
 181 typedef struct {
 182     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
 183     int16_t count;
 184     int16_t length;
 185     uint8_t *s;
 186 } Word;
 187
 188 static Line lines[MAX_LINE_COUNT];
 189 static Word words[MAX_WORD_COUNT];
 190
 191 static uint32_t lineCount=0, wordCount=0;
 192
 193 static int16_t leadByteCount;
 194
 195 #define LEADBYTE_LIMIT 16
 196
 197 static int16_t tokens[LEADBYTE_LIMIT*256];
 198 static uint32_t tokenCount;
 199
 200 /* prototypes --------------------------------------------------------------- */
 201
 202 static void
 203 init(void);
 204
 205 static void
 206 parseDB(const char *filename, UBool store10Names);
 207
 208 static void
 209 parseName(char *name, int16_t length);
 210
 211 static int16_t
 212 skipNoise(char *line, int16_t start, int16_t limit);
 213
 214 static int16_t
 215 getWord(char *line, int16_t start, int16_t limit);
 216
 217 static void
 218 compress(void);
 219
 220 static void
 221 compressLines(void);
 222
 223 static int16_t
 224 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
 225
 226 static int
 227 compareWords(const void *word1, const void *word2);
 228
 229 static void
 230 generateData(const char *dataDir);
 231
 232 static uint32_t
 233 generateAlgorithmicData(UNewDataMemory *pData);
 234
 235 static int16_t
 236 findToken(uint8_t *s, int16_t length);
 237
 238 static Word *
 239 findWord(char *s, int16_t length);
 240
 241 static Word *
 242 addWord(char *s, int16_t length);
 243
 244 static void
 245 countWord(Word *word);
 246
 247 static void
 248 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
 249
 250 static void
 251 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
 252
 253 static uint32_t
 254 addToken(uint8_t *s, int16_t length);
 255
 256 static void
 257 appendLineLength(int16_t length);
 258
 259 static void
 260 appendLineLengthNibble(uint8_t nibble);
 261
 262 static uint8_t *
 263 allocLine(int32_t length);
 264
 265 static uint8_t *
 266 allocWord(uint32_t length);
 267
 268 /* -------------------------------------------------------------------------- */
 269
 270 static UOption options[]={
 271     UOPTION_HELP_H,
 272     UOPTION_HELP_QUESTION_MARK,
 273     UOPTION_VERBOSE,
 274     UOPTION_QUIET,
 275     UOPTION_COPYRIGHT,
 276     UOPTION_DESTDIR,
 277     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
 278     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
 279 };
 280
 281 extern int
 282 main(int argc, char* argv[]) {
 283     UVersionInfo version;
 284     UBool store10Names=FALSE;
 285
 286     U_MAIN_INIT_ARGS(argc, argv);
 287
 288     /* preset then read command line options */
 289     options[5].value=u_getDataDirectory();
 290     options[6].value="3.2";
 291     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 292
 293     /* error handling, printing usage message */
 294     if(argc<0) {
 295         fprintf(stderr,
 296             "error in command line argument \"%s\"\n",
 297             argv[-argc]);
 298     } else if(argc<2) {
 299         argc=-1;
 300     }
 301     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
 302         /*
 303          * Broken into chucks because the C89 standard says the minimum
 304          * required supported string length is 509 bytes.
 305          */
 306         fprintf(stderr,
 307             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
 308             "\n"
 309             "Read the UnicodeData.txt file and \n"
 310             "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the character names\n"
 311             "\n"
 312             "\tfilename  absolute path/filename for the Unicode database text file\n"
 313             "\t\t(default: standard input)\n"
 314             "\n",
 315             argv[0]);
 316         fprintf(stderr,
 317             "Options:\n"
 318             "\t-h or -? or --help  this usage text\n"
 319             "\t-v or --verbose     verbose output\n"
 320             "\t-q or --quiet       no output\n"
 321             "\t-c or --copyright   include a copyright notice\n"
 322             "\t-d or --destdir     destination directory, followed by the path\n"
 323             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 324             "\t-1 or --unicode1-names  store Unicode 1.0 character names\n");
 325         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 326     }
 327
 328     /* get the options values */
 329     beVerbose=options[2].doesOccur;
 330     beQuiet=options[3].doesOccur;
 331     haveCopyright=options[4].doesOccur;
 332     store10Names=options[7].doesOccur;
 333
 334     /* set the Unicode version */
 335     u_versionFromString(version, options[6].value);
 336     uprv_memcpy(dataInfo.dataVersion, version, 4);
 337
 338     init();
 339     parseDB(argc>=2 ? argv[1] : "-", store10Names);
 340     compress();
 341     generateData(options[5].value);
 342
 343     return 0;
 344 }
 345
 346 static void
 347 init() {
 348     int i;
 349
 350     for(i=0; i<256; ++i) {
 351         tokens[i]=0;
 352     }
 353 }
 354
 355 /* parsing ------------------------------------------------------------------ */
 356
 357 static void U_CALLCONV
 358 lineFn(void *context,
 359        char *fields[][2], int32_t fieldCount,
 360        UErrorCode *pErrorCode) {
 361     char *names[3];
 362     int16_t lengths[3];
 363     static uint32_t prevCode=0;
 364     uint32_t code=0;
 365
 366     if(U_FAILURE(*pErrorCode)) {
 367         return;
 368     }
 369     /* get the character code */
 370     code=uprv_strtoul(fields[0][0], NULL, 16);
 371
 372     /* get the character name */
 373     names[0]=fields[1][0];
 374     if(fields[1][0][0]!='<') {
 375         lengths[0]=(int16_t)(fields[1][1]-names[0]);
 376     } else {
 377         /* do not store pseudo-names in <> brackets */
 378         lengths[0]=0;
 379     }
 380
 381     /* store 1.0 names */
 382     /* get the second character name, the one from Unicode 1.0 */
 383     /* do not store pseudo-names in <> brackets */
 384     names[1]=fields[10][0];
 385     if(*(UBool *)context && fields[10][0][0]!='<') {
 386         lengths[1]=(int16_t)(fields[10][1]-names[1]);
 387     } else {
 388         lengths[1]=0;
 389     }
 390
 391     /* get the ISO 10646 comment */
 392     names[2]=fields[11][0];
 393     lengths[2]=(int16_t)(fields[11][1]-names[2]);
 394
 395     if(lengths[0]+lengths[1]+lengths[2]==0) {
 396         return;
 397     }
 398
 399     /* check for non-character code points */
 400     if(!UTF_IS_UNICODE_CHAR(code)) {
 401         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
 402                 (unsigned long)code);
 403         *pErrorCode=U_PARSE_ERROR;
 404         exit(U_PARSE_ERROR);
 405     }
 406
 407     /* check that the code points (code) are in ascending order */
 408     if(code<=prevCode && code>0) {
 409         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 410                 (unsigned long)code, (unsigned long)prevCode);
 411         *pErrorCode=U_PARSE_ERROR;
 412         exit(U_PARSE_ERROR);
 413     }
 414     prevCode=code;
 415
 416     parseName(names[0], lengths[0]);
 417     parseName(names[1], lengths[1]);
 418     parseName(names[2], lengths[2]);
 419
 420     /*
 421      * set the count argument to
 422      * 1: only store regular names
 423      * 2: store regular and 1.0 names
 424      * 3: store names and ISO 10646 comment
 425      */
 426     addLine(code, names, lengths, 3);
 427 }
 428
 429 static void
 430 parseDB(const char *filename, UBool store10Names) {
 431     char *fields[15][2];
 432     UErrorCode errorCode=U_ZERO_ERROR;
 433
 434     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
 435     if(U_FAILURE(errorCode)) {
 436         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
 437         exit(errorCode);
 438     }
 439
 440     if(!beQuiet) {
 441         printf("size of all names in the database: %lu\n",
 442             (unsigned long)lineTop);
 443         printf("number of named Unicode characters: %lu\n",
 444             (unsigned long)lineCount);
 445         printf("number of words in the dictionary from these names: %lu\n",
 446             (unsigned long)wordCount);
 447     }
 448 }
 449
 450 static void
 451 parseName(char *name, int16_t length) {
 452     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
 453     Word *word;
 454
 455     while(start<length) {
 456         /* skip any "noise" characters */
 457         limit=skipNoise(name, start, length);
 458         if(start<limit) {
 459             /*prevStart=-1;*/
 460             start=limit;
 461         }
 462         if(start==length) {
 463             break;
 464         }
 465
 466         /* get a word and add it if it is longer than 1 */
 467         limit=getWord(name, start, length);
 468         wordLength=(int16_t)(limit-start);
 469         if(wordLength>1) {
 470             word=findWord(name+start, wordLength);
 471             if(word==NULL) {
 472                 word=addWord(name+start, wordLength);
 473             }
 474             countWord(word);
 475         }
 476
 477 #if 0
 478         /*
 479          * if there was a word before this
 480          * (with no noise in between), then add the pair of words, too
 481          */
 482         if(prevStart!=-1) {
 483             wordLength=limit-prevStart;
 484             word=findWord(name+prevStart, wordLength);
 485             if(word==NULL) {
 486                 word=addWord(name+prevStart, wordLength);
 487             }
 488             countWord(word);
 489         }
 490 #endif
 491
 492         /*prevStart=start;*/
 493         start=limit;
 494     }
 495 }
 496
 497 static UBool U_INLINE
 498 isWordChar(char c) {
 499     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
 500            ('J'<=c && c<='R') ||
 501            ('S'<=c && c<='Z') ||
 502
 503            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
 504            ('j'<=c && c<='r') ||
 505            ('s'<=c && c<='z') ||
 506
 507            ('0'<=c && c<='9');
 508 }
 509
 510 static int16_t
 511 skipNoise(char *line, int16_t start, int16_t limit) {
 512     /* skip anything that is not part of a word in this sense */
 513     while(start<limit && !isWordChar(line[start])) {
 514         ++start;
 515     }
 516
 517     return start;
 518 }
 519
 520 static int16_t
 521 getWord(char *line, int16_t start, int16_t limit) {
 522     char c=0; /* initialize to avoid a compiler warning although the code was safe */
 523
 524     /* a unicode character name word consists of A-Z0-9 */
 525     while(start<limit && isWordChar(line[start])) {
 526         ++start;
 527     }
 528
 529     /* include a following space or dash */
 530     if(start<limit && ((c=line[start])==' ' || c=='-')) {
 531         ++start;
 532     }
 533
 534     return start;
 535 }
 536
 537 /* compressing -------------------------------------------------------------- */
 538
 539 static void
 540 compress() {
 541     uint32_t i, letterCount;
 542     int16_t wordNumber;
 543
 544     /* sort the words in reverse order by weight */
 545     qsort(words, wordCount, sizeof(Word), compareWords);
 546
 547     /* remove the words that do not save anything */
 548     while(wordCount>0 && words[wordCount-1].weight<1) {
 549         --wordCount;
 550     }
 551
 552     /* count the letters in the token range */
 553     letterCount=0;
 554     for(i=LEADBYTE_LIMIT; i<256; ++i) {
 555         if(tokens[i]==-1) {
 556             ++letterCount;
 557         }
 558     }
 559     if(!beQuiet) {
 560         printf("number of letters used in the names: %d\n", letterCount);
 561     }
 562
 563     /* do we need double-byte tokens? */
 564     if(wordCount+letterCount<=256) {
 565         /* no, single-byte tokens are enough */
 566         leadByteCount=0;
 567         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
 568             if(tokens[i]!=-1) {
 569                 tokens[i]=wordNumber;
 570                 if(beVerbose) {
 571                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 572                             i, (long)words[wordNumber].weight,
 573                             words[wordNumber].length, words[wordNumber].s);
 574                 }
 575                 ++wordNumber;
 576             }
 577         }
 578         tokenCount=i;
 579     } else {
 580         /*
 581          * The tokens that need two token bytes
 582          * get their weight reduced by their count
 583          * because they save less.
 584          */
 585         tokenCount=256-letterCount;
 586         for(i=tokenCount; i<wordCount; ++i) {
 587             words[i].weight-=words[i].count;
 588         }
 589
 590         /* sort these words in reverse order by weight */
 591         qsort(words+tokenCount, wordCount-tokenCount, sizeof(Word), compareWords);
 592
 593         /* remove the words that do not save anything */
 594         while(wordCount>0 && words[wordCount-1].weight<1) {
 595             --wordCount;
 596         }
 597
 598         /* how many tokens and lead bytes do we have now? */
 599         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
 600         /*
 601          * adjust upwards to take into account that
 602          * double-byte tokens must not
 603          * use NAME_SEPARATOR_CHAR as a second byte
 604          */
 605         tokenCount+=(tokenCount-256+254)/255;
 606
 607         leadByteCount=(int16_t)(tokenCount>>8);
 608         if(leadByteCount<LEADBYTE_LIMIT) {
 609             /* adjust for the real number of lead bytes */
 610             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
 611         } else {
 612             /* limit the number of lead bytes */
 613             leadByteCount=LEADBYTE_LIMIT-1;
 614             tokenCount=LEADBYTE_LIMIT*256;
 615             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
 616             /* adjust again to skip double-byte tokens with ';' */
 617             wordCount-=(tokenCount-256+254)/255;
 618         }
 619
 620         /* set token 0 to word 0 */
 621         tokens[0]=0;
 622         if(beVerbose) {
 623             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
 624                     (long)words[0].weight,
 625                     words[0].length, words[0].s);
 626         }
 627         wordNumber=1;
 628
 629         /* set the lead byte tokens */
 630         for(i=1; (int16_t)i<=leadByteCount; ++i) {
 631             tokens[i]=-2;
 632         }
 633
 634         /* set the tokens */
 635         for(; i<256; ++i) {
 636             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
 637             if(tokens[i]!=-1) {
 638                 tokens[i]=wordNumber;
 639                 if(beVerbose) {
 640                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 641                             i, (long)words[wordNumber].weight,
 642                             words[wordNumber].length, words[wordNumber].s);
 643                 }
 644                 ++wordNumber;
 645             }
 646         }
 647
 648         /* continue above 255 where there are no letters */
 649         for(; (uint32_t)wordNumber<wordCount; ++i) {
 650             if((i&0xff)==NAME_SEPARATOR_CHAR) {
 651                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 652             } else {
 653                 tokens[i]=wordNumber;
 654                 if(beVerbose) {
 655                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 656                             i, (long)words[wordNumber].weight,
 657                             words[wordNumber].length, words[wordNumber].s);
 658                 }
 659                 ++wordNumber;
 660             }
 661         }
 662         tokenCount=i; /* should be already tokenCount={i or i+1} */
 663     }
 664
 665     if(!beQuiet) {
 666         printf("number of lead bytes: %d\n", leadByteCount);
 667         printf("number of single-byte tokens: %lu\n",
 668             (unsigned long)256-letterCount-leadByteCount);
 669         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
 670     }
 671
 672     compressLines();
 673 }
 674
 675 static void
 676 compressLines() {
 677     Line *line=NULL;
 678     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
 679              groupMSB=0xffff, lineCount2;
 680     int16_t groupTop=0;
 681
 682     /* store the groups like lines, reusing the lines' memory */
 683     lineTop=0;
 684     lineCount2=lineCount;
 685     lineCount=0;
 686
 687     /* loop over all lines */
 688     while(i<lineCount2) {
 689         line=lines+i++;
 690         inLine=line->code;
 691
 692         /* segment the lines to groups of 32 */
 693         if(inLine>>GROUP_SHIFT!=groupMSB) {
 694             /* finish the current group with empty lines */
 695             while((++outLine&GROUP_MASK)!=0) {
 696                 appendLineLength(0);
 697             }
 698
 699             /* store the group like a line */
 700             if(groupTop>0) {
 701                 if(groupTop>GROUP_STORE_SIZE) {
 702                     fprintf(stderr, "gennames: group store overflow\n");
 703                     exit(U_BUFFER_OVERFLOW_ERROR);
 704                 }
 705                 addGroup(groupMSB, groupStore, groupTop);
 706                 if(lineTop>(uint32_t)(line->s-stringStore)) {
 707                     fprintf(stderr, "gennames: group store runs into string store\n");
 708                     exit(U_INTERNAL_PROGRAM_ERROR);
 709                 }
 710             }
 711
 712             /* start the new group */
 713             lineLengthsTop=0;
 714             groupTop=0;
 715             groupMSB=inLine>>GROUP_SHIFT;
 716             outLine=(inLine&~GROUP_MASK)-1;
 717         }
 718
 719         /* write empty lines between the previous line in the group and this one */
 720         while(++outLine<inLine) {
 721             appendLineLength(0);
 722         }
 723
 724         /* write characters and tokens for this line */
 725         appendLineLength(compressLine(line->s, line->length, &groupTop));
 726     }
 727
 728     /* finish and store the last group */
 729     if(line && groupMSB!=0xffff) {
 730         /* finish the current group with empty lines */
 731         while((++outLine&GROUP_MASK)!=0) {
 732             appendLineLength(0);
 733         }
 734
 735         /* store the group like a line */
 736         if(groupTop>0) {
 737             if(groupTop>GROUP_STORE_SIZE) {
 738                 fprintf(stderr, "gennames: group store overflow\n");
 739                 exit(U_BUFFER_OVERFLOW_ERROR);
 740             }
 741             addGroup(groupMSB, groupStore, groupTop);
 742             if(lineTop>(uint32_t)(line->s-stringStore)) {
 743                 fprintf(stderr, "gennames: group store runs into string store\n");
 744                 exit(U_INTERNAL_PROGRAM_ERROR);
 745             }
 746         }
 747     }
 748
 749     if(!beQuiet) {
 750         printf("number of groups: %lu\n", (unsigned long)lineCount);
 751     }
 752 }
 753
 754 static int16_t
 755 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
 756     int16_t start, limit, token, groupTop=*pGroupTop;
 757
 758     start=0;
 759     do {
 760         /* write any "noise" characters */
 761         limit=skipNoise((char *)s, start, length);
 762         while(start<limit) {
 763             groupStore[groupTop++]=s[start++];
 764         }
 765
 766         if(start==length) {
 767             break;
 768         }
 769
 770         /* write a word, as token or directly */
 771         limit=getWord((char *)s, start, length);
 772         if(limit-start==1) {
 773             groupStore[groupTop++]=s[start++];
 774         } else {
 775             token=findToken(s+start, (int16_t)(limit-start));
 776             if(token!=-1) {
 777                 if(token>0xff) {
 778                     groupStore[groupTop++]=(uint8_t)(token>>8);
 779                 }
 780                 groupStore[groupTop++]=(uint8_t)token;
 781                 start=limit;
 782             } else {
 783                 while(start<limit) {
 784                     groupStore[groupTop++]=s[start++];
 785                 }
 786             }
 787         }
 788     } while(start<length);
 789
 790     length=(int16_t)(groupTop-*pGroupTop);
 791     *pGroupTop=groupTop;
 792     return length;
 793 }
 794
 795 static int
 796 compareWords(const void *word1, const void *word2) {
 797     /* reverse sort by word weight */
 798     return ((Word *)word2)->weight-((Word *)word1)->weight;
 799 }
 800
 801 /* generate output data ----------------------------------------------------- */
 802
 803 static void
 804 generateData(const char *dataDir) {
 805     UNewDataMemory *pData;
 806     UErrorCode errorCode=U_ZERO_ERROR;
 807     uint16_t groupWords[3];
 808     uint32_t i, groupTop=lineTop, offset, size,
 809              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
 810     long dataLength;
 811     int16_t token;
 812
 813     pData=udata_create(dataDir, DATA_TYPE,U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
 814                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
 815     if(U_FAILURE(errorCode)) {
 816         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
 817         exit(errorCode);
 818     }
 819
 820     /* first, see how much space we need, and prepare the token strings */
 821     for(i=0; i<tokenCount; ++i) {
 822         token=tokens[i];
 823         if(token!=-1 && token!=-2) {
 824             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
 825         }
 826     }
 827
 828     /*
 829      * Calculate the total size in bytes of the data including:
 830      * - the offset to the token strings, uint32_t (4)
 831      * - the offset to the group table, uint32_t (4)
 832      * - the offset to the group strings, uint32_t (4)
 833      * - the offset to the algorithmic names, uint32_t (4)
 834      *
 835      * - the number of tokens, uint16_t (2)
 836      * - the token table, uint16_t[tokenCount] (2*tokenCount)
 837      *
 838      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
 839      *
 840      * - the number of groups, uint16_t (2)
 841      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
 842      *
 843      * - the group strings (groupTop), 2-padded
 844      *
 845      * - the size of the data for the algorithmic names
 846      */
 847     tokenStringOffset=4+4+4+4+2+2*tokenCount;
 848     groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
 849     groupStringOffset=groupsOffset+2+6*lineCount;
 850     algNamesOffset=(groupStringOffset+groupTop+3)&~3;
 851
 852     offset=generateAlgorithmicData(NULL);
 853     size=algNamesOffset+offset;
 854
 855     if(!beQuiet) {
 856         printf("size of the Unicode Names data:\n"
 857                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
 858                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
 859                 (unsigned long)groupTop, (unsigned long)offset);
 860     }
 861
 862     /* write the data to the file */
 863     /* offsets */
 864     udata_write32(pData, tokenStringOffset);
 865     udata_write32(pData, groupsOffset);
 866     udata_write32(pData, groupStringOffset);
 867     udata_write32(pData, algNamesOffset);
 868
 869     /* token table */
 870     udata_write16(pData, (uint16_t)tokenCount);
 871     udata_writeBlock(pData, tokens, 2*tokenCount);
 872
 873     /* token strings */
 874     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
 875     if((lineTop-groupTop)&1) {
 876         /* 2-padding */
 877         udata_writePadding(pData, 1);
 878     }
 879
 880     /* group table */
 881     udata_write16(pData, (uint16_t)lineCount);
 882     for(i=0; i<lineCount; ++i) {
 883         /* groupMSB */
 884         groupWords[0]=(uint16_t)lines[i].code;
 885
 886         /* offset */
 887         offset = (uint32_t)(lines[i].s - stringStore);
 888         groupWords[1]=(uint16_t)(offset>>16);
 889         groupWords[2]=(uint16_t)(offset);
 890         udata_writeBlock(pData, groupWords, 6);
 891     }
 892
 893     /* group strings */
 894     udata_writeBlock(pData, stringStore, groupTop);
 895
 896     /* 4-align the algorithmic names data */
 897     udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
 898
 899     generateAlgorithmicData(pData);
 900
 901     /* finish up */
 902     dataLength=udata_finish(pData, &errorCode);
 903     if(U_FAILURE(errorCode)) {
 904         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
 905         exit(errorCode);
 906     }
 907
 908     if(dataLength!=(long)size) {
 909         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
 910 dataLength, (unsigned long)size);
 911         exit(U_INTERNAL_PROGRAM_ERROR);
 912     }
 913 }
 914
 915 /* the structure for algorithmic names needs to be 4-aligned */
 916 typedef struct AlgorithmicRange {
 917     uint32_t rangeStart, rangeEnd;
 918     uint8_t algorithmType, algorithmVariant;
 919     uint16_t rangeSize;
 920 } AlgorithmicRange;
 921
 922 static uint32_t
 923 generateAlgorithmicData(UNewDataMemory *pData) {
 924     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
 925 #   define PREFIX_LENGTH 23
 926 #   define PREFIX_LENGTH_4 24
 927     uint32_t countAlgRanges;
 928
 929     static AlgorithmicRange cjkExtA={
 930         0x3400, 0x4db5,
 931         0, 4,
 932         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 933     };
 934     static AlgorithmicRange cjk={
 935         0x4e00, 0x9fa5,
 936         0, 4,
 937         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 938     };
 939     static AlgorithmicRange cjkExtB={
 940         0x20000, 0x2a6d6,
 941         0, 5,
 942         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
 943     };
 944
 945     static char jamo[]=
 946         "HANGUL SYLLABLE \0"
 947
 948         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
 949         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
 950
 951         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
 952         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
 953         "YU\0EU\0YI\0I\0"
 954
 955         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
 956         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
 957         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
 958     ;
 959
 960     static AlgorithmicRange hangul={
 961         0xac00, 0xd7a3,
 962         1, 3,
 963         sizeof(AlgorithmicRange)+6+sizeof(jamo)
 964     };
 965
 966     /* modulo factors, maximum 8 */
 967     /* 3 factors: 19, 21, 28, most-to-least-significant */
 968     static uint16_t hangulFactors[3]={
 969         19, 21, 28
 970     };
 971
 972     uint32_t size;
 973
 974     size=0;
 975
 976     /* number of ranges of algorithmic names */
 977     if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) {
 978         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
 979         countAlgRanges=4;
 980     } else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) {
 981         /* Unicode 3.0 has 3 ranges including CJK Extension A */
 982         countAlgRanges=3;
 983     } else {
 984         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
 985         countAlgRanges=2;
 986     }
 987
 988     if(pData!=NULL) {
 989         udata_write32(pData, countAlgRanges);
 990     } else {
 991         size+=4;
 992     }
 993
 994     /*
 995      * each range:
 996      * uint32_t rangeStart
 997      * uint32_t rangeEnd
 998      * uint8_t algorithmType
 999      * uint8_t algorithmVariant
1000      * uint16_t size of range data
1001      * uint8_t[size] data
1002      */
1003
1004     /* range 0: cjk extension a */
1005     if(countAlgRanges>=3) {
1006         if(pData!=NULL) {
1007             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1008             udata_writeString(pData, prefix, PREFIX_LENGTH);
1009             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1010                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1011             }
1012         } else {
1013             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1014         }
1015     }
1016
1017     /* range 1: cjk */
1018     if(pData!=NULL) {
1019         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1020         udata_writeString(pData, prefix, PREFIX_LENGTH);
1021         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1022             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1023         }
1024     } else {
1025         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1026     }
1027
1028     /* range 2: hangul syllables */
1029     if(pData!=NULL) {
1030         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1031         udata_writeBlock(pData, hangulFactors, 6);
1032         udata_writeString(pData, jamo, sizeof(jamo));
1033     } else {
1034         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1035     }
1036
1037     /* range 3: cjk extension b */
1038     if(countAlgRanges>=4) {
1039         if(pData!=NULL) {
1040             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1041             udata_writeString(pData, prefix, PREFIX_LENGTH);
1042             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1043                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1044             }
1045         } else {
1046             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1047         }
1048     }
1049
1050     return size;
1051 }
1052
1053 /* helpers ------------------------------------------------------------------ */
1054
1055 static int16_t
1056 findToken(uint8_t *s, int16_t length) {
1057     int16_t i, token;
1058
1059     for(i=0; i<(int16_t)tokenCount; ++i) {
1060         token=tokens[i];
1061         if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1062             return i;
1063         }
1064     }
1065
1066     return -1;
1067 }
1068
1069 static Word *
1070 findWord(char *s, int16_t length) {
1071     uint32_t i;
1072
1073     for(i=0; i<wordCount; ++i) {
1074         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1075             return words+i;
1076         }
1077     }
1078
1079     return NULL;
1080 }
1081
1082 static Word *
1083 addWord(char *s, int16_t length) {
1084     uint8_t *stringStart;
1085     Word *word;
1086
1087     if(wordCount==MAX_WORD_COUNT) {
1088         fprintf(stderr, "gennames: too many words\n");
1089         exit(U_BUFFER_OVERFLOW_ERROR);
1090     }
1091
1092     stringStart=allocWord(length);
1093     uprv_memcpy(stringStart, s, length);
1094
1095     word=words+wordCount;
1096
1097     /*
1098      * Initialize the weight with the costs for this token:
1099      * a zero-terminated string and a 16-bit offset.
1100      */
1101     word->weight=-(length+1+2);
1102     word->count=0;
1103     word->length=length;
1104     word->s=stringStart;
1105
1106     ++wordCount;
1107
1108     return word;
1109 }
1110
1111 static void
1112 countWord(Word *word) {
1113     /* add to the weight the savings: the length of the word minus 1 byte for the token */
1114     word->weight+=word->length-1;
1115     ++word->count;
1116 }
1117
1118 static void
1119 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1120     uint8_t *stringStart;
1121     Line *line;
1122     int16_t i, length;
1123
1124     if(lineCount==MAX_LINE_COUNT) {
1125         fprintf(stderr, "gennames: too many lines\n");
1126         exit(U_BUFFER_OVERFLOW_ERROR);
1127     }
1128
1129     /* find the last non-empty name */
1130     while(count>0 && lengths[count-1]==0) {
1131         --count;
1132     }
1133     if(count==0) {
1134         return; /* should not occur: caller should not have called */
1135     }
1136
1137     /* there will be (count-1) separator characters */
1138     i=count;
1139     length=count-1;
1140
1141     /* add lengths of strings */
1142     while(i>0) {
1143         length+=lengths[--i];
1144     }
1145
1146     /* allocate line memory */
1147     stringStart=allocLine(length);
1148
1149     /* copy all strings into the line memory */
1150     length=0; /* number of chars copied so far */
1151     for(i=0; i<count; ++i) {
1152         if(i>0) {
1153             stringStart[length++]=NAME_SEPARATOR_CHAR;
1154         }
1155         if(lengths[i]>0) {
1156             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1157             length+=lengths[i];
1158         }
1159     }
1160
1161     line=lines+lineCount;
1162
1163     line->code=code;
1164     line->length=length;
1165     line->s=stringStart;
1166
1167     ++lineCount;
1168
1169     /* prevent a character value that is actually in a name from becoming a token */
1170     while(length>0) {
1171         tokens[stringStart[--length]]=-1;
1172     }
1173 }
1174
1175 static void
1176 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1177     uint8_t *stringStart;
1178     Line *line;
1179
1180     if(lineCount==MAX_LINE_COUNT) {
1181         fprintf(stderr, "gennames: too many groups\n");
1182         exit(U_BUFFER_OVERFLOW_ERROR);
1183     }
1184
1185     /* store the line lengths first, then the strings */
1186     lineLengthsTop=(lineLengthsTop+1)/2;
1187     stringStart=allocLine(lineLengthsTop+length);
1188     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1189     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1190
1191     line=lines+lineCount;
1192
1193     line->code=groupMSB;
1194     line->length=length;
1195     line->s=stringStart;
1196
1197     ++lineCount;
1198 }
1199
1200 static uint32_t
1201 addToken(uint8_t *s, int16_t length) {
1202     uint8_t *stringStart;
1203
1204     stringStart=allocLine(length+1);
1205     uprv_memcpy(stringStart, s, length);
1206     stringStart[length]=0;
1207
1208     return (uint32_t)(stringStart - stringStore);
1209 }
1210
1211 static void
1212 appendLineLength(int16_t length) {
1213     if(length>=76) {
1214         fprintf(stderr, "gennames: compressed line too long\n");
1215         exit(U_BUFFER_OVERFLOW_ERROR);
1216     }
1217     if(length>=12) {
1218         length-=12;
1219         appendLineLengthNibble((uint8_t)((length>>4)|12));
1220     }
1221     appendLineLengthNibble((uint8_t)length);
1222 }
1223
1224 static void
1225 appendLineLengthNibble(uint8_t nibble) {
1226     if((lineLengthsTop&1)==0) {
1227         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1228     } else {
1229         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1230     }
1231     ++lineLengthsTop;
1232 }
1233
1234 static uint8_t *
1235 allocLine(int32_t length) {
1236     uint32_t top=lineTop+length;
1237     uint8_t *p;
1238
1239     if(top>wordBottom) {
1240         fprintf(stderr, "gennames: out of memory\n");
1241         exit(U_MEMORY_ALLOCATION_ERROR);
1242     }
1243     p=stringStore+lineTop;
1244     lineTop=top;
1245     return p;
1246 }
1247
1248 static uint8_t *
1249 allocWord(uint32_t length) {
1250     uint32_t bottom=wordBottom-length;
1251
1252     if(lineTop>bottom) {
1253         fprintf(stderr, "gennames: out of memory\n");
1254         exit(U_MEMORY_ALLOCATION_ERROR);
1255     }
1256     wordBottom=bottom;
1257     return stringStore+bottom;
1258 }
1259
1260 /*
1261  * Hey, Emacs, please set the following:
1262  *
1263  * Local Variables:
1264  * indent-tabs-mode: nil
1265  * End:
1266  *
1267  */