icuSources/tools/gennames/gennames.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennames.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999sep30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the character code,
  18 *   the "modern" character name, and optionally the
  19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
  20 *   It then tokenizes and compresses the names and builds
  21 *   compact binary tables for random-access lookup
  22 *   in a u_charName() API function.
  23 *
  24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
  25 * (all data is static const)
  26 *
  27 * UDataInfo fields:
  28 *   dataFormat "unam"
  29 *   formatVersion 1.0
  30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
  31 *
  32 * -- data-based names
  33 * uint32_t tokenStringOffset,
  34 *          groupsOffset,
  35 *          groupStringOffset,
  36 *          algNamesOffset;
  37 *
  38 * uint16_t tokenCount;
  39 * uint16_t tokenTable[tokenCount];
  40 *
  41 * char     tokenStrings[]; -- padded to even count
  42 *
  43 * -- strings (groupStrings) are tokenized as follows:
  44 *   for each character c
  45 *       if(c>=tokenCount) write that character c directly
  46 *   else
  47 *       token=tokenTable[c];
  48 *       if(token==0xfffe) -- lead byte of double-byte token
  49 *           token=tokenTable[c<<8|next character];
  50 *       if(token==-1)
  51 *           write c directly
  52 *       else
  53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
  54 *           append zero-terminated tokenString;
  55 *
  56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
  57 *    are separated by ';'.
  58 *
  59 * uint16_t groupCount;
  60 * struct {
  61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
  62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
  63 *   uint16_t offsetLow;
  64 * } groupTable[groupCount];
  65 *
  66 * char     groupStrings[]; -- padded to 4-count
  67 *
  68 * -- The actual, tokenized group strings are not zero-terminated because
  69 *   that would take up too much space.
  70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
  71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
  72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
  73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
  74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
  75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
  76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
  77 *   For the de-tokenizing, see token description above; the strings immediately follow the
  78 *   32 lengths.
  79 *
  80 * -- algorithmic names
  81 *
  82 * typedef struct AlgorithmicRange {
  83 *     uint32_t rangeStart, rangeEnd;
  84 *     uint8_t algorithmType, algorithmVariant;
  85 *     uint16_t rangeSize;
  86 * } AlgorithmicRange;
  87 *
  88 * uint32_t algRangesCount; -- number of data blocks for ranges of
  89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
  90 *
  91 * struct {
  92 *     AlgorithmicRange algRange;
  93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
  94 * } algRanges[algNamesCount];
  95 * -- not a real array because each part has a different size
  96 *    of algRange.rangeSize (including AlgorithmicRange)
  97 *
  98 * -- algorithmic range types:
  99 *
 100 * 0 Names are formed from a string prefix that is stored in
 101 *   the algRangeData (zero-terminated), followed by the Unicode code point
 102 *   of the character in hexadecimal digits;
 103 *   algRange.algorithmVariant digits are written
 104 *
 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
 106 *   algRange.algorithmVariant is the count of modulo factors
 107 *   algRangeData contains
 108 *       uint16_t factors[algRange.algorithmVariant];
 109 *       char strings[];
 110 *   the first zero-terminated string is written as the prefix; then:
 111 *
 112 *   The rangeStart is subtracted; with the difference, here "code":
 113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
 114 *       index[i]=code%factor[i];
 115 *       code/=factor[i];
 116 *
 117 *   The strings after the prefix are short pieces that are then appended to the result
 118 *   according to index[0..algRange.algorithmVariant-1].
 119 */
 120
 121 #include <stdio.h>
 122 #include "unicode/utypes.h"
 123 #include "unicode/putil.h"
 124 #include "unicode/uclean.h"
 125 #include "unicode/udata.h"
 126 #include "cmemory.h"
 127 #include "cstring.h"
 128 #include "uarrsort.h"
 129 #include "unewdata.h"
 130 #include "uoptions.h"
 131 #include "uparse.h"
 132
 133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
 134
 135 #define STRING_STORE_SIZE 1000000
 136 #define GROUP_STORE_SIZE 5000
 137
 138 #define GROUP_SHIFT 5
 139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
 140 #define GROUP_MASK (LINES_PER_GROUP-1)
 141
 142 #define MAX_LINE_COUNT 50000
 143 #define MAX_WORD_COUNT 20000
 144 #define MAX_GROUP_COUNT 5000
 145
 146 #define DATA_NAME "unames"
 147 #define DATA_TYPE "icu"
 148 #define VERSION_STRING "unam"
 149 #define NAME_SEPARATOR_CHAR ';'
 150
 151 #define ISO_DATA_NAME "ucomment"
 152
 153 /* Unicode versions --------------------------------------------------------- */
 154
 155 enum {
 156     UNI_1_0,
 157     UNI_1_1,
 158     UNI_2_0,
 159     UNI_3_0,
 160     UNI_3_1,
 161     UNI_3_2,
 162     UNI_4_0,
 163     UNI_4_0_1,
 164     UNI_4_1,
 165     UNI_5_0,
 166     UNI_5_1,
 167     UNI_VER_COUNT
 168 };
 169
 170 static const UVersionInfo
 171 unicodeVersions[]={
 172     { 1, 0, 0, 0 },
 173     { 1, 1, 0, 0 },
 174     { 2, 0, 0, 0 },
 175     { 3, 0, 0, 0 },
 176     { 3, 1, 0, 0 },
 177     { 3, 2, 0, 0 },
 178     { 4, 0, 0, 0 },
 179     { 4, 0, 1, 0 },
 180     { 4, 1, 0, 0 },
 181     { 5, 0, 0, 0 },
 182     { 5, 1, 0, 0 }
 183 };
 184
 185 static int32_t ucdVersion=UNI_5_1;
 186
 187 static int32_t
 188 findUnicodeVersion(const UVersionInfo version) {
 189     int32_t i;
 190
 191     for(i=0; /* while(version>unicodeVersions[i]) {} */
 192         i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
 193         ++i) {}
 194     if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
 195         --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
 196     }
 197     return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
 198 }
 199
 200 /* generator data ----------------------------------------------------------- */
 201
 202 /* UDataInfo cf. udata.h */
 203 static UDataInfo dataInfo={
 204     sizeof(UDataInfo),
 205     0,
 206
 207     U_IS_BIG_ENDIAN,
 208     U_CHARSET_FAMILY,
 209     sizeof(UChar),
 210     0,
 211
 212     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
 213     {1, 0, 0, 0},                 /* formatVersion */
 214     {3, 0, 0, 0}                  /* dataVersion */
 215 };
 216
 217 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
 218
 219 typedef struct Options {
 220     UBool storeNames;
 221     UBool store10Names;
 222     UBool storeISOComments;
 223 } Options;
 224
 225 static uint8_t stringStore[STRING_STORE_SIZE],
 226                groupStore[GROUP_STORE_SIZE],
 227                lineLengths[LINES_PER_GROUP];
 228
 229 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
 230
 231 typedef struct {
 232     uint32_t code;
 233     int16_t length;
 234     uint8_t *s;
 235 } Line;
 236
 237 typedef struct {
 238     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
 239     int16_t count;
 240     int16_t length;
 241     uint8_t *s;
 242 } Word;
 243
 244 static Line lines[MAX_LINE_COUNT];
 245 static Word words[MAX_WORD_COUNT];
 246
 247 static uint32_t lineCount=0, wordCount=0;
 248
 249 static int16_t leadByteCount;
 250
 251 #define LEADBYTE_LIMIT 16
 252
 253 static int16_t tokens[LEADBYTE_LIMIT*256];
 254 static uint32_t tokenCount;
 255
 256 /* prototypes --------------------------------------------------------------- */
 257
 258 static void
 259 init(void);
 260
 261 static void
 262 parseDB(const char *filename, Options *options);
 263
 264 static void
 265 parseName(char *name, int16_t length);
 266
 267 static int16_t
 268 skipNoise(char *line, int16_t start, int16_t limit);
 269
 270 static int16_t
 271 getWord(char *line, int16_t start, int16_t limit);
 272
 273 static void
 274 compress(void);
 275
 276 static void
 277 compressLines(void);
 278
 279 static int16_t
 280 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
 281
 282 static int32_t
 283 compareWords(const void *context, const void *word1, const void *word2);
 284
 285 static void
 286 generateData(const char *dataDir, Options *options);
 287
 288 static uint32_t
 289 generateAlgorithmicData(UNewDataMemory *pData, Options *options);
 290
 291 static int16_t
 292 findToken(uint8_t *s, int16_t length);
 293
 294 static Word *
 295 findWord(char *s, int16_t length);
 296
 297 static Word *
 298 addWord(char *s, int16_t length);
 299
 300 static void
 301 countWord(Word *word);
 302
 303 static void
 304 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
 305
 306 static void
 307 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
 308
 309 static uint32_t
 310 addToken(uint8_t *s, int16_t length);
 311
 312 static void
 313 appendLineLength(int16_t length);
 314
 315 static void
 316 appendLineLengthNibble(uint8_t nibble);
 317
 318 static uint8_t *
 319 allocLine(int32_t length);
 320
 321 static uint8_t *
 322 allocWord(uint32_t length);
 323
 324 /* -------------------------------------------------------------------------- */
 325
 326 enum {
 327     HELP_H,
 328     HELP_QUESTION_MARK,
 329     VERBOSE,
 330     QUIET,
 331     COPYRIGHT,
 332     DESTDIR,
 333     UNICODE,
 334     UNICODE1_NAMES,
 335     NO_ISO_COMMENTS,
 336     ONLY_ISO_COMMENTS
 337 };
 338
 339 static UOption options[]={
 340     UOPTION_HELP_H,
 341     UOPTION_HELP_QUESTION_MARK,
 342     UOPTION_VERBOSE,
 343     UOPTION_QUIET,
 344     UOPTION_COPYRIGHT,
 345     UOPTION_DESTDIR,
 346     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
 347     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
 348     { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
 349     { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
 350 };
 351
 352 extern int
 353 main(int argc, char* argv[]) {
 354     UVersionInfo version;
 355     Options moreOptions={ TRUE, FALSE, TRUE };
 356     UErrorCode errorCode = U_ZERO_ERROR;
 357
 358     U_MAIN_INIT_ARGS(argc, argv);
 359
 360     /* Initialize ICU */
 361     u_init(&errorCode);
 362     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
 363         /* Note: u_init() will try to open ICU property data.
 364          *       failures here are expected when building ICU from scratch.
 365          *       ignore them.
 366          */
 367         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
 368             argv[0], u_errorName(errorCode));
 369         exit(1);
 370     }
 371
 372     /* preset then read command line options */
 373     options[DESTDIR].value=u_getDataDirectory();
 374     options[UNICODE].value="4.1";
 375     argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
 376
 377     /* error handling, printing usage message */
 378     if(argc<0) {
 379         fprintf(stderr,
 380             "error in command line argument \"%s\"\n",
 381             argv[-argc]);
 382     } else if(argc<2) {
 383         argc=-1;
 384     }
 385     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 386         /*
 387          * Broken into chucks because the C89 standard says the minimum
 388          * required supported string length is 509 bytes.
 389          */
 390         fprintf(stderr,
 391             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
 392             "\n"
 393             "Read the UnicodeData.txt file and \n"
 394             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
 395             "\n"
 396             "\tfilename  absolute path/filename for the Unicode database text file\n"
 397             "\t\t(default: standard input)\n"
 398             "\n",
 399             argv[0]);
 400         fprintf(stderr,
 401             "Options:\n"
 402             "\t-h or -? or --help  this usage text\n"
 403             "\t-v or --verbose     verbose output\n"
 404             "\t-q or --quiet       no output\n"
 405             "\t-c or --copyright   include a copyright notice\n"
 406             "\t-d or --destdir     destination directory, followed by the path\n"
 407             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
 408         fprintf(stderr,
 409             "\t-1 or --unicode1-names     store Unicode 1.0 character names\n"
 410             "\t      --no-iso-comments    do not store ISO comments\n"
 411             "\t      --only-iso-comments  write ucomment.icu with only ISO comments\n");
 412         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 413     }
 414
 415     /* get the options values */
 416     beVerbose=options[VERBOSE].doesOccur;
 417     beQuiet=options[QUIET].doesOccur;
 418     haveCopyright=options[COPYRIGHT].doesOccur;
 419     moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
 420     moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
 421     if(options[ONLY_ISO_COMMENTS].doesOccur) {
 422         moreOptions.storeNames=moreOptions.store10Names=FALSE;
 423         moreOptions.storeISOComments=TRUE;
 424     }
 425
 426     /* set the Unicode version */
 427     u_versionFromString(version, options[UNICODE].value);
 428     uprv_memcpy(dataInfo.dataVersion, version, 4);
 429     ucdVersion=findUnicodeVersion(version);
 430
 431     init();
 432     parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
 433     compress();
 434     generateData(options[DESTDIR].value, &moreOptions);
 435
 436     u_cleanup();
 437     return 0;
 438 }
 439
 440 static void
 441 init() {
 442     int i;
 443
 444     for(i=0; i<256; ++i) {
 445         tokens[i]=0;
 446     }
 447 }
 448
 449 /* parsing ------------------------------------------------------------------ */
 450
 451 /* get a name, strip leading and trailing whitespace */
 452 static int16_t
 453 getName(char **pStart, char *limit) {
 454     /* strip leading whitespace */
 455     char *start=(char *)u_skipWhitespace(*pStart);
 456
 457     /* strip trailing whitespace */
 458     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
 459         --limit;
 460     }
 461
 462     /* return results */
 463     *pStart=start;
 464     return (int16_t)(limit-start);
 465 }
 466
 467 static void U_CALLCONV
 468 lineFn(void *context,
 469        char *fields[][2], int32_t fieldCount,
 470        UErrorCode *pErrorCode) {
 471     Options *storeOptions=(Options *)context;
 472     char *names[3];
 473     int16_t lengths[3]={ 0, 0, 0 };
 474     static uint32_t prevCode=0;
 475     uint32_t code=0;
 476
 477     if(U_FAILURE(*pErrorCode)) {
 478         return;
 479     }
 480     /* get the character code */
 481     code=uprv_strtoul(fields[0][0], NULL, 16);
 482
 483     /* get the character name */
 484     if(storeOptions->storeNames) {
 485         names[0]=fields[1][0];
 486         lengths[0]=getName(names+0, fields[1][1]);
 487         if(names[0][0]=='<') {
 488             /* do not store pseudo-names in <> brackets */
 489             lengths[0]=0;
 490         }
 491     }
 492
 493     /* store 1.0 names */
 494     /* get the second character name, the one from Unicode 1.0 */
 495     if(storeOptions->store10Names) {
 496         names[1]=fields[10][0];
 497         lengths[1]=getName(names+1, fields[10][1]);
 498         if(names[1][0]=='<') {
 499             /* do not store pseudo-names in <> brackets */
 500             lengths[1]=0;
 501         }
 502     }
 503
 504     /* get the ISO 10646 comment */
 505     if(storeOptions->storeISOComments) {
 506         names[2]=fields[11][0];
 507         lengths[2]=getName(names+2, fields[11][1]);
 508     }
 509
 510     if(lengths[0]+lengths[1]+lengths[2]==0) {
 511         return;
 512     }
 513
 514     /* check for non-character code points */
 515     if(!U_IS_UNICODE_CHAR(code)) {
 516         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
 517                 (unsigned long)code);
 518         *pErrorCode=U_PARSE_ERROR;
 519         exit(U_PARSE_ERROR);
 520     }
 521
 522     /* check that the code points (code) are in ascending order */
 523     if(code<=prevCode && code>0) {
 524         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 525                 (unsigned long)code, (unsigned long)prevCode);
 526         *pErrorCode=U_PARSE_ERROR;
 527         exit(U_PARSE_ERROR);
 528     }
 529     prevCode=code;
 530
 531     parseName(names[0], lengths[0]);
 532     parseName(names[1], lengths[1]);
 533     parseName(names[2], lengths[2]);
 534
 535     /*
 536      * set the count argument to
 537      * 1: only store regular names, or only store ISO 10646 comments
 538      * 2: store regular and 1.0 names
 539      * 3: store names and ISO 10646 comment
 540      *
 541      * addLine() will ignore empty trailing names
 542      */
 543     if(storeOptions->storeNames) {
 544         /* store names and comments as parsed according to storeOptions */
 545         addLine(code, names, lengths, 3);
 546     } else {
 547         /* store only ISO 10646 comments */
 548         addLine(code, names+2, lengths+2, 1);
 549     }
 550 }
 551
 552 static void
 553 parseDB(const char *filename, Options *storeOptions) {
 554     char *fields[15][2];
 555     UErrorCode errorCode=U_ZERO_ERROR;
 556
 557     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
 558     if(U_FAILURE(errorCode)) {
 559         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
 560         exit(errorCode);
 561     }
 562
 563     if(!beQuiet) {
 564         printf("size of all names in the database: %lu\n",
 565             (unsigned long)lineTop);
 566         printf("number of named Unicode characters: %lu\n",
 567             (unsigned long)lineCount);
 568         printf("number of words in the dictionary from these names: %lu\n",
 569             (unsigned long)wordCount);
 570     }
 571 }
 572
 573 static void
 574 parseName(char *name, int16_t length) {
 575     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
 576     Word *word;
 577
 578     while(start<length) {
 579         /* skip any "noise" characters */
 580         limit=skipNoise(name, start, length);
 581         if(start<limit) {
 582             /*prevStart=-1;*/
 583             start=limit;
 584         }
 585         if(start==length) {
 586             break;
 587         }
 588
 589         /* get a word and add it if it is longer than 1 */
 590         limit=getWord(name, start, length);
 591         wordLength=(int16_t)(limit-start);
 592         if(wordLength>1) {
 593             word=findWord(name+start, wordLength);
 594             if(word==NULL) {
 595                 word=addWord(name+start, wordLength);
 596             }
 597             countWord(word);
 598         }
 599
 600 #if 0
 601         /*
 602          * if there was a word before this
 603          * (with no noise in between), then add the pair of words, too
 604          */
 605         if(prevStart!=-1) {
 606             wordLength=limit-prevStart;
 607             word=findWord(name+prevStart, wordLength);
 608             if(word==NULL) {
 609                 word=addWord(name+prevStart, wordLength);
 610             }
 611             countWord(word);
 612         }
 613 #endif
 614
 615         /*prevStart=start;*/
 616         start=limit;
 617     }
 618 }
 619
 620 static UBool U_INLINE
 621 isWordChar(char c) {
 622     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
 623            ('J'<=c && c<='R') ||
 624            ('S'<=c && c<='Z') ||
 625
 626            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
 627            ('j'<=c && c<='r') ||
 628            ('s'<=c && c<='z') ||
 629
 630            ('0'<=c && c<='9');
 631 }
 632
 633 static int16_t
 634 skipNoise(char *line, int16_t start, int16_t limit) {
 635     /* skip anything that is not part of a word in this sense */
 636     while(start<limit && !isWordChar(line[start])) {
 637         ++start;
 638     }
 639
 640     return start;
 641 }
 642
 643 static int16_t
 644 getWord(char *line, int16_t start, int16_t limit) {
 645     char c=0; /* initialize to avoid a compiler warning although the code was safe */
 646
 647     /* a unicode character name word consists of A-Z0-9 */
 648     while(start<limit && isWordChar(line[start])) {
 649         ++start;
 650     }
 651
 652     /* include a following space or dash */
 653     if(start<limit && ((c=line[start])==' ' || c=='-')) {
 654         ++start;
 655     }
 656
 657     return start;
 658 }
 659
 660 /* compressing -------------------------------------------------------------- */
 661
 662 static void
 663 compress() {
 664     uint32_t i, letterCount;
 665     int16_t wordNumber;
 666     UErrorCode errorCode;
 667
 668     /* sort the words in reverse order by weight */
 669     errorCode=U_ZERO_ERROR;
 670     uprv_sortArray(words, wordCount, sizeof(Word),
 671                     compareWords, NULL, FALSE, &errorCode);
 672
 673     /* remove the words that do not save anything */
 674     while(wordCount>0 && words[wordCount-1].weight<1) {
 675         --wordCount;
 676     }
 677
 678     /* count the letters in the token range */
 679     letterCount=0;
 680     for(i=LEADBYTE_LIMIT; i<256; ++i) {
 681         if(tokens[i]==-1) {
 682             ++letterCount;
 683         }
 684     }
 685     if(!beQuiet) {
 686         printf("number of letters used in the names: %d\n", (int)letterCount);
 687     }
 688
 689     /* do we need double-byte tokens? */
 690     if(wordCount+letterCount<=256) {
 691         /* no, single-byte tokens are enough */
 692         leadByteCount=0;
 693         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
 694             if(tokens[i]!=-1) {
 695                 tokens[i]=wordNumber;
 696                 if(beVerbose) {
 697                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 698                             (int)i, (long)words[wordNumber].weight,
 699                             words[wordNumber].length, words[wordNumber].s);
 700                 }
 701                 ++wordNumber;
 702             }
 703         }
 704         tokenCount=i;
 705     } else {
 706         /*
 707          * The tokens that need two token bytes
 708          * get their weight reduced by their count
 709          * because they save less.
 710          */
 711         tokenCount=256-letterCount;
 712         for(i=tokenCount; i<wordCount; ++i) {
 713             words[i].weight-=words[i].count;
 714         }
 715
 716         /* sort these words in reverse order by weight */
 717         errorCode=U_ZERO_ERROR;
 718         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
 719                         compareWords, NULL, FALSE, &errorCode);
 720
 721         /* remove the words that do not save anything */
 722         while(wordCount>0 && words[wordCount-1].weight<1) {
 723             --wordCount;
 724         }
 725
 726         /* how many tokens and lead bytes do we have now? */
 727         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
 728         /*
 729          * adjust upwards to take into account that
 730          * double-byte tokens must not
 731          * use NAME_SEPARATOR_CHAR as a second byte
 732          */
 733         tokenCount+=(tokenCount-256+254)/255;
 734
 735         leadByteCount=(int16_t)(tokenCount>>8);
 736         if(leadByteCount<LEADBYTE_LIMIT) {
 737             /* adjust for the real number of lead bytes */
 738             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
 739         } else {
 740             /* limit the number of lead bytes */
 741             leadByteCount=LEADBYTE_LIMIT-1;
 742             tokenCount=LEADBYTE_LIMIT*256;
 743             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
 744             /* adjust again to skip double-byte tokens with ';' */
 745             wordCount-=(tokenCount-256+254)/255;
 746         }
 747
 748         /* set token 0 to word 0 */
 749         tokens[0]=0;
 750         if(beVerbose) {
 751             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
 752                     (long)words[0].weight,
 753                     words[0].length, words[0].s);
 754         }
 755         wordNumber=1;
 756
 757         /* set the lead byte tokens */
 758         for(i=1; (int16_t)i<=leadByteCount; ++i) {
 759             tokens[i]=-2;
 760         }
 761
 762         /* set the tokens */
 763         for(; i<256; ++i) {
 764             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
 765             if(tokens[i]!=-1) {
 766                 tokens[i]=wordNumber;
 767                 if(beVerbose) {
 768                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 769                             (int)i, (long)words[wordNumber].weight,
 770                             words[wordNumber].length, words[wordNumber].s);
 771                 }
 772                 ++wordNumber;
 773             }
 774         }
 775
 776         /* continue above 255 where there are no letters */
 777         for(; (uint32_t)wordNumber<wordCount; ++i) {
 778             if((i&0xff)==NAME_SEPARATOR_CHAR) {
 779                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 780             } else {
 781                 tokens[i]=wordNumber;
 782                 if(beVerbose) {
 783                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 784                             (int)i, (long)words[wordNumber].weight,
 785                             words[wordNumber].length, words[wordNumber].s);
 786                 }
 787                 ++wordNumber;
 788             }
 789         }
 790         tokenCount=i; /* should be already tokenCount={i or i+1} */
 791     }
 792
 793     if(!beQuiet) {
 794         printf("number of lead bytes: %d\n", leadByteCount);
 795         printf("number of single-byte tokens: %lu\n",
 796             (unsigned long)256-letterCount-leadByteCount);
 797         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
 798     }
 799
 800     compressLines();
 801 }
 802
 803 static void
 804 compressLines() {
 805     Line *line=NULL;
 806     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
 807              groupMSB=0xffff, lineCount2;
 808     int16_t groupTop=0;
 809
 810     /* store the groups like lines, with compressed data after raw strings */
 811     groupBottom=lineTop;
 812     lineCount2=lineCount;
 813     lineCount=0;
 814
 815     /* loop over all lines */
 816     while(i<lineCount2) {
 817         line=lines+i++;
 818         inLine=line->code;
 819
 820         /* segment the lines to groups of 32 */
 821         if(inLine>>GROUP_SHIFT!=groupMSB) {
 822             /* finish the current group with empty lines */
 823             while((++outLine&GROUP_MASK)!=0) {
 824                 appendLineLength(0);
 825             }
 826
 827             /* store the group like a line */
 828             if(groupTop>0) {
 829                 if(groupTop>GROUP_STORE_SIZE) {
 830                     fprintf(stderr, "gennames: group store overflow\n");
 831                     exit(U_BUFFER_OVERFLOW_ERROR);
 832                 }
 833                 addGroup(groupMSB, groupStore, groupTop);
 834             }
 835
 836             /* start the new group */
 837             lineLengthsTop=0;
 838             groupTop=0;
 839             groupMSB=inLine>>GROUP_SHIFT;
 840             outLine=(inLine&~GROUP_MASK)-1;
 841         }
 842
 843         /* write empty lines between the previous line in the group and this one */
 844         while(++outLine<inLine) {
 845             appendLineLength(0);
 846         }
 847
 848         /* write characters and tokens for this line */
 849         appendLineLength(compressLine(line->s, line->length, &groupTop));
 850     }
 851
 852     /* finish and store the last group */
 853     if(line && groupMSB!=0xffff) {
 854         /* finish the current group with empty lines */
 855         while((++outLine&GROUP_MASK)!=0) {
 856             appendLineLength(0);
 857         }
 858
 859         /* store the group like a line */
 860         if(groupTop>0) {
 861             if(groupTop>GROUP_STORE_SIZE) {
 862                 fprintf(stderr, "gennames: group store overflow\n");
 863                 exit(U_BUFFER_OVERFLOW_ERROR);
 864             }
 865             addGroup(groupMSB, groupStore, groupTop);
 866         }
 867     }
 868
 869     if(!beQuiet) {
 870         printf("number of groups: %lu\n", (unsigned long)lineCount);
 871     }
 872 }
 873
 874 static int16_t
 875 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
 876     int16_t start, limit, token, groupTop=*pGroupTop;
 877
 878     start=0;
 879     do {
 880         /* write any "noise" characters */
 881         limit=skipNoise((char *)s, start, length);
 882         while(start<limit) {
 883             groupStore[groupTop++]=s[start++];
 884         }
 885
 886         if(start==length) {
 887             break;
 888         }
 889
 890         /* write a word, as token or directly */
 891         limit=getWord((char *)s, start, length);
 892         if(limit-start==1) {
 893             groupStore[groupTop++]=s[start++];
 894         } else {
 895             token=findToken(s+start, (int16_t)(limit-start));
 896             if(token!=-1) {
 897                 if(token>0xff) {
 898                     groupStore[groupTop++]=(uint8_t)(token>>8);
 899                 }
 900                 groupStore[groupTop++]=(uint8_t)token;
 901                 start=limit;
 902             } else {
 903                 while(start<limit) {
 904                     groupStore[groupTop++]=s[start++];
 905                 }
 906             }
 907         }
 908     } while(start<length);
 909
 910     length=(int16_t)(groupTop-*pGroupTop);
 911     *pGroupTop=groupTop;
 912     return length;
 913 }
 914
 915 static int32_t
 916 compareWords(const void *context, const void *word1, const void *word2) {
 917     /* reverse sort by word weight */
 918     return ((Word *)word2)->weight-((Word *)word1)->weight;
 919 }
 920
 921 /* generate output data ----------------------------------------------------- */
 922
 923 static void
 924 generateData(const char *dataDir, Options *storeOptions) {
 925     UNewDataMemory *pData;
 926     UErrorCode errorCode=U_ZERO_ERROR;
 927     uint16_t groupWords[3];
 928     uint32_t i, groupTop=lineTop, offset, size,
 929              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
 930     long dataLength;
 931     int16_t token;
 932
 933     pData=udata_create(dataDir,
 934                        DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
 935                        &dataInfo,
 936                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
 937     if(U_FAILURE(errorCode)) {
 938         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
 939         exit(errorCode);
 940     }
 941
 942     /* first, see how much space we need, and prepare the token strings */
 943     for(i=0; i<tokenCount; ++i) {
 944         token=tokens[i];
 945         if(token!=-1 && token!=-2) {
 946             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
 947         }
 948     }
 949
 950     /*
 951      * Required padding for data swapping:
 952      * The token table undergoes a permutation during data swapping when the
 953      * input and output charsets are different.
 954      * The token table cannot grow during swapping, so we need to make sure that
 955      * the table is long enough for successful in-place permutation.
 956      *
 957      * We simply round up tokenCount to the next multiple of 256 to account for
 958      * all possible permutations.
 959      *
 960      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
 961      *
 962      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
 963      * and will be swapped between ASCII and EBCDIC between
 964      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
 965      * This should be the only -1 entry in tokens[256..511] on which the data
 966      * swapper bases its trail byte permutation map (trailMap[]).
 967      *
 968      * It would be sufficient to increase tokenCount so that its lower 8 bits
 969      * are at least 0x5e+1 to make room for swapping between the two semicolons.
 970      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
 971      * should always be an identity map, where we do not need additional room.
 972      */
 973     i=tokenCount;
 974     tokenCount=(tokenCount+0xff)&~0xff;
 975     if(!beQuiet && i<tokenCount) {
 976         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
 977     }
 978     for(; i<tokenCount; ++i) {
 979         if((i&0xff)==NAME_SEPARATOR_CHAR) {
 980             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 981         } else {
 982             tokens[i]=0; /* unused token for padding */
 983         }
 984     }
 985
 986     /*
 987      * Calculate the total size in bytes of the data including:
 988      * - the offset to the token strings, uint32_t (4)
 989      * - the offset to the group table, uint32_t (4)
 990      * - the offset to the group strings, uint32_t (4)
 991      * - the offset to the algorithmic names, uint32_t (4)
 992      *
 993      * - the number of tokens, uint16_t (2)
 994      * - the token table, uint16_t[tokenCount] (2*tokenCount)
 995      *
 996      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
 997      *
 998      * - the number of groups, uint16_t (2)
 999      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1000      *
1001      * - the group strings (groupTop-groupBottom), 2-padded
1002      *
1003      * - the size of the data for the algorithmic names
1004      */
1005     tokenStringOffset=4+4+4+4+2+2*tokenCount;
1006     groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
1007     groupStringOffset=groupsOffset+2+6*lineCount;
1008     algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
1009
1010     offset=generateAlgorithmicData(NULL, storeOptions);
1011     size=algNamesOffset+offset;
1012
1013     if(!beQuiet) {
1014         printf("size of the Unicode Names data:\n"
1015                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1016                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
1017                 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
1018     }
1019
1020     /* write the data to the file */
1021     /* offsets */
1022     udata_write32(pData, tokenStringOffset);
1023     udata_write32(pData, groupsOffset);
1024     udata_write32(pData, groupStringOffset);
1025     udata_write32(pData, algNamesOffset);
1026
1027     /* token table */
1028     udata_write16(pData, (uint16_t)tokenCount);
1029     udata_writeBlock(pData, tokens, 2*tokenCount);
1030
1031     /* token strings */
1032     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
1033     if((lineTop-groupTop)&1) {
1034         /* 2-padding */
1035         udata_writePadding(pData, 1);
1036     }
1037
1038     /* group table */
1039     udata_write16(pData, (uint16_t)lineCount);
1040     for(i=0; i<lineCount; ++i) {
1041         /* groupMSB */
1042         groupWords[0]=(uint16_t)lines[i].code;
1043
1044         /* offset */
1045         offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
1046         groupWords[1]=(uint16_t)(offset>>16);
1047         groupWords[2]=(uint16_t)(offset);
1048         udata_writeBlock(pData, groupWords, 6);
1049     }
1050
1051     /* group strings */
1052     udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
1053
1054     /* 4-align the algorithmic names data */
1055     udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
1056
1057     generateAlgorithmicData(pData, storeOptions);
1058
1059     /* finish up */
1060     dataLength=udata_finish(pData, &errorCode);
1061     if(U_FAILURE(errorCode)) {
1062         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1063         exit(errorCode);
1064     }
1065
1066     if(dataLength!=(long)size) {
1067         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1068 dataLength, (unsigned long)size);
1069         exit(U_INTERNAL_PROGRAM_ERROR);
1070     }
1071 }
1072
1073 /* the structure for algorithmic names needs to be 4-aligned */
1074 typedef struct AlgorithmicRange {
1075     uint32_t rangeStart, rangeEnd;
1076     uint8_t algorithmType, algorithmVariant;
1077     uint16_t rangeSize;
1078 } AlgorithmicRange;
1079
1080 static uint32_t
1081 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
1082     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1083 #   define PREFIX_LENGTH 23
1084 #   define PREFIX_LENGTH_4 24
1085     uint32_t countAlgRanges;
1086
1087     static AlgorithmicRange cjkExtA={
1088         0x3400, 0x4db5,
1089         0, 4,
1090         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1091     };
1092     static AlgorithmicRange cjk={
1093         0x4e00, 0x9fa5,
1094         0, 4,
1095         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1096     };
1097     static AlgorithmicRange cjkExtB={
1098         0x20000, 0x2a6d6,
1099         0, 5,
1100         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1101     };
1102
1103     static char jamo[]=
1104         "HANGUL SYLLABLE \0"
1105
1106         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1107         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1108
1109         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1110         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1111         "YU\0EU\0YI\0I\0"
1112
1113         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1114         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1115         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1116     ;
1117
1118     static AlgorithmicRange hangul={
1119         0xac00, 0xd7a3,
1120         1, 3,
1121         sizeof(AlgorithmicRange)+6+sizeof(jamo)
1122     };
1123
1124     /* modulo factors, maximum 8 */
1125     /* 3 factors: 19, 21, 28, most-to-least-significant */
1126     static uint16_t hangulFactors[3]={
1127         19, 21, 28
1128     };
1129
1130     uint32_t size;
1131
1132     size=0;
1133
1134     if(ucdVersion>=UNI_5_1) {
1135         /* Unicode 5.1 and up has a longer CJK Unihan range than before */
1136         cjk.rangeEnd=0x9FC3;
1137     } else if(ucdVersion>=UNI_4_1) {
1138         /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1139         cjk.rangeEnd=0x9FBB;
1140     }
1141
1142     /* number of ranges of algorithmic names */
1143     if(!storeOptions->storeNames) {
1144         countAlgRanges=0;
1145     } else if(ucdVersion>=UNI_3_1) {
1146         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1147         countAlgRanges=4;
1148     } else if(ucdVersion>=UNI_3_0) {
1149         /* Unicode 3.0 has 3 ranges including CJK Extension A */
1150         countAlgRanges=3;
1151     } else {
1152         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1153         countAlgRanges=2;
1154     }
1155
1156     if(pData!=NULL) {
1157         udata_write32(pData, countAlgRanges);
1158     } else {
1159         size+=4;
1160     }
1161     if(countAlgRanges==0) {
1162         return size;
1163     }
1164
1165     /*
1166      * each range:
1167      * uint32_t rangeStart
1168      * uint32_t rangeEnd
1169      * uint8_t algorithmType
1170      * uint8_t algorithmVariant
1171      * uint16_t size of range data
1172      * uint8_t[size] data
1173      */
1174
1175     /* range 0: cjk extension a */
1176     if(countAlgRanges>=3) {
1177         if(pData!=NULL) {
1178             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1179             udata_writeString(pData, prefix, PREFIX_LENGTH);
1180             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1181                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1182             }
1183         } else {
1184             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1185         }
1186     }
1187
1188     /* range 1: cjk */
1189     if(pData!=NULL) {
1190         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1191         udata_writeString(pData, prefix, PREFIX_LENGTH);
1192         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1193             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1194         }
1195     } else {
1196         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1197     }
1198
1199     /* range 2: hangul syllables */
1200     if(pData!=NULL) {
1201         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1202         udata_writeBlock(pData, hangulFactors, 6);
1203         udata_writeString(pData, jamo, sizeof(jamo));
1204     } else {
1205         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1206     }
1207
1208     /* range 3: cjk extension b */
1209     if(countAlgRanges>=4) {
1210         if(pData!=NULL) {
1211             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1212             udata_writeString(pData, prefix, PREFIX_LENGTH);
1213             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1214                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1215             }
1216         } else {
1217             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1218         }
1219     }
1220
1221     return size;
1222 }
1223
1224 /* helpers ------------------------------------------------------------------ */
1225
1226 static int16_t
1227 findToken(uint8_t *s, int16_t length) {
1228     int16_t i, token;
1229
1230     for(i=0; i<(int16_t)tokenCount; ++i) {
1231         token=tokens[i];
1232         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1233             return i;
1234         }
1235     }
1236
1237     return -1;
1238 }
1239
1240 static Word *
1241 findWord(char *s, int16_t length) {
1242     uint32_t i;
1243
1244     for(i=0; i<wordCount; ++i) {
1245         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1246             return words+i;
1247         }
1248     }
1249
1250     return NULL;
1251 }
1252
1253 static Word *
1254 addWord(char *s, int16_t length) {
1255     uint8_t *stringStart;
1256     Word *word;
1257
1258     if(wordCount==MAX_WORD_COUNT) {
1259         fprintf(stderr, "gennames: too many words\n");
1260         exit(U_BUFFER_OVERFLOW_ERROR);
1261     }
1262
1263     stringStart=allocWord(length);
1264     uprv_memcpy(stringStart, s, length);
1265
1266     word=words+wordCount;
1267
1268     /*
1269      * Initialize the weight with the costs for this token:
1270      * a zero-terminated string and a 16-bit offset.
1271      */
1272     word->weight=-(length+1+2);
1273     word->count=0;
1274     word->length=length;
1275     word->s=stringStart;
1276
1277     ++wordCount;
1278
1279     return word;
1280 }
1281
1282 static void
1283 countWord(Word *word) {
1284     /* add to the weight the savings: the length of the word minus 1 byte for the token */
1285     word->weight+=word->length-1;
1286     ++word->count;
1287 }
1288
1289 static void
1290 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1291     uint8_t *stringStart;
1292     Line *line;
1293     int16_t i, length;
1294
1295     if(lineCount==MAX_LINE_COUNT) {
1296         fprintf(stderr, "gennames: too many lines\n");
1297         exit(U_BUFFER_OVERFLOW_ERROR);
1298     }
1299
1300     /* find the last non-empty name */
1301     while(count>0 && lengths[count-1]==0) {
1302         --count;
1303     }
1304     if(count==0) {
1305         return; /* should not occur: caller should not have called */
1306     }
1307
1308     /* there will be (count-1) separator characters */
1309     i=count;
1310     length=count-1;
1311
1312     /* add lengths of strings */
1313     while(i>0) {
1314         length+=lengths[--i];
1315     }
1316
1317     /* allocate line memory */
1318     stringStart=allocLine(length);
1319
1320     /* copy all strings into the line memory */
1321     length=0; /* number of chars copied so far */
1322     for(i=0; i<count; ++i) {
1323         if(i>0) {
1324             stringStart[length++]=NAME_SEPARATOR_CHAR;
1325         }
1326         if(lengths[i]>0) {
1327             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1328             length+=lengths[i];
1329         }
1330     }
1331
1332     line=lines+lineCount;
1333
1334     line->code=code;
1335     line->length=length;
1336     line->s=stringStart;
1337
1338     ++lineCount;
1339
1340     /* prevent a character value that is actually in a name from becoming a token */
1341     while(length>0) {
1342         tokens[stringStart[--length]]=-1;
1343     }
1344 }
1345
1346 static void
1347 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1348     uint8_t *stringStart;
1349     Line *line;
1350
1351     if(lineCount==MAX_LINE_COUNT) {
1352         fprintf(stderr, "gennames: too many groups\n");
1353         exit(U_BUFFER_OVERFLOW_ERROR);
1354     }
1355
1356     /* store the line lengths first, then the strings */
1357     lineLengthsTop=(lineLengthsTop+1)/2;
1358     stringStart=allocLine(lineLengthsTop+length);
1359     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1360     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1361
1362     line=lines+lineCount;
1363
1364     line->code=groupMSB;
1365     line->length=length;
1366     line->s=stringStart;
1367
1368     ++lineCount;
1369 }
1370
1371 static uint32_t
1372 addToken(uint8_t *s, int16_t length) {
1373     uint8_t *stringStart;
1374
1375     stringStart=allocLine(length+1);
1376     uprv_memcpy(stringStart, s, length);
1377     stringStart[length]=0;
1378
1379     return (uint32_t)(stringStart - stringStore);
1380 }
1381
1382 static void
1383 appendLineLength(int16_t length) {
1384     if(length>=76) {
1385         fprintf(stderr, "gennames: compressed line too long\n");
1386         exit(U_BUFFER_OVERFLOW_ERROR);
1387     }
1388     if(length>=12) {
1389         length-=12;
1390         appendLineLengthNibble((uint8_t)((length>>4)|12));
1391     }
1392     appendLineLengthNibble((uint8_t)length);
1393 }
1394
1395 static void
1396 appendLineLengthNibble(uint8_t nibble) {
1397     if((lineLengthsTop&1)==0) {
1398         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1399     } else {
1400         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1401     }
1402     ++lineLengthsTop;
1403 }
1404
1405 static uint8_t *
1406 allocLine(int32_t length) {
1407     uint32_t top=lineTop+length;
1408     uint8_t *p;
1409
1410     if(top>wordBottom) {
1411         fprintf(stderr, "gennames: out of memory\n");
1412         exit(U_MEMORY_ALLOCATION_ERROR);
1413     }
1414     p=stringStore+lineTop;
1415     lineTop=top;
1416     return p;
1417 }
1418
1419 static uint8_t *
1420 allocWord(uint32_t length) {
1421     uint32_t bottom=wordBottom-length;
1422
1423     if(lineTop>bottom) {
1424         fprintf(stderr, "gennames: out of memory\n");
1425         exit(U_MEMORY_ALLOCATION_ERROR);
1426     }
1427     wordBottom=bottom;
1428     return stringStore+bottom;
1429 }
1430
1431 /*
1432  * Hey, Emacs, please set the following:
1433  *
1434  * Local Variables:
1435  * indent-tabs-mode: nil
1436  * End:
1437  *
1438  */