icuSources/tools/gennames/gennames.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2005, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennames.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999sep30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the character code,
  18 *   the "modern" character name, and optionally the
  19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
  20 *   It then tokenizes and compresses the names and builds
  21 *   compact binary tables for random-access lookup
  22 *   in a u_charName() API function.
  23 *
  24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
  25 * (all data is static const)
  26 *
  27 * UDataInfo fields:
  28 *   dataFormat "unam"
  29 *   formatVersion 1.0
  30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
  31 *
  32 * -- data-based names
  33 * uint32_t tokenStringOffset,
  34 *          groupsOffset,
  35 *          groupStringOffset,
  36 *          algNamesOffset;
  37 *
  38 * uint16_t tokenCount;
  39 * uint16_t tokenTable[tokenCount];
  40 *
  41 * char     tokenStrings[]; -- padded to even count
  42 *
  43 * -- strings (groupStrings) are tokenized as follows:
  44 *   for each character c
  45 *       if(c>=tokenCount) write that character c directly
  46 *   else
  47 *       token=tokenTable[c];
  48 *       if(token==0xfffe) -- lead byte of double-byte token
  49 *           token=tokenTable[c<<8|next character];
  50 *       if(token==-1)
  51 *           write c directly
  52 *       else
  53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
  54 *           append zero-terminated tokenString;
  55 *
  56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
  57 *    are separated by ';'.
  58 *
  59 * uint16_t groupCount;
  60 * struct {
  61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
  62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
  63 *   uint16_t offsetLow;
  64 * } groupTable[groupCount];
  65 *
  66 * char     groupStrings[]; -- padded to 4-count
  67 *
  68 * -- The actual, tokenized group strings are not zero-terminated because
  69 *   that would take up too much space.
  70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
  71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
  72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
  73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
  74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
  75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
  76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
  77 *   For the de-tokenizing, see token description above; the strings immediately follow the
  78 *   32 lengths.
  79 *
  80 * -- algorithmic names
  81 *
  82 * typedef struct AlgorithmicRange {
  83 *     uint32_t rangeStart, rangeEnd;
  84 *     uint8_t algorithmType, algorithmVariant;
  85 *     uint16_t rangeSize;
  86 * } AlgorithmicRange;
  87 *
  88 * uint32_t algRangesCount; -- number of data blocks for ranges of
  89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
  90 *
  91 * struct {
  92 *     AlgorithmicRange algRange;
  93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
  94 * } algRanges[algNamesCount];
  95 * -- not a real array because each part has a different size
  96 *    of algRange.rangeSize (including AlgorithmicRange)
  97 *
  98 * -- algorithmic range types:
  99 *
 100 * 0 Names are formed from a string prefix that is stored in
 101 *   the algRangeData (zero-terminated), followed by the Unicode code point
 102 *   of the character in hexadecimal digits;
 103 *   algRange.algorithmVariant digits are written
 104 *
 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
 106 *   algRange.algorithmVariant is the count of modulo factors
 107 *   algRangeData contains
 108 *       uint16_t factors[algRange.algorithmVariant];
 109 *       char strings[];
 110 *   the first zero-terminated string is written as the prefix; then:
 111 *
 112 *   The rangeStart is subtracted; with the difference, here "code":
 113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
 114 *       index[i]=code%factor[i];
 115 *       code/=factor[i];
 116 *
 117 *   The strings after the prefix are short pieces that are then appended to the result
 118 *   according to index[0..algRange.algorithmVariant-1].
 119 */
 120
 121 #include <stdio.h>
 122 #include "unicode/utypes.h"
 123 #include "unicode/putil.h"
 124 #include "unicode/uclean.h"
 125 #include "unicode/udata.h"
 126 #include "cmemory.h"
 127 #include "cstring.h"
 128 #include "uarrsort.h"
 129 #include "unewdata.h"
 130 #include "uoptions.h"
 131 #include "uparse.h"
 132
 133 #define STRING_STORE_SIZE 1000000
 134 #define GROUP_STORE_SIZE 5000
 135
 136 #define GROUP_SHIFT 5
 137 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
 138 #define GROUP_MASK (LINES_PER_GROUP-1)
 139
 140 #define MAX_LINE_COUNT 50000
 141 #define MAX_WORD_COUNT 20000
 142 #define MAX_GROUP_COUNT 5000
 143
 144 #define DATA_NAME "unames"
 145 #define DATA_TYPE "icu"
 146 #define VERSION_STRING "unam"
 147 #define NAME_SEPARATOR_CHAR ';'
 148
 149 /* Unicode versions --------------------------------------------------------- */
 150
 151 enum {
 152     UNI_1_0,
 153     UNI_1_1,
 154     UNI_2_0,
 155     UNI_3_0,
 156     UNI_3_1,
 157     UNI_3_2,
 158     UNI_4_0,
 159     UNI_4_0_1,
 160     UNI_4_1,
 161     UNI_VER_COUNT
 162 };
 163
 164 static const UVersionInfo
 165 unicodeVersions[]={
 166     { 1, 0, 0, 0 },
 167     { 1, 1, 0, 0 },
 168     { 2, 0, 0, 0 },
 169     { 3, 0, 0, 0 },
 170     { 3, 1, 0, 0 },
 171     { 3, 2, 0, 0 },
 172     { 4, 0, 0, 0 },
 173     { 4, 0, 1, 0 },
 174     { 4, 1, 0, 0 }
 175 };
 176
 177 static int32_t ucdVersion=UNI_4_1;
 178
 179 static int32_t
 180 findUnicodeVersion(const UVersionInfo version) {
 181     int32_t i;
 182
 183     for(i=0; /* while(version>unicodeVersions[i]) {} */
 184         i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
 185         ++i) {}
 186     if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
 187         --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
 188     }
 189     return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
 190 }
 191
 192 /* generator data ----------------------------------------------------------- */
 193
 194 /* UDataInfo cf. udata.h */
 195 static UDataInfo dataInfo={
 196     sizeof(UDataInfo),
 197     0,
 198
 199     U_IS_BIG_ENDIAN,
 200     U_CHARSET_FAMILY,
 201     sizeof(UChar),
 202     0,
 203
 204     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
 205     {1, 0, 0, 0},                 /* formatVersion */
 206     {3, 0, 0, 0}                  /* dataVersion */
 207 };
 208
 209 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
 210
 211 static uint8_t stringStore[STRING_STORE_SIZE],
 212                groupStore[GROUP_STORE_SIZE],
 213                lineLengths[LINES_PER_GROUP];
 214
 215 static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
 216
 217 typedef struct {
 218     uint32_t code;
 219     int16_t length;
 220     uint8_t *s;
 221 } Line;
 222
 223 typedef struct {
 224     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
 225     int16_t count;
 226     int16_t length;
 227     uint8_t *s;
 228 } Word;
 229
 230 static Line lines[MAX_LINE_COUNT];
 231 static Word words[MAX_WORD_COUNT];
 232
 233 static uint32_t lineCount=0, wordCount=0;
 234
 235 static int16_t leadByteCount;
 236
 237 #define LEADBYTE_LIMIT 16
 238
 239 static int16_t tokens[LEADBYTE_LIMIT*256];
 240 static uint32_t tokenCount;
 241
 242 /* prototypes --------------------------------------------------------------- */
 243
 244 static void
 245 init(void);
 246
 247 static void
 248 parseDB(const char *filename, UBool store10Names);
 249
 250 static void
 251 parseName(char *name, int16_t length);
 252
 253 static int16_t
 254 skipNoise(char *line, int16_t start, int16_t limit);
 255
 256 static int16_t
 257 getWord(char *line, int16_t start, int16_t limit);
 258
 259 static void
 260 compress(void);
 261
 262 static void
 263 compressLines(void);
 264
 265 static int16_t
 266 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
 267
 268 static int32_t
 269 compareWords(const void *context, const void *word1, const void *word2);
 270
 271 static void
 272 generateData(const char *dataDir);
 273
 274 static uint32_t
 275 generateAlgorithmicData(UNewDataMemory *pData);
 276
 277 static int16_t
 278 findToken(uint8_t *s, int16_t length);
 279
 280 static Word *
 281 findWord(char *s, int16_t length);
 282
 283 static Word *
 284 addWord(char *s, int16_t length);
 285
 286 static void
 287 countWord(Word *word);
 288
 289 static void
 290 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
 291
 292 static void
 293 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
 294
 295 static uint32_t
 296 addToken(uint8_t *s, int16_t length);
 297
 298 static void
 299 appendLineLength(int16_t length);
 300
 301 static void
 302 appendLineLengthNibble(uint8_t nibble);
 303
 304 static uint8_t *
 305 allocLine(int32_t length);
 306
 307 static uint8_t *
 308 allocWord(uint32_t length);
 309
 310 /* -------------------------------------------------------------------------- */
 311
 312 static UOption options[]={
 313     UOPTION_HELP_H,
 314     UOPTION_HELP_QUESTION_MARK,
 315     UOPTION_VERBOSE,
 316     UOPTION_QUIET,
 317     UOPTION_COPYRIGHT,
 318     UOPTION_DESTDIR,
 319     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
 320     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
 321 };
 322
 323 extern int
 324 main(int argc, char* argv[]) {
 325     UVersionInfo version;
 326     UBool store10Names=FALSE;
 327     UErrorCode errorCode = U_ZERO_ERROR;
 328
 329     U_MAIN_INIT_ARGS(argc, argv);
 330
 331     /* Initialize ICU */
 332     u_init(&errorCode);
 333     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
 334         /* Note: u_init() will try to open ICU property data.
 335          *       failures here are expected when building ICU from scratch.
 336          *       ignore them.
 337          */
 338         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
 339             argv[0], u_errorName(errorCode));
 340         exit(1);
 341     }
 342
 343     /* preset then read command line options */
 344     options[5].value=u_getDataDirectory();
 345     options[6].value="4.1";
 346     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 347
 348     /* error handling, printing usage message */
 349     if(argc<0) {
 350         fprintf(stderr,
 351             "error in command line argument \"%s\"\n",
 352             argv[-argc]);
 353     } else if(argc<2) {
 354         argc=-1;
 355     }
 356     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
 357         /*
 358          * Broken into chucks because the C89 standard says the minimum
 359          * required supported string length is 509 bytes.
 360          */
 361         fprintf(stderr,
 362             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
 363             "\n"
 364             "Read the UnicodeData.txt file and \n"
 365             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
 366             "\n"
 367             "\tfilename  absolute path/filename for the Unicode database text file\n"
 368             "\t\t(default: standard input)\n"
 369             "\n",
 370             argv[0]);
 371         fprintf(stderr,
 372             "Options:\n"
 373             "\t-h or -? or --help  this usage text\n"
 374             "\t-v or --verbose     verbose output\n"
 375             "\t-q or --quiet       no output\n"
 376             "\t-c or --copyright   include a copyright notice\n"
 377             "\t-d or --destdir     destination directory, followed by the path\n"
 378             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 379             "\t-1 or --unicode1-names  store Unicode 1.0 character names\n");
 380         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 381     }
 382
 383     /* get the options values */
 384     beVerbose=options[2].doesOccur;
 385     beQuiet=options[3].doesOccur;
 386     haveCopyright=options[4].doesOccur;
 387     store10Names=options[7].doesOccur;
 388
 389     /* set the Unicode version */
 390     u_versionFromString(version, options[6].value);
 391     uprv_memcpy(dataInfo.dataVersion, version, 4);
 392     ucdVersion=findUnicodeVersion(version);
 393
 394     init();
 395     parseDB(argc>=2 ? argv[1] : "-", store10Names);
 396     compress();
 397     generateData(options[5].value);
 398
 399     u_cleanup();
 400     return 0;
 401 }
 402
 403 static void
 404 init() {
 405     int i;
 406
 407     for(i=0; i<256; ++i) {
 408         tokens[i]=0;
 409     }
 410 }
 411
 412 /* parsing ------------------------------------------------------------------ */
 413
 414 /* get a name, strip leading and trailing whitespace */
 415 static int16_t
 416 getName(char **pStart, char *limit) {
 417     /* strip leading whitespace */
 418     char *start=(char *)u_skipWhitespace(*pStart);
 419
 420     /* strip trailing whitespace */
 421     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
 422         --limit;
 423     }
 424
 425     /* return results */
 426     *pStart=start;
 427     return (int16_t)(limit-start);
 428 }
 429
 430 static void U_CALLCONV
 431 lineFn(void *context,
 432        char *fields[][2], int32_t fieldCount,
 433        UErrorCode *pErrorCode) {
 434     char *names[3];
 435     int16_t lengths[3];
 436     static uint32_t prevCode=0;
 437     uint32_t code=0;
 438
 439     if(U_FAILURE(*pErrorCode)) {
 440         return;
 441     }
 442     /* get the character code */
 443     code=uprv_strtoul(fields[0][0], NULL, 16);
 444
 445     /* get the character name */
 446     names[0]=fields[1][0];
 447     lengths[0]=getName(names+0, fields[1][1]);
 448     if(names[0][0]=='<') {
 449         /* do not store pseudo-names in <> brackets */
 450         lengths[0]=0;
 451     }
 452
 453     /* store 1.0 names */
 454     /* get the second character name, the one from Unicode 1.0 */
 455     /* do not store pseudo-names in <> brackets */
 456     names[1]=fields[10][0];
 457     lengths[1]=getName(names+1, fields[10][1]);
 458     if(*(UBool *)context && names[1][0]!='<') {
 459         /* keep the name */
 460     } else {
 461         lengths[1]=0;
 462     }
 463
 464     /* get the ISO 10646 comment */
 465     names[2]=fields[11][0];
 466     lengths[2]=getName(names+2, fields[11][1]);
 467
 468     if(lengths[0]+lengths[1]+lengths[2]==0) {
 469         return;
 470     }
 471
 472     /* check for non-character code points */
 473     if(!UTF_IS_UNICODE_CHAR(code)) {
 474         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
 475                 (unsigned long)code);
 476         *pErrorCode=U_PARSE_ERROR;
 477         exit(U_PARSE_ERROR);
 478     }
 479
 480     /* check that the code points (code) are in ascending order */
 481     if(code<=prevCode && code>0) {
 482         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 483                 (unsigned long)code, (unsigned long)prevCode);
 484         *pErrorCode=U_PARSE_ERROR;
 485         exit(U_PARSE_ERROR);
 486     }
 487     prevCode=code;
 488
 489     parseName(names[0], lengths[0]);
 490     parseName(names[1], lengths[1]);
 491     parseName(names[2], lengths[2]);
 492
 493     /*
 494      * set the count argument to
 495      * 1: only store regular names
 496      * 2: store regular and 1.0 names
 497      * 3: store names and ISO 10646 comment
 498      */
 499     addLine(code, names, lengths, 3);
 500 }
 501
 502 static void
 503 parseDB(const char *filename, UBool store10Names) {
 504     char *fields[15][2];
 505     UErrorCode errorCode=U_ZERO_ERROR;
 506
 507     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
 508     if(U_FAILURE(errorCode)) {
 509         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
 510         exit(errorCode);
 511     }
 512
 513     if(!beQuiet) {
 514         printf("size of all names in the database: %lu\n",
 515             (unsigned long)lineTop);
 516         printf("number of named Unicode characters: %lu\n",
 517             (unsigned long)lineCount);
 518         printf("number of words in the dictionary from these names: %lu\n",
 519             (unsigned long)wordCount);
 520     }
 521 }
 522
 523 static void
 524 parseName(char *name, int16_t length) {
 525     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
 526     Word *word;
 527
 528     while(start<length) {
 529         /* skip any "noise" characters */
 530         limit=skipNoise(name, start, length);
 531         if(start<limit) {
 532             /*prevStart=-1;*/
 533             start=limit;
 534         }
 535         if(start==length) {
 536             break;
 537         }
 538
 539         /* get a word and add it if it is longer than 1 */
 540         limit=getWord(name, start, length);
 541         wordLength=(int16_t)(limit-start);
 542         if(wordLength>1) {
 543             word=findWord(name+start, wordLength);
 544             if(word==NULL) {
 545                 word=addWord(name+start, wordLength);
 546             }
 547             countWord(word);
 548         }
 549
 550 #if 0
 551         /*
 552          * if there was a word before this
 553          * (with no noise in between), then add the pair of words, too
 554          */
 555         if(prevStart!=-1) {
 556             wordLength=limit-prevStart;
 557             word=findWord(name+prevStart, wordLength);
 558             if(word==NULL) {
 559                 word=addWord(name+prevStart, wordLength);
 560             }
 561             countWord(word);
 562         }
 563 #endif
 564
 565         /*prevStart=start;*/
 566         start=limit;
 567     }
 568 }
 569
 570 static UBool U_INLINE
 571 isWordChar(char c) {
 572     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
 573            ('J'<=c && c<='R') ||
 574            ('S'<=c && c<='Z') ||
 575
 576            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
 577            ('j'<=c && c<='r') ||
 578            ('s'<=c && c<='z') ||
 579
 580            ('0'<=c && c<='9');
 581 }
 582
 583 static int16_t
 584 skipNoise(char *line, int16_t start, int16_t limit) {
 585     /* skip anything that is not part of a word in this sense */
 586     while(start<limit && !isWordChar(line[start])) {
 587         ++start;
 588     }
 589
 590     return start;
 591 }
 592
 593 static int16_t
 594 getWord(char *line, int16_t start, int16_t limit) {
 595     char c=0; /* initialize to avoid a compiler warning although the code was safe */
 596
 597     /* a unicode character name word consists of A-Z0-9 */
 598     while(start<limit && isWordChar(line[start])) {
 599         ++start;
 600     }
 601
 602     /* include a following space or dash */
 603     if(start<limit && ((c=line[start])==' ' || c=='-')) {
 604         ++start;
 605     }
 606
 607     return start;
 608 }
 609
 610 /* compressing -------------------------------------------------------------- */
 611
 612 static void
 613 compress() {
 614     uint32_t i, letterCount;
 615     int16_t wordNumber;
 616     UErrorCode errorCode;
 617
 618     /* sort the words in reverse order by weight */
 619     errorCode=U_ZERO_ERROR;
 620     uprv_sortArray(words, wordCount, sizeof(Word),
 621                     compareWords, NULL, FALSE, &errorCode);
 622
 623     /* remove the words that do not save anything */
 624     while(wordCount>0 && words[wordCount-1].weight<1) {
 625         --wordCount;
 626     }
 627
 628     /* count the letters in the token range */
 629     letterCount=0;
 630     for(i=LEADBYTE_LIMIT; i<256; ++i) {
 631         if(tokens[i]==-1) {
 632             ++letterCount;
 633         }
 634     }
 635     if(!beQuiet) {
 636         printf("number of letters used in the names: %d\n", (int)letterCount);
 637     }
 638
 639     /* do we need double-byte tokens? */
 640     if(wordCount+letterCount<=256) {
 641         /* no, single-byte tokens are enough */
 642         leadByteCount=0;
 643         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
 644             if(tokens[i]!=-1) {
 645                 tokens[i]=wordNumber;
 646                 if(beVerbose) {
 647                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 648                             (int)i, (long)words[wordNumber].weight,
 649                             words[wordNumber].length, words[wordNumber].s);
 650                 }
 651                 ++wordNumber;
 652             }
 653         }
 654         tokenCount=i;
 655     } else {
 656         /*
 657          * The tokens that need two token bytes
 658          * get their weight reduced by their count
 659          * because they save less.
 660          */
 661         tokenCount=256-letterCount;
 662         for(i=tokenCount; i<wordCount; ++i) {
 663             words[i].weight-=words[i].count;
 664         }
 665
 666         /* sort these words in reverse order by weight */
 667         errorCode=U_ZERO_ERROR;
 668         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
 669                         compareWords, NULL, FALSE, &errorCode);
 670
 671         /* remove the words that do not save anything */
 672         while(wordCount>0 && words[wordCount-1].weight<1) {
 673             --wordCount;
 674         }
 675
 676         /* how many tokens and lead bytes do we have now? */
 677         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
 678         /*
 679          * adjust upwards to take into account that
 680          * double-byte tokens must not
 681          * use NAME_SEPARATOR_CHAR as a second byte
 682          */
 683         tokenCount+=(tokenCount-256+254)/255;
 684
 685         leadByteCount=(int16_t)(tokenCount>>8);
 686         if(leadByteCount<LEADBYTE_LIMIT) {
 687             /* adjust for the real number of lead bytes */
 688             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
 689         } else {
 690             /* limit the number of lead bytes */
 691             leadByteCount=LEADBYTE_LIMIT-1;
 692             tokenCount=LEADBYTE_LIMIT*256;
 693             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
 694             /* adjust again to skip double-byte tokens with ';' */
 695             wordCount-=(tokenCount-256+254)/255;
 696         }
 697
 698         /* set token 0 to word 0 */
 699         tokens[0]=0;
 700         if(beVerbose) {
 701             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
 702                     (long)words[0].weight,
 703                     words[0].length, words[0].s);
 704         }
 705         wordNumber=1;
 706
 707         /* set the lead byte tokens */
 708         for(i=1; (int16_t)i<=leadByteCount; ++i) {
 709             tokens[i]=-2;
 710         }
 711
 712         /* set the tokens */
 713         for(; i<256; ++i) {
 714             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
 715             if(tokens[i]!=-1) {
 716                 tokens[i]=wordNumber;
 717                 if(beVerbose) {
 718                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 719                             (int)i, (long)words[wordNumber].weight,
 720                             words[wordNumber].length, words[wordNumber].s);
 721                 }
 722                 ++wordNumber;
 723             }
 724         }
 725
 726         /* continue above 255 where there are no letters */
 727         for(; (uint32_t)wordNumber<wordCount; ++i) {
 728             if((i&0xff)==NAME_SEPARATOR_CHAR) {
 729                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 730             } else {
 731                 tokens[i]=wordNumber;
 732                 if(beVerbose) {
 733                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
 734                             (int)i, (long)words[wordNumber].weight,
 735                             words[wordNumber].length, words[wordNumber].s);
 736                 }
 737                 ++wordNumber;
 738             }
 739         }
 740         tokenCount=i; /* should be already tokenCount={i or i+1} */
 741     }
 742
 743     if(!beQuiet) {
 744         printf("number of lead bytes: %d\n", leadByteCount);
 745         printf("number of single-byte tokens: %lu\n",
 746             (unsigned long)256-letterCount-leadByteCount);
 747         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
 748     }
 749
 750     compressLines();
 751 }
 752
 753 static void
 754 compressLines() {
 755     Line *line=NULL;
 756     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
 757              groupMSB=0xffff, lineCount2;
 758     int16_t groupTop=0;
 759
 760     /* store the groups like lines, reusing the lines' memory */
 761     lineTop=0;
 762     lineCount2=lineCount;
 763     lineCount=0;
 764
 765     /* loop over all lines */
 766     while(i<lineCount2) {
 767         line=lines+i++;
 768         inLine=line->code;
 769
 770         /* segment the lines to groups of 32 */
 771         if(inLine>>GROUP_SHIFT!=groupMSB) {
 772             /* finish the current group with empty lines */
 773             while((++outLine&GROUP_MASK)!=0) {
 774                 appendLineLength(0);
 775             }
 776
 777             /* store the group like a line */
 778             if(groupTop>0) {
 779                 if(groupTop>GROUP_STORE_SIZE) {
 780                     fprintf(stderr, "gennames: group store overflow\n");
 781                     exit(U_BUFFER_OVERFLOW_ERROR);
 782                 }
 783                 addGroup(groupMSB, groupStore, groupTop);
 784                 if(lineTop>(uint32_t)(line->s-stringStore)) {
 785                     fprintf(stderr, "gennames: group store runs into string store\n");
 786                     exit(U_INTERNAL_PROGRAM_ERROR);
 787                 }
 788             }
 789
 790             /* start the new group */
 791             lineLengthsTop=0;
 792             groupTop=0;
 793             groupMSB=inLine>>GROUP_SHIFT;
 794             outLine=(inLine&~GROUP_MASK)-1;
 795         }
 796
 797         /* write empty lines between the previous line in the group and this one */
 798         while(++outLine<inLine) {
 799             appendLineLength(0);
 800         }
 801
 802         /* write characters and tokens for this line */
 803         appendLineLength(compressLine(line->s, line->length, &groupTop));
 804     }
 805
 806     /* finish and store the last group */
 807     if(line && groupMSB!=0xffff) {
 808         /* finish the current group with empty lines */
 809         while((++outLine&GROUP_MASK)!=0) {
 810             appendLineLength(0);
 811         }
 812
 813         /* store the group like a line */
 814         if(groupTop>0) {
 815             if(groupTop>GROUP_STORE_SIZE) {
 816                 fprintf(stderr, "gennames: group store overflow\n");
 817                 exit(U_BUFFER_OVERFLOW_ERROR);
 818             }
 819             addGroup(groupMSB, groupStore, groupTop);
 820             if(lineTop>(uint32_t)(line->s-stringStore)) {
 821                 fprintf(stderr, "gennames: group store runs into string store\n");
 822                 exit(U_INTERNAL_PROGRAM_ERROR);
 823             }
 824         }
 825     }
 826
 827     if(!beQuiet) {
 828         printf("number of groups: %lu\n", (unsigned long)lineCount);
 829     }
 830 }
 831
 832 static int16_t
 833 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
 834     int16_t start, limit, token, groupTop=*pGroupTop;
 835
 836     start=0;
 837     do {
 838         /* write any "noise" characters */
 839         limit=skipNoise((char *)s, start, length);
 840         while(start<limit) {
 841             groupStore[groupTop++]=s[start++];
 842         }
 843
 844         if(start==length) {
 845             break;
 846         }
 847
 848         /* write a word, as token or directly */
 849         limit=getWord((char *)s, start, length);
 850         if(limit-start==1) {
 851             groupStore[groupTop++]=s[start++];
 852         } else {
 853             token=findToken(s+start, (int16_t)(limit-start));
 854             if(token!=-1) {
 855                 if(token>0xff) {
 856                     groupStore[groupTop++]=(uint8_t)(token>>8);
 857                 }
 858                 groupStore[groupTop++]=(uint8_t)token;
 859                 start=limit;
 860             } else {
 861                 while(start<limit) {
 862                     groupStore[groupTop++]=s[start++];
 863                 }
 864             }
 865         }
 866     } while(start<length);
 867
 868     length=(int16_t)(groupTop-*pGroupTop);
 869     *pGroupTop=groupTop;
 870     return length;
 871 }
 872
 873 static int32_t
 874 compareWords(const void *context, const void *word1, const void *word2) {
 875     /* reverse sort by word weight */
 876     return ((Word *)word2)->weight-((Word *)word1)->weight;
 877 }
 878
 879 /* generate output data ----------------------------------------------------- */
 880
 881 static void
 882 generateData(const char *dataDir) {
 883     UNewDataMemory *pData;
 884     UErrorCode errorCode=U_ZERO_ERROR;
 885     uint16_t groupWords[3];
 886     uint32_t i, groupTop=lineTop, offset, size,
 887              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
 888     long dataLength;
 889     int16_t token;
 890
 891     pData=udata_create(dataDir, DATA_TYPE,DATA_NAME, &dataInfo,
 892                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
 893     if(U_FAILURE(errorCode)) {
 894         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
 895         exit(errorCode);
 896     }
 897
 898     /* first, see how much space we need, and prepare the token strings */
 899     for(i=0; i<tokenCount; ++i) {
 900         token=tokens[i];
 901         if(token!=-1 && token!=-2) {
 902             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
 903         }
 904     }
 905
 906     /*
 907      * Required padding for data swapping:
 908      * The token table undergoes a permutation during data swapping when the
 909      * input and output charsets are different.
 910      * The token table cannot grow during swapping, so we need to make sure that
 911      * the table is long enough for successful in-place permutation.
 912      *
 913      * We simply round up tokenCount to the next multiple of 256 to account for
 914      * all possible permutations.
 915      *
 916      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
 917      *
 918      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
 919      * and will be swapped between ASCII and EBCDIC between
 920      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
 921      * This should be the only -1 entry in tokens[256..511] on which the data
 922      * swapper bases its trail byte permutation map (trailMap[]).
 923      *
 924      * It would be sufficient to increase tokenCount so that its lower 8 bits
 925      * are at least 0x5e+1 to make room for swapping between the two semicolons.
 926      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
 927      * should always be an identity map, where we do not need additional room.
 928      */
 929     i=tokenCount;
 930     tokenCount=(tokenCount+0xff)&~0xff;
 931     if(!beQuiet && i<tokenCount) {
 932         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
 933     }
 934     for(; i<tokenCount; ++i) {
 935         if((i&0xff)==NAME_SEPARATOR_CHAR) {
 936             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
 937         } else {
 938             tokens[i]=0; /* unused token for padding */
 939         }
 940     }
 941
 942     /*
 943      * Calculate the total size in bytes of the data including:
 944      * - the offset to the token strings, uint32_t (4)
 945      * - the offset to the group table, uint32_t (4)
 946      * - the offset to the group strings, uint32_t (4)
 947      * - the offset to the algorithmic names, uint32_t (4)
 948      *
 949      * - the number of tokens, uint16_t (2)
 950      * - the token table, uint16_t[tokenCount] (2*tokenCount)
 951      *
 952      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
 953      *
 954      * - the number of groups, uint16_t (2)
 955      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
 956      *
 957      * - the group strings (groupTop), 2-padded
 958      *
 959      * - the size of the data for the algorithmic names
 960      */
 961     tokenStringOffset=4+4+4+4+2+2*tokenCount;
 962     groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
 963     groupStringOffset=groupsOffset+2+6*lineCount;
 964     algNamesOffset=(groupStringOffset+groupTop+3)&~3;
 965
 966     offset=generateAlgorithmicData(NULL);
 967     size=algNamesOffset+offset;
 968
 969     if(!beQuiet) {
 970         printf("size of the Unicode Names data:\n"
 971                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
 972                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
 973                 (unsigned long)groupTop, (unsigned long)offset);
 974     }
 975
 976     /* write the data to the file */
 977     /* offsets */
 978     udata_write32(pData, tokenStringOffset);
 979     udata_write32(pData, groupsOffset);
 980     udata_write32(pData, groupStringOffset);
 981     udata_write32(pData, algNamesOffset);
 982
 983     /* token table */
 984     udata_write16(pData, (uint16_t)tokenCount);
 985     udata_writeBlock(pData, tokens, 2*tokenCount);
 986
 987     /* token strings */
 988     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
 989     if((lineTop-groupTop)&1) {
 990         /* 2-padding */
 991         udata_writePadding(pData, 1);
 992     }
 993
 994     /* group table */
 995     udata_write16(pData, (uint16_t)lineCount);
 996     for(i=0; i<lineCount; ++i) {
 997         /* groupMSB */
 998         groupWords[0]=(uint16_t)lines[i].code;
 999
1000         /* offset */
1001         offset = (uint32_t)(lines[i].s - stringStore);
1002         groupWords[1]=(uint16_t)(offset>>16);
1003         groupWords[2]=(uint16_t)(offset);
1004         udata_writeBlock(pData, groupWords, 6);
1005     }
1006
1007     /* group strings */
1008     udata_writeBlock(pData, stringStore, groupTop);
1009
1010     /* 4-align the algorithmic names data */
1011     udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
1012
1013     generateAlgorithmicData(pData);
1014
1015     /* finish up */
1016     dataLength=udata_finish(pData, &errorCode);
1017     if(U_FAILURE(errorCode)) {
1018         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1019         exit(errorCode);
1020     }
1021
1022     if(dataLength!=(long)size) {
1023         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1024 dataLength, (unsigned long)size);
1025         exit(U_INTERNAL_PROGRAM_ERROR);
1026     }
1027 }
1028
1029 /* the structure for algorithmic names needs to be 4-aligned */
1030 typedef struct AlgorithmicRange {
1031     uint32_t rangeStart, rangeEnd;
1032     uint8_t algorithmType, algorithmVariant;
1033     uint16_t rangeSize;
1034 } AlgorithmicRange;
1035
1036 static uint32_t
1037 generateAlgorithmicData(UNewDataMemory *pData) {
1038     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1039 #   define PREFIX_LENGTH 23
1040 #   define PREFIX_LENGTH_4 24
1041     uint32_t countAlgRanges;
1042
1043     static AlgorithmicRange cjkExtA={
1044         0x3400, 0x4db5,
1045         0, 4,
1046         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1047     };
1048     static AlgorithmicRange cjk={
1049         0x4e00, 0x9fa5,
1050         0, 4,
1051         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1052     };
1053     static AlgorithmicRange cjkExtB={
1054         0x20000, 0x2a6d6,
1055         0, 5,
1056         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1057     };
1058
1059     static char jamo[]=
1060         "HANGUL SYLLABLE \0"
1061
1062         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1063         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1064
1065         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1066         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1067         "YU\0EU\0YI\0I\0"
1068
1069         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1070         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1071         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1072     ;
1073
1074     static AlgorithmicRange hangul={
1075         0xac00, 0xd7a3,
1076         1, 3,
1077         sizeof(AlgorithmicRange)+6+sizeof(jamo)
1078     };
1079
1080     /* modulo factors, maximum 8 */
1081     /* 3 factors: 19, 21, 28, most-to-least-significant */
1082     static uint16_t hangulFactors[3]={
1083         19, 21, 28
1084     };
1085
1086     uint32_t size;
1087
1088     size=0;
1089
1090     if(ucdVersion>=UNI_4_1) {
1091         /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1092         cjk.rangeEnd=0x9FBB;
1093     }
1094
1095     /* number of ranges of algorithmic names */
1096     if(ucdVersion>=UNI_3_1) {
1097         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1098         countAlgRanges=4;
1099     } else if(ucdVersion>=UNI_3_0) {
1100         /* Unicode 3.0 has 3 ranges including CJK Extension A */
1101         countAlgRanges=3;
1102     } else {
1103         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1104         countAlgRanges=2;
1105     }
1106
1107     if(pData!=NULL) {
1108         udata_write32(pData, countAlgRanges);
1109     } else {
1110         size+=4;
1111     }
1112
1113     /*
1114      * each range:
1115      * uint32_t rangeStart
1116      * uint32_t rangeEnd
1117      * uint8_t algorithmType
1118      * uint8_t algorithmVariant
1119      * uint16_t size of range data
1120      * uint8_t[size] data
1121      */
1122
1123     /* range 0: cjk extension a */
1124     if(countAlgRanges>=3) {
1125         if(pData!=NULL) {
1126             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1127             udata_writeString(pData, prefix, PREFIX_LENGTH);
1128             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1129                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1130             }
1131         } else {
1132             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1133         }
1134     }
1135
1136     /* range 1: cjk */
1137     if(pData!=NULL) {
1138         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1139         udata_writeString(pData, prefix, PREFIX_LENGTH);
1140         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1141             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1142         }
1143     } else {
1144         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1145     }
1146
1147     /* range 2: hangul syllables */
1148     if(pData!=NULL) {
1149         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1150         udata_writeBlock(pData, hangulFactors, 6);
1151         udata_writeString(pData, jamo, sizeof(jamo));
1152     } else {
1153         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1154     }
1155
1156     /* range 3: cjk extension b */
1157     if(countAlgRanges>=4) {
1158         if(pData!=NULL) {
1159             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1160             udata_writeString(pData, prefix, PREFIX_LENGTH);
1161             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1162                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1163             }
1164         } else {
1165             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1166         }
1167     }
1168
1169     return size;
1170 }
1171
1172 /* helpers ------------------------------------------------------------------ */
1173
1174 static int16_t
1175 findToken(uint8_t *s, int16_t length) {
1176     int16_t i, token;
1177
1178     for(i=0; i<(int16_t)tokenCount; ++i) {
1179         token=tokens[i];
1180         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1181             return i;
1182         }
1183     }
1184
1185     return -1;
1186 }
1187
1188 static Word *
1189 findWord(char *s, int16_t length) {
1190     uint32_t i;
1191
1192     for(i=0; i<wordCount; ++i) {
1193         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1194             return words+i;
1195         }
1196     }
1197
1198     return NULL;
1199 }
1200
1201 static Word *
1202 addWord(char *s, int16_t length) {
1203     uint8_t *stringStart;
1204     Word *word;
1205
1206     if(wordCount==MAX_WORD_COUNT) {
1207         fprintf(stderr, "gennames: too many words\n");
1208         exit(U_BUFFER_OVERFLOW_ERROR);
1209     }
1210
1211     stringStart=allocWord(length);
1212     uprv_memcpy(stringStart, s, length);
1213
1214     word=words+wordCount;
1215
1216     /*
1217      * Initialize the weight with the costs for this token:
1218      * a zero-terminated string and a 16-bit offset.
1219      */
1220     word->weight=-(length+1+2);
1221     word->count=0;
1222     word->length=length;
1223     word->s=stringStart;
1224
1225     ++wordCount;
1226
1227     return word;
1228 }
1229
1230 static void
1231 countWord(Word *word) {
1232     /* add to the weight the savings: the length of the word minus 1 byte for the token */
1233     word->weight+=word->length-1;
1234     ++word->count;
1235 }
1236
1237 static void
1238 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1239     uint8_t *stringStart;
1240     Line *line;
1241     int16_t i, length;
1242
1243     if(lineCount==MAX_LINE_COUNT) {
1244         fprintf(stderr, "gennames: too many lines\n");
1245         exit(U_BUFFER_OVERFLOW_ERROR);
1246     }
1247
1248     /* find the last non-empty name */
1249     while(count>0 && lengths[count-1]==0) {
1250         --count;
1251     }
1252     if(count==0) {
1253         return; /* should not occur: caller should not have called */
1254     }
1255
1256     /* there will be (count-1) separator characters */
1257     i=count;
1258     length=count-1;
1259
1260     /* add lengths of strings */
1261     while(i>0) {
1262         length+=lengths[--i];
1263     }
1264
1265     /* allocate line memory */
1266     stringStart=allocLine(length);
1267
1268     /* copy all strings into the line memory */
1269     length=0; /* number of chars copied so far */
1270     for(i=0; i<count; ++i) {
1271         if(i>0) {
1272             stringStart[length++]=NAME_SEPARATOR_CHAR;
1273         }
1274         if(lengths[i]>0) {
1275             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1276             length+=lengths[i];
1277         }
1278     }
1279
1280     line=lines+lineCount;
1281
1282     line->code=code;
1283     line->length=length;
1284     line->s=stringStart;
1285
1286     ++lineCount;
1287
1288     /* prevent a character value that is actually in a name from becoming a token */
1289     while(length>0) {
1290         tokens[stringStart[--length]]=-1;
1291     }
1292 }
1293
1294 static void
1295 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1296     uint8_t *stringStart;
1297     Line *line;
1298
1299     if(lineCount==MAX_LINE_COUNT) {
1300         fprintf(stderr, "gennames: too many groups\n");
1301         exit(U_BUFFER_OVERFLOW_ERROR);
1302     }
1303
1304     /* store the line lengths first, then the strings */
1305     lineLengthsTop=(lineLengthsTop+1)/2;
1306     stringStart=allocLine(lineLengthsTop+length);
1307     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1308     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1309
1310     line=lines+lineCount;
1311
1312     line->code=groupMSB;
1313     line->length=length;
1314     line->s=stringStart;
1315
1316     ++lineCount;
1317 }
1318
1319 static uint32_t
1320 addToken(uint8_t *s, int16_t length) {
1321     uint8_t *stringStart;
1322
1323     stringStart=allocLine(length+1);
1324     uprv_memcpy(stringStart, s, length);
1325     stringStart[length]=0;
1326
1327     return (uint32_t)(stringStart - stringStore);
1328 }
1329
1330 static void
1331 appendLineLength(int16_t length) {
1332     if(length>=76) {
1333         fprintf(stderr, "gennames: compressed line too long\n");
1334         exit(U_BUFFER_OVERFLOW_ERROR);
1335     }
1336     if(length>=12) {
1337         length-=12;
1338         appendLineLengthNibble((uint8_t)((length>>4)|12));
1339     }
1340     appendLineLengthNibble((uint8_t)length);
1341 }
1342
1343 static void
1344 appendLineLengthNibble(uint8_t nibble) {
1345     if((lineLengthsTop&1)==0) {
1346         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1347     } else {
1348         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1349     }
1350     ++lineLengthsTop;
1351 }
1352
1353 static uint8_t *
1354 allocLine(int32_t length) {
1355     uint32_t top=lineTop+length;
1356     uint8_t *p;
1357
1358     if(top>wordBottom) {
1359         fprintf(stderr, "gennames: out of memory\n");
1360         exit(U_MEMORY_ALLOCATION_ERROR);
1361     }
1362     p=stringStore+lineTop;
1363     lineTop=top;
1364     return p;
1365 }
1366
1367 static uint8_t *
1368 allocWord(uint32_t length) {
1369     uint32_t bottom=wordBottom-length;
1370
1371     if(lineTop>bottom) {
1372         fprintf(stderr, "gennames: out of memory\n");
1373         exit(U_MEMORY_ALLOCATION_ERROR);
1374     }
1375     wordBottom=bottom;
1376     return stringStore+bottom;
1377 }
1378
1379 /*
1380  * Hey, Emacs, please set the following:
1381  *
1382  * Local Variables:
1383  * indent-tabs-mode: nil
1384  * End:
1385  *
1386  */