icuSources/tools/genprops/misc/ucdmerge.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucdmerge.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2003feb20
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Simple tool for Unicode Character Database files with semicolon-delimited fields.
  17 *   Merges adjacent, identical per-code point data lines into one line with range syntax.
  18 *
  19 *   To compile, just call a C compiler/linker with this source file.
  20 *   On Windows: cl ucdmerge.c
  21 */
  22
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <stdlib.h>
  26
  27 static const char *
  28 skipWhitespace(const char *s) {
  29     while(*s==' ' || *s=='\t') {
  30         ++s;
  31     }
  32     return s;
  33 }
  34
  35 /* return the first character position after the end of the data */
  36 static char *
  37 endOfData(const char *l) {
  38     char *end;
  39     char c;
  40
  41     end=strchr(l, '#');
  42     if(end!=NULL) {
  43         /* ignore whitespace before the comment */
  44         while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
  45             --end;
  46         }
  47     } else {
  48         end=strchr(l, 0);
  49     }
  50     return end;
  51 }
  52
  53 static int
  54 sameData(const char *l1, const char *l2) {
  55     char *end1, *end2;
  56     int length;
  57
  58     /* find the first semicolon in each line - there must be one */
  59     l1=strchr(l1, ';')+1;
  60     l2=strchr(l2, ';')+1;
  61
  62     /* find the end of data: end of string or start of comment */
  63     end1=endOfData(l1);
  64     end2=endOfData(l2);
  65
  66     /* compare the line data portions */
  67     length=end1-l1;
  68     return length==(end2-l2) && 0==memcmp(l1, l2, length);
  69 }
  70
  71 extern int
  72 main(int argc, const char *argv[]) {
  73     static char line[2000], firstLine[2000], lastLine[2000];
  74     char *end;
  75     long first, last, c;
  76     int finished;
  77
  78     first=last=-1;
  79     finished=0;
  80
  81     for(;;) {
  82         if(gets(line)!=NULL) {
  83             /* parse the initial code point, if any */
  84             c=strtol(line, &end, 16);
  85             if(end!=line && *skipWhitespace(end)==';') {
  86                 /* single code point followed by semicolon and data, keep c */
  87             } else {
  88                 c=-1;
  89             }
  90         } else {
  91             line[0]=0;
  92             c=-1;
  93             finished=1;
  94         }
  95
  96         if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
  97             /* output the current range */
  98             if(first==last) {
  99                 /* there was no range, just output the one line we found */
 100                 puts(firstLine);
 101             } else {
 102                 /* there was a real range, merge their lines */
 103                 end=strchr(lastLine, '#');
 104                 if(end==NULL) {
 105                     /* no comment in second line */
 106                     printf("%04lX..%04lX%s\n",
 107                             first, last,            /* code point range */
 108                             strchr(firstLine, ';'));/* first line starting from the first ; */
 109                 } else if(strchr(firstLine, '#')==NULL) {
 110                     /* no comment in first line */
 111                     printf("%04lX..%04lX%s%s\n",
 112                             first, last,            /* code point range */
 113                             strchr(firstLine, ';'), /* first line starting from the first ; */
 114                             end);                   /* comment from second line */
 115                 } else {
 116                     /* merge comments from both lines */
 117                     printf("%04lX..%04lX%s..%s\n",
 118                             first, last,            /* code point range */
 119                             strchr(firstLine, ';'), /* first line starting from the first ; */
 120                             skipWhitespace(end+1)); /* comment from second line, after # and spaces */
 121                 }
 122             }
 123             first=last=-1;
 124         }
 125
 126         if(c<0) {
 127             if(finished) {
 128                 break;
 129             }
 130
 131             /* no data on this line, output as is */
 132             puts(line);
 133         } else {
 134             /* data on this line, store for possible range compaction */
 135             if(last<0) {
 136                 /* set as the first line in a possible range */
 137                 first=last=c;
 138                 strcpy(firstLine, line);
 139                 lastLine[0]=0;
 140             } else /* must be c==(last+1) && sameData() because of previous conditions */ {
 141                 /* continue with the current range */
 142                 last=c;
 143                 strcpy(lastLine, line);
 144             }
 145         }
 146     }
 147
 148     return 0;
 149 }