]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genprops/misc/ucdmerge.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / genprops / misc / ucdmerge.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucdmerge.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003feb20
14 * created by: Markus W. Scherer
15 *
16 * Simple tool for Unicode Character Database files with semicolon-delimited fields.
17 * Merges adjacent, identical per-code point data lines into one line with range syntax.
18 *
19 * To compile, just call a C compiler/linker with this source file.
20 * On Windows: cl ucdmerge.c
21 */
22
23 #include <stdio.h>
24 #include <string.h>
25 #include <stdlib.h>
26
27 static const char *
28 skipWhitespace(const char *s) {
29 while(*s==' ' || *s=='\t') {
30 ++s;
31 }
32 return s;
33 }
34
35 /* return the first character position after the end of the data */
36 static char *
37 endOfData(const char *l) {
38 char *end;
39 char c;
40
41 end=strchr(l, '#');
42 if(end!=NULL) {
43 /* ignore whitespace before the comment */
44 while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
45 --end;
46 }
47 } else {
48 end=strchr(l, 0);
49 }
50 return end;
51 }
52
53 static int
54 sameData(const char *l1, const char *l2) {
55 char *end1, *end2;
56 int length;
57
58 /* find the first semicolon in each line - there must be one */
59 l1=strchr(l1, ';')+1;
60 l2=strchr(l2, ';')+1;
61
62 /* find the end of data: end of string or start of comment */
63 end1=endOfData(l1);
64 end2=endOfData(l2);
65
66 /* compare the line data portions */
67 length=end1-l1;
68 return length==(end2-l2) && 0==memcmp(l1, l2, length);
69 }
70
71 extern int
72 main(int argc, const char *argv[]) {
73 static char line[2000], firstLine[2000], lastLine[2000];
74 char *end;
75 long first, last, c;
76 int finished;
77
78 first=last=-1;
79 finished=0;
80
81 for(;;) {
82 if(gets(line)!=NULL) {
83 /* parse the initial code point, if any */
84 c=strtol(line, &end, 16);
85 if(end!=line && *skipWhitespace(end)==';') {
86 /* single code point followed by semicolon and data, keep c */
87 } else {
88 c=-1;
89 }
90 } else {
91 line[0]=0;
92 c=-1;
93 finished=1;
94 }
95
96 if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
97 /* output the current range */
98 if(first==last) {
99 /* there was no range, just output the one line we found */
100 puts(firstLine);
101 } else {
102 /* there was a real range, merge their lines */
103 end=strchr(lastLine, '#');
104 if(end==NULL) {
105 /* no comment in second line */
106 printf("%04lX..%04lX%s\n",
107 first, last, /* code point range */
108 strchr(firstLine, ';'));/* first line starting from the first ; */
109 } else if(strchr(firstLine, '#')==NULL) {
110 /* no comment in first line */
111 printf("%04lX..%04lX%s%s\n",
112 first, last, /* code point range */
113 strchr(firstLine, ';'), /* first line starting from the first ; */
114 end); /* comment from second line */
115 } else {
116 /* merge comments from both lines */
117 printf("%04lX..%04lX%s..%s\n",
118 first, last, /* code point range */
119 strchr(firstLine, ';'), /* first line starting from the first ; */
120 skipWhitespace(end+1)); /* comment from second line, after # and spaces */
121 }
122 }
123 first=last=-1;
124 }
125
126 if(c<0) {
127 if(finished) {
128 break;
129 }
130
131 /* no data on this line, output as is */
132 puts(line);
133 } else {
134 /* data on this line, store for possible range compaction */
135 if(last<0) {
136 /* set as the first line in a possible range */
137 first=last=c;
138 strcpy(firstLine, line);
139 lastLine[0]=0;
140 } else /* must be c==(last+1) && sameData() because of previous conditions */ {
141 /* continue with the current range */
142 last=c;
143 strcpy(lastLine, line);
144 }
145 }
146 }
147
148 return 0;
149 }