]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucdmerge.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003feb20 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Simple tool for Unicode Character Database files with semicolon-delimited fields. | |
17 | * Merges adjacent, identical per-code point data lines into one line with range syntax. | |
18 | * | |
19 | * To compile, just call a C compiler/linker with this source file. | |
20 | * On Windows: cl ucdmerge.c | |
21 | */ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <string.h> | |
25 | #include <stdlib.h> | |
26 | ||
27 | static const char * | |
28 | skipWhitespace(const char *s) { | |
29 | while(*s==' ' || *s=='\t') { | |
30 | ++s; | |
31 | } | |
32 | return s; | |
33 | } | |
34 | ||
35 | /* return the first character position after the end of the data */ | |
36 | static char * | |
37 | endOfData(const char *l) { | |
38 | char *end; | |
39 | char c; | |
40 | ||
41 | end=strchr(l, '#'); | |
42 | if(end!=NULL) { | |
43 | /* ignore whitespace before the comment */ | |
44 | while(l!=end && ((c=*(end-1))==' ' || c=='\t')) { | |
45 | --end; | |
46 | } | |
47 | } else { | |
48 | end=strchr(l, 0); | |
49 | } | |
50 | return end; | |
51 | } | |
52 | ||
53 | static int | |
54 | sameData(const char *l1, const char *l2) { | |
55 | char *end1, *end2; | |
56 | int length; | |
57 | ||
58 | /* find the first semicolon in each line - there must be one */ | |
59 | l1=strchr(l1, ';')+1; | |
60 | l2=strchr(l2, ';')+1; | |
61 | ||
62 | /* find the end of data: end of string or start of comment */ | |
63 | end1=endOfData(l1); | |
64 | end2=endOfData(l2); | |
65 | ||
66 | /* compare the line data portions */ | |
67 | length=end1-l1; | |
68 | return length==(end2-l2) && 0==memcmp(l1, l2, length); | |
69 | } | |
70 | ||
71 | extern int | |
72 | main(int argc, const char *argv[]) { | |
73 | static char line[2000], firstLine[2000], lastLine[2000]; | |
74 | char *end; | |
75 | long first, last, c; | |
76 | int finished; | |
77 | ||
78 | first=last=-1; | |
79 | finished=0; | |
80 | ||
81 | for(;;) { | |
82 | if(gets(line)!=NULL) { | |
83 | /* parse the initial code point, if any */ | |
84 | c=strtol(line, &end, 16); | |
85 | if(end!=line && *skipWhitespace(end)==';') { | |
86 | /* single code point followed by semicolon and data, keep c */ | |
87 | } else { | |
88 | c=-1; | |
89 | } | |
90 | } else { | |
91 | line[0]=0; | |
92 | c=-1; | |
93 | finished=1; | |
94 | } | |
95 | ||
96 | if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) { | |
97 | /* output the current range */ | |
98 | if(first==last) { | |
99 | /* there was no range, just output the one line we found */ | |
100 | puts(firstLine); | |
101 | } else { | |
102 | /* there was a real range, merge their lines */ | |
103 | end=strchr(lastLine, '#'); | |
104 | if(end==NULL) { | |
105 | /* no comment in second line */ | |
106 | printf("%04lX..%04lX%s\n", | |
107 | first, last, /* code point range */ | |
108 | strchr(firstLine, ';'));/* first line starting from the first ; */ | |
109 | } else if(strchr(firstLine, '#')==NULL) { | |
110 | /* no comment in first line */ | |
111 | printf("%04lX..%04lX%s%s\n", | |
112 | first, last, /* code point range */ | |
113 | strchr(firstLine, ';'), /* first line starting from the first ; */ | |
114 | end); /* comment from second line */ | |
115 | } else { | |
116 | /* merge comments from both lines */ | |
117 | printf("%04lX..%04lX%s..%s\n", | |
118 | first, last, /* code point range */ | |
119 | strchr(firstLine, ';'), /* first line starting from the first ; */ | |
120 | skipWhitespace(end+1)); /* comment from second line, after # and spaces */ | |
121 | } | |
122 | } | |
123 | first=last=-1; | |
124 | } | |
125 | ||
126 | if(c<0) { | |
127 | if(finished) { | |
128 | break; | |
129 | } | |
130 | ||
131 | /* no data on this line, output as is */ | |
132 | puts(line); | |
133 | } else { | |
134 | /* data on this line, store for possible range compaction */ | |
135 | if(last<0) { | |
136 | /* set as the first line in a possible range */ | |
137 | first=last=c; | |
138 | strcpy(firstLine, line); | |
139 | lastLine[0]=0; | |
140 | } else /* must be c==(last+1) && sameData() because of previous conditions */ { | |
141 | /* continue with the current range */ | |
142 | last=c; | |
143 | strcpy(lastLine, line); | |
144 | } | |
145 | } | |
146 | } | |
147 | ||
148 | return 0; | |
149 | } |