]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/uparse.c
ICU-6.2.10.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / uparse.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uparse.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000apr18
14 * created by: Markus W. Scherer
15 *
16 * This file provides a parser for files that are delimited by one single
17 * character like ';' or TAB. Example: the Unicode Character Properties files
18 * like UnicodeData.txt are semicolon-delimited.
19 */
20
21 #include "unicode/utypes.h"
22 #include "cstring.h"
23 #include "filestrm.h"
24 #include "uparse.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "ustr_imp.h"
28
29 #include <stdio.h>
30
31 U_CAPI const char * U_EXPORT2
32 u_skipWhitespace(const char *s) {
33 while(*s==' ' || *s=='\t') {
34 ++s;
35 }
36 return s;
37 }
38
39 U_CAPI void U_EXPORT2
40 u_parseDelimitedFile(const char *filename, char delimiter,
41 char *fields[][2], int32_t fieldCount,
42 UParseLineFn *lineFn, void *context,
43 UErrorCode *pErrorCode) {
44 FileStream *file;
45 char line[300];
46 char *start, *limit;
47 int32_t i, length;
48
49 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
50 return;
51 }
52
53 if(fields==NULL || lineFn==NULL || fieldCount<=0) {
54 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
55 return;
56 }
57
58 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
59 filename=NULL;
60 file=T_FileStream_stdin();
61 } else {
62 file=T_FileStream_open(filename, "r");
63 }
64 if(file==NULL) {
65 *pErrorCode=U_FILE_ACCESS_ERROR;
66 return;
67 }
68
69 while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
70 length=(int32_t)uprv_strlen(line);
71
72 /* remove trailing newline characters */
73 while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
74 line[--length]=0;
75 }
76
77 /* skip this line if it is empty or a comment */
78 if(line[0]==0 || line[0]=='#') {
79 continue;
80 }
81
82 /* remove in-line comments */
83 limit=uprv_strchr(line, '#');
84 if(limit!=NULL) {
85 /* get white space before the pound sign */
86 while(limit>line && (*(limit-1)==' ' || *(limit-1)=='\t')) {
87 --limit;
88 }
89
90 /* truncate the line */
91 *limit=0;
92 }
93
94 /* skip lines with only whitespace */
95 if(u_skipWhitespace(line)[0]==0) {
96 continue;
97 }
98
99 /* for each field, call the corresponding field function */
100 start=line;
101 for(i=0; i<fieldCount; ++i) {
102 /* set the limit pointer of this field */
103 limit=start;
104 while(*limit!=delimiter && *limit!=0) {
105 ++limit;
106 }
107
108 /* set the field start and limit in the fields array */
109 fields[i][0]=start;
110 fields[i][1]=limit;
111
112 /* set start to the beginning of the next field, if any */
113 start=limit;
114 if(*start!=0) {
115 ++start;
116 } else if(i+1<fieldCount) {
117 *pErrorCode=U_PARSE_ERROR;
118 limit=line+length;
119 i=fieldCount;
120 break;
121 }
122 }
123
124 /* error in a field function? */
125 if(U_FAILURE(*pErrorCode)) {
126 break;
127 }
128
129 /* call the field function */
130 lineFn(context, fields, fieldCount, pErrorCode);
131 if(U_FAILURE(*pErrorCode)) {
132 break;
133 }
134 }
135
136 if(filename!=NULL) {
137 T_FileStream_close(file);
138 }
139 }
140
141 /*
142 * parse a list of code points
143 * store them as a UTF-32 string in dest[destCapacity]
144 * return the number of code points
145 */
146 U_CAPI int32_t U_EXPORT2
147 u_parseCodePoints(const char *s,
148 uint32_t *dest, int32_t destCapacity,
149 UErrorCode *pErrorCode) {
150 char *end;
151 uint32_t value;
152 int32_t count;
153
154 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
155 return 0;
156 }
157 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
158 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
159 }
160
161 count=0;
162 for(;;) {
163 s=u_skipWhitespace(s);
164 if(*s==';' || *s==0) {
165 return count;
166 }
167
168 /* read one code point */
169 value=(uint32_t)uprv_strtoul(s, &end, 16);
170 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
171 *pErrorCode=U_PARSE_ERROR;
172 return 0;
173 }
174
175 /* append it to the destination array */
176 if(count<destCapacity) {
177 dest[count++]=value;
178 } else {
179 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
180 }
181
182 /* go to the following characters */
183 s=end;
184 }
185 }
186
187 /*
188 * parse a list of code points
189 * store them as a string in dest[destCapacity]
190 * set the first code point in *pFirst
191 * @return The length of the string in numbers of UChars.
192 */
193 U_CAPI int32_t U_EXPORT2
194 u_parseString(const char *s,
195 UChar *dest, int32_t destCapacity,
196 uint32_t *pFirst,
197 UErrorCode *pErrorCode) {
198 char *end;
199 uint32_t value;
200 int32_t destLength;
201
202 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
203 return 0;
204 }
205 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
206 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207 }
208
209 if(pFirst!=NULL) {
210 *pFirst=0xffffffff;
211 }
212
213 destLength=0;
214 for(;;) {
215 s=u_skipWhitespace(s);
216 if(*s==';' || *s==0) {
217 if(destLength<destCapacity) {
218 dest[destLength]=0;
219 } else if(destLength==destCapacity) {
220 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
221 } else {
222 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
223 }
224 return destLength;
225 }
226
227 /* read one code point */
228 value=(uint32_t)uprv_strtoul(s, &end, 16);
229 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
230 *pErrorCode=U_PARSE_ERROR;
231 return 0;
232 }
233
234 /* store the first code point */
235 if(destLength==0 && pFirst!=NULL) {
236 *pFirst=value;
237 }
238
239 /* append it to the destination array */
240 if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
241 UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
242 } else {
243 destLength+=UTF_CHAR_LENGTH(value);
244 }
245
246 /* go to the following characters */
247 s=end;
248 }
249 }
250
251 /* read a range like start or start..end */
252 U_CAPI int32_t U_EXPORT2
253 u_parseCodePointRange(const char *s,
254 uint32_t *pStart, uint32_t *pEnd,
255 UErrorCode *pErrorCode) {
256 char *end;
257 uint32_t value;
258
259 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
260 return 0;
261 }
262 if(s==NULL || pStart==NULL || pEnd==NULL) {
263 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
264 }
265
266 s=u_skipWhitespace(s);
267 if(*s==';' || *s==0) {
268 *pErrorCode=U_PARSE_ERROR;
269 return 0;
270 }
271
272 /* read the start code point */
273 value=(uint32_t)uprv_strtoul(s, &end, 16);
274 if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
275 *pErrorCode=U_PARSE_ERROR;
276 return 0;
277 }
278 *pStart=*pEnd=value;
279
280 /* is there a "..end"? */
281 s=u_skipWhitespace(end);
282 if(*s==';' || *s==0) {
283 return 1;
284 }
285
286 if(*s!='.' || s[1]!='.') {
287 *pErrorCode=U_PARSE_ERROR;
288 return 0;
289 }
290 s+=2;
291
292 /* read the end code point */
293 value=(uint32_t)uprv_strtoul(s, &end, 16);
294 if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
295 *pErrorCode=U_PARSE_ERROR;
296 return 0;
297 }
298 *pEnd=value;
299
300 /* is this a valid range? */
301 if(value<*pStart) {
302 *pErrorCode=U_PARSE_ERROR;
303 return 0;
304 }
305
306 /* no garbage after that? */
307 s=u_skipWhitespace(end);
308 if(*s==';' || *s==0) {
309 return value-*pStart+1;
310 } else {
311 *pErrorCode=U_PARSE_ERROR;
312 return 0;
313 }
314 }
315
316 U_CAPI int32_t U_EXPORT2
317 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
318 const char *read = source;
319 int32_t i = 0;
320 unsigned int value = 0;
321 if(sLen == -1) {
322 sLen = (int32_t)strlen(source);
323 }
324
325 while(read < source+sLen) {
326 sscanf(read, "%2x", &value);
327 if(i < destCapacity) {
328 dest[i] = (char)value;
329 }
330 i++;
331 read += 2;
332 }
333 return u_terminateChars(dest, destCapacity, i, status);
334 }