]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/toolutil/uparse.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / uparse.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4*******************************************************************************
5*
51004dcb 6* Copyright (C) 2000-2012, International Business Machines
b75a7d8f
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uparse.c
f3c0d7a5 11* encoding: UTF-8
b75a7d8f
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2000apr18
16* created by: Markus W. Scherer
17*
18* This file provides a parser for files that are delimited by one single
19* character like ';' or TAB. Example: the Unicode Character Properties files
20* like UnicodeData.txt are semicolon-delimited.
21*/
22
23#include "unicode/utypes.h"
4388f060
A
24#include "unicode/uchar.h"
25#include "unicode/ustring.h"
26#include "unicode/utf16.h"
b75a7d8f
A
27#include "cstring.h"
28#include "filestrm.h"
29#include "uparse.h"
b75a7d8f
A
30#include "ustr_imp.h"
31
32#include <stdio.h>
33
34U_CAPI const char * U_EXPORT2
35u_skipWhitespace(const char *s) {
729e4ab9 36 while(U_IS_INV_WHITESPACE(*s)) {
b75a7d8f
A
37 ++s;
38 }
39 return s;
40}
41
729e4ab9
A
42U_CAPI char * U_EXPORT2
43u_rtrim(char *s) {
44 char *end=uprv_strchr(s, 0);
45 while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
46 *--end = 0;
47 }
48 return end;
49}
50
73c04bcf
A
51/*
52 * If the string starts with # @missing: then return the pointer to the
53 * following non-whitespace character.
54 * Otherwise return the original pointer.
55 * Unicode 5.0 adds such lines in some data files to document
56 * default property values.
57 * Poor man's regex for variable amounts of white space.
58 */
59static const char *
60getMissingLimit(const char *s) {
61 const char *s0=s;
62 if(
63 *(s=u_skipWhitespace(s))=='#' &&
64 *(s=u_skipWhitespace(s+1))=='@' &&
65 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
66 *(s=u_skipWhitespace(s+7))==':'
67 ) {
68 return u_skipWhitespace(s+1);
69 } else {
70 return s0;
71 }
72}
73
b75a7d8f
A
74U_CAPI void U_EXPORT2
75u_parseDelimitedFile(const char *filename, char delimiter,
76 char *fields[][2], int32_t fieldCount,
77 UParseLineFn *lineFn, void *context,
78 UErrorCode *pErrorCode) {
79 FileStream *file;
80 char line[300];
81 char *start, *limit;
82 int32_t i, length;
83
729e4ab9 84 if(U_FAILURE(*pErrorCode)) {
b75a7d8f
A
85 return;
86 }
87
88 if(fields==NULL || lineFn==NULL || fieldCount<=0) {
89 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
90 return;
91 }
92
93 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
94 filename=NULL;
95 file=T_FileStream_stdin();
96 } else {
97 file=T_FileStream_open(filename, "r");
98 }
99 if(file==NULL) {
100 *pErrorCode=U_FILE_ACCESS_ERROR;
101 return;
102 }
103
104 while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
b75a7d8f 105 /* remove trailing newline characters */
729e4ab9 106 length=(int32_t)(u_rtrim(line)-line);
b75a7d8f 107
73c04bcf
A
108 /*
109 * detect a line with # @missing:
110 * start parsing after that, or else from the beginning of the line
111 * set the default warning for @missing lines
112 */
113 start=(char *)getMissingLimit(line);
114 if(start==line) {
115 *pErrorCode=U_ZERO_ERROR;
116 } else {
117 *pErrorCode=U_USING_DEFAULT_WARNING;
118 }
119
b75a7d8f 120 /* skip this line if it is empty or a comment */
73c04bcf 121 if(*start==0 || *start=='#') {
b75a7d8f
A
122 continue;
123 }
124
125 /* remove in-line comments */
73c04bcf 126 limit=uprv_strchr(start, '#');
b75a7d8f
A
127 if(limit!=NULL) {
128 /* get white space before the pound sign */
729e4ab9 129 while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
b75a7d8f
A
130 --limit;
131 }
132
133 /* truncate the line */
134 *limit=0;
135 }
136
137 /* skip lines with only whitespace */
73c04bcf 138 if(u_skipWhitespace(start)[0]==0) {
b75a7d8f
A
139 continue;
140 }
141
142 /* for each field, call the corresponding field function */
b75a7d8f
A
143 for(i=0; i<fieldCount; ++i) {
144 /* set the limit pointer of this field */
145 limit=start;
146 while(*limit!=delimiter && *limit!=0) {
147 ++limit;
148 }
149
150 /* set the field start and limit in the fields array */
151 fields[i][0]=start;
152 fields[i][1]=limit;
153
154 /* set start to the beginning of the next field, if any */
155 start=limit;
156 if(*start!=0) {
157 ++start;
158 } else if(i+1<fieldCount) {
159 *pErrorCode=U_PARSE_ERROR;
160 limit=line+length;
161 i=fieldCount;
162 break;
163 }
164 }
165
166 /* error in a field function? */
167 if(U_FAILURE(*pErrorCode)) {
168 break;
169 }
170
171 /* call the field function */
172 lineFn(context, fields, fieldCount, pErrorCode);
173 if(U_FAILURE(*pErrorCode)) {
174 break;
175 }
176 }
177
178 if(filename!=NULL) {
179 T_FileStream_close(file);
180 }
181}
182
183/*
184 * parse a list of code points
185 * store them as a UTF-32 string in dest[destCapacity]
186 * return the number of code points
187 */
188U_CAPI int32_t U_EXPORT2
189u_parseCodePoints(const char *s,
190 uint32_t *dest, int32_t destCapacity,
191 UErrorCode *pErrorCode) {
192 char *end;
193 uint32_t value;
194 int32_t count;
195
729e4ab9 196 if(U_FAILURE(*pErrorCode)) {
b75a7d8f
A
197 return 0;
198 }
199 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 201 return 0;
b75a7d8f
A
202 }
203
204 count=0;
205 for(;;) {
206 s=u_skipWhitespace(s);
207 if(*s==';' || *s==0) {
208 return count;
209 }
210
211 /* read one code point */
212 value=(uint32_t)uprv_strtoul(s, &end, 16);
729e4ab9 213 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
b75a7d8f
A
214 *pErrorCode=U_PARSE_ERROR;
215 return 0;
216 }
217
218 /* append it to the destination array */
219 if(count<destCapacity) {
220 dest[count++]=value;
221 } else {
222 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
223 }
224
225 /* go to the following characters */
226 s=end;
227 }
228}
229
230/*
231 * parse a list of code points
232 * store them as a string in dest[destCapacity]
233 * set the first code point in *pFirst
234 * @return The length of the string in numbers of UChars.
235 */
236U_CAPI int32_t U_EXPORT2
237u_parseString(const char *s,
238 UChar *dest, int32_t destCapacity,
239 uint32_t *pFirst,
240 UErrorCode *pErrorCode) {
241 char *end;
242 uint32_t value;
243 int32_t destLength;
244
729e4ab9 245 if(U_FAILURE(*pErrorCode)) {
b75a7d8f
A
246 return 0;
247 }
248 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
249 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
51004dcb 250 return 0;
b75a7d8f
A
251 }
252
253 if(pFirst!=NULL) {
254 *pFirst=0xffffffff;
255 }
256
257 destLength=0;
258 for(;;) {
259 s=u_skipWhitespace(s);
260 if(*s==';' || *s==0) {
261 if(destLength<destCapacity) {
262 dest[destLength]=0;
263 } else if(destLength==destCapacity) {
264 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
265 } else {
266 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267 }
268 return destLength;
269 }
270
271 /* read one code point */
272 value=(uint32_t)uprv_strtoul(s, &end, 16);
729e4ab9 273 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
b75a7d8f
A
274 *pErrorCode=U_PARSE_ERROR;
275 return 0;
276 }
277
278 /* store the first code point */
729e4ab9 279 if(pFirst!=NULL) {
b75a7d8f 280 *pFirst=value;
729e4ab9 281 pFirst=NULL;
b75a7d8f
A
282 }
283
284 /* append it to the destination array */
729e4ab9
A
285 if((destLength+U16_LENGTH(value))<=destCapacity) {
286 U16_APPEND_UNSAFE(dest, destLength, value);
b75a7d8f 287 } else {
729e4ab9 288 destLength+=U16_LENGTH(value);
b75a7d8f
A
289 }
290
291 /* go to the following characters */
292 s=end;
293 }
294}
295
296/* read a range like start or start..end */
297U_CAPI int32_t U_EXPORT2
729e4ab9
A
298u_parseCodePointRangeAnyTerminator(const char *s,
299 uint32_t *pStart, uint32_t *pEnd,
300 const char **terminator,
301 UErrorCode *pErrorCode) {
b75a7d8f
A
302 char *end;
303 uint32_t value;
304
729e4ab9 305 if(U_FAILURE(*pErrorCode)) {
b75a7d8f
A
306 return 0;
307 }
308 if(s==NULL || pStart==NULL || pEnd==NULL) {
309 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 310 return 0;
b75a7d8f
A
311 }
312
b75a7d8f 313 /* read the start code point */
729e4ab9 314 s=u_skipWhitespace(s);
b75a7d8f 315 value=(uint32_t)uprv_strtoul(s, &end, 16);
729e4ab9 316 if(end<=s || value>=0x110000) {
b75a7d8f
A
317 *pErrorCode=U_PARSE_ERROR;
318 return 0;
319 }
320 *pStart=*pEnd=value;
321
322 /* is there a "..end"? */
323 s=u_skipWhitespace(end);
b75a7d8f 324 if(*s!='.' || s[1]!='.') {
729e4ab9
A
325 *terminator=end;
326 return 1;
b75a7d8f 327 }
729e4ab9 328 s=u_skipWhitespace(s+2);
b75a7d8f
A
329
330 /* read the end code point */
331 value=(uint32_t)uprv_strtoul(s, &end, 16);
729e4ab9 332 if(end<=s || value>=0x110000) {
b75a7d8f
A
333 *pErrorCode=U_PARSE_ERROR;
334 return 0;
335 }
336 *pEnd=value;
337
338 /* is this a valid range? */
339 if(value<*pStart) {
340 *pErrorCode=U_PARSE_ERROR;
341 return 0;
342 }
343
729e4ab9
A
344 *terminator=end;
345 return value-*pStart+1;
346}
347
348U_CAPI int32_t U_EXPORT2
349u_parseCodePointRange(const char *s,
350 uint32_t *pStart, uint32_t *pEnd,
351 UErrorCode *pErrorCode) {
352 const char *terminator;
353 int32_t rangeLength=
354 u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
355 if(U_SUCCESS(*pErrorCode)) {
356 terminator=u_skipWhitespace(terminator);
357 if(*terminator!=';' && *terminator!=0) {
358 *pErrorCode=U_PARSE_ERROR;
359 return 0;
360 }
b75a7d8f 361 }
729e4ab9 362 return rangeLength;
b75a7d8f
A
363}
364
b75a7d8f
A
365U_CAPI int32_t U_EXPORT2
366u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
367 const char *read = source;
368 int32_t i = 0;
369 unsigned int value = 0;
370 if(sLen == -1) {
374ca955 371 sLen = (int32_t)strlen(source);
b75a7d8f
A
372 }
373
374 while(read < source+sLen) {
375 sscanf(read, "%2x", &value);
376 if(i < destCapacity) {
377 dest[i] = (char)value;
378 }
379 i++;
380 read += 2;
381 }
382 return u_terminateChars(dest, destCapacity, i, status);
383}