2 *******************************************************************************
4 * Copyright (C) 2000-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000apr18
14 * created by: Markus W. Scherer
16 * This file provides a parser for files that are delimited by one single
17 * character like ';' or TAB. Example: the Unicode Character Properties files
18 * like UnicodeData.txt are semicolon-delimited.
21 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
31 U_CAPI
const char * U_EXPORT2
32 u_skipWhitespace(const char *s
) {
33 while(U_IS_INV_WHITESPACE(*s
)) {
39 U_CAPI
char * U_EXPORT2
41 char *end
=uprv_strchr(s
, 0);
42 while(s
<end
&& U_IS_INV_WHITESPACE(*(end
-1))) {
49 * If the string starts with # @missing: then return the pointer to the
50 * following non-whitespace character.
51 * Otherwise return the original pointer.
52 * Unicode 5.0 adds such lines in some data files to document
53 * default property values.
54 * Poor man's regex for variable amounts of white space.
57 getMissingLimit(const char *s
) {
60 *(s
=u_skipWhitespace(s
))=='#' &&
61 *(s
=u_skipWhitespace(s
+1))=='@' &&
62 0==strncmp((s
=u_skipWhitespace(s
+1)), "missing", 7) &&
63 *(s
=u_skipWhitespace(s
+7))==':'
65 return u_skipWhitespace(s
+1);
72 u_parseDelimitedFile(const char *filename
, char delimiter
,
73 char *fields
[][2], int32_t fieldCount
,
74 UParseLineFn
*lineFn
, void *context
,
75 UErrorCode
*pErrorCode
) {
81 if(U_FAILURE(*pErrorCode
)) {
85 if(fields
==NULL
|| lineFn
==NULL
|| fieldCount
<=0) {
86 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
90 if(filename
==NULL
|| *filename
==0 || (*filename
=='-' && filename
[1]==0)) {
92 file
=T_FileStream_stdin();
94 file
=T_FileStream_open(filename
, "r");
97 *pErrorCode
=U_FILE_ACCESS_ERROR
;
101 while(T_FileStream_readLine(file
, line
, sizeof(line
))!=NULL
) {
102 /* remove trailing newline characters */
103 length
=(int32_t)(u_rtrim(line
)-line
);
106 * detect a line with # @missing:
107 * start parsing after that, or else from the beginning of the line
108 * set the default warning for @missing lines
110 start
=(char *)getMissingLimit(line
);
112 *pErrorCode
=U_ZERO_ERROR
;
114 *pErrorCode
=U_USING_DEFAULT_WARNING
;
117 /* skip this line if it is empty or a comment */
118 if(*start
==0 || *start
=='#') {
122 /* remove in-line comments */
123 limit
=uprv_strchr(start
, '#');
125 /* get white space before the pound sign */
126 while(limit
>start
&& U_IS_INV_WHITESPACE(*(limit
-1))) {
130 /* truncate the line */
134 /* skip lines with only whitespace */
135 if(u_skipWhitespace(start
)[0]==0) {
139 /* for each field, call the corresponding field function */
140 for(i
=0; i
<fieldCount
; ++i
) {
141 /* set the limit pointer of this field */
143 while(*limit
!=delimiter
&& *limit
!=0) {
147 /* set the field start and limit in the fields array */
151 /* set start to the beginning of the next field, if any */
155 } else if(i
+1<fieldCount
) {
156 *pErrorCode
=U_PARSE_ERROR
;
163 /* error in a field function? */
164 if(U_FAILURE(*pErrorCode
)) {
168 /* call the field function */
169 lineFn(context
, fields
, fieldCount
, pErrorCode
);
170 if(U_FAILURE(*pErrorCode
)) {
176 T_FileStream_close(file
);
181 * parse a list of code points
182 * store them as a UTF-32 string in dest[destCapacity]
183 * return the number of code points
185 U_CAPI
int32_t U_EXPORT2
186 u_parseCodePoints(const char *s
,
187 uint32_t *dest
, int32_t destCapacity
,
188 UErrorCode
*pErrorCode
) {
193 if(U_FAILURE(*pErrorCode
)) {
196 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
197 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
203 s
=u_skipWhitespace(s
);
204 if(*s
==';' || *s
==0) {
208 /* read one code point */
209 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
210 if(end
<=s
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=';' && *end
!=0) || value
>=0x110000) {
211 *pErrorCode
=U_PARSE_ERROR
;
215 /* append it to the destination array */
216 if(count
<destCapacity
) {
219 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
222 /* go to the following characters */
228 * parse a list of code points
229 * store them as a string in dest[destCapacity]
230 * set the first code point in *pFirst
231 * @return The length of the string in numbers of UChars.
233 U_CAPI
int32_t U_EXPORT2
234 u_parseString(const char *s
,
235 UChar
*dest
, int32_t destCapacity
,
237 UErrorCode
*pErrorCode
) {
242 if(U_FAILURE(*pErrorCode
)) {
245 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
246 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
255 s
=u_skipWhitespace(s
);
256 if(*s
==';' || *s
==0) {
257 if(destLength
<destCapacity
) {
259 } else if(destLength
==destCapacity
) {
260 *pErrorCode
=U_STRING_NOT_TERMINATED_WARNING
;
262 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
267 /* read one code point */
268 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
269 if(end
<=s
|| (!U_IS_INV_WHITESPACE(*end
) && *end
!=';' && *end
!=0) || value
>=0x110000) {
270 *pErrorCode
=U_PARSE_ERROR
;
274 /* store the first code point */
280 /* append it to the destination array */
281 if((destLength
+U16_LENGTH(value
))<=destCapacity
) {
282 U16_APPEND_UNSAFE(dest
, destLength
, value
);
284 destLength
+=U16_LENGTH(value
);
287 /* go to the following characters */
292 /* read a range like start or start..end */
293 U_CAPI
int32_t U_EXPORT2
294 u_parseCodePointRangeAnyTerminator(const char *s
,
295 uint32_t *pStart
, uint32_t *pEnd
,
296 const char **terminator
,
297 UErrorCode
*pErrorCode
) {
301 if(U_FAILURE(*pErrorCode
)) {
304 if(s
==NULL
|| pStart
==NULL
|| pEnd
==NULL
) {
305 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
309 /* read the start code point */
310 s
=u_skipWhitespace(s
);
311 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
312 if(end
<=s
|| value
>=0x110000) {
313 *pErrorCode
=U_PARSE_ERROR
;
318 /* is there a "..end"? */
319 s
=u_skipWhitespace(end
);
320 if(*s
!='.' || s
[1]!='.') {
324 s
=u_skipWhitespace(s
+2);
326 /* read the end code point */
327 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
328 if(end
<=s
|| value
>=0x110000) {
329 *pErrorCode
=U_PARSE_ERROR
;
334 /* is this a valid range? */
336 *pErrorCode
=U_PARSE_ERROR
;
341 return value
-*pStart
+1;
344 U_CAPI
int32_t U_EXPORT2
345 u_parseCodePointRange(const char *s
,
346 uint32_t *pStart
, uint32_t *pEnd
,
347 UErrorCode
*pErrorCode
) {
348 const char *terminator
;
350 u_parseCodePointRangeAnyTerminator(s
, pStart
, pEnd
, &terminator
, pErrorCode
);
351 if(U_SUCCESS(*pErrorCode
)) {
352 terminator
=u_skipWhitespace(terminator
);
353 if(*terminator
!=';' && *terminator
!=0) {
354 *pErrorCode
=U_PARSE_ERROR
;
361 U_CAPI
int32_t U_EXPORT2
362 u_parseUTF8(const char *source
, int32_t sLen
, char *dest
, int32_t destCapacity
, UErrorCode
*status
) {
363 const char *read
= source
;
365 unsigned int value
= 0;
367 sLen
= (int32_t)strlen(source
);
370 while(read
< source
+sLen
) {
371 sscanf(read
, "%2x", &value
);
372 if(i
< destCapacity
) {
373 dest
[i
] = (char)value
;
378 return u_terminateChars(dest
, destCapacity
, i
, status
);