2 *******************************************************************************
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000apr18
14 * created by: Markus W. Scherer
16 * This file provides a parser for files that are delimited by one single
17 * character like ';' or TAB. Example: the Unicode Character Properties files
18 * like UnicodeData.txt are semicolon-delimited.
21 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
31 U_CAPI
const char * U_EXPORT2
32 u_skipWhitespace(const char *s
) {
33 while(*s
==' ' || *s
=='\t') {
40 u_parseDelimitedFile(const char *filename
, char delimiter
,
41 char *fields
[][2], int32_t fieldCount
,
42 UParseLineFn
*lineFn
, void *context
,
43 UErrorCode
*pErrorCode
) {
49 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
53 if(fields
==NULL
|| lineFn
==NULL
|| fieldCount
<=0) {
54 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
58 if(filename
==NULL
|| *filename
==0 || (*filename
=='-' && filename
[1]==0)) {
60 file
=T_FileStream_stdin();
62 file
=T_FileStream_open(filename
, "r");
65 *pErrorCode
=U_FILE_ACCESS_ERROR
;
69 while(T_FileStream_readLine(file
, line
, sizeof(line
))!=NULL
) {
70 length
=(int32_t)uprv_strlen(line
);
72 /* remove trailing newline characters */
73 while(length
>0 && (line
[length
-1]=='\r' || line
[length
-1]=='\n')) {
77 /* skip this line if it is empty or a comment */
78 if(line
[0]==0 || line
[0]=='#') {
82 /* remove in-line comments */
83 limit
=uprv_strchr(line
, '#');
85 /* get white space before the pound sign */
86 while(limit
>line
&& (*(limit
-1)==' ' || *(limit
-1)=='\t')) {
90 /* truncate the line */
94 /* skip lines with only whitespace */
95 if(u_skipWhitespace(line
)[0]==0) {
99 /* for each field, call the corresponding field function */
101 for(i
=0; i
<fieldCount
; ++i
) {
102 /* set the limit pointer of this field */
104 while(*limit
!=delimiter
&& *limit
!=0) {
108 /* set the field start and limit in the fields array */
112 /* set start to the beginning of the next field, if any */
116 } else if(i
+1<fieldCount
) {
117 *pErrorCode
=U_PARSE_ERROR
;
124 /* error in a field function? */
125 if(U_FAILURE(*pErrorCode
)) {
129 /* call the field function */
130 lineFn(context
, fields
, fieldCount
, pErrorCode
);
131 if(U_FAILURE(*pErrorCode
)) {
137 T_FileStream_close(file
);
142 * parse a list of code points
143 * store them as a UTF-32 string in dest[destCapacity]
144 * return the number of code points
146 U_CAPI
int32_t U_EXPORT2
147 u_parseCodePoints(const char *s
,
148 uint32_t *dest
, int32_t destCapacity
,
149 UErrorCode
*pErrorCode
) {
154 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
157 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
158 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
163 s
=u_skipWhitespace(s
);
164 if(*s
==';' || *s
==0) {
168 /* read one code point */
169 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
170 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';' && *end
!=0) || value
>=0x110000) {
171 *pErrorCode
=U_PARSE_ERROR
;
175 /* append it to the destination array */
176 if(count
<destCapacity
) {
179 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
182 /* go to the following characters */
188 * parse a list of code points
189 * store them as a string in dest[destCapacity]
190 * set the first code point in *pFirst
191 * @return The length of the string in numbers of UChars.
193 U_CAPI
int32_t U_EXPORT2
194 u_parseString(const char *s
,
195 UChar
*dest
, int32_t destCapacity
,
197 UErrorCode
*pErrorCode
) {
202 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
205 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
206 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
215 s
=u_skipWhitespace(s
);
216 if(*s
==';' || *s
==0) {
217 if(destLength
<destCapacity
) {
219 } else if(destLength
==destCapacity
) {
220 *pErrorCode
=U_STRING_NOT_TERMINATED_WARNING
;
222 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
227 /* read one code point */
228 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
229 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';' && *end
!=0) || value
>=0x110000) {
230 *pErrorCode
=U_PARSE_ERROR
;
234 /* store the first code point */
235 if(destLength
==0 && pFirst
!=NULL
) {
239 /* append it to the destination array */
240 if((destLength
+UTF_CHAR_LENGTH(value
))<=destCapacity
) {
241 UTF_APPEND_CHAR_UNSAFE(dest
, destLength
, value
);
243 destLength
+=UTF_CHAR_LENGTH(value
);
246 /* go to the following characters */
251 /* read a range like start or start..end */
252 U_CAPI
int32_t U_EXPORT2
253 u_parseCodePointRange(const char *s
,
254 uint32_t *pStart
, uint32_t *pEnd
,
255 UErrorCode
*pErrorCode
) {
259 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
262 if(s
==NULL
|| pStart
==NULL
|| pEnd
==NULL
) {
263 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
266 s
=u_skipWhitespace(s
);
267 if(*s
==';' || *s
==0) {
268 *pErrorCode
=U_PARSE_ERROR
;
272 /* read the start code point */
273 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
274 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!='.' && *end
!=';') || value
>=0x110000) {
275 *pErrorCode
=U_PARSE_ERROR
;
280 /* is there a "..end"? */
281 s
=u_skipWhitespace(end
);
282 if(*s
==';' || *s
==0) {
286 if(*s
!='.' || s
[1]!='.') {
287 *pErrorCode
=U_PARSE_ERROR
;
292 /* read the end code point */
293 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
294 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';') || value
>=0x110000) {
295 *pErrorCode
=U_PARSE_ERROR
;
300 /* is this a valid range? */
302 *pErrorCode
=U_PARSE_ERROR
;
306 /* no garbage after that? */
307 s
=u_skipWhitespace(end
);
308 if(*s
==';' || *s
==0) {
309 return value
-*pStart
+1;
311 *pErrorCode
=U_PARSE_ERROR
;
316 U_CAPI
int32_t U_EXPORT2
317 u_parseUTF8(const char *source
, int32_t sLen
, char *dest
, int32_t destCapacity
, UErrorCode
*status
) {
318 const char *read
= source
;
320 unsigned int value
= 0;
322 sLen
= (int32_t)strlen(source
);
325 while(read
< source
+sLen
) {
326 sscanf(read
, "%2x", &value
);
327 if(i
< destCapacity
) {
328 dest
[i
] = (char)value
;
333 return u_terminateChars(dest
, destCapacity
, i
, status
);