2 *******************************************************************************
4 * Copyright (C) 2000-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2000apr18
14 * created by: Markus W. Scherer
16 * This file provides a parser for files that are delimited by one single
17 * character like ';' or TAB. Example: the Unicode Character Properties files
18 * like UnicodeData.txt are semicolon-delimited.
21 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
31 U_CAPI
const char * U_EXPORT2
32 u_skipWhitespace(const char *s
) {
33 while(*s
==' ' || *s
=='\t') {
40 * If the string starts with # @missing: then return the pointer to the
41 * following non-whitespace character.
42 * Otherwise return the original pointer.
43 * Unicode 5.0 adds such lines in some data files to document
44 * default property values.
45 * Poor man's regex for variable amounts of white space.
48 getMissingLimit(const char *s
) {
51 *(s
=u_skipWhitespace(s
))=='#' &&
52 *(s
=u_skipWhitespace(s
+1))=='@' &&
53 0==strncmp((s
=u_skipWhitespace(s
+1)), "missing", 7) &&
54 *(s
=u_skipWhitespace(s
+7))==':'
56 return u_skipWhitespace(s
+1);
63 u_parseDelimitedFile(const char *filename
, char delimiter
,
64 char *fields
[][2], int32_t fieldCount
,
65 UParseLineFn
*lineFn
, void *context
,
66 UErrorCode
*pErrorCode
) {
72 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
76 if(fields
==NULL
|| lineFn
==NULL
|| fieldCount
<=0) {
77 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
81 if(filename
==NULL
|| *filename
==0 || (*filename
=='-' && filename
[1]==0)) {
83 file
=T_FileStream_stdin();
85 file
=T_FileStream_open(filename
, "r");
88 *pErrorCode
=U_FILE_ACCESS_ERROR
;
92 while(T_FileStream_readLine(file
, line
, sizeof(line
))!=NULL
) {
93 length
=(int32_t)uprv_strlen(line
);
95 /* remove trailing newline characters */
96 while(length
>0 && (line
[length
-1]=='\r' || line
[length
-1]=='\n')) {
101 * detect a line with # @missing:
102 * start parsing after that, or else from the beginning of the line
103 * set the default warning for @missing lines
105 start
=(char *)getMissingLimit(line
);
107 *pErrorCode
=U_ZERO_ERROR
;
109 *pErrorCode
=U_USING_DEFAULT_WARNING
;
112 /* skip this line if it is empty or a comment */
113 if(*start
==0 || *start
=='#') {
117 /* remove in-line comments */
118 limit
=uprv_strchr(start
, '#');
120 /* get white space before the pound sign */
121 while(limit
>start
&& (*(limit
-1)==' ' || *(limit
-1)=='\t')) {
125 /* truncate the line */
129 /* skip lines with only whitespace */
130 if(u_skipWhitespace(start
)[0]==0) {
134 /* for each field, call the corresponding field function */
135 for(i
=0; i
<fieldCount
; ++i
) {
136 /* set the limit pointer of this field */
138 while(*limit
!=delimiter
&& *limit
!=0) {
142 /* set the field start and limit in the fields array */
146 /* set start to the beginning of the next field, if any */
150 } else if(i
+1<fieldCount
) {
151 *pErrorCode
=U_PARSE_ERROR
;
158 /* error in a field function? */
159 if(U_FAILURE(*pErrorCode
)) {
163 /* call the field function */
164 lineFn(context
, fields
, fieldCount
, pErrorCode
);
165 if(U_FAILURE(*pErrorCode
)) {
171 T_FileStream_close(file
);
176 * parse a list of code points
177 * store them as a UTF-32 string in dest[destCapacity]
178 * return the number of code points
180 U_CAPI
int32_t U_EXPORT2
181 u_parseCodePoints(const char *s
,
182 uint32_t *dest
, int32_t destCapacity
,
183 UErrorCode
*pErrorCode
) {
188 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
191 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
192 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
198 s
=u_skipWhitespace(s
);
199 if(*s
==';' || *s
==0) {
203 /* read one code point */
204 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
205 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';' && *end
!=0) || value
>=0x110000) {
206 *pErrorCode
=U_PARSE_ERROR
;
210 /* append it to the destination array */
211 if(count
<destCapacity
) {
214 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
217 /* go to the following characters */
223 * parse a list of code points
224 * store them as a string in dest[destCapacity]
225 * set the first code point in *pFirst
226 * @return The length of the string in numbers of UChars.
228 U_CAPI
int32_t U_EXPORT2
229 u_parseString(const char *s
,
230 UChar
*dest
, int32_t destCapacity
,
232 UErrorCode
*pErrorCode
) {
237 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
240 if(s
==NULL
|| destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
241 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
250 s
=u_skipWhitespace(s
);
251 if(*s
==';' || *s
==0) {
252 if(destLength
<destCapacity
) {
254 } else if(destLength
==destCapacity
) {
255 *pErrorCode
=U_STRING_NOT_TERMINATED_WARNING
;
257 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
262 /* read one code point */
263 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
264 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';' && *end
!=0) || value
>=0x110000) {
265 *pErrorCode
=U_PARSE_ERROR
;
269 /* store the first code point */
270 if(destLength
==0 && pFirst
!=NULL
) {
274 /* append it to the destination array */
275 if((destLength
+UTF_CHAR_LENGTH(value
))<=destCapacity
) {
276 UTF_APPEND_CHAR_UNSAFE(dest
, destLength
, value
);
278 destLength
+=UTF_CHAR_LENGTH(value
);
281 /* go to the following characters */
286 /* read a range like start or start..end */
287 U_CAPI
int32_t U_EXPORT2
288 u_parseCodePointRange(const char *s
,
289 uint32_t *pStart
, uint32_t *pEnd
,
290 UErrorCode
*pErrorCode
) {
294 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
297 if(s
==NULL
|| pStart
==NULL
|| pEnd
==NULL
) {
298 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
302 s
=u_skipWhitespace(s
);
303 if(*s
==';' || *s
==0) {
304 *pErrorCode
=U_PARSE_ERROR
;
308 /* read the start code point */
309 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
310 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!='.' && *end
!=';') || value
>=0x110000) {
311 *pErrorCode
=U_PARSE_ERROR
;
316 /* is there a "..end"? */
317 s
=u_skipWhitespace(end
);
318 if(*s
==';' || *s
==0) {
322 if(*s
!='.' || s
[1]!='.') {
323 *pErrorCode
=U_PARSE_ERROR
;
328 /* read the end code point */
329 value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
330 if(end
<=s
|| (*end
!=' ' && *end
!='\t' && *end
!=';') || value
>=0x110000) {
331 *pErrorCode
=U_PARSE_ERROR
;
336 /* is this a valid range? */
338 *pErrorCode
=U_PARSE_ERROR
;
342 /* no garbage after that? */
343 s
=u_skipWhitespace(end
);
344 if(*s
==';' || *s
==0) {
345 return value
-*pStart
+1;
347 *pErrorCode
=U_PARSE_ERROR
;
352 U_CAPI
int32_t U_EXPORT2
353 u_parseUTF8(const char *source
, int32_t sLen
, char *dest
, int32_t destCapacity
, UErrorCode
*status
) {
354 const char *read
= source
;
356 unsigned int value
= 0;
358 sLen
= (int32_t)strlen(source
);
361 while(read
< source
+sLen
) {
362 sscanf(read
, "%2x", &value
);
363 if(i
< destCapacity
) {
364 dest
[i
] = (char)value
;
369 return u_terminateChars(dest
, destCapacity
, i
, status
);