2 *******************************************************************************
4 * Copyright (C) 1998-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 05/26/99 stephen Creation.
15 * 5/10/01 Ram removed ustdio dependency
16 *******************************************************************************
21 #include "unicode/ustring.h"
22 #include "unicode/utf16.h"
24 #define OPENBRACE 0x007B
25 #define CLOSEBRACE 0x007D
30 #define ASTERISK 0x002A
37 static int32_t lineCount
;
40 static enum ETokenType
getStringToken(UCHARBUF
*buf
,
42 struct UString
*token
,
45 static UChar32
getNextChar (UCHARBUF
*buf
, UBool skipwhite
, struct UString
*token
, UErrorCode
*status
);
46 static void seekUntilNewline (UCHARBUF
*buf
, struct UString
*token
, UErrorCode
*status
);
47 static void seekUntilEndOfComment (UCHARBUF
*buf
, struct UString
*token
, UErrorCode
*status
);
48 static UBool
isWhitespace (UChar32 c
);
49 static UBool
isNewline (UChar32 c
);
51 U_CFUNC
void resetLineNumber() {
55 /* Read and return the next token from the stream. If the token is of
56 type eString, fill in the token parameter with the token. If the
57 token is eError, then the status parameter will contain the
58 specific error. This will be eItemNotFound at the end of file,
59 indicating that all tokens have been returned. This method will
60 never return eString twice in a row; instead, multiple adjacent
61 string tokens will be merged into one, with no intervening
63 U_CFUNC
enum ETokenType
64 getNextToken(UCHARBUF
* buf
,
65 struct UString
*token
,
66 uint32_t *linenumber
, /* out: linenumber of token */
67 struct UString
*comment
,
69 enum ETokenType result
;
72 if (U_FAILURE(*status
)) {
77 c
= getNextChar(buf
, TRUE
, comment
, status
);
79 if (U_FAILURE(*status
)) {
83 *linenumber
= lineCount
;
89 return TOK_OPEN_BRACE
;
91 return TOK_CLOSE_BRACE
;
100 result
= getStringToken(buf
, c
, token
, status
);
103 *linenumber
= lineCount
;
107 /* Copy a string token into the given UnicodeString. Upon entry, we
108 have already read the first character of the string token, which is
109 not a whitespace character (but may be a QUOTE or ESCAPE). This
110 function reads all subsequent characters that belong with this
111 string, and copy them into the token parameter. The other
112 important, and slightly convoluted purpose of this function is to
113 merge adjacent strings. It looks forward a bit, and if the next
114 non comment, non whitespace item is a string, it reads it in as
115 well. If two adjacent strings are quoted, they are merged without
116 intervening space. Otherwise a single SPACE character is
118 static enum ETokenType
getStringToken(UCHARBUF
* buf
,
120 struct UString
*token
,
121 UErrorCode
*status
) {
122 UBool lastStringWasQuoted
;
124 UChar target
[3] = { '\0' };
125 UChar
*pTarget
= target
;
127 UBool isFollowingCharEscaped
=FALSE
;
128 UBool isNLUnescaped
= FALSE
;
131 /* We are guaranteed on entry that initialChar is not a whitespace
132 character. If we are at the EOF, or have some other problem, it
133 doesn't matter; we still want to validly return the initialChar
134 (if nothing else) as a string token. */
136 if (U_FAILURE(*status
)) {
141 lastStringWasQuoted
= FALSE
;
143 ustr_setlen(token
, 0, status
);
145 if (U_FAILURE(*status
)) {
151 if (!lastStringWasQuoted
&& token
->fLength
> 0) {
152 ustr_ucat(token
, SPACE
, status
);
154 if (U_FAILURE(*status
)) {
159 lastStringWasQuoted
= TRUE
;
162 c
= ucbuf_getc(buf
,status
);
169 /* Unterminated quoted strings */
170 if (U_FAILURE(*status
)) {
174 if (c
== QUOTE
&& !isFollowingCharEscaped
) {
178 if (c
== ESCAPE
&& !isFollowingCharEscaped
) {
180 c
= unescape(buf
, status
);
185 if(c
== CR
|| c
== LF
){
186 isNLUnescaped
= TRUE
;
190 if(c
==ESCAPE
&& !isFollowingCharEscaped
){
191 isFollowingCharEscaped
= TRUE
;
193 U_APPEND_CHAR32(c
, pTarget
,len
);
195 ustr_uscat(token
, pTarget
,len
, status
);
196 isFollowingCharEscaped
= FALSE
;
198 if(c
== CR
|| c
== LF
){
199 if(isNLUnescaped
== FALSE
&& prevC
!=CR
){
202 isNLUnescaped
= FALSE
;
206 if (U_FAILURE(*status
)) {
212 if (token
->fLength
> 0) {
213 ustr_ucat(token
, SPACE
, status
);
215 if (U_FAILURE(*status
)) {
220 if(lastStringWasQuoted
){
221 if(getShowWarning()){
222 warning(lineCount
, "Mixing quoted and unquoted strings");
230 lastStringWasQuoted
= FALSE
;
232 /* if we reach here we are mixing
233 * quoted and unquoted strings
234 * warn in normal mode and error in
240 c
= unescape(buf
, status
);
248 U_APPEND_CHAR32(c
, pTarget
,len
);
250 ustr_uscat(token
, pTarget
,len
, status
);
253 if (U_FAILURE(*status
)) {
258 /* DON'T skip whitespace */
259 c
= getNextChar(buf
, FALSE
, NULL
, status
);
263 ucbuf_ungetc(c
, buf
);
267 if (U_FAILURE(*status
)) {
276 ucbuf_ungetc(c
, buf
);
280 if (isWhitespace(c
)) {
286 c
= unescape(buf
, status
);
293 U_APPEND_CHAR32(c
, pTarget
,len
);
295 ustr_uscat(token
, pTarget
,len
, status
);
297 if (U_FAILURE(*status
)) {
303 /* DO skip whitespace */
304 c
= getNextChar(buf
, TRUE
, NULL
, status
);
306 if (U_FAILURE(*status
)) {
310 if (c
== OPENBRACE
|| c
== CLOSEBRACE
|| c
== COMMA
|| c
== COLON
) {
311 ucbuf_ungetc(c
, buf
);
317 /* Retrieve the next character. If skipwhite is
318 true, whitespace is skipped as well. */
319 static UChar32
getNextChar(UCHARBUF
* buf
,
321 struct UString
*token
,
322 UErrorCode
*status
) {
325 if (U_FAILURE(*status
)) {
330 c
= ucbuf_getc(buf
,status
);
336 if (skipwhite
&& isWhitespace(c
)) {
340 /* This also handles the get() failing case */
345 c
= ucbuf_getc(buf
,status
); /* "/c" */
352 case SLASH
: /* "//" */
353 seekUntilNewline(buf
, NULL
, status
);
356 case ASTERISK
: /* " / * " */
357 c2
= ucbuf_getc(buf
, status
); /* "/ * c" */
358 if(c2
== ASTERISK
){ /* "/ * *" */
359 /* parse multi-line comment and store it in token*/
360 seekUntilEndOfComment(buf
, token
, status
);
362 ucbuf_ungetc(c2
, buf
); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
363 seekUntilEndOfComment(buf
, NULL
, status
);
368 ucbuf_ungetc(c
, buf
); /* "/c" - put back the c */
369 /* If get() failed this is a NOP */
376 static void seekUntilNewline(UCHARBUF
* buf
,
377 struct UString
*token
,
378 UErrorCode
*status
) {
381 if (U_FAILURE(*status
)) {
386 c
= ucbuf_getc(buf
,status
);
387 /* add the char to token */
389 ustr_u32cat(token
, c
, status
);
391 } while (!isNewline(c
) && c
!= U_EOF
&& *status
== U_ZERO_ERROR
);
394 static void seekUntilEndOfComment(UCHARBUF
*buf
,
395 struct UString
*token
,
396 UErrorCode
*status
) {
400 if (U_FAILURE(*status
)) {
407 c
= ucbuf_getc(buf
, status
);
410 d
= ucbuf_getc(buf
, status
);
413 ucbuf_ungetc(d
, buf
);
418 /* add the char to token */
420 ustr_u32cat(token
, c
, status
);
422 /* increment the lineCount */
425 } while (c
!= U_EOF
&& *status
== U_ZERO_ERROR
);
428 *status
= U_INVALID_FORMAT_ERROR
;
429 error(line
, "unterminated comment detected");
433 U_CFUNC UChar32
unescape(UCHARBUF
*buf
, UErrorCode
*status
) {
434 if (U_FAILURE(*status
)) {
438 /* We expect to be called after the ESCAPE has been seen, but
439 * u_fgetcx needs an ESCAPE to do its magic. */
440 ucbuf_ungetc(ESCAPE
, buf
);
442 return ucbuf_getcx32(buf
, status
);
445 static UBool
isWhitespace(UChar32 c
) {
447 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
462 static UBool
isNewline(UChar32 c
) {
464 /* '\n', '\r', 0x2029 */