1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1998-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 5/10/01 Ram removed ustdio dependency
18 *******************************************************************************
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
26 #define OPENBRACE 0x007B
27 #define CLOSEBRACE 0x007D
32 #define ASTERISK 0x002A
39 static int32_t lineCount
;
42 static enum ETokenType
getStringToken(UCHARBUF
*buf
,
44 struct UString
*token
,
47 static UChar32
getNextChar (UCHARBUF
*buf
, UBool skipwhite
, struct UString
*token
, UErrorCode
*status
);
48 static void seekUntilNewline (UCHARBUF
*buf
, struct UString
*token
, UErrorCode
*status
);
49 static void seekUntilEndOfComment (UCHARBUF
*buf
, struct UString
*token
, UErrorCode
*status
);
50 static UBool
isWhitespace (UChar32 c
);
51 static UBool
isNewline (UChar32 c
);
53 U_CFUNC
void resetLineNumber() {
57 /* Read and return the next token from the stream. If the token is of
58 type eString, fill in the token parameter with the token. If the
59 token is eError, then the status parameter will contain the
60 specific error. This will be eItemNotFound at the end of file,
61 indicating that all tokens have been returned. This method will
62 never return eString twice in a row; instead, multiple adjacent
63 string tokens will be merged into one, with no intervening
65 U_CFUNC
enum ETokenType
66 getNextToken(UCHARBUF
* buf
,
67 struct UString
*token
,
68 uint32_t *linenumber
, /* out: linenumber of token */
69 struct UString
*comment
,
71 enum ETokenType result
;
74 if (U_FAILURE(*status
)) {
79 c
= getNextChar(buf
, TRUE
, comment
, status
);
81 if (U_FAILURE(*status
)) {
85 *linenumber
= lineCount
;
91 return TOK_OPEN_BRACE
;
93 return TOK_CLOSE_BRACE
;
102 result
= getStringToken(buf
, c
, token
, status
);
105 *linenumber
= lineCount
;
109 /* Copy a string token into the given UnicodeString. Upon entry, we
110 have already read the first character of the string token, which is
111 not a whitespace character (but may be a QUOTE or ESCAPE). This
112 function reads all subsequent characters that belong with this
113 string, and copy them into the token parameter. The other
114 important, and slightly convoluted purpose of this function is to
115 merge adjacent strings. It looks forward a bit, and if the next
116 non comment, non whitespace item is a string, it reads it in as
117 well. If two adjacent strings are quoted, they are merged without
118 intervening space. Otherwise a single SPACE character is
120 static enum ETokenType
getStringToken(UCHARBUF
* buf
,
122 struct UString
*token
,
123 UErrorCode
*status
) {
124 UBool lastStringWasQuoted
;
126 UChar target
[3] = { '\0' };
127 UChar
*pTarget
= target
;
129 UBool isFollowingCharEscaped
=FALSE
;
130 UBool isNLUnescaped
= FALSE
;
133 /* We are guaranteed on entry that initialChar is not a whitespace
134 character. If we are at the EOF, or have some other problem, it
135 doesn't matter; we still want to validly return the initialChar
136 (if nothing else) as a string token. */
138 if (U_FAILURE(*status
)) {
143 lastStringWasQuoted
= FALSE
;
145 ustr_setlen(token
, 0, status
);
147 if (U_FAILURE(*status
)) {
153 if (!lastStringWasQuoted
&& token
->fLength
> 0) {
154 ustr_ucat(token
, SPACE
, status
);
156 if (U_FAILURE(*status
)) {
161 lastStringWasQuoted
= TRUE
;
164 c
= ucbuf_getc(buf
,status
);
171 /* Unterminated quoted strings */
172 if (U_FAILURE(*status
)) {
176 if (c
== QUOTE
&& !isFollowingCharEscaped
) {
180 if (c
== ESCAPE
&& !isFollowingCharEscaped
) {
182 c
= unescape(buf
, status
);
187 if(c
== CR
|| c
== LF
){
188 isNLUnescaped
= TRUE
;
192 if(c
==ESCAPE
&& !isFollowingCharEscaped
){
193 isFollowingCharEscaped
= TRUE
;
195 U_APPEND_CHAR32(c
, pTarget
,len
);
197 ustr_uscat(token
, pTarget
,len
, status
);
198 isFollowingCharEscaped
= FALSE
;
200 if(c
== CR
|| c
== LF
){
201 if(isNLUnescaped
== FALSE
&& prevC
!=CR
){
204 isNLUnescaped
= FALSE
;
208 if (U_FAILURE(*status
)) {
214 if (token
->fLength
> 0) {
215 ustr_ucat(token
, SPACE
, status
);
217 if (U_FAILURE(*status
)) {
222 if(lastStringWasQuoted
){
223 if(getShowWarning()){
224 warning(lineCount
, "Mixing quoted and unquoted strings");
232 lastStringWasQuoted
= FALSE
;
234 /* if we reach here we are mixing
235 * quoted and unquoted strings
236 * warn in normal mode and error in
242 c
= unescape(buf
, status
);
250 U_APPEND_CHAR32(c
, pTarget
,len
);
252 ustr_uscat(token
, pTarget
,len
, status
);
255 if (U_FAILURE(*status
)) {
260 /* DON'T skip whitespace */
261 c
= getNextChar(buf
, FALSE
, NULL
, status
);
265 ucbuf_ungetc(c
, buf
);
269 if (U_FAILURE(*status
)) {
278 ucbuf_ungetc(c
, buf
);
282 if (isWhitespace(c
)) {
288 c
= unescape(buf
, status
);
295 U_APPEND_CHAR32(c
, pTarget
,len
);
297 ustr_uscat(token
, pTarget
,len
, status
);
299 if (U_FAILURE(*status
)) {
305 /* DO skip whitespace */
306 c
= getNextChar(buf
, TRUE
, NULL
, status
);
308 if (U_FAILURE(*status
)) {
312 if (c
== OPENBRACE
|| c
== CLOSEBRACE
|| c
== COMMA
|| c
== COLON
) {
313 ucbuf_ungetc(c
, buf
);
319 /* Retrieve the next character. If skipwhite is
320 true, whitespace is skipped as well. */
321 static UChar32
getNextChar(UCHARBUF
* buf
,
323 struct UString
*token
,
324 UErrorCode
*status
) {
327 if (U_FAILURE(*status
)) {
332 c
= ucbuf_getc(buf
,status
);
338 if (skipwhite
&& isWhitespace(c
)) {
342 /* This also handles the get() failing case */
347 c
= ucbuf_getc(buf
,status
); /* "/c" */
354 case SLASH
: /* "//" */
355 seekUntilNewline(buf
, NULL
, status
);
358 case ASTERISK
: /* " / * " */
359 c2
= ucbuf_getc(buf
, status
); /* "/ * c" */
360 if(c2
== ASTERISK
){ /* "/ * *" */
361 /* parse multi-line comment and store it in token*/
362 seekUntilEndOfComment(buf
, token
, status
);
364 ucbuf_ungetc(c2
, buf
); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
365 seekUntilEndOfComment(buf
, NULL
, status
);
370 ucbuf_ungetc(c
, buf
); /* "/c" - put back the c */
371 /* If get() failed this is a NOP */
378 static void seekUntilNewline(UCHARBUF
* buf
,
379 struct UString
*token
,
380 UErrorCode
*status
) {
383 if (U_FAILURE(*status
)) {
388 c
= ucbuf_getc(buf
,status
);
389 /* add the char to token */
391 ustr_u32cat(token
, c
, status
);
393 } while (!isNewline(c
) && c
!= U_EOF
&& *status
== U_ZERO_ERROR
);
396 static void seekUntilEndOfComment(UCHARBUF
*buf
,
397 struct UString
*token
,
398 UErrorCode
*status
) {
402 if (U_FAILURE(*status
)) {
409 c
= ucbuf_getc(buf
, status
);
412 d
= ucbuf_getc(buf
, status
);
415 ucbuf_ungetc(d
, buf
);
420 /* add the char to token */
422 ustr_u32cat(token
, c
, status
);
424 /* increment the lineCount */
427 } while (c
!= U_EOF
&& *status
== U_ZERO_ERROR
);
430 *status
= U_INVALID_FORMAT_ERROR
;
431 error(line
, "unterminated comment detected");
435 U_CFUNC UChar32
unescape(UCHARBUF
*buf
, UErrorCode
*status
) {
436 if (U_FAILURE(*status
)) {
440 /* We expect to be called after the ESCAPE has been seen, but
441 * u_fgetcx needs an ESCAPE to do its magic. */
442 ucbuf_ungetc(ESCAPE
, buf
);
444 return ucbuf_getcx32(buf
, status
);
447 static UBool
isWhitespace(UChar32 c
) {
449 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
464 static UBool
isNewline(UChar32 c
) {
466 /* '\n', '\r', 0x2029 */