]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4388f060 | 4 | * Copyright (C) 1998-2011, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * | |
9 | * File read.c | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 05/26/99 stephen Creation. | |
15 | * 5/10/01 Ram removed ustdio dependency | |
16 | ******************************************************************************* | |
17 | */ | |
18 | ||
19 | #include "read.h" | |
20 | #include "errmsg.h" | |
21 | #include "unicode/ustring.h" | |
22 | ||
23 | #define OPENBRACE 0x007B | |
24 | #define CLOSEBRACE 0x007D | |
25 | #define COMMA 0x002C | |
26 | #define QUOTE 0x0022 | |
27 | #define ESCAPE 0x005C | |
28 | #define SLASH 0x002F | |
29 | #define ASTERISK 0x002A | |
30 | #define SPACE 0x0020 | |
31 | #define COLON 0x003A | |
32 | #define BADBOM 0xFFFE | |
374ca955 A |
33 | #define CR 0x000D |
34 | #define LF 0x000A | |
35 | ||
b75a7d8f A |
36 | static int32_t lineCount; |
37 | ||
38 | /* Protos */ | |
39 | static enum ETokenType getStringToken(UCHARBUF *buf, | |
40 | UChar32 initialChar, | |
41 | struct UString *token, | |
42 | UErrorCode *status); | |
43 | ||
374ca955 A |
44 | static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); |
45 | static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); | |
46 | static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); | |
b75a7d8f A |
47 | static UBool isWhitespace (UChar32 c); |
48 | static UBool isNewline (UChar32 c); | |
49 | ||
4388f060 | 50 | U_CFUNC void resetLineNumber() { |
b75a7d8f A |
51 | lineCount = 1; |
52 | } | |
53 | ||
54 | /* Read and return the next token from the stream. If the token is of | |
55 | type eString, fill in the token parameter with the token. If the | |
56 | token is eError, then the status parameter will contain the | |
57 | specific error. This will be eItemNotFound at the end of file, | |
58 | indicating that all tokens have been returned. This method will | |
59 | never return eString twice in a row; instead, multiple adjacent | |
60 | string tokens will be merged into one, with no intervening | |
61 | space. */ | |
4388f060 A |
62 | U_CFUNC enum ETokenType |
63 | getNextToken(UCHARBUF* buf, | |
64 | struct UString *token, | |
65 | uint32_t *linenumber, /* out: linenumber of token */ | |
66 | struct UString *comment, | |
67 | UErrorCode *status) { | |
b75a7d8f A |
68 | enum ETokenType result; |
69 | UChar32 c; | |
70 | ||
71 | if (U_FAILURE(*status)) { | |
72 | return TOK_ERROR; | |
73 | } | |
74 | ||
75 | /* Skip whitespace */ | |
374ca955 | 76 | c = getNextChar(buf, TRUE, comment, status); |
b75a7d8f A |
77 | |
78 | if (U_FAILURE(*status)) { | |
79 | return TOK_ERROR; | |
80 | } | |
81 | ||
82 | *linenumber = lineCount; | |
83 | ||
84 | switch(c) { | |
85 | case BADBOM: | |
86 | return TOK_ERROR; | |
87 | case OPENBRACE: | |
88 | return TOK_OPEN_BRACE; | |
89 | case CLOSEBRACE: | |
90 | return TOK_CLOSE_BRACE; | |
91 | case COMMA: | |
92 | return TOK_COMMA; | |
93 | case U_EOF: | |
94 | return TOK_EOF; | |
95 | case COLON: | |
96 | return TOK_COLON; | |
97 | ||
98 | default: | |
99 | result = getStringToken(buf, c, token, status); | |
100 | } | |
101 | ||
102 | *linenumber = lineCount; | |
103 | return result; | |
104 | } | |
105 | ||
106 | /* Copy a string token into the given UnicodeString. Upon entry, we | |
107 | have already read the first character of the string token, which is | |
108 | not a whitespace character (but may be a QUOTE or ESCAPE). This | |
109 | function reads all subsequent characters that belong with this | |
110 | string, and copy them into the token parameter. The other | |
111 | important, and slightly convoluted purpose of this function is to | |
112 | merge adjacent strings. It looks forward a bit, and if the next | |
113 | non comment, non whitespace item is a string, it reads it in as | |
114 | well. If two adjacent strings are quoted, they are merged without | |
115 | intervening space. Otherwise a single SPACE character is | |
116 | inserted. */ | |
117 | static enum ETokenType getStringToken(UCHARBUF* buf, | |
118 | UChar32 initialChar, | |
119 | struct UString *token, | |
120 | UErrorCode *status) { | |
121 | UBool lastStringWasQuoted; | |
122 | UChar32 c; | |
123 | UChar target[3] = { '\0' }; | |
124 | UChar *pTarget = target; | |
125 | int len=0; | |
126 | UBool isFollowingCharEscaped=FALSE; | |
374ca955 A |
127 | UBool isNLUnescaped = FALSE; |
128 | UChar32 prevC=0; | |
b75a7d8f A |
129 | |
130 | /* We are guaranteed on entry that initialChar is not a whitespace | |
131 | character. If we are at the EOF, or have some other problem, it | |
132 | doesn't matter; we still want to validly return the initialChar | |
133 | (if nothing else) as a string token. */ | |
134 | ||
135 | if (U_FAILURE(*status)) { | |
136 | return TOK_ERROR; | |
137 | } | |
138 | ||
139 | /* setup */ | |
140 | lastStringWasQuoted = FALSE; | |
141 | c = initialChar; | |
142 | ustr_setlen(token, 0, status); | |
143 | ||
144 | if (U_FAILURE(*status)) { | |
145 | return TOK_ERROR; | |
146 | } | |
147 | ||
148 | for (;;) { | |
149 | if (c == QUOTE) { | |
150 | if (!lastStringWasQuoted && token->fLength > 0) { | |
151 | ustr_ucat(token, SPACE, status); | |
152 | ||
153 | if (U_FAILURE(*status)) { | |
154 | return TOK_ERROR; | |
155 | } | |
156 | } | |
157 | ||
158 | lastStringWasQuoted = TRUE; | |
159 | ||
160 | for (;;) { | |
161 | c = ucbuf_getc(buf,status); | |
162 | ||
163 | /* EOF reached */ | |
164 | if (c == U_EOF) { | |
165 | return TOK_EOF; | |
166 | } | |
167 | ||
168 | /* Unterminated quoted strings */ | |
169 | if (U_FAILURE(*status)) { | |
170 | return TOK_ERROR; | |
171 | } | |
172 | ||
173 | if (c == QUOTE && !isFollowingCharEscaped) { | |
174 | break; | |
175 | } | |
176 | ||
177 | if (c == ESCAPE && !isFollowingCharEscaped) { | |
178 | pTarget = target; | |
179 | c = unescape(buf, status); | |
180 | ||
181 | if (c == U_ERR) { | |
182 | return TOK_ERROR; | |
183 | } | |
374ca955 A |
184 | if(c == CR || c == LF){ |
185 | isNLUnescaped = TRUE; | |
186 | } | |
b75a7d8f A |
187 | } |
188 | ||
189 | if(c==ESCAPE && !isFollowingCharEscaped){ | |
190 | isFollowingCharEscaped = TRUE; | |
191 | }else{ | |
192 | U_APPEND_CHAR32(c, pTarget,len); | |
193 | pTarget = target; | |
194 | ustr_uscat(token, pTarget,len, status); | |
195 | isFollowingCharEscaped = FALSE; | |
196 | len=0; | |
374ca955 A |
197 | if(c == CR || c == LF){ |
198 | if(isNLUnescaped == FALSE && prevC!=CR){ | |
199 | lineCount++; | |
200 | } | |
201 | isNLUnescaped = FALSE; | |
202 | } | |
b75a7d8f A |
203 | } |
204 | ||
205 | if (U_FAILURE(*status)) { | |
206 | return TOK_ERROR; | |
207 | } | |
374ca955 | 208 | prevC = c; |
b75a7d8f A |
209 | } |
210 | } else { | |
211 | if (token->fLength > 0) { | |
212 | ustr_ucat(token, SPACE, status); | |
213 | ||
214 | if (U_FAILURE(*status)) { | |
215 | return TOK_ERROR; | |
216 | } | |
217 | } | |
218 | ||
219 | if(lastStringWasQuoted){ | |
220 | if(getShowWarning()){ | |
221 | warning(lineCount, "Mixing quoted and unquoted strings"); | |
222 | } | |
223 | if(isStrict()){ | |
224 | return TOK_ERROR; | |
225 | } | |
226 | ||
227 | } | |
228 | ||
229 | lastStringWasQuoted = FALSE; | |
230 | ||
231 | /* if we reach here we are mixing | |
232 | * quoted and unquoted strings | |
233 | * warn in normal mode and error in | |
234 | * pedantic mode | |
235 | */ | |
236 | ||
237 | if (c == ESCAPE) { | |
238 | pTarget = target; | |
239 | c = unescape(buf, status); | |
240 | ||
241 | /* EOF reached */ | |
242 | if (c == U_EOF) { | |
243 | return TOK_ERROR; | |
244 | } | |
245 | } | |
246 | ||
247 | U_APPEND_CHAR32(c, pTarget,len); | |
248 | pTarget = target; | |
249 | ustr_uscat(token, pTarget,len, status); | |
250 | len=0; | |
374ca955 | 251 | |
b75a7d8f A |
252 | if (U_FAILURE(*status)) { |
253 | return TOK_ERROR; | |
254 | } | |
255 | ||
256 | for (;;) { | |
257 | /* DON'T skip whitespace */ | |
374ca955 | 258 | c = getNextChar(buf, FALSE, NULL, status); |
b75a7d8f A |
259 | |
260 | /* EOF reached */ | |
261 | if (c == U_EOF) { | |
262 | ucbuf_ungetc(c, buf); | |
263 | return TOK_STRING; | |
264 | } | |
265 | ||
266 | if (U_FAILURE(*status)) { | |
267 | return TOK_STRING; | |
268 | } | |
269 | ||
270 | if (c == QUOTE | |
271 | || c == OPENBRACE | |
272 | || c == CLOSEBRACE | |
273 | || c == COMMA | |
274 | || c == COLON) { | |
275 | ucbuf_ungetc(c, buf); | |
276 | break; | |
277 | } | |
278 | ||
279 | if (isWhitespace(c)) { | |
280 | break; | |
281 | } | |
282 | ||
283 | if (c == ESCAPE) { | |
284 | pTarget = target; | |
285 | c = unescape(buf, status); | |
286 | ||
287 | if (c == U_ERR) { | |
288 | return TOK_ERROR; | |
289 | } | |
290 | } | |
291 | ||
292 | U_APPEND_CHAR32(c, pTarget,len); | |
293 | pTarget = target; | |
294 | ustr_uscat(token, pTarget,len, status); | |
295 | len=0; | |
296 | if (U_FAILURE(*status)) { | |
297 | return TOK_ERROR; | |
298 | } | |
299 | } | |
300 | } | |
301 | ||
302 | /* DO skip whitespace */ | |
374ca955 | 303 | c = getNextChar(buf, TRUE, NULL, status); |
b75a7d8f A |
304 | |
305 | if (U_FAILURE(*status)) { | |
306 | return TOK_STRING; | |
307 | } | |
308 | ||
309 | if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { | |
310 | ucbuf_ungetc(c, buf); | |
311 | return TOK_STRING; | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
374ca955 | 316 | /* Retrieve the next character. If skipwhite is |
b75a7d8f A |
317 | true, whitespace is skipped as well. */ |
318 | static UChar32 getNextChar(UCHARBUF* buf, | |
319 | UBool skipwhite, | |
374ca955 | 320 | struct UString *token, |
b75a7d8f | 321 | UErrorCode *status) { |
374ca955 | 322 | UChar32 c, c2; |
b75a7d8f A |
323 | |
324 | if (U_FAILURE(*status)) { | |
325 | return U_EOF; | |
326 | } | |
327 | ||
328 | for (;;) { | |
329 | c = ucbuf_getc(buf,status); | |
330 | ||
331 | if (c == U_EOF) { | |
332 | return U_EOF; | |
333 | } | |
334 | ||
335 | if (skipwhite && isWhitespace(c)) { | |
336 | continue; | |
337 | } | |
338 | ||
339 | /* This also handles the get() failing case */ | |
340 | if (c != SLASH) { | |
341 | return c; | |
342 | } | |
343 | ||
46f4442e | 344 | c = ucbuf_getc(buf,status); /* "/c" */ |
b75a7d8f A |
345 | |
346 | if (c == U_EOF) { | |
347 | return U_EOF; | |
348 | } | |
349 | ||
350 | switch (c) { | |
46f4442e | 351 | case SLASH: /* "//" */ |
374ca955 | 352 | seekUntilNewline(buf, NULL, status); |
b75a7d8f A |
353 | break; |
354 | ||
729e4ab9 A |
355 | case ASTERISK: /* " / * " */ |
356 | c2 = ucbuf_getc(buf, status); /* "/ * c" */ | |
357 | if(c2 == ASTERISK){ /* "/ * *" */ | |
374ca955 A |
358 | /* parse multi-line comment and store it in token*/ |
359 | seekUntilEndOfComment(buf, token, status); | |
46f4442e | 360 | } else { |
729e4ab9 | 361 | ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ |
374ca955 A |
362 | seekUntilEndOfComment(buf, NULL, status); |
363 | } | |
b75a7d8f A |
364 | break; |
365 | ||
366 | default: | |
46f4442e | 367 | ucbuf_ungetc(c, buf); /* "/c" - put back the c */ |
b75a7d8f A |
368 | /* If get() failed this is a NOP */ |
369 | return SLASH; | |
370 | } | |
374ca955 | 371 | |
b75a7d8f A |
372 | } |
373 | } | |
374 | ||
375 | static void seekUntilNewline(UCHARBUF* buf, | |
374ca955 | 376 | struct UString *token, |
b75a7d8f A |
377 | UErrorCode *status) { |
378 | UChar32 c; | |
379 | ||
380 | if (U_FAILURE(*status)) { | |
381 | return; | |
382 | } | |
383 | ||
384 | do { | |
385 | c = ucbuf_getc(buf,status); | |
374ca955 A |
386 | /* add the char to token */ |
387 | if(token!=NULL){ | |
388 | ustr_u32cat(token, c, status); | |
389 | } | |
b75a7d8f A |
390 | } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); |
391 | } | |
392 | ||
393 | static void seekUntilEndOfComment(UCHARBUF *buf, | |
374ca955 | 394 | struct UString *token, |
b75a7d8f A |
395 | UErrorCode *status) { |
396 | UChar32 c, d; | |
397 | uint32_t line; | |
398 | ||
399 | if (U_FAILURE(*status)) { | |
400 | return; | |
401 | } | |
402 | ||
403 | line = lineCount; | |
404 | ||
405 | do { | |
406 | c = ucbuf_getc(buf, status); | |
407 | ||
408 | if (c == ASTERISK) { | |
409 | d = ucbuf_getc(buf, status); | |
410 | ||
411 | if (d != SLASH) { | |
412 | ucbuf_ungetc(d, buf); | |
413 | } else { | |
414 | break; | |
415 | } | |
416 | } | |
374ca955 A |
417 | /* add the char to token */ |
418 | if(token!=NULL){ | |
419 | ustr_u32cat(token, c, status); | |
420 | } | |
421 | /* increment the lineCount */ | |
422 | isNewline(c); | |
423 | ||
b75a7d8f A |
424 | } while (c != U_EOF && *status == U_ZERO_ERROR); |
425 | ||
426 | if (c == U_EOF) { | |
427 | *status = U_INVALID_FORMAT_ERROR; | |
428 | error(line, "unterminated comment detected"); | |
429 | } | |
430 | } | |
431 | ||
4388f060 | 432 | U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { |
b75a7d8f A |
433 | if (U_FAILURE(*status)) { |
434 | return U_EOF; | |
435 | } | |
436 | ||
437 | /* We expect to be called after the ESCAPE has been seen, but | |
438 | * u_fgetcx needs an ESCAPE to do its magic. */ | |
439 | ucbuf_ungetc(ESCAPE, buf); | |
440 | ||
441 | return ucbuf_getcx32(buf, status); | |
442 | } | |
443 | ||
444 | static UBool isWhitespace(UChar32 c) { | |
445 | switch (c) { | |
446 | /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ | |
447 | case 0x000A: | |
448 | case 0x2029: | |
449 | lineCount++; | |
450 | case 0x000D: | |
451 | case 0x0020: | |
452 | case 0x0009: | |
453 | case 0xFEFF: | |
454 | return TRUE; | |
455 | ||
456 | default: | |
457 | return FALSE; | |
458 | } | |
459 | } | |
460 | ||
461 | static UBool isNewline(UChar32 c) { | |
462 | switch (c) { | |
463 | /* '\n', '\r', 0x2029 */ | |
464 | case 0x000A: | |
465 | case 0x2029: | |
466 | lineCount++; | |
467 | case 0x000D: | |
468 | return TRUE; | |
469 | ||
470 | default: | |
471 | return FALSE; | |
472 | } | |
473 | } |