]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 1998-2011, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * | |
9 | * File read.c | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 05/26/99 stephen Creation. | |
15 | * 5/10/01 Ram removed ustdio dependency | |
16 | ******************************************************************************* | |
17 | */ | |
18 | ||
19 | #include "read.h" | |
20 | #include "errmsg.h" | |
21 | #include "unicode/ustring.h" | |
22 | ||
23 | #define OPENBRACE 0x007B | |
24 | #define CLOSEBRACE 0x007D | |
25 | #define COMMA 0x002C | |
26 | #define QUOTE 0x0022 | |
27 | #define ESCAPE 0x005C | |
28 | #define SLASH 0x002F | |
29 | #define ASTERISK 0x002A | |
30 | #define SPACE 0x0020 | |
31 | #define COLON 0x003A | |
32 | #define BADBOM 0xFFFE | |
33 | #define CR 0x000D | |
34 | #define LF 0x000A | |
35 | ||
36 | static int32_t lineCount; | |
37 | ||
38 | /* Protos */ | |
39 | static enum ETokenType getStringToken(UCHARBUF *buf, | |
40 | UChar32 initialChar, | |
41 | struct UString *token, | |
42 | UErrorCode *status); | |
43 | ||
44 | static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); | |
45 | static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); | |
46 | static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); | |
47 | static UBool isWhitespace (UChar32 c); | |
48 | static UBool isNewline (UChar32 c); | |
49 | ||
50 | U_CFUNC void resetLineNumber() { | |
51 | lineCount = 1; | |
52 | } | |
53 | ||
54 | /* Read and return the next token from the stream. If the token is of | |
55 | type eString, fill in the token parameter with the token. If the | |
56 | token is eError, then the status parameter will contain the | |
57 | specific error. This will be eItemNotFound at the end of file, | |
58 | indicating that all tokens have been returned. This method will | |
59 | never return eString twice in a row; instead, multiple adjacent | |
60 | string tokens will be merged into one, with no intervening | |
61 | space. */ | |
62 | U_CFUNC enum ETokenType | |
63 | getNextToken(UCHARBUF* buf, | |
64 | struct UString *token, | |
65 | uint32_t *linenumber, /* out: linenumber of token */ | |
66 | struct UString *comment, | |
67 | UErrorCode *status) { | |
68 | enum ETokenType result; | |
69 | UChar32 c; | |
70 | ||
71 | if (U_FAILURE(*status)) { | |
72 | return TOK_ERROR; | |
73 | } | |
74 | ||
75 | /* Skip whitespace */ | |
76 | c = getNextChar(buf, TRUE, comment, status); | |
77 | ||
78 | if (U_FAILURE(*status)) { | |
79 | return TOK_ERROR; | |
80 | } | |
81 | ||
82 | *linenumber = lineCount; | |
83 | ||
84 | switch(c) { | |
85 | case BADBOM: | |
86 | return TOK_ERROR; | |
87 | case OPENBRACE: | |
88 | return TOK_OPEN_BRACE; | |
89 | case CLOSEBRACE: | |
90 | return TOK_CLOSE_BRACE; | |
91 | case COMMA: | |
92 | return TOK_COMMA; | |
93 | case U_EOF: | |
94 | return TOK_EOF; | |
95 | case COLON: | |
96 | return TOK_COLON; | |
97 | ||
98 | default: | |
99 | result = getStringToken(buf, c, token, status); | |
100 | } | |
101 | ||
102 | *linenumber = lineCount; | |
103 | return result; | |
104 | } | |
105 | ||
106 | /* Copy a string token into the given UnicodeString. Upon entry, we | |
107 | have already read the first character of the string token, which is | |
108 | not a whitespace character (but may be a QUOTE or ESCAPE). This | |
109 | function reads all subsequent characters that belong with this | |
110 | string, and copy them into the token parameter. The other | |
111 | important, and slightly convoluted purpose of this function is to | |
112 | merge adjacent strings. It looks forward a bit, and if the next | |
113 | non comment, non whitespace item is a string, it reads it in as | |
114 | well. If two adjacent strings are quoted, they are merged without | |
115 | intervening space. Otherwise a single SPACE character is | |
116 | inserted. */ | |
117 | static enum ETokenType getStringToken(UCHARBUF* buf, | |
118 | UChar32 initialChar, | |
119 | struct UString *token, | |
120 | UErrorCode *status) { | |
121 | UBool lastStringWasQuoted; | |
122 | UChar32 c; | |
123 | UChar target[3] = { '\0' }; | |
124 | UChar *pTarget = target; | |
125 | int len=0; | |
126 | UBool isFollowingCharEscaped=FALSE; | |
127 | UBool isNLUnescaped = FALSE; | |
128 | UChar32 prevC=0; | |
129 | ||
130 | /* We are guaranteed on entry that initialChar is not a whitespace | |
131 | character. If we are at the EOF, or have some other problem, it | |
132 | doesn't matter; we still want to validly return the initialChar | |
133 | (if nothing else) as a string token. */ | |
134 | ||
135 | if (U_FAILURE(*status)) { | |
136 | return TOK_ERROR; | |
137 | } | |
138 | ||
139 | /* setup */ | |
140 | lastStringWasQuoted = FALSE; | |
141 | c = initialChar; | |
142 | ustr_setlen(token, 0, status); | |
143 | ||
144 | if (U_FAILURE(*status)) { | |
145 | return TOK_ERROR; | |
146 | } | |
147 | ||
148 | for (;;) { | |
149 | if (c == QUOTE) { | |
150 | if (!lastStringWasQuoted && token->fLength > 0) { | |
151 | ustr_ucat(token, SPACE, status); | |
152 | ||
153 | if (U_FAILURE(*status)) { | |
154 | return TOK_ERROR; | |
155 | } | |
156 | } | |
157 | ||
158 | lastStringWasQuoted = TRUE; | |
159 | ||
160 | for (;;) { | |
161 | c = ucbuf_getc(buf,status); | |
162 | ||
163 | /* EOF reached */ | |
164 | if (c == U_EOF) { | |
165 | return TOK_EOF; | |
166 | } | |
167 | ||
168 | /* Unterminated quoted strings */ | |
169 | if (U_FAILURE(*status)) { | |
170 | return TOK_ERROR; | |
171 | } | |
172 | ||
173 | if (c == QUOTE && !isFollowingCharEscaped) { | |
174 | break; | |
175 | } | |
176 | ||
177 | if (c == ESCAPE && !isFollowingCharEscaped) { | |
178 | pTarget = target; | |
179 | c = unescape(buf, status); | |
180 | ||
181 | if (c == U_ERR) { | |
182 | return TOK_ERROR; | |
183 | } | |
184 | if(c == CR || c == LF){ | |
185 | isNLUnescaped = TRUE; | |
186 | } | |
187 | } | |
188 | ||
189 | if(c==ESCAPE && !isFollowingCharEscaped){ | |
190 | isFollowingCharEscaped = TRUE; | |
191 | }else{ | |
192 | U_APPEND_CHAR32(c, pTarget,len); | |
193 | pTarget = target; | |
194 | ustr_uscat(token, pTarget,len, status); | |
195 | isFollowingCharEscaped = FALSE; | |
196 | len=0; | |
197 | if(c == CR || c == LF){ | |
198 | if(isNLUnescaped == FALSE && prevC!=CR){ | |
199 | lineCount++; | |
200 | } | |
201 | isNLUnescaped = FALSE; | |
202 | } | |
203 | } | |
204 | ||
205 | if (U_FAILURE(*status)) { | |
206 | return TOK_ERROR; | |
207 | } | |
208 | prevC = c; | |
209 | } | |
210 | } else { | |
211 | if (token->fLength > 0) { | |
212 | ustr_ucat(token, SPACE, status); | |
213 | ||
214 | if (U_FAILURE(*status)) { | |
215 | return TOK_ERROR; | |
216 | } | |
217 | } | |
218 | ||
219 | if(lastStringWasQuoted){ | |
220 | if(getShowWarning()){ | |
221 | warning(lineCount, "Mixing quoted and unquoted strings"); | |
222 | } | |
223 | if(isStrict()){ | |
224 | return TOK_ERROR; | |
225 | } | |
226 | ||
227 | } | |
228 | ||
229 | lastStringWasQuoted = FALSE; | |
230 | ||
231 | /* if we reach here we are mixing | |
232 | * quoted and unquoted strings | |
233 | * warn in normal mode and error in | |
234 | * pedantic mode | |
235 | */ | |
236 | ||
237 | if (c == ESCAPE) { | |
238 | pTarget = target; | |
239 | c = unescape(buf, status); | |
240 | ||
241 | /* EOF reached */ | |
242 | if (c == U_EOF) { | |
243 | return TOK_ERROR; | |
244 | } | |
245 | } | |
246 | ||
247 | U_APPEND_CHAR32(c, pTarget,len); | |
248 | pTarget = target; | |
249 | ustr_uscat(token, pTarget,len, status); | |
250 | len=0; | |
251 | ||
252 | if (U_FAILURE(*status)) { | |
253 | return TOK_ERROR; | |
254 | } | |
255 | ||
256 | for (;;) { | |
257 | /* DON'T skip whitespace */ | |
258 | c = getNextChar(buf, FALSE, NULL, status); | |
259 | ||
260 | /* EOF reached */ | |
261 | if (c == U_EOF) { | |
262 | ucbuf_ungetc(c, buf); | |
263 | return TOK_STRING; | |
264 | } | |
265 | ||
266 | if (U_FAILURE(*status)) { | |
267 | return TOK_STRING; | |
268 | } | |
269 | ||
270 | if (c == QUOTE | |
271 | || c == OPENBRACE | |
272 | || c == CLOSEBRACE | |
273 | || c == COMMA | |
274 | || c == COLON) { | |
275 | ucbuf_ungetc(c, buf); | |
276 | break; | |
277 | } | |
278 | ||
279 | if (isWhitespace(c)) { | |
280 | break; | |
281 | } | |
282 | ||
283 | if (c == ESCAPE) { | |
284 | pTarget = target; | |
285 | c = unescape(buf, status); | |
286 | ||
287 | if (c == U_ERR) { | |
288 | return TOK_ERROR; | |
289 | } | |
290 | } | |
291 | ||
292 | U_APPEND_CHAR32(c, pTarget,len); | |
293 | pTarget = target; | |
294 | ustr_uscat(token, pTarget,len, status); | |
295 | len=0; | |
296 | if (U_FAILURE(*status)) { | |
297 | return TOK_ERROR; | |
298 | } | |
299 | } | |
300 | } | |
301 | ||
302 | /* DO skip whitespace */ | |
303 | c = getNextChar(buf, TRUE, NULL, status); | |
304 | ||
305 | if (U_FAILURE(*status)) { | |
306 | return TOK_STRING; | |
307 | } | |
308 | ||
309 | if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { | |
310 | ucbuf_ungetc(c, buf); | |
311 | return TOK_STRING; | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
316 | /* Retrieve the next character. If skipwhite is | |
317 | true, whitespace is skipped as well. */ | |
318 | static UChar32 getNextChar(UCHARBUF* buf, | |
319 | UBool skipwhite, | |
320 | struct UString *token, | |
321 | UErrorCode *status) { | |
322 | UChar32 c, c2; | |
323 | ||
324 | if (U_FAILURE(*status)) { | |
325 | return U_EOF; | |
326 | } | |
327 | ||
328 | for (;;) { | |
329 | c = ucbuf_getc(buf,status); | |
330 | ||
331 | if (c == U_EOF) { | |
332 | return U_EOF; | |
333 | } | |
334 | ||
335 | if (skipwhite && isWhitespace(c)) { | |
336 | continue; | |
337 | } | |
338 | ||
339 | /* This also handles the get() failing case */ | |
340 | if (c != SLASH) { | |
341 | return c; | |
342 | } | |
343 | ||
344 | c = ucbuf_getc(buf,status); /* "/c" */ | |
345 | ||
346 | if (c == U_EOF) { | |
347 | return U_EOF; | |
348 | } | |
349 | ||
350 | switch (c) { | |
351 | case SLASH: /* "//" */ | |
352 | seekUntilNewline(buf, NULL, status); | |
353 | break; | |
354 | ||
355 | case ASTERISK: /* " / * " */ | |
356 | c2 = ucbuf_getc(buf, status); /* "/ * c" */ | |
357 | if(c2 == ASTERISK){ /* "/ * *" */ | |
358 | /* parse multi-line comment and store it in token*/ | |
359 | seekUntilEndOfComment(buf, token, status); | |
360 | } else { | |
361 | ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ | |
362 | seekUntilEndOfComment(buf, NULL, status); | |
363 | } | |
364 | break; | |
365 | ||
366 | default: | |
367 | ucbuf_ungetc(c, buf); /* "/c" - put back the c */ | |
368 | /* If get() failed this is a NOP */ | |
369 | return SLASH; | |
370 | } | |
371 | ||
372 | } | |
373 | } | |
374 | ||
375 | static void seekUntilNewline(UCHARBUF* buf, | |
376 | struct UString *token, | |
377 | UErrorCode *status) { | |
378 | UChar32 c; | |
379 | ||
380 | if (U_FAILURE(*status)) { | |
381 | return; | |
382 | } | |
383 | ||
384 | do { | |
385 | c = ucbuf_getc(buf,status); | |
386 | /* add the char to token */ | |
387 | if(token!=NULL){ | |
388 | ustr_u32cat(token, c, status); | |
389 | } | |
390 | } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); | |
391 | } | |
392 | ||
393 | static void seekUntilEndOfComment(UCHARBUF *buf, | |
394 | struct UString *token, | |
395 | UErrorCode *status) { | |
396 | UChar32 c, d; | |
397 | uint32_t line; | |
398 | ||
399 | if (U_FAILURE(*status)) { | |
400 | return; | |
401 | } | |
402 | ||
403 | line = lineCount; | |
404 | ||
405 | do { | |
406 | c = ucbuf_getc(buf, status); | |
407 | ||
408 | if (c == ASTERISK) { | |
409 | d = ucbuf_getc(buf, status); | |
410 | ||
411 | if (d != SLASH) { | |
412 | ucbuf_ungetc(d, buf); | |
413 | } else { | |
414 | break; | |
415 | } | |
416 | } | |
417 | /* add the char to token */ | |
418 | if(token!=NULL){ | |
419 | ustr_u32cat(token, c, status); | |
420 | } | |
421 | /* increment the lineCount */ | |
422 | isNewline(c); | |
423 | ||
424 | } while (c != U_EOF && *status == U_ZERO_ERROR); | |
425 | ||
426 | if (c == U_EOF) { | |
427 | *status = U_INVALID_FORMAT_ERROR; | |
428 | error(line, "unterminated comment detected"); | |
429 | } | |
430 | } | |
431 | ||
432 | U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { | |
433 | if (U_FAILURE(*status)) { | |
434 | return U_EOF; | |
435 | } | |
436 | ||
437 | /* We expect to be called after the ESCAPE has been seen, but | |
438 | * u_fgetcx needs an ESCAPE to do its magic. */ | |
439 | ucbuf_ungetc(ESCAPE, buf); | |
440 | ||
441 | return ucbuf_getcx32(buf, status); | |
442 | } | |
443 | ||
444 | static UBool isWhitespace(UChar32 c) { | |
445 | switch (c) { | |
446 | /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ | |
447 | case 0x000A: | |
448 | case 0x2029: | |
449 | lineCount++; | |
450 | case 0x000D: | |
451 | case 0x0020: | |
452 | case 0x0009: | |
453 | case 0xFEFF: | |
454 | return TRUE; | |
455 | ||
456 | default: | |
457 | return FALSE; | |
458 | } | |
459 | } | |
460 | ||
461 | static UBool isNewline(UChar32 c) { | |
462 | switch (c) { | |
463 | /* '\n', '\r', 0x2029 */ | |
464 | case 0x000A: | |
465 | case 0x2029: | |
466 | lineCount++; | |
467 | case 0x000D: | |
468 | return TRUE; | |
469 | ||
470 | default: | |
471 | return FALSE; | |
472 | } | |
473 | } |