]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genrb/read.c
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / tools / genrb / read.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4388f060 4* Copyright (C) 1998-2011, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8*
9* File read.c
10*
11* Modification History:
12*
13* Date Name Description
14* 05/26/99 stephen Creation.
15* 5/10/01 Ram removed ustdio dependency
16*******************************************************************************
17*/
18
19#include "read.h"
20#include "errmsg.h"
21#include "unicode/ustring.h"
22
23#define OPENBRACE 0x007B
24#define CLOSEBRACE 0x007D
25#define COMMA 0x002C
26#define QUOTE 0x0022
27#define ESCAPE 0x005C
28#define SLASH 0x002F
29#define ASTERISK 0x002A
30#define SPACE 0x0020
31#define COLON 0x003A
32#define BADBOM 0xFFFE
374ca955
A
33#define CR 0x000D
34#define LF 0x000A
35
b75a7d8f
A
36static int32_t lineCount;
37
38/* Protos */
39static enum ETokenType getStringToken(UCHARBUF *buf,
40 UChar32 initialChar,
41 struct UString *token,
42 UErrorCode *status);
43
374ca955
A
44static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
45static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
46static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
b75a7d8f
A
47static UBool isWhitespace (UChar32 c);
48static UBool isNewline (UChar32 c);
49
4388f060 50U_CFUNC void resetLineNumber() {
b75a7d8f
A
51 lineCount = 1;
52}
53
54/* Read and return the next token from the stream. If the token is of
55 type eString, fill in the token parameter with the token. If the
56 token is eError, then the status parameter will contain the
57 specific error. This will be eItemNotFound at the end of file,
58 indicating that all tokens have been returned. This method will
59 never return eString twice in a row; instead, multiple adjacent
60 string tokens will be merged into one, with no intervening
61 space. */
4388f060
A
62U_CFUNC enum ETokenType
63getNextToken(UCHARBUF* buf,
64 struct UString *token,
65 uint32_t *linenumber, /* out: linenumber of token */
66 struct UString *comment,
67 UErrorCode *status) {
b75a7d8f
A
68 enum ETokenType result;
69 UChar32 c;
70
71 if (U_FAILURE(*status)) {
72 return TOK_ERROR;
73 }
74
75 /* Skip whitespace */
374ca955 76 c = getNextChar(buf, TRUE, comment, status);
b75a7d8f
A
77
78 if (U_FAILURE(*status)) {
79 return TOK_ERROR;
80 }
81
82 *linenumber = lineCount;
83
84 switch(c) {
85 case BADBOM:
86 return TOK_ERROR;
87 case OPENBRACE:
88 return TOK_OPEN_BRACE;
89 case CLOSEBRACE:
90 return TOK_CLOSE_BRACE;
91 case COMMA:
92 return TOK_COMMA;
93 case U_EOF:
94 return TOK_EOF;
95 case COLON:
96 return TOK_COLON;
97
98 default:
99 result = getStringToken(buf, c, token, status);
100 }
101
102 *linenumber = lineCount;
103 return result;
104}
105
106/* Copy a string token into the given UnicodeString. Upon entry, we
107 have already read the first character of the string token, which is
108 not a whitespace character (but may be a QUOTE or ESCAPE). This
109 function reads all subsequent characters that belong with this
110 string, and copy them into the token parameter. The other
111 important, and slightly convoluted purpose of this function is to
112 merge adjacent strings. It looks forward a bit, and if the next
113 non comment, non whitespace item is a string, it reads it in as
114 well. If two adjacent strings are quoted, they are merged without
115 intervening space. Otherwise a single SPACE character is
116 inserted. */
117static enum ETokenType getStringToken(UCHARBUF* buf,
118 UChar32 initialChar,
119 struct UString *token,
120 UErrorCode *status) {
121 UBool lastStringWasQuoted;
122 UChar32 c;
123 UChar target[3] = { '\0' };
124 UChar *pTarget = target;
125 int len=0;
126 UBool isFollowingCharEscaped=FALSE;
374ca955
A
127 UBool isNLUnescaped = FALSE;
128 UChar32 prevC=0;
b75a7d8f
A
129
130 /* We are guaranteed on entry that initialChar is not a whitespace
131 character. If we are at the EOF, or have some other problem, it
132 doesn't matter; we still want to validly return the initialChar
133 (if nothing else) as a string token. */
134
135 if (U_FAILURE(*status)) {
136 return TOK_ERROR;
137 }
138
139 /* setup */
140 lastStringWasQuoted = FALSE;
141 c = initialChar;
142 ustr_setlen(token, 0, status);
143
144 if (U_FAILURE(*status)) {
145 return TOK_ERROR;
146 }
147
148 for (;;) {
149 if (c == QUOTE) {
150 if (!lastStringWasQuoted && token->fLength > 0) {
151 ustr_ucat(token, SPACE, status);
152
153 if (U_FAILURE(*status)) {
154 return TOK_ERROR;
155 }
156 }
157
158 lastStringWasQuoted = TRUE;
159
160 for (;;) {
161 c = ucbuf_getc(buf,status);
162
163 /* EOF reached */
164 if (c == U_EOF) {
165 return TOK_EOF;
166 }
167
168 /* Unterminated quoted strings */
169 if (U_FAILURE(*status)) {
170 return TOK_ERROR;
171 }
172
173 if (c == QUOTE && !isFollowingCharEscaped) {
174 break;
175 }
176
177 if (c == ESCAPE && !isFollowingCharEscaped) {
178 pTarget = target;
179 c = unescape(buf, status);
180
181 if (c == U_ERR) {
182 return TOK_ERROR;
183 }
374ca955
A
184 if(c == CR || c == LF){
185 isNLUnescaped = TRUE;
186 }
b75a7d8f
A
187 }
188
189 if(c==ESCAPE && !isFollowingCharEscaped){
190 isFollowingCharEscaped = TRUE;
191 }else{
192 U_APPEND_CHAR32(c, pTarget,len);
193 pTarget = target;
194 ustr_uscat(token, pTarget,len, status);
195 isFollowingCharEscaped = FALSE;
196 len=0;
374ca955
A
197 if(c == CR || c == LF){
198 if(isNLUnescaped == FALSE && prevC!=CR){
199 lineCount++;
200 }
201 isNLUnescaped = FALSE;
202 }
b75a7d8f
A
203 }
204
205 if (U_FAILURE(*status)) {
206 return TOK_ERROR;
207 }
374ca955 208 prevC = c;
b75a7d8f
A
209 }
210 } else {
211 if (token->fLength > 0) {
212 ustr_ucat(token, SPACE, status);
213
214 if (U_FAILURE(*status)) {
215 return TOK_ERROR;
216 }
217 }
218
219 if(lastStringWasQuoted){
220 if(getShowWarning()){
221 warning(lineCount, "Mixing quoted and unquoted strings");
222 }
223 if(isStrict()){
224 return TOK_ERROR;
225 }
226
227 }
228
229 lastStringWasQuoted = FALSE;
230
231 /* if we reach here we are mixing
232 * quoted and unquoted strings
233 * warn in normal mode and error in
234 * pedantic mode
235 */
236
237 if (c == ESCAPE) {
238 pTarget = target;
239 c = unescape(buf, status);
240
241 /* EOF reached */
242 if (c == U_EOF) {
243 return TOK_ERROR;
244 }
245 }
246
247 U_APPEND_CHAR32(c, pTarget,len);
248 pTarget = target;
249 ustr_uscat(token, pTarget,len, status);
250 len=0;
374ca955 251
b75a7d8f
A
252 if (U_FAILURE(*status)) {
253 return TOK_ERROR;
254 }
255
256 for (;;) {
257 /* DON'T skip whitespace */
374ca955 258 c = getNextChar(buf, FALSE, NULL, status);
b75a7d8f
A
259
260 /* EOF reached */
261 if (c == U_EOF) {
262 ucbuf_ungetc(c, buf);
263 return TOK_STRING;
264 }
265
266 if (U_FAILURE(*status)) {
267 return TOK_STRING;
268 }
269
270 if (c == QUOTE
271 || c == OPENBRACE
272 || c == CLOSEBRACE
273 || c == COMMA
274 || c == COLON) {
275 ucbuf_ungetc(c, buf);
276 break;
277 }
278
279 if (isWhitespace(c)) {
280 break;
281 }
282
283 if (c == ESCAPE) {
284 pTarget = target;
285 c = unescape(buf, status);
286
287 if (c == U_ERR) {
288 return TOK_ERROR;
289 }
290 }
291
292 U_APPEND_CHAR32(c, pTarget,len);
293 pTarget = target;
294 ustr_uscat(token, pTarget,len, status);
295 len=0;
296 if (U_FAILURE(*status)) {
297 return TOK_ERROR;
298 }
299 }
300 }
301
302 /* DO skip whitespace */
374ca955 303 c = getNextChar(buf, TRUE, NULL, status);
b75a7d8f
A
304
305 if (U_FAILURE(*status)) {
306 return TOK_STRING;
307 }
308
309 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
310 ucbuf_ungetc(c, buf);
311 return TOK_STRING;
312 }
313 }
314}
315
374ca955 316/* Retrieve the next character. If skipwhite is
b75a7d8f
A
317 true, whitespace is skipped as well. */
318static UChar32 getNextChar(UCHARBUF* buf,
319 UBool skipwhite,
374ca955 320 struct UString *token,
b75a7d8f 321 UErrorCode *status) {
374ca955 322 UChar32 c, c2;
b75a7d8f
A
323
324 if (U_FAILURE(*status)) {
325 return U_EOF;
326 }
327
328 for (;;) {
329 c = ucbuf_getc(buf,status);
330
331 if (c == U_EOF) {
332 return U_EOF;
333 }
334
335 if (skipwhite && isWhitespace(c)) {
336 continue;
337 }
338
339 /* This also handles the get() failing case */
340 if (c != SLASH) {
341 return c;
342 }
343
46f4442e 344 c = ucbuf_getc(buf,status); /* "/c" */
b75a7d8f
A
345
346 if (c == U_EOF) {
347 return U_EOF;
348 }
349
350 switch (c) {
46f4442e 351 case SLASH: /* "//" */
374ca955 352 seekUntilNewline(buf, NULL, status);
b75a7d8f
A
353 break;
354
729e4ab9
A
355 case ASTERISK: /* " / * " */
356 c2 = ucbuf_getc(buf, status); /* "/ * c" */
357 if(c2 == ASTERISK){ /* "/ * *" */
374ca955
A
358 /* parse multi-line comment and store it in token*/
359 seekUntilEndOfComment(buf, token, status);
46f4442e 360 } else {
729e4ab9 361 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
374ca955
A
362 seekUntilEndOfComment(buf, NULL, status);
363 }
b75a7d8f
A
364 break;
365
366 default:
46f4442e 367 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
b75a7d8f
A
368 /* If get() failed this is a NOP */
369 return SLASH;
370 }
374ca955 371
b75a7d8f
A
372 }
373}
374
375static void seekUntilNewline(UCHARBUF* buf,
374ca955 376 struct UString *token,
b75a7d8f
A
377 UErrorCode *status) {
378 UChar32 c;
379
380 if (U_FAILURE(*status)) {
381 return;
382 }
383
384 do {
385 c = ucbuf_getc(buf,status);
374ca955
A
386 /* add the char to token */
387 if(token!=NULL){
388 ustr_u32cat(token, c, status);
389 }
b75a7d8f
A
390 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
391}
392
393static void seekUntilEndOfComment(UCHARBUF *buf,
374ca955 394 struct UString *token,
b75a7d8f
A
395 UErrorCode *status) {
396 UChar32 c, d;
397 uint32_t line;
398
399 if (U_FAILURE(*status)) {
400 return;
401 }
402
403 line = lineCount;
404
405 do {
406 c = ucbuf_getc(buf, status);
407
408 if (c == ASTERISK) {
409 d = ucbuf_getc(buf, status);
410
411 if (d != SLASH) {
412 ucbuf_ungetc(d, buf);
413 } else {
414 break;
415 }
416 }
374ca955
A
417 /* add the char to token */
418 if(token!=NULL){
419 ustr_u32cat(token, c, status);
420 }
421 /* increment the lineCount */
422 isNewline(c);
423
b75a7d8f
A
424 } while (c != U_EOF && *status == U_ZERO_ERROR);
425
426 if (c == U_EOF) {
427 *status = U_INVALID_FORMAT_ERROR;
428 error(line, "unterminated comment detected");
429 }
430}
431
4388f060 432U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
b75a7d8f
A
433 if (U_FAILURE(*status)) {
434 return U_EOF;
435 }
436
437 /* We expect to be called after the ESCAPE has been seen, but
438 * u_fgetcx needs an ESCAPE to do its magic. */
439 ucbuf_ungetc(ESCAPE, buf);
440
441 return ucbuf_getcx32(buf, status);
442}
443
444static UBool isWhitespace(UChar32 c) {
445 switch (c) {
446 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
447 case 0x000A:
448 case 0x2029:
449 lineCount++;
450 case 0x000D:
451 case 0x0020:
452 case 0x0009:
453 case 0xFEFF:
454 return TRUE;
455
456 default:
457 return FALSE;
458 }
459}
460
461static UBool isNewline(UChar32 c) {
462 switch (c) {
463 /* '\n', '\r', 0x2029 */
464 case 0x000A:
465 case 0x2029:
466 lineCount++;
467 case 0x000D:
468 return TRUE;
469
470 default:
471 return FALSE;
472 }
473}