]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genrb/read.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / genrb / read.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 1998-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8*
9* File read.c
10*
11* Modification History:
12*
13* Date Name Description
14* 05/26/99 stephen Creation.
15* 5/10/01 Ram removed ustdio dependency
16*******************************************************************************
17*/
18
19#include "read.h"
20#include "errmsg.h"
21#include "unicode/ustring.h"
22
23#define OPENBRACE 0x007B
24#define CLOSEBRACE 0x007D
25#define COMMA 0x002C
26#define QUOTE 0x0022
27#define ESCAPE 0x005C
28#define SLASH 0x002F
29#define ASTERISK 0x002A
30#define SPACE 0x0020
31#define COLON 0x003A
32#define BADBOM 0xFFFE
33
34static int32_t lineCount;
35
36/* Protos */
37static enum ETokenType getStringToken(UCHARBUF *buf,
38 UChar32 initialChar,
39 struct UString *token,
40 UErrorCode *status);
41
42static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, UErrorCode *status);
43static void seekUntilNewline (UCHARBUF *buf, UErrorCode *status);
44static void seekUntilEndOfComment (UCHARBUF *buf, UErrorCode *status);
45static UBool isWhitespace (UChar32 c);
46static UBool isNewline (UChar32 c);
47
48void resetLineNumber() {
49 lineCount = 1;
50}
51
52/* Read and return the next token from the stream. If the token is of
53 type eString, fill in the token parameter with the token. If the
54 token is eError, then the status parameter will contain the
55 specific error. This will be eItemNotFound at the end of file,
56 indicating that all tokens have been returned. This method will
57 never return eString twice in a row; instead, multiple adjacent
58 string tokens will be merged into one, with no intervening
59 space. */
60enum ETokenType getNextToken(UCHARBUF* buf,
61 struct UString *token,
62 uint32_t *linenumber, /* out: linenumber of token */
63 UErrorCode *status) {
64 enum ETokenType result;
65 UChar32 c;
66
67 if (U_FAILURE(*status)) {
68 return TOK_ERROR;
69 }
70
71 /* Skip whitespace */
72 c = getNextChar(buf, TRUE, status);
73
74 if (U_FAILURE(*status)) {
75 return TOK_ERROR;
76 }
77
78 *linenumber = lineCount;
79
80 switch(c) {
81 case BADBOM:
82 return TOK_ERROR;
83 case OPENBRACE:
84 return TOK_OPEN_BRACE;
85 case CLOSEBRACE:
86 return TOK_CLOSE_BRACE;
87 case COMMA:
88 return TOK_COMMA;
89 case U_EOF:
90 return TOK_EOF;
91 case COLON:
92 return TOK_COLON;
93
94 default:
95 result = getStringToken(buf, c, token, status);
96 }
97
98 *linenumber = lineCount;
99 return result;
100}
101
102/* Copy a string token into the given UnicodeString. Upon entry, we
103 have already read the first character of the string token, which is
104 not a whitespace character (but may be a QUOTE or ESCAPE). This
105 function reads all subsequent characters that belong with this
106 string, and copy them into the token parameter. The other
107 important, and slightly convoluted purpose of this function is to
108 merge adjacent strings. It looks forward a bit, and if the next
109 non comment, non whitespace item is a string, it reads it in as
110 well. If two adjacent strings are quoted, they are merged without
111 intervening space. Otherwise a single SPACE character is
112 inserted. */
113static enum ETokenType getStringToken(UCHARBUF* buf,
114 UChar32 initialChar,
115 struct UString *token,
116 UErrorCode *status) {
117 UBool lastStringWasQuoted;
118 UChar32 c;
119 UChar target[3] = { '\0' };
120 UChar *pTarget = target;
121 int len=0;
122 UBool isFollowingCharEscaped=FALSE;
123
124 /* We are guaranteed on entry that initialChar is not a whitespace
125 character. If we are at the EOF, or have some other problem, it
126 doesn't matter; we still want to validly return the initialChar
127 (if nothing else) as a string token. */
128
129 if (U_FAILURE(*status)) {
130 return TOK_ERROR;
131 }
132
133 /* setup */
134 lastStringWasQuoted = FALSE;
135 c = initialChar;
136 ustr_setlen(token, 0, status);
137
138 if (U_FAILURE(*status)) {
139 return TOK_ERROR;
140 }
141
142 for (;;) {
143 if (c == QUOTE) {
144 if (!lastStringWasQuoted && token->fLength > 0) {
145 ustr_ucat(token, SPACE, status);
146
147 if (U_FAILURE(*status)) {
148 return TOK_ERROR;
149 }
150 }
151
152 lastStringWasQuoted = TRUE;
153
154 for (;;) {
155 c = ucbuf_getc(buf,status);
156
157 /* EOF reached */
158 if (c == U_EOF) {
159 return TOK_EOF;
160 }
161
162 /* Unterminated quoted strings */
163 if (U_FAILURE(*status)) {
164 return TOK_ERROR;
165 }
166
167 if (c == QUOTE && !isFollowingCharEscaped) {
168 break;
169 }
170
171 if (c == ESCAPE && !isFollowingCharEscaped) {
172 pTarget = target;
173 c = unescape(buf, status);
174
175 if (c == U_ERR) {
176 return TOK_ERROR;
177 }
178 }
179
180 if(c==ESCAPE && !isFollowingCharEscaped){
181 isFollowingCharEscaped = TRUE;
182 }else{
183 U_APPEND_CHAR32(c, pTarget,len);
184 pTarget = target;
185 ustr_uscat(token, pTarget,len, status);
186 isFollowingCharEscaped = FALSE;
187 len=0;
188 }
189
190 if (U_FAILURE(*status)) {
191 return TOK_ERROR;
192 }
193 }
194 } else {
195 if (token->fLength > 0) {
196 ustr_ucat(token, SPACE, status);
197
198 if (U_FAILURE(*status)) {
199 return TOK_ERROR;
200 }
201 }
202
203 if(lastStringWasQuoted){
204 if(getShowWarning()){
205 warning(lineCount, "Mixing quoted and unquoted strings");
206 }
207 if(isStrict()){
208 return TOK_ERROR;
209 }
210
211 }
212
213 lastStringWasQuoted = FALSE;
214
215 /* if we reach here we are mixing
216 * quoted and unquoted strings
217 * warn in normal mode and error in
218 * pedantic mode
219 */
220
221 if (c == ESCAPE) {
222 pTarget = target;
223 c = unescape(buf, status);
224
225 /* EOF reached */
226 if (c == U_EOF) {
227 return TOK_ERROR;
228 }
229 }
230
231 U_APPEND_CHAR32(c, pTarget,len);
232 pTarget = target;
233 ustr_uscat(token, pTarget,len, status);
234 len=0;
235
236 if (U_FAILURE(*status)) {
237 return TOK_ERROR;
238 }
239
240 for (;;) {
241 /* DON'T skip whitespace */
242 c = getNextChar(buf, FALSE, status);
243
244 /* EOF reached */
245 if (c == U_EOF) {
246 ucbuf_ungetc(c, buf);
247 return TOK_STRING;
248 }
249
250 if (U_FAILURE(*status)) {
251 return TOK_STRING;
252 }
253
254 if (c == QUOTE
255 || c == OPENBRACE
256 || c == CLOSEBRACE
257 || c == COMMA
258 || c == COLON) {
259 ucbuf_ungetc(c, buf);
260 break;
261 }
262
263 if (isWhitespace(c)) {
264 break;
265 }
266
267 if (c == ESCAPE) {
268 pTarget = target;
269 c = unescape(buf, status);
270
271 if (c == U_ERR) {
272 return TOK_ERROR;
273 }
274 }
275
276 U_APPEND_CHAR32(c, pTarget,len);
277 pTarget = target;
278 ustr_uscat(token, pTarget,len, status);
279 len=0;
280 if (U_FAILURE(*status)) {
281 return TOK_ERROR;
282 }
283 }
284 }
285
286 /* DO skip whitespace */
287 c = getNextChar(buf, TRUE, status);
288
289 if (U_FAILURE(*status)) {
290 return TOK_STRING;
291 }
292
293 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
294 ucbuf_ungetc(c, buf);
295 return TOK_STRING;
296 }
297 }
298}
299
300/* Retrieve the next character, ignoring comments. If skipwhite is
301 true, whitespace is skipped as well. */
302static UChar32 getNextChar(UCHARBUF* buf,
303 UBool skipwhite,
304 UErrorCode *status) {
305 UChar32 c;
306
307 if (U_FAILURE(*status)) {
308 return U_EOF;
309 }
310
311 for (;;) {
312 c = ucbuf_getc(buf,status);
313
314 if (c == U_EOF) {
315 return U_EOF;
316 }
317
318 if (skipwhite && isWhitespace(c)) {
319 continue;
320 }
321
322 /* This also handles the get() failing case */
323 if (c != SLASH) {
324 return c;
325 }
326
327 c = ucbuf_getc(buf,status);
328
329 if (c == U_EOF) {
330 return U_EOF;
331 }
332
333 switch (c) {
334 case SLASH:
335 seekUntilNewline(buf, status);
336 break;
337
338 case ASTERISK:
339 seekUntilEndOfComment(buf, status);
340 break;
341
342 default:
343 ucbuf_ungetc(c, buf);
344 /* If get() failed this is a NOP */
345 return SLASH;
346 }
347 }
348}
349
350static void seekUntilNewline(UCHARBUF* buf,
351 UErrorCode *status) {
352 UChar32 c;
353
354 if (U_FAILURE(*status)) {
355 return;
356 }
357
358 do {
359 c = ucbuf_getc(buf,status);
360 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
361}
362
363static void seekUntilEndOfComment(UCHARBUF *buf,
364 UErrorCode *status) {
365 UChar32 c, d;
366 uint32_t line;
367
368 if (U_FAILURE(*status)) {
369 return;
370 }
371
372 line = lineCount;
373
374 do {
375 c = ucbuf_getc(buf, status);
376
377 if (c == ASTERISK) {
378 d = ucbuf_getc(buf, status);
379
380 if (d != SLASH) {
381 ucbuf_ungetc(d, buf);
382 } else {
383 break;
384 }
385 }
386 } while (c != U_EOF && *status == U_ZERO_ERROR);
387
388 if (c == U_EOF) {
389 *status = U_INVALID_FORMAT_ERROR;
390 error(line, "unterminated comment detected");
391 }
392}
393
394UChar32 unescape(UCHARBUF *buf,
395 UErrorCode *status) {
396 if (U_FAILURE(*status)) {
397 return U_EOF;
398 }
399
400 /* We expect to be called after the ESCAPE has been seen, but
401 * u_fgetcx needs an ESCAPE to do its magic. */
402 ucbuf_ungetc(ESCAPE, buf);
403
404 return ucbuf_getcx32(buf, status);
405}
406
407static UBool isWhitespace(UChar32 c) {
408 switch (c) {
409 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
410 case 0x000A:
411 case 0x2029:
412 lineCount++;
413 case 0x000D:
414 case 0x0020:
415 case 0x0009:
416 case 0xFEFF:
417 return TRUE;
418
419 default:
420 return FALSE;
421 }
422}
423
424static UBool isNewline(UChar32 c) {
425 switch (c) {
426 /* '\n', '\r', 0x2029 */
427 case 0x000A:
428 case 0x2029:
429 lineCount++;
430 case 0x000D:
431 return TRUE;
432
433 default:
434 return FALSE;
435 }
436}