]>
Commit | Line | Data |
---|---|---|
40675e7c | 1 | /* Token-reader for Bison's input parser, |
a44c2277 | 2 | Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc. |
40675e7c DM |
3 | |
4 | This file is part of Bison, the GNU Compiler Compiler. | |
5 | ||
6 | Bison is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | Bison is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with Bison; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
19 | ||
20 | ||
a083fbbf | 21 | /* |
a44c2277 | 22 | lex is the entry point. It is called from reader.c. |
40675e7c DM |
23 | It returns one of the token-type codes defined in lex.h. |
24 | When an identifier is seen, the code IDENTIFIER is returned | |
25 | and the name is looked up in the symbol table using symtab.c; | |
26 | symval is set to a pointer to the entry found. */ | |
27 | ||
28 | #include <stdio.h> | |
29 | #include <ctype.h> | |
30 | #include "system.h" | |
31 | #include "files.h" | |
a44c2277 | 32 | #include "getopt.h" /* for optarg */ |
40675e7c DM |
33 | #include "symtab.h" |
34 | #include "lex.h" | |
7612000c | 35 | #include "alloc.h" |
40675e7c | 36 | |
a44c2277 RS |
37 | /* flags set by % directives */ |
38 | extern int definesflag; /* for -d */ | |
39 | extern int toknumflag; /* for -k */ | |
40 | extern int noparserflag; /* for -n */ | |
41 | extern int fixed_outfiles; /* for -y */ | |
42 | extern int nolinesflag; /* for -l */ | |
43 | extern int rawtoknumflag; /* for -r */ | |
44 | extern int verboseflag; /* for -v */ | |
45 | extern int debugflag; /* for -t */ | |
46 | extern char *spec_name_prefix; /* for -p */ | |
47 | extern char *spec_file_prefix; /* for -b */ | |
48 | /*spec_outfile is declared in files.h, for -o */ | |
40675e7c DM |
49 | |
50 | extern int lineno; | |
51 | extern int translations; | |
52 | ||
53 | int parse_percent_token(); | |
54 | ||
a44c2277 RS |
55 | /* functions from main.c */ |
56 | extern char *printable_version(); | |
40675e7c | 57 | extern void fatal(); |
a44c2277 RS |
58 | extern void warni(); |
59 | extern void warn(); | |
40675e7c DM |
60 | |
61 | /* Buffer for storing the current token. */ | |
62 | char *token_buffer; | |
63 | ||
64 | /* Allocated size of token_buffer, not including space for terminator. */ | |
65 | static int maxtoken; | |
66 | ||
67 | bucket *symval; | |
68 | int numval; | |
69 | ||
70 | static int unlexed; /* these two describe a token to be reread */ | |
71 | static bucket *unlexed_symval; /* by the next call to lex */ | |
72 | ||
73 | ||
74 | void | |
75 | init_lex() | |
76 | { | |
77 | maxtoken = 100; | |
78 | token_buffer = NEW2 (maxtoken + 1, char); | |
79 | unlexed = -1; | |
80 | } | |
81 | ||
82 | ||
83 | static char * | |
84 | grow_token_buffer (p) | |
85 | char *p; | |
86 | { | |
87 | int offset = p - token_buffer; | |
88 | maxtoken *= 2; | |
89 | token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); | |
90 | return token_buffer + offset; | |
91 | } | |
92 | ||
93 | ||
94 | int | |
95 | skip_white_space() | |
96 | { | |
97 | register int c; | |
98 | register int inside; | |
99 | ||
100 | c = getc(finput); | |
101 | ||
102 | for (;;) | |
103 | { | |
104 | int cplus_comment; | |
105 | ||
106 | switch (c) | |
107 | { | |
108 | case '/': | |
109 | c = getc(finput); | |
a083fbbf | 110 | if (c != '*' && c != '/') |
a44c2277 | 111 | { |
a083fbbf | 112 | warn(_("unexpected `/' found and ignored")); |
a44c2277 RS |
113 | break; |
114 | } | |
40675e7c DM |
115 | cplus_comment = (c == '/'); |
116 | ||
117 | c = getc(finput); | |
118 | ||
119 | inside = 1; | |
120 | while (inside) | |
121 | { | |
122 | if (!cplus_comment && c == '*') | |
123 | { | |
124 | while (c == '*') | |
125 | c = getc(finput); | |
126 | ||
127 | if (c == '/') | |
128 | { | |
129 | inside = 0; | |
130 | c = getc(finput); | |
131 | } | |
132 | } | |
133 | else if (c == '\n') | |
134 | { | |
135 | lineno++; | |
136 | if (cplus_comment) | |
137 | inside = 0; | |
138 | c = getc(finput); | |
139 | } | |
140 | else if (c == EOF) | |
a083fbbf | 141 | fatal(_("unterminated comment")); |
40675e7c DM |
142 | else |
143 | c = getc(finput); | |
144 | } | |
145 | ||
146 | break; | |
147 | ||
148 | case '\n': | |
149 | lineno++; | |
150 | ||
151 | case ' ': | |
152 | case '\t': | |
153 | case '\f': | |
154 | c = getc(finput); | |
155 | break; | |
156 | ||
157 | default: | |
158 | return (c); | |
159 | } | |
160 | } | |
161 | } | |
162 | ||
a44c2277 RS |
163 | /* do a getc, but give error message if EOF encountered */ |
164 | int | |
165 | safegetc(f) | |
166 | FILE *f; | |
167 | { | |
168 | register int c = getc(f); | |
169 | if (c == EOF) | |
a083fbbf | 170 | fatal(_("Unexpected end of file")); |
a44c2277 RS |
171 | return c; |
172 | } | |
173 | ||
174 | /* read one literal character from finput. process \ escapes. | |
175 | append the normalized string version of the char to *pp. | |
176 | assign the character code to *pcode | |
177 | return 1 unless the character is an unescaped `term' or \n | |
178 | report error for \n | |
179 | */ | |
180 | int | |
181 | literalchar(pp, pcode, term) | |
182 | char **pp; | |
183 | int *pcode; | |
184 | char term; | |
185 | { | |
186 | register int c; | |
187 | register char *p; | |
188 | register int code; | |
189 | int wasquote = 0; | |
190 | ||
191 | c = safegetc(finput); | |
a083fbbf | 192 | if (c == '\n') |
a44c2277 | 193 | { |
a083fbbf | 194 | warn(_("unescaped newline in constant")); |
a44c2277 RS |
195 | ungetc(c, finput); |
196 | code = '?'; | |
197 | wasquote = 1; | |
198 | } | |
199 | else if (c != '\\') | |
200 | { | |
201 | code = c; | |
a083fbbf | 202 | if (c == term) |
a44c2277 RS |
203 | wasquote = 1; |
204 | } | |
205 | else | |
206 | { | |
207 | c = safegetc(finput); | |
208 | if (c == 't') code = '\t'; | |
209 | else if (c == 'n') code = '\n'; | |
210 | else if (c == 'a') code = '\007'; | |
211 | else if (c == 'r') code = '\r'; | |
212 | else if (c == 'f') code = '\f'; | |
213 | else if (c == 'b') code = '\b'; | |
214 | else if (c == 'v') code = 013; | |
215 | else if (c == '\\') code = '\\'; | |
216 | else if (c == '\'') code = '\''; | |
217 | else if (c == '\"') code = '\"'; | |
218 | else if (c <= '7' && c >= '0') | |
219 | { | |
220 | code = 0; | |
221 | while (c <= '7' && c >= '0') | |
222 | { | |
223 | code = (code * 8) + (c - '0'); | |
224 | if (code >= 256 || code < 0) | |
225 | { | |
a083fbbf | 226 | warni(_("octal value outside range 0...255: `\\%o'"), code); |
a44c2277 RS |
227 | code &= 0xFF; |
228 | break; | |
229 | } | |
230 | c = safegetc(finput); | |
231 | } | |
232 | ungetc(c, finput); | |
233 | } | |
234 | else if (c == 'x') | |
235 | { | |
236 | c = safegetc(finput); | |
237 | code = 0; | |
238 | while (1) | |
239 | { | |
240 | if (c >= '0' && c <= '9') | |
241 | code *= 16, code += c - '0'; | |
242 | else if (c >= 'a' && c <= 'f') | |
243 | code *= 16, code += c - 'a' + 10; | |
244 | else if (c >= 'A' && c <= 'F') | |
245 | code *= 16, code += c - 'A' + 10; | |
a083fbbf | 246 | else |
a44c2277 RS |
247 | break; |
248 | if (code >= 256 || code<0) | |
249 | { | |
a083fbbf | 250 | warni(_("hexadecimal value above 255: `\\x%x'"), code); |
a44c2277 RS |
251 | code &= 0xFF; |
252 | break; | |
253 | } | |
254 | c = safegetc(finput); | |
255 | } | |
256 | ungetc(c, finput); | |
257 | } | |
258 | else | |
259 | { | |
b0180c64 | 260 | warns (_("unknown escape sequence: `\\' followed by `%s'"), |
a44c2277 RS |
261 | printable_version(c)); |
262 | code = '?'; | |
263 | } | |
264 | } /* has \ */ | |
265 | ||
266 | /* now fill token_buffer with the canonical name for this character | |
267 | as a literal token. Do not use what the user typed, | |
268 | so that `\012' and `\n' can be interchangeable. */ | |
269 | ||
270 | p = *pp; | |
5ce94c29 | 271 | if (code == '\\') {*p++ = '\\'; *p++ = '\\';} |
a44c2277 RS |
272 | else if (code == '\'') {*p++ = '\\'; *p++ = '\'';} |
273 | else if (code == '\"') {*p++ = '\\'; *p++ = '\"';} | |
5ce94c29 RS |
274 | else if (code >= 040 && code < 0177) |
275 | *p++ = code; | |
a44c2277 RS |
276 | else if (code == '\t') {*p++ = '\\'; *p++ = 't';} |
277 | else if (code == '\n') {*p++ = '\\'; *p++ = 'n';} | |
278 | else if (code == '\r') {*p++ = '\\'; *p++ = 'r';} | |
279 | else if (code == '\v') {*p++ = '\\'; *p++ = 'v';} | |
280 | else if (code == '\b') {*p++ = '\\'; *p++ = 'b';} | |
281 | else if (code == '\f') {*p++ = '\\'; *p++ = 'f';} | |
282 | else | |
283 | { | |
284 | *p++ = '\\'; | |
285 | *p++ = code / 0100 + '0'; | |
286 | *p++ = ((code / 010) & 07) + '0'; | |
287 | *p++ = (code & 07) + '0'; | |
288 | } | |
289 | *pp = p; | |
290 | *pcode = code; | |
291 | return ! wasquote; | |
292 | } | |
293 | ||
40675e7c DM |
294 | |
295 | void | |
296 | unlex(token) | |
a44c2277 | 297 | int token; |
40675e7c DM |
298 | { |
299 | unlexed = token; | |
300 | unlexed_symval = symval; | |
301 | } | |
302 | ||
303 | ||
40675e7c DM |
304 | int |
305 | lex() | |
306 | { | |
307 | register int c; | |
a44c2277 | 308 | char *p; |
40675e7c DM |
309 | |
310 | if (unlexed >= 0) | |
311 | { | |
312 | symval = unlexed_symval; | |
313 | c = unlexed; | |
314 | unlexed = -1; | |
315 | return (c); | |
316 | } | |
317 | ||
318 | c = skip_white_space(); | |
a44c2277 RS |
319 | *token_buffer = c; /* for error messages (token buffer always valid) */ |
320 | token_buffer[1] = 0; | |
40675e7c DM |
321 | |
322 | switch (c) | |
323 | { | |
324 | case EOF: | |
a44c2277 | 325 | strcpy(token_buffer, "EOF"); |
40675e7c DM |
326 | return (ENDFILE); |
327 | ||
328 | case 'A': case 'B': case 'C': case 'D': case 'E': | |
329 | case 'F': case 'G': case 'H': case 'I': case 'J': | |
330 | case 'K': case 'L': case 'M': case 'N': case 'O': | |
331 | case 'P': case 'Q': case 'R': case 'S': case 'T': | |
332 | case 'U': case 'V': case 'W': case 'X': case 'Y': | |
333 | case 'Z': | |
334 | case 'a': case 'b': case 'c': case 'd': case 'e': | |
335 | case 'f': case 'g': case 'h': case 'i': case 'j': | |
336 | case 'k': case 'l': case 'm': case 'n': case 'o': | |
337 | case 'p': case 'q': case 'r': case 's': case 't': | |
338 | case 'u': case 'v': case 'w': case 'x': case 'y': | |
339 | case 'z': | |
340 | case '.': case '_': | |
341 | p = token_buffer; | |
342 | while (isalnum(c) || c == '_' || c == '.') | |
343 | { | |
344 | if (p == token_buffer + maxtoken) | |
345 | p = grow_token_buffer(p); | |
346 | ||
347 | *p++ = c; | |
348 | c = getc(finput); | |
349 | } | |
350 | ||
351 | *p = 0; | |
352 | ungetc(c, finput); | |
353 | symval = getsym(token_buffer); | |
354 | return (IDENTIFIER); | |
355 | ||
356 | case '0': case '1': case '2': case '3': case '4': | |
357 | case '5': case '6': case '7': case '8': case '9': | |
358 | { | |
359 | numval = 0; | |
360 | ||
a44c2277 | 361 | p = token_buffer; |
40675e7c DM |
362 | while (isdigit(c)) |
363 | { | |
a44c2277 RS |
364 | if (p == token_buffer + maxtoken) |
365 | p = grow_token_buffer(p); | |
366 | ||
367 | *p++ = c; | |
40675e7c DM |
368 | numval = numval*10 + c - '0'; |
369 | c = getc(finput); | |
370 | } | |
a44c2277 | 371 | *p = 0; |
40675e7c DM |
372 | ungetc(c, finput); |
373 | return (NUMBER); | |
374 | } | |
375 | ||
376 | case '\'': | |
40675e7c DM |
377 | |
378 | /* parse the literal token and compute character code in code */ | |
379 | ||
a44c2277 | 380 | translations = -1; |
40675e7c | 381 | { |
a44c2277 RS |
382 | int code, discode; |
383 | char discard[10], *dp; | |
5ce94c29 | 384 | |
a44c2277 RS |
385 | p = token_buffer; |
386 | *p++ = '\''; | |
387 | literalchar(&p, &code, '\''); | |
40675e7c | 388 | |
a44c2277 RS |
389 | c = getc(finput); |
390 | if (c != '\'') | |
40675e7c | 391 | { |
a083fbbf | 392 | warn(_("use \"...\" for multi-character literal tokens")); |
5ce94c29 RS |
393 | while (1) |
394 | { | |
395 | dp = discard; | |
396 | if (! literalchar(&dp, &discode, '\'')) | |
397 | break; | |
398 | } | |
40675e7c | 399 | } |
a44c2277 RS |
400 | *p++ = '\''; |
401 | *p = 0; | |
402 | symval = getsym(token_buffer); | |
403 | symval->class = STOKEN; | |
404 | if (! symval->user_token_number) | |
405 | symval->user_token_number = code; | |
406 | return (IDENTIFIER); | |
407 | } | |
40675e7c | 408 | |
a44c2277 | 409 | case '\"': |
40675e7c | 410 | |
a44c2277 RS |
411 | /* parse the literal string token and treat as an identifier */ |
412 | ||
413 | translations = -1; | |
414 | { | |
415 | int code; /* ignored here */ | |
40675e7c | 416 | p = token_buffer; |
a44c2277 RS |
417 | *p++ = '\"'; |
418 | while (literalchar(&p, &code, '\"')) /* read up to and including " */ | |
40675e7c | 419 | { |
a44c2277 RS |
420 | if (p >= token_buffer + maxtoken - 4) |
421 | p = grow_token_buffer(p); | |
40675e7c | 422 | } |
40675e7c | 423 | *p = 0; |
a44c2277 | 424 | |
40675e7c DM |
425 | symval = getsym(token_buffer); |
426 | symval->class = STOKEN; | |
a44c2277 | 427 | |
40675e7c DM |
428 | return (IDENTIFIER); |
429 | } | |
430 | ||
431 | case ',': | |
432 | return (COMMA); | |
433 | ||
434 | case ':': | |
435 | return (COLON); | |
436 | ||
437 | case ';': | |
438 | return (SEMICOLON); | |
439 | ||
440 | case '|': | |
441 | return (BAR); | |
442 | ||
443 | case '{': | |
444 | return (LEFT_CURLY); | |
445 | ||
446 | case '=': | |
447 | do | |
448 | { | |
449 | c = getc(finput); | |
450 | if (c == '\n') lineno++; | |
451 | } | |
452 | while(c==' ' || c=='\n' || c=='\t'); | |
453 | ||
454 | if (c == '{') | |
a44c2277 RS |
455 | { |
456 | strcpy(token_buffer, "={"); | |
457 | return(LEFT_CURLY); | |
458 | } | |
40675e7c DM |
459 | else |
460 | { | |
461 | ungetc(c, finput); | |
462 | return(ILLEGAL); | |
463 | } | |
464 | ||
465 | case '<': | |
466 | p = token_buffer; | |
467 | c = getc(finput); | |
468 | while (c != '>') | |
469 | { | |
a44c2277 | 470 | if (c == EOF) |
a083fbbf RS |
471 | fatal(_("unterminated type name at end of file")); |
472 | if (c == '\n') | |
a44c2277 | 473 | { |
a083fbbf | 474 | warn(_("unterminated type name")); |
a44c2277 RS |
475 | ungetc(c, finput); |
476 | break; | |
477 | } | |
40675e7c DM |
478 | |
479 | if (p == token_buffer + maxtoken) | |
480 | p = grow_token_buffer(p); | |
481 | ||
482 | *p++ = c; | |
483 | c = getc(finput); | |
484 | } | |
485 | *p = 0; | |
486 | return (TYPENAME); | |
a083fbbf | 487 | |
40675e7c DM |
488 | |
489 | case '%': | |
490 | return (parse_percent_token()); | |
491 | ||
492 | default: | |
493 | return (ILLEGAL); | |
494 | } | |
495 | } | |
496 | ||
a083fbbf | 497 | /* the following table dictates the action taken for the various |
a44c2277 RS |
498 | % directives. A setflag value causes the named flag to be |
499 | set. A retval action returns the code. | |
500 | */ | |
501 | struct percent_table_struct { | |
502 | char *name; | |
a083fbbf | 503 | void *setflag; |
a44c2277 RS |
504 | int retval; |
505 | } percent_table[] = | |
506 | { | |
507 | {"token", NULL, TOKEN}, | |
508 | {"term", NULL, TOKEN}, | |
509 | {"nterm", NULL, NTERM}, | |
510 | {"type", NULL, TYPE}, | |
511 | {"guard", NULL, GUARD}, | |
512 | {"union", NULL, UNION}, | |
513 | {"expect", NULL, EXPECT}, | |
514 | {"thong", NULL, THONG}, | |
515 | {"start", NULL, START}, | |
516 | {"left", NULL, LEFT}, | |
517 | {"right", NULL, RIGHT}, | |
518 | {"nonassoc", NULL, NONASSOC}, | |
519 | {"binary", NULL, NONASSOC}, | |
520 | {"semantic_parser", NULL, SEMANTIC_PARSER}, | |
521 | {"pure_parser", NULL, PURE_PARSER}, | |
522 | {"prec", NULL, PREC}, | |
523 | ||
524 | {"no_lines", &nolinesflag, NOOP}, /* -l */ | |
525 | {"raw", &rawtoknumflag, NOOP}, /* -r */ | |
526 | {"token_table", &toknumflag, NOOP}, /* -k */ | |
527 | ||
528 | #if 0 | |
529 | /* These can be utilized after main is reoganized so | |
530 | open_files() is deferred 'til after read_declarations(). | |
531 | But %{ and %union both put information into files | |
532 | that have to be opened before read_declarations(). | |
533 | */ | |
534 | {"yacc", &fixed_outfiles, NOOP}, /* -y */ | |
535 | {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */ | |
536 | {"defines", &definesflag, NOOP}, /* -d */ | |
537 | {"no_parser", &noparserflag, NOOP}, /* -n */ | |
538 | {"output_file", &spec_outfile, SETOPT}, /* -o */ | |
539 | {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */ | |
540 | {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */ | |
541 | ||
542 | /* These would be acceptable, but they do not affect processing */ | |
543 | {"verbose", &verboseflag, NOOP}, /* -v */ | |
544 | {"debug", &debugflag, NOOP}, /* -t */ | |
545 | /* {"help", <print usage stmt>, NOOP}, /* -h */ | |
546 | /* {"version", <print version number> , NOOP}, /* -V */ | |
547 | #endif | |
548 | ||
549 | {NULL, NULL, ILLEGAL} | |
550 | }; | |
551 | ||
552 | /* Parse a token which starts with %. | |
553 | Assumes the % has already been read and discarded. */ | |
40675e7c DM |
554 | |
555 | int | |
556 | parse_percent_token () | |
557 | { | |
558 | register int c; | |
559 | register char *p; | |
a44c2277 | 560 | register struct percent_table_struct *tx; |
40675e7c DM |
561 | |
562 | p = token_buffer; | |
563 | c = getc(finput); | |
a44c2277 RS |
564 | *p++ = '%'; |
565 | *p++ = c; /* for error msg */ | |
566 | *p = 0; | |
40675e7c DM |
567 | |
568 | switch (c) | |
569 | { | |
570 | case '%': | |
571 | return (TWO_PERCENTS); | |
572 | ||
573 | case '{': | |
574 | return (PERCENT_LEFT_CURLY); | |
575 | ||
576 | case '<': | |
577 | return (LEFT); | |
578 | ||
579 | case '>': | |
580 | return (RIGHT); | |
581 | ||
582 | case '2': | |
583 | return (NONASSOC); | |
584 | ||
585 | case '0': | |
586 | return (TOKEN); | |
587 | ||
588 | case '=': | |
589 | return (PREC); | |
590 | } | |
a083fbbf | 591 | if (!isalpha(c)) |
40675e7c DM |
592 | return (ILLEGAL); |
593 | ||
a44c2277 RS |
594 | p = token_buffer; |
595 | *p++ = '%'; | |
596 | while (isalpha(c) || c == '_' || c == '-') | |
40675e7c DM |
597 | { |
598 | if (p == token_buffer + maxtoken) | |
599 | p = grow_token_buffer(p); | |
600 | ||
a44c2277 | 601 | if (c == '-') c = '_'; |
40675e7c DM |
602 | *p++ = c; |
603 | c = getc(finput); | |
604 | } | |
605 | ||
606 | ungetc(c, finput); | |
607 | ||
608 | *p = 0; | |
609 | ||
a44c2277 RS |
610 | /* table lookup % directive */ |
611 | for (tx = percent_table; tx->name; tx++) | |
612 | if (strcmp(token_buffer+1, tx->name) == 0) | |
613 | break; | |
614 | if (tx->retval == SETOPT) | |
615 | { | |
616 | *((char **)(tx->setflag)) = optarg; | |
617 | return NOOP; | |
618 | } | |
619 | if (tx->setflag) | |
620 | { | |
621 | *((int *)(tx->setflag)) = 1; | |
622 | return NOOP; | |
623 | } | |
624 | return tx->retval; | |
40675e7c | 625 | } |