]>
Commit | Line | Data |
---|---|---|
40675e7c | 1 | /* Token-reader for Bison's input parser, |
a44c2277 | 2 | Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc. |
40675e7c DM |
3 | |
4 | This file is part of Bison, the GNU Compiler Compiler. | |
5 | ||
6 | Bison is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | Bison is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with Bison; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
19 | ||
20 | ||
21 | /* | |
a44c2277 | 22 | lex is the entry point. It is called from reader.c. |
40675e7c DM |
23 | It returns one of the token-type codes defined in lex.h. |
24 | When an identifier is seen, the code IDENTIFIER is returned | |
25 | and the name is looked up in the symbol table using symtab.c; | |
26 | symval is set to a pointer to the entry found. */ | |
27 | ||
28 | #include <stdio.h> | |
29 | #include <ctype.h> | |
30 | #include "system.h" | |
31 | #include "files.h" | |
a44c2277 | 32 | #include "getopt.h" /* for optarg */ |
40675e7c DM |
33 | #include "symtab.h" |
34 | #include "lex.h" | |
35 | #include "new.h" | |
36 | ||
a44c2277 RS |
37 | /* flags set by % directives */ |
38 | extern int definesflag; /* for -d */ | |
39 | extern int toknumflag; /* for -k */ | |
40 | extern int noparserflag; /* for -n */ | |
41 | extern int fixed_outfiles; /* for -y */ | |
42 | extern int nolinesflag; /* for -l */ | |
43 | extern int rawtoknumflag; /* for -r */ | |
44 | extern int verboseflag; /* for -v */ | |
45 | extern int debugflag; /* for -t */ | |
46 | extern char *spec_name_prefix; /* for -p */ | |
47 | extern char *spec_file_prefix; /* for -b */ | |
48 | /*spec_outfile is declared in files.h, for -o */ | |
40675e7c DM |
49 | |
50 | extern int lineno; | |
51 | extern int translations; | |
52 | ||
53 | int parse_percent_token(); | |
54 | ||
a44c2277 RS |
55 | /* functions from main.c */ |
56 | extern char *printable_version(); | |
40675e7c | 57 | extern void fatal(); |
a44c2277 RS |
58 | extern void warni(); |
59 | extern void warn(); | |
40675e7c DM |
60 | |
61 | /* Buffer for storing the current token. */ | |
62 | char *token_buffer; | |
63 | ||
64 | /* Allocated size of token_buffer, not including space for terminator. */ | |
65 | static int maxtoken; | |
66 | ||
67 | bucket *symval; | |
68 | int numval; | |
69 | ||
70 | static int unlexed; /* these two describe a token to be reread */ | |
71 | static bucket *unlexed_symval; /* by the next call to lex */ | |
72 | ||
73 | ||
74 | void | |
75 | init_lex() | |
76 | { | |
77 | maxtoken = 100; | |
78 | token_buffer = NEW2 (maxtoken + 1, char); | |
79 | unlexed = -1; | |
80 | } | |
81 | ||
82 | ||
83 | static char * | |
84 | grow_token_buffer (p) | |
85 | char *p; | |
86 | { | |
87 | int offset = p - token_buffer; | |
88 | maxtoken *= 2; | |
89 | token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); | |
90 | return token_buffer + offset; | |
91 | } | |
92 | ||
93 | ||
94 | int | |
95 | skip_white_space() | |
96 | { | |
97 | register int c; | |
98 | register int inside; | |
99 | ||
100 | c = getc(finput); | |
101 | ||
102 | for (;;) | |
103 | { | |
104 | int cplus_comment; | |
105 | ||
106 | switch (c) | |
107 | { | |
108 | case '/': | |
109 | c = getc(finput); | |
a44c2277 RS |
110 | if (c != '*' && c != '/') |
111 | { | |
112 | warn("unexpected `/' found and ignored"); | |
113 | break; | |
114 | } | |
40675e7c DM |
115 | cplus_comment = (c == '/'); |
116 | ||
117 | c = getc(finput); | |
118 | ||
119 | inside = 1; | |
120 | while (inside) | |
121 | { | |
122 | if (!cplus_comment && c == '*') | |
123 | { | |
124 | while (c == '*') | |
125 | c = getc(finput); | |
126 | ||
127 | if (c == '/') | |
128 | { | |
129 | inside = 0; | |
130 | c = getc(finput); | |
131 | } | |
132 | } | |
133 | else if (c == '\n') | |
134 | { | |
135 | lineno++; | |
136 | if (cplus_comment) | |
137 | inside = 0; | |
138 | c = getc(finput); | |
139 | } | |
140 | else if (c == EOF) | |
141 | fatal("unterminated comment"); | |
142 | else | |
143 | c = getc(finput); | |
144 | } | |
145 | ||
146 | break; | |
147 | ||
148 | case '\n': | |
149 | lineno++; | |
150 | ||
151 | case ' ': | |
152 | case '\t': | |
153 | case '\f': | |
154 | c = getc(finput); | |
155 | break; | |
156 | ||
157 | default: | |
158 | return (c); | |
159 | } | |
160 | } | |
161 | } | |
162 | ||
a44c2277 RS |
163 | /* do a getc, but give error message if EOF encountered */ |
164 | int | |
165 | safegetc(f) | |
166 | FILE *f; | |
167 | { | |
168 | register int c = getc(f); | |
169 | if (c == EOF) | |
170 | fatal("Unexpected end of file"); | |
171 | return c; | |
172 | } | |
173 | ||
174 | /* read one literal character from finput. process \ escapes. | |
175 | append the normalized string version of the char to *pp. | |
176 | assign the character code to *pcode | |
177 | return 1 unless the character is an unescaped `term' or \n | |
178 | report error for \n | |
179 | */ | |
180 | int | |
181 | literalchar(pp, pcode, term) | |
182 | char **pp; | |
183 | int *pcode; | |
184 | char term; | |
185 | { | |
186 | register int c; | |
187 | register char *p; | |
188 | register int code; | |
189 | int wasquote = 0; | |
190 | ||
191 | c = safegetc(finput); | |
192 | if (c == '\n') | |
193 | { | |
194 | warn("unescaped newline in constant"); | |
195 | ungetc(c, finput); | |
196 | code = '?'; | |
197 | wasquote = 1; | |
198 | } | |
199 | else if (c != '\\') | |
200 | { | |
201 | code = c; | |
202 | if (c == term) | |
203 | wasquote = 1; | |
204 | } | |
205 | else | |
206 | { | |
207 | c = safegetc(finput); | |
208 | if (c == 't') code = '\t'; | |
209 | else if (c == 'n') code = '\n'; | |
210 | else if (c == 'a') code = '\007'; | |
211 | else if (c == 'r') code = '\r'; | |
212 | else if (c == 'f') code = '\f'; | |
213 | else if (c == 'b') code = '\b'; | |
214 | else if (c == 'v') code = 013; | |
215 | else if (c == '\\') code = '\\'; | |
216 | else if (c == '\'') code = '\''; | |
217 | else if (c == '\"') code = '\"'; | |
218 | else if (c <= '7' && c >= '0') | |
219 | { | |
220 | code = 0; | |
221 | while (c <= '7' && c >= '0') | |
222 | { | |
223 | code = (code * 8) + (c - '0'); | |
224 | if (code >= 256 || code < 0) | |
225 | { | |
226 | warni("octal value outside range 0...255: `\\%o'", code); | |
227 | code &= 0xFF; | |
228 | break; | |
229 | } | |
230 | c = safegetc(finput); | |
231 | } | |
232 | ungetc(c, finput); | |
233 | } | |
234 | else if (c == 'x') | |
235 | { | |
236 | c = safegetc(finput); | |
237 | code = 0; | |
238 | while (1) | |
239 | { | |
240 | if (c >= '0' && c <= '9') | |
241 | code *= 16, code += c - '0'; | |
242 | else if (c >= 'a' && c <= 'f') | |
243 | code *= 16, code += c - 'a' + 10; | |
244 | else if (c >= 'A' && c <= 'F') | |
245 | code *= 16, code += c - 'A' + 10; | |
246 | else | |
247 | break; | |
248 | if (code >= 256 || code<0) | |
249 | { | |
250 | warni("hexadecimal value above 255: `\\x%x'", code); | |
251 | code &= 0xFF; | |
252 | break; | |
253 | } | |
254 | c = safegetc(finput); | |
255 | } | |
256 | ungetc(c, finput); | |
257 | } | |
258 | else | |
259 | { | |
260 | warni ("unknown escape sequence: `\\' followed by `%s'", | |
261 | printable_version(c)); | |
262 | code = '?'; | |
263 | } | |
264 | } /* has \ */ | |
265 | ||
266 | /* now fill token_buffer with the canonical name for this character | |
267 | as a literal token. Do not use what the user typed, | |
268 | so that `\012' and `\n' can be interchangeable. */ | |
269 | ||
270 | p = *pp; | |
271 | if (code >= 040 && code < 0177) | |
272 | *p++ = code; | |
273 | else if (code == '\\') {*p++ = '\\'; *p++ = '\\';} | |
274 | else if (code == '\'') {*p++ = '\\'; *p++ = '\'';} | |
275 | else if (code == '\"') {*p++ = '\\'; *p++ = '\"';} | |
276 | else if (code == '\t') {*p++ = '\\'; *p++ = 't';} | |
277 | else if (code == '\n') {*p++ = '\\'; *p++ = 'n';} | |
278 | else if (code == '\r') {*p++ = '\\'; *p++ = 'r';} | |
279 | else if (code == '\v') {*p++ = '\\'; *p++ = 'v';} | |
280 | else if (code == '\b') {*p++ = '\\'; *p++ = 'b';} | |
281 | else if (code == '\f') {*p++ = '\\'; *p++ = 'f';} | |
282 | else | |
283 | { | |
284 | *p++ = '\\'; | |
285 | *p++ = code / 0100 + '0'; | |
286 | *p++ = ((code / 010) & 07) + '0'; | |
287 | *p++ = (code & 07) + '0'; | |
288 | } | |
289 | *pp = p; | |
290 | *pcode = code; | |
291 | return ! wasquote; | |
292 | } | |
293 | ||
40675e7c DM |
294 | |
295 | void | |
296 | unlex(token) | |
a44c2277 | 297 | int token; |
40675e7c DM |
298 | { |
299 | unlexed = token; | |
300 | unlexed_symval = symval; | |
301 | } | |
302 | ||
303 | ||
40675e7c DM |
304 | int |
305 | lex() | |
306 | { | |
307 | register int c; | |
a44c2277 | 308 | char *p; |
40675e7c DM |
309 | |
310 | if (unlexed >= 0) | |
311 | { | |
312 | symval = unlexed_symval; | |
313 | c = unlexed; | |
314 | unlexed = -1; | |
315 | return (c); | |
316 | } | |
317 | ||
318 | c = skip_white_space(); | |
a44c2277 RS |
319 | *token_buffer = c; /* for error messages (token buffer always valid) */ |
320 | token_buffer[1] = 0; | |
40675e7c DM |
321 | |
322 | switch (c) | |
323 | { | |
324 | case EOF: | |
a44c2277 | 325 | strcpy(token_buffer, "EOF"); |
40675e7c DM |
326 | return (ENDFILE); |
327 | ||
328 | case 'A': case 'B': case 'C': case 'D': case 'E': | |
329 | case 'F': case 'G': case 'H': case 'I': case 'J': | |
330 | case 'K': case 'L': case 'M': case 'N': case 'O': | |
331 | case 'P': case 'Q': case 'R': case 'S': case 'T': | |
332 | case 'U': case 'V': case 'W': case 'X': case 'Y': | |
333 | case 'Z': | |
334 | case 'a': case 'b': case 'c': case 'd': case 'e': | |
335 | case 'f': case 'g': case 'h': case 'i': case 'j': | |
336 | case 'k': case 'l': case 'm': case 'n': case 'o': | |
337 | case 'p': case 'q': case 'r': case 's': case 't': | |
338 | case 'u': case 'v': case 'w': case 'x': case 'y': | |
339 | case 'z': | |
340 | case '.': case '_': | |
341 | p = token_buffer; | |
342 | while (isalnum(c) || c == '_' || c == '.') | |
343 | { | |
344 | if (p == token_buffer + maxtoken) | |
345 | p = grow_token_buffer(p); | |
346 | ||
347 | *p++ = c; | |
348 | c = getc(finput); | |
349 | } | |
350 | ||
351 | *p = 0; | |
352 | ungetc(c, finput); | |
353 | symval = getsym(token_buffer); | |
354 | return (IDENTIFIER); | |
355 | ||
356 | case '0': case '1': case '2': case '3': case '4': | |
357 | case '5': case '6': case '7': case '8': case '9': | |
358 | { | |
359 | numval = 0; | |
360 | ||
a44c2277 | 361 | p = token_buffer; |
40675e7c DM |
362 | while (isdigit(c)) |
363 | { | |
a44c2277 RS |
364 | if (p == token_buffer + maxtoken) |
365 | p = grow_token_buffer(p); | |
366 | ||
367 | *p++ = c; | |
40675e7c DM |
368 | numval = numval*10 + c - '0'; |
369 | c = getc(finput); | |
370 | } | |
a44c2277 | 371 | *p = 0; |
40675e7c DM |
372 | ungetc(c, finput); |
373 | return (NUMBER); | |
374 | } | |
375 | ||
376 | case '\'': | |
40675e7c DM |
377 | |
378 | /* parse the literal token and compute character code in code */ | |
379 | ||
a44c2277 | 380 | translations = -1; |
40675e7c | 381 | { |
a44c2277 RS |
382 | int code, discode; |
383 | char discard[10], *dp; | |
384 | p = token_buffer; | |
385 | *p++ = '\''; | |
386 | literalchar(&p, &code, '\''); | |
40675e7c | 387 | |
a44c2277 RS |
388 | c = getc(finput); |
389 | if (c != '\'') | |
40675e7c | 390 | { |
a44c2277 RS |
391 | warn("use \"...\" for multi-character literal tokens"); |
392 | dp = discard; | |
393 | while (literalchar(&dp, &discode, '\'')) {} | |
40675e7c | 394 | } |
a44c2277 RS |
395 | *p++ = '\''; |
396 | *p = 0; | |
397 | symval = getsym(token_buffer); | |
398 | symval->class = STOKEN; | |
399 | if (! symval->user_token_number) | |
400 | symval->user_token_number = code; | |
401 | return (IDENTIFIER); | |
402 | } | |
40675e7c | 403 | |
a44c2277 | 404 | case '\"': |
40675e7c | 405 | |
a44c2277 RS |
406 | /* parse the literal string token and treat as an identifier */ |
407 | ||
408 | translations = -1; | |
409 | { | |
410 | int code; /* ignored here */ | |
40675e7c | 411 | p = token_buffer; |
a44c2277 RS |
412 | *p++ = '\"'; |
413 | while (literalchar(&p, &code, '\"')) /* read up to and including " */ | |
40675e7c | 414 | { |
a44c2277 RS |
415 | if (p >= token_buffer + maxtoken - 4) |
416 | p = grow_token_buffer(p); | |
40675e7c | 417 | } |
40675e7c | 418 | *p = 0; |
a44c2277 | 419 | |
40675e7c DM |
420 | symval = getsym(token_buffer); |
421 | symval->class = STOKEN; | |
a44c2277 | 422 | |
40675e7c DM |
423 | return (IDENTIFIER); |
424 | } | |
425 | ||
426 | case ',': | |
427 | return (COMMA); | |
428 | ||
429 | case ':': | |
430 | return (COLON); | |
431 | ||
432 | case ';': | |
433 | return (SEMICOLON); | |
434 | ||
435 | case '|': | |
436 | return (BAR); | |
437 | ||
438 | case '{': | |
439 | return (LEFT_CURLY); | |
440 | ||
441 | case '=': | |
442 | do | |
443 | { | |
444 | c = getc(finput); | |
445 | if (c == '\n') lineno++; | |
446 | } | |
447 | while(c==' ' || c=='\n' || c=='\t'); | |
448 | ||
449 | if (c == '{') | |
a44c2277 RS |
450 | { |
451 | strcpy(token_buffer, "={"); | |
452 | return(LEFT_CURLY); | |
453 | } | |
40675e7c DM |
454 | else |
455 | { | |
456 | ungetc(c, finput); | |
457 | return(ILLEGAL); | |
458 | } | |
459 | ||
460 | case '<': | |
461 | p = token_buffer; | |
462 | c = getc(finput); | |
463 | while (c != '>') | |
464 | { | |
a44c2277 RS |
465 | if (c == EOF) |
466 | fatal("unterminated type name at end of file"); | |
467 | if (c == '\n') | |
468 | { | |
469 | warn("unterminated type name"); | |
470 | ungetc(c, finput); | |
471 | break; | |
472 | } | |
40675e7c DM |
473 | |
474 | if (p == token_buffer + maxtoken) | |
475 | p = grow_token_buffer(p); | |
476 | ||
477 | *p++ = c; | |
478 | c = getc(finput); | |
479 | } | |
480 | *p = 0; | |
481 | return (TYPENAME); | |
482 | ||
483 | ||
484 | case '%': | |
485 | return (parse_percent_token()); | |
486 | ||
487 | default: | |
488 | return (ILLEGAL); | |
489 | } | |
490 | } | |
491 | ||
a44c2277 RS |
492 | /* the following table dictates the action taken for the various |
493 | % directives. A setflag value causes the named flag to be | |
494 | set. A retval action returns the code. | |
495 | */ | |
496 | struct percent_table_struct { | |
497 | char *name; | |
498 | void *setflag; | |
499 | int retval; | |
500 | } percent_table[] = | |
501 | { | |
502 | {"token", NULL, TOKEN}, | |
503 | {"term", NULL, TOKEN}, | |
504 | {"nterm", NULL, NTERM}, | |
505 | {"type", NULL, TYPE}, | |
506 | {"guard", NULL, GUARD}, | |
507 | {"union", NULL, UNION}, | |
508 | {"expect", NULL, EXPECT}, | |
509 | {"thong", NULL, THONG}, | |
510 | {"start", NULL, START}, | |
511 | {"left", NULL, LEFT}, | |
512 | {"right", NULL, RIGHT}, | |
513 | {"nonassoc", NULL, NONASSOC}, | |
514 | {"binary", NULL, NONASSOC}, | |
515 | {"semantic_parser", NULL, SEMANTIC_PARSER}, | |
516 | {"pure_parser", NULL, PURE_PARSER}, | |
517 | {"prec", NULL, PREC}, | |
518 | ||
519 | {"no_lines", &nolinesflag, NOOP}, /* -l */ | |
520 | {"raw", &rawtoknumflag, NOOP}, /* -r */ | |
521 | {"token_table", &toknumflag, NOOP}, /* -k */ | |
522 | ||
523 | #if 0 | |
524 | /* These can be utilized after main is reoganized so | |
525 | open_files() is deferred 'til after read_declarations(). | |
526 | But %{ and %union both put information into files | |
527 | that have to be opened before read_declarations(). | |
528 | */ | |
529 | {"yacc", &fixed_outfiles, NOOP}, /* -y */ | |
530 | {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */ | |
531 | {"defines", &definesflag, NOOP}, /* -d */ | |
532 | {"no_parser", &noparserflag, NOOP}, /* -n */ | |
533 | {"output_file", &spec_outfile, SETOPT}, /* -o */ | |
534 | {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */ | |
535 | {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */ | |
536 | ||
537 | /* These would be acceptable, but they do not affect processing */ | |
538 | {"verbose", &verboseflag, NOOP}, /* -v */ | |
539 | {"debug", &debugflag, NOOP}, /* -t */ | |
540 | /* {"help", <print usage stmt>, NOOP}, /* -h */ | |
541 | /* {"version", <print version number> , NOOP}, /* -V */ | |
542 | #endif | |
543 | ||
544 | {NULL, NULL, ILLEGAL} | |
545 | }; | |
546 | ||
547 | /* Parse a token which starts with %. | |
548 | Assumes the % has already been read and discarded. */ | |
40675e7c DM |
549 | |
550 | int | |
551 | parse_percent_token () | |
552 | { | |
553 | register int c; | |
554 | register char *p; | |
a44c2277 | 555 | register struct percent_table_struct *tx; |
40675e7c DM |
556 | |
557 | p = token_buffer; | |
558 | c = getc(finput); | |
a44c2277 RS |
559 | *p++ = '%'; |
560 | *p++ = c; /* for error msg */ | |
561 | *p = 0; | |
40675e7c DM |
562 | |
563 | switch (c) | |
564 | { | |
565 | case '%': | |
566 | return (TWO_PERCENTS); | |
567 | ||
568 | case '{': | |
569 | return (PERCENT_LEFT_CURLY); | |
570 | ||
571 | case '<': | |
572 | return (LEFT); | |
573 | ||
574 | case '>': | |
575 | return (RIGHT); | |
576 | ||
577 | case '2': | |
578 | return (NONASSOC); | |
579 | ||
580 | case '0': | |
581 | return (TOKEN); | |
582 | ||
583 | case '=': | |
584 | return (PREC); | |
585 | } | |
a44c2277 | 586 | if (!isalpha(c)) |
40675e7c DM |
587 | return (ILLEGAL); |
588 | ||
a44c2277 RS |
589 | p = token_buffer; |
590 | *p++ = '%'; | |
591 | while (isalpha(c) || c == '_' || c == '-') | |
40675e7c DM |
592 | { |
593 | if (p == token_buffer + maxtoken) | |
594 | p = grow_token_buffer(p); | |
595 | ||
a44c2277 | 596 | if (c == '-') c = '_'; |
40675e7c DM |
597 | *p++ = c; |
598 | c = getc(finput); | |
599 | } | |
600 | ||
601 | ungetc(c, finput); | |
602 | ||
603 | *p = 0; | |
604 | ||
a44c2277 RS |
605 | /* table lookup % directive */ |
606 | for (tx = percent_table; tx->name; tx++) | |
607 | if (strcmp(token_buffer+1, tx->name) == 0) | |
608 | break; | |
609 | if (tx->retval == SETOPT) | |
610 | { | |
611 | *((char **)(tx->setflag)) = optarg; | |
612 | return NOOP; | |
613 | } | |
614 | if (tx->setflag) | |
615 | { | |
616 | *((int *)(tx->setflag)) = 1; | |
617 | return NOOP; | |
618 | } | |
619 | return tx->retval; | |
40675e7c | 620 | } |