]>
Commit | Line | Data |
---|---|---|
1 | /* Token-reader for Bison's input parser, | |
2 | Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of Bison, the GNU Compiler Compiler. | |
5 | ||
6 | Bison is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | Bison is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with Bison; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
19 | ||
20 | ||
21 | /* | |
22 | lex is the entry point. It is called from reader.c. | |
23 | It returns one of the token-type codes defined in lex.h. | |
24 | When an identifier is seen, the code IDENTIFIER is returned | |
25 | and the name is looked up in the symbol table using symtab.c; | |
26 | symval is set to a pointer to the entry found. */ | |
27 | ||
28 | #include <stdio.h> | |
29 | #include "system.h" | |
30 | #include "files.h" | |
31 | #include "getopt.h" /* for optarg */ | |
32 | #include "symtab.h" | |
33 | #include "lex.h" | |
34 | #include "alloc.h" | |
35 | ||
36 | /* flags set by % directives */ | |
37 | extern int definesflag; /* for -d */ | |
38 | extern int toknumflag; /* for -k */ | |
39 | extern int noparserflag; /* for -n */ | |
40 | extern int fixed_outfiles; /* for -y */ | |
41 | extern int nolinesflag; /* for -l */ | |
42 | extern int rawtoknumflag; /* for -r */ | |
43 | extern int verboseflag; /* for -v */ | |
44 | extern int debugflag; /* for -t */ | |
45 | extern char *spec_name_prefix; /* for -p */ | |
46 | extern char *spec_file_prefix; /* for -b */ | |
47 | /*spec_outfile is declared in files.h, for -o */ | |
48 | ||
49 | extern int lineno; | |
50 | extern int translations; | |
51 | ||
52 | void init_lex PARAMS((void)); | |
53 | char *grow_token_buffer PARAMS((char *)); | |
54 | int skip_white_space PARAMS((void)); | |
55 | int safegetc PARAMS((FILE *)); | |
56 | int literalchar PARAMS((char **, int *, char)); | |
57 | void unlex PARAMS((int)); | |
58 | int lex PARAMS((void)); | |
59 | int parse_percent_token PARAMS((void)); | |
60 | ||
61 | /* functions from main.c */ | |
62 | extern char *printable_version PARAMS((int)); | |
63 | extern void fatal PARAMS((char *)); | |
64 | extern void warn PARAMS((char *)); | |
65 | extern void warni PARAMS((char *, int)); | |
66 | extern void warns PARAMS((char *, char *)); | |
67 | ||
68 | /* Buffer for storing the current token. */ | |
69 | char *token_buffer; | |
70 | ||
71 | /* Allocated size of token_buffer, not including space for terminator. */ | |
72 | int maxtoken; | |
73 | ||
74 | bucket *symval; | |
75 | int numval; | |
76 | ||
77 | static int unlexed; /* these two describe a token to be reread */ | |
78 | static bucket *unlexed_symval; /* by the next call to lex */ | |
79 | ||
80 | ||
81 | void | |
82 | init_lex (void) | |
83 | { | |
84 | maxtoken = 100; | |
85 | token_buffer = NEW2 (maxtoken + 1, char); | |
86 | unlexed = -1; | |
87 | } | |
88 | ||
89 | ||
90 | char * | |
91 | grow_token_buffer (char *p) | |
92 | { | |
93 | int offset = p - token_buffer; | |
94 | maxtoken *= 2; | |
95 | token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); | |
96 | return token_buffer + offset; | |
97 | } | |
98 | ||
99 | ||
100 | int | |
101 | skip_white_space (void) | |
102 | { | |
103 | register int c; | |
104 | register int inside; | |
105 | ||
106 | c = getc(finput); | |
107 | ||
108 | for (;;) | |
109 | { | |
110 | int cplus_comment; | |
111 | ||
112 | switch (c) | |
113 | { | |
114 | case '/': | |
115 | c = getc(finput); | |
116 | if (c != '*' && c != '/') | |
117 | { | |
118 | warn(_("unexpected `/' found and ignored")); | |
119 | break; | |
120 | } | |
121 | cplus_comment = (c == '/'); | |
122 | ||
123 | c = getc(finput); | |
124 | ||
125 | inside = 1; | |
126 | while (inside) | |
127 | { | |
128 | if (!cplus_comment && c == '*') | |
129 | { | |
130 | while (c == '*') | |
131 | c = getc(finput); | |
132 | ||
133 | if (c == '/') | |
134 | { | |
135 | inside = 0; | |
136 | c = getc(finput); | |
137 | } | |
138 | } | |
139 | else if (c == '\n') | |
140 | { | |
141 | lineno++; | |
142 | if (cplus_comment) | |
143 | inside = 0; | |
144 | c = getc(finput); | |
145 | } | |
146 | else if (c == EOF) | |
147 | fatal(_("unterminated comment")); | |
148 | else | |
149 | c = getc(finput); | |
150 | } | |
151 | ||
152 | break; | |
153 | ||
154 | case '\n': | |
155 | lineno++; | |
156 | ||
157 | case ' ': | |
158 | case '\t': | |
159 | case '\f': | |
160 | c = getc(finput); | |
161 | break; | |
162 | ||
163 | default: | |
164 | return (c); | |
165 | } | |
166 | } | |
167 | } | |
168 | ||
169 | /* do a getc, but give error message if EOF encountered */ | |
170 | int | |
171 | safegetc (FILE *f) | |
172 | { | |
173 | register int c = getc(f); | |
174 | if (c == EOF) | |
175 | fatal(_("Unexpected end of file")); | |
176 | return c; | |
177 | } | |
178 | ||
179 | /* read one literal character from finput. process \ escapes. | |
180 | append the normalized string version of the char to *pp. | |
181 | assign the character code to *pcode | |
182 | return 1 unless the character is an unescaped `term' or \n | |
183 | report error for \n | |
184 | */ | |
185 | int | |
186 | literalchar (char **pp, int *pcode, char term) | |
187 | { | |
188 | register int c; | |
189 | register char *p; | |
190 | register int code; | |
191 | int wasquote = 0; | |
192 | ||
193 | c = safegetc(finput); | |
194 | if (c == '\n') | |
195 | { | |
196 | warn(_("unescaped newline in constant")); | |
197 | ungetc(c, finput); | |
198 | code = '?'; | |
199 | wasquote = 1; | |
200 | } | |
201 | else if (c != '\\') | |
202 | { | |
203 | code = c; | |
204 | if (c == term) | |
205 | wasquote = 1; | |
206 | } | |
207 | else | |
208 | { | |
209 | c = safegetc(finput); | |
210 | if (c == 't') code = '\t'; | |
211 | else if (c == 'n') code = '\n'; | |
212 | else if (c == 'a') code = '\007'; | |
213 | else if (c == 'r') code = '\r'; | |
214 | else if (c == 'f') code = '\f'; | |
215 | else if (c == 'b') code = '\b'; | |
216 | else if (c == 'v') code = '\013'; | |
217 | else if (c == '\\') code = '\\'; | |
218 | else if (c == '\'') code = '\''; | |
219 | else if (c == '\"') code = '\"'; | |
220 | else if (c <= '7' && c >= '0') | |
221 | { | |
222 | code = 0; | |
223 | while (c <= '7' && c >= '0') | |
224 | { | |
225 | code = (code * 8) + (c - '0'); | |
226 | if (code >= 256 || code < 0) | |
227 | { | |
228 | warni(_("octal value outside range 0...255: `\\%o'"), code); | |
229 | code &= 0xFF; | |
230 | break; | |
231 | } | |
232 | c = safegetc(finput); | |
233 | } | |
234 | ungetc(c, finput); | |
235 | } | |
236 | else if (c == 'x') | |
237 | { | |
238 | c = safegetc(finput); | |
239 | code = 0; | |
240 | while (1) | |
241 | { | |
242 | if (c >= '0' && c <= '9') | |
243 | code *= 16, code += c - '0'; | |
244 | else if (c >= 'a' && c <= 'f') | |
245 | code *= 16, code += c - 'a' + 10; | |
246 | else if (c >= 'A' && c <= 'F') | |
247 | code *= 16, code += c - 'A' + 10; | |
248 | else | |
249 | break; | |
250 | if (code >= 256 || code<0) | |
251 | { | |
252 | warni(_("hexadecimal value above 255: `\\x%x'"), code); | |
253 | code &= 0xFF; | |
254 | break; | |
255 | } | |
256 | c = safegetc(finput); | |
257 | } | |
258 | ungetc(c, finput); | |
259 | } | |
260 | else | |
261 | { | |
262 | warns (_("unknown escape sequence: `\\' followed by `%s'"), | |
263 | printable_version(c)); | |
264 | code = '?'; | |
265 | } | |
266 | } /* has \ */ | |
267 | ||
268 | /* now fill token_buffer with the canonical name for this character | |
269 | as a literal token. Do not use what the user typed, | |
270 | so that `\012' and `\n' can be interchangeable. */ | |
271 | ||
272 | p = *pp; | |
273 | if (code == '\\') {*p++ = '\\'; *p++ = '\\';} | |
274 | else if (code == '\'') {*p++ = '\\'; *p++ = '\'';} | |
275 | else if (code == '\"') {*p++ = '\\'; *p++ = '\"';} | |
276 | else if (code >= 040 && code < 0177) | |
277 | *p++ = code; | |
278 | else if (code == '\t') {*p++ = '\\'; *p++ = 't';} | |
279 | else if (code == '\n') {*p++ = '\\'; *p++ = 'n';} | |
280 | else if (code == '\r') {*p++ = '\\'; *p++ = 'r';} | |
281 | else if (code == '\v') {*p++ = '\\'; *p++ = 'v';} | |
282 | else if (code == '\b') {*p++ = '\\'; *p++ = 'b';} | |
283 | else if (code == '\f') {*p++ = '\\'; *p++ = 'f';} | |
284 | else | |
285 | { | |
286 | *p++ = '\\'; | |
287 | *p++ = code / 0100 + '0'; | |
288 | *p++ = ((code / 010) & 07) + '0'; | |
289 | *p++ = (code & 07) + '0'; | |
290 | } | |
291 | *pp = p; | |
292 | *pcode = code; | |
293 | return ! wasquote; | |
294 | } | |
295 | ||
296 | ||
297 | void | |
298 | unlex (int token) | |
299 | { | |
300 | unlexed = token; | |
301 | unlexed_symval = symval; | |
302 | } | |
303 | ||
304 | ||
305 | int | |
306 | lex (void) | |
307 | { | |
308 | register int c; | |
309 | char *p; | |
310 | ||
311 | if (unlexed >= 0) | |
312 | { | |
313 | symval = unlexed_symval; | |
314 | c = unlexed; | |
315 | unlexed = -1; | |
316 | return (c); | |
317 | } | |
318 | ||
319 | c = skip_white_space(); | |
320 | *token_buffer = c; /* for error messages (token buffer always valid) */ | |
321 | token_buffer[1] = 0; | |
322 | ||
323 | switch (c) | |
324 | { | |
325 | case EOF: | |
326 | strcpy(token_buffer, "EOF"); | |
327 | return (ENDFILE); | |
328 | ||
329 | case 'A': case 'B': case 'C': case 'D': case 'E': | |
330 | case 'F': case 'G': case 'H': case 'I': case 'J': | |
331 | case 'K': case 'L': case 'M': case 'N': case 'O': | |
332 | case 'P': case 'Q': case 'R': case 'S': case 'T': | |
333 | case 'U': case 'V': case 'W': case 'X': case 'Y': | |
334 | case 'Z': | |
335 | case 'a': case 'b': case 'c': case 'd': case 'e': | |
336 | case 'f': case 'g': case 'h': case 'i': case 'j': | |
337 | case 'k': case 'l': case 'm': case 'n': case 'o': | |
338 | case 'p': case 'q': case 'r': case 's': case 't': | |
339 | case 'u': case 'v': case 'w': case 'x': case 'y': | |
340 | case 'z': | |
341 | case '.': case '_': | |
342 | p = token_buffer; | |
343 | while (isalnum(c) || c == '_' || c == '.') | |
344 | { | |
345 | if (p == token_buffer + maxtoken) | |
346 | p = grow_token_buffer(p); | |
347 | ||
348 | *p++ = c; | |
349 | c = getc(finput); | |
350 | } | |
351 | ||
352 | *p = 0; | |
353 | ungetc(c, finput); | |
354 | symval = getsym(token_buffer); | |
355 | return (IDENTIFIER); | |
356 | ||
357 | case '0': case '1': case '2': case '3': case '4': | |
358 | case '5': case '6': case '7': case '8': case '9': | |
359 | { | |
360 | numval = 0; | |
361 | ||
362 | p = token_buffer; | |
363 | while (isdigit(c)) | |
364 | { | |
365 | if (p == token_buffer + maxtoken) | |
366 | p = grow_token_buffer(p); | |
367 | ||
368 | *p++ = c; | |
369 | numval = numval*10 + c - '0'; | |
370 | c = getc(finput); | |
371 | } | |
372 | *p = 0; | |
373 | ungetc(c, finput); | |
374 | return (NUMBER); | |
375 | } | |
376 | ||
377 | case '\'': | |
378 | ||
379 | /* parse the literal token and compute character code in code */ | |
380 | ||
381 | translations = -1; | |
382 | { | |
383 | int code, discode; | |
384 | char discard[10], *dp; | |
385 | ||
386 | p = token_buffer; | |
387 | *p++ = '\''; | |
388 | literalchar(&p, &code, '\''); | |
389 | ||
390 | c = getc(finput); | |
391 | if (c != '\'') | |
392 | { | |
393 | warn(_("use \"...\" for multi-character literal tokens")); | |
394 | while (1) | |
395 | { | |
396 | dp = discard; | |
397 | if (! literalchar(&dp, &discode, '\'')) | |
398 | break; | |
399 | } | |
400 | } | |
401 | *p++ = '\''; | |
402 | *p = 0; | |
403 | symval = getsym(token_buffer); | |
404 | symval->class = STOKEN; | |
405 | if (! symval->user_token_number) | |
406 | symval->user_token_number = code; | |
407 | return (IDENTIFIER); | |
408 | } | |
409 | ||
410 | case '\"': | |
411 | ||
412 | /* parse the literal string token and treat as an identifier */ | |
413 | ||
414 | translations = -1; | |
415 | { | |
416 | int code; /* ignored here */ | |
417 | p = token_buffer; | |
418 | *p++ = '\"'; | |
419 | while (literalchar(&p, &code, '\"')) /* read up to and including " */ | |
420 | { | |
421 | if (p >= token_buffer + maxtoken - 4) | |
422 | p = grow_token_buffer(p); | |
423 | } | |
424 | *p = 0; | |
425 | ||
426 | symval = getsym(token_buffer); | |
427 | symval->class = STOKEN; | |
428 | ||
429 | return (IDENTIFIER); | |
430 | } | |
431 | ||
432 | case ',': | |
433 | return (COMMA); | |
434 | ||
435 | case ':': | |
436 | return (COLON); | |
437 | ||
438 | case ';': | |
439 | return (SEMICOLON); | |
440 | ||
441 | case '|': | |
442 | return (BAR); | |
443 | ||
444 | case '{': | |
445 | return (LEFT_CURLY); | |
446 | ||
447 | case '=': | |
448 | do | |
449 | { | |
450 | c = getc(finput); | |
451 | if (c == '\n') lineno++; | |
452 | } | |
453 | while(c==' ' || c=='\n' || c=='\t'); | |
454 | ||
455 | if (c == '{') | |
456 | { | |
457 | strcpy(token_buffer, "={"); | |
458 | return(LEFT_CURLY); | |
459 | } | |
460 | else | |
461 | { | |
462 | ungetc(c, finput); | |
463 | return(ILLEGAL); | |
464 | } | |
465 | ||
466 | case '<': | |
467 | p = token_buffer; | |
468 | c = getc(finput); | |
469 | while (c != '>') | |
470 | { | |
471 | if (c == EOF) | |
472 | fatal(_("unterminated type name at end of file")); | |
473 | if (c == '\n') | |
474 | { | |
475 | warn(_("unterminated type name")); | |
476 | ungetc(c, finput); | |
477 | break; | |
478 | } | |
479 | ||
480 | if (p == token_buffer + maxtoken) | |
481 | p = grow_token_buffer(p); | |
482 | ||
483 | *p++ = c; | |
484 | c = getc(finput); | |
485 | } | |
486 | *p = 0; | |
487 | return (TYPENAME); | |
488 | ||
489 | ||
490 | case '%': | |
491 | return (parse_percent_token()); | |
492 | ||
493 | default: | |
494 | return (ILLEGAL); | |
495 | } | |
496 | } | |
497 | ||
498 | /* the following table dictates the action taken for the various | |
499 | % directives. A setflag value causes the named flag to be | |
500 | set. A retval action returns the code. | |
501 | */ | |
502 | struct percent_table_struct { | |
503 | char *name; | |
504 | void *setflag; | |
505 | int retval; | |
506 | } percent_table[] = | |
507 | { | |
508 | {"token", NULL, TOKEN}, | |
509 | {"term", NULL, TOKEN}, | |
510 | {"nterm", NULL, NTERM}, | |
511 | {"type", NULL, TYPE}, | |
512 | {"guard", NULL, GUARD}, | |
513 | {"union", NULL, UNION}, | |
514 | {"expect", NULL, EXPECT}, | |
515 | {"thong", NULL, THONG}, | |
516 | {"start", NULL, START}, | |
517 | {"left", NULL, LEFT}, | |
518 | {"right", NULL, RIGHT}, | |
519 | {"nonassoc", NULL, NONASSOC}, | |
520 | {"binary", NULL, NONASSOC}, | |
521 | {"semantic_parser", NULL, SEMANTIC_PARSER}, | |
522 | {"pure_parser", NULL, PURE_PARSER}, | |
523 | {"prec", NULL, PREC}, | |
524 | ||
525 | {"no_lines", &nolinesflag, NOOP}, /* -l */ | |
526 | {"raw", &rawtoknumflag, NOOP}, /* -r */ | |
527 | {"token_table", &toknumflag, NOOP}, /* -k */ | |
528 | ||
529 | #if 0 | |
530 | /* These can be utilized after main is reoganized so | |
531 | open_files() is deferred 'til after read_declarations(). | |
532 | But %{ and %union both put information into files | |
533 | that have to be opened before read_declarations(). | |
534 | */ | |
535 | {"yacc", &fixed_outfiles, NOOP}, /* -y */ | |
536 | {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */ | |
537 | {"defines", &definesflag, NOOP}, /* -d */ | |
538 | {"no_parser", &noparserflag, NOOP}, /* -n */ | |
539 | {"output_file", &spec_outfile, SETOPT}, /* -o */ | |
540 | {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */ | |
541 | {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */ | |
542 | ||
543 | /* These would be acceptable, but they do not affect processing */ | |
544 | {"verbose", &verboseflag, NOOP}, /* -v */ | |
545 | {"debug", &debugflag, NOOP}, /* -t */ | |
546 | /* {"help", <print usage stmt>, NOOP},*/ /* -h */ | |
547 | /* {"version", <print version number> , NOOP},*/ /* -V */ | |
548 | #endif | |
549 | ||
550 | {NULL, NULL, ILLEGAL} | |
551 | }; | |
552 | ||
553 | /* Parse a token which starts with %. | |
554 | Assumes the % has already been read and discarded. */ | |
555 | ||
556 | int | |
557 | parse_percent_token (void) | |
558 | { | |
559 | register int c; | |
560 | register char *p; | |
561 | register struct percent_table_struct *tx; | |
562 | ||
563 | p = token_buffer; | |
564 | c = getc(finput); | |
565 | *p++ = '%'; | |
566 | *p++ = c; /* for error msg */ | |
567 | *p = 0; | |
568 | ||
569 | switch (c) | |
570 | { | |
571 | case '%': | |
572 | return (TWO_PERCENTS); | |
573 | ||
574 | case '{': | |
575 | return (PERCENT_LEFT_CURLY); | |
576 | ||
577 | case '<': | |
578 | return (LEFT); | |
579 | ||
580 | case '>': | |
581 | return (RIGHT); | |
582 | ||
583 | case '2': | |
584 | return (NONASSOC); | |
585 | ||
586 | case '0': | |
587 | return (TOKEN); | |
588 | ||
589 | case '=': | |
590 | return (PREC); | |
591 | } | |
592 | if (!isalpha(c)) | |
593 | return (ILLEGAL); | |
594 | ||
595 | p = token_buffer; | |
596 | *p++ = '%'; | |
597 | while (isalpha(c) || c == '_' || c == '-') | |
598 | { | |
599 | if (p == token_buffer + maxtoken) | |
600 | p = grow_token_buffer(p); | |
601 | ||
602 | if (c == '-') c = '_'; | |
603 | *p++ = c; | |
604 | c = getc(finput); | |
605 | } | |
606 | ||
607 | ungetc(c, finput); | |
608 | ||
609 | *p = 0; | |
610 | ||
611 | /* table lookup % directive */ | |
612 | for (tx = percent_table; tx->name; tx++) | |
613 | if (strcmp(token_buffer+1, tx->name) == 0) | |
614 | break; | |
615 | if (tx->retval == SETOPT) | |
616 | { | |
617 | *((char **)(tx->setflag)) = optarg; | |
618 | return NOOP; | |
619 | } | |
620 | if (tx->setflag) | |
621 | { | |
622 | *((int *)(tx->setflag)) = 1; | |
623 | return NOOP; | |
624 | } | |
625 | return tx->retval; | |
626 | } |