]> git.saurik.com Git - bison.git/blob - src/lex.c
* src/gram.h (rprec, rprecsym, rassoc): Remove, now part of...
[bison.git] / src / lex.c
1 /* Token-reader for Bison's input parser,
2 Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
3
4 This file is part of Bison, the GNU Compiler Compiler.
5
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21 #include "system.h"
22 #include "getargs.h"
23 #include "files.h"
24 #include "symtab.h"
25 #include "lex.h"
26 #include "complain.h"
27 #include "gram.h"
28 #include "quote.h"
29
30 /* Buffer for storing the current token. */
31 struct obstack token_obstack;
32 const char *token_buffer = NULL;
33
34 bucket *symval = NULL;
35 int numval;
36
37 /* A token to be reread, see unlex and lex. */
38 static token_t unlexed = tok_undef;
39 static bucket *unlexed_symval = NULL;
40 static const char *unlexed_token_buffer = NULL;
41
42 void
43 lex_init (void)
44 {
45 obstack_init (&token_obstack);
46 unlexed = tok_undef;
47 }
48
49
50 void
51 lex_free (void)
52 {
53 obstack_free (&token_obstack, NULL);
54 }
55
56
57 int
58 skip_white_space (void)
59 {
60 int c;
61 int inside;
62
63 c = getc (finput);
64
65 for (;;)
66 {
67 int cplus_comment;
68
69 switch (c)
70 {
71 case '/':
72 /* FIXME: Should probably be merged with copy_comment. */
73 c = getc (finput);
74 if (c != '*' && c != '/')
75 {
76 complain (_("unexpected `/' found and ignored"));
77 break;
78 }
79 cplus_comment = (c == '/');
80
81 c = getc (finput);
82
83 inside = 1;
84 while (inside)
85 {
86 if (!cplus_comment && c == '*')
87 {
88 while (c == '*')
89 c = getc (finput);
90
91 if (c == '/')
92 {
93 inside = 0;
94 c = getc (finput);
95 }
96 }
97 else if (c == '\n')
98 {
99 lineno++;
100 if (cplus_comment)
101 inside = 0;
102 c = getc (finput);
103 }
104 else if (c == EOF)
105 fatal (_("unterminated comment"));
106 else
107 c = getc (finput);
108 }
109
110 break;
111
112 case '\n':
113 lineno++;
114
115 case ' ':
116 case '\t':
117 case '\f':
118 c = getc (finput);
119 break;
120
121 default:
122 return c;
123 }
124 }
125 }
126
127
128 /*-----------------------------------------------------.
129 | Do a getc, but give error message if EOF encountered |
130 `-----------------------------------------------------*/
131
132 static int
133 xgetc (FILE *f)
134 {
135 int c = getc (f);
136 if (c == EOF)
137 fatal (_("unexpected end of file"));
138 return c;
139 }
140
141
142 /*------------------------------------------------------------------.
143 | Read one literal character from finput. Process \ escapes. |
144 | Append the normalized string version of the char to OUT. Assign |
145 | the character code to *PCODE. Return 1 unless the character is an |
146 | unescaped `term' or \n report error for \n. |
147 `------------------------------------------------------------------*/
148
149 /* FIXME: We could directly work in the obstack, but that would make
150 it more difficult to move to quotearg some day. So for the time
151 being, I prefer have literalchar behave like quotearg, and change
152 my mind later if I was wrong. */
153
154 static int
155 literalchar (struct obstack *out, int *pcode, char term)
156 {
157 int c;
158 char buf[4096];
159 char *cp;
160 int code;
161 int wasquote = 0;
162
163 c = xgetc (finput);
164 if (c == '\n')
165 {
166 complain (_("unescaped newline in constant"));
167 ungetc (c, finput);
168 code = '?';
169 wasquote = 1;
170 }
171 else if (c != '\\')
172 {
173 code = c;
174 if (c == term)
175 wasquote = 1;
176 }
177 else
178 {
179 c = xgetc (finput);
180 if (c == 't')
181 code = '\t';
182 else if (c == 'n')
183 code = '\n';
184 else if (c == 'a')
185 code = '\007';
186 else if (c == 'r')
187 code = '\r';
188 else if (c == 'f')
189 code = '\f';
190 else if (c == 'b')
191 code = '\b';
192 else if (c == 'v')
193 code = '\013';
194 else if (c == '\\')
195 code = '\\';
196 else if (c == '\'')
197 code = '\'';
198 else if (c == '\"')
199 code = '\"';
200 else if (c <= '7' && c >= '0')
201 {
202 code = 0;
203 while (c <= '7' && c >= '0')
204 {
205 code = (code * 8) + (c - '0');
206 if (code >= 256 || code < 0)
207 {
208 complain (_("octal value outside range 0...255: `\\%o'"),
209 code);
210 code &= 0xFF;
211 break;
212 }
213 c = xgetc (finput);
214 }
215 ungetc (c, finput);
216 }
217 else if (c == 'x')
218 {
219 c = xgetc (finput);
220 code = 0;
221 while (1)
222 {
223 if (c >= '0' && c <= '9')
224 code *= 16, code += c - '0';
225 else if (c >= 'a' && c <= 'f')
226 code *= 16, code += c - 'a' + 10;
227 else if (c >= 'A' && c <= 'F')
228 code *= 16, code += c - 'A' + 10;
229 else
230 break;
231 if (code >= 256 || code < 0)
232 {
233 complain (_("hexadecimal value above 255: `\\x%x'"), code);
234 code &= 0xFF;
235 break;
236 }
237 c = xgetc (finput);
238 }
239 ungetc (c, finput);
240 }
241 else
242 {
243 char badchar [] = "c";
244 badchar[0] = c;
245 complain (_("unknown escape sequence: `\\' followed by `%s'"),
246 quote (badchar));
247 code = '?';
248 }
249 } /* has \ */
250
251 /* now fill BUF with the canonical name for this character as a
252 literal token. Do not use what the user typed, so that `\012'
253 and `\n' can be interchangeable. */
254
255 cp = buf;
256 if (code == term && wasquote)
257 *cp++ = code;
258 else if (code == '\\')
259 {
260 *cp++ = '\\';
261 *cp++ = '\\';
262 }
263 else if (code == '\'')
264 {
265 *cp++ = '\\';
266 *cp++ = '\'';
267 }
268 else if (code == '\"')
269 {
270 *cp++ = '\\';
271 *cp++ = '\"';
272 }
273 else if (code >= 040 && code < 0177)
274 *cp++ = code;
275 else if (code == '\t')
276 {
277 *cp++ = '\\';
278 *cp++ = 't';
279 }
280 else if (code == '\n')
281 {
282 *cp++ = '\\';
283 *cp++ = 'n';
284 }
285 else if (code == '\r')
286 {
287 *cp++ = '\\';
288 *cp++ = 'r';
289 }
290 else if (code == '\v')
291 {
292 *cp++ = '\\';
293 *cp++ = 'v';
294 }
295 else if (code == '\b')
296 {
297 *cp++ = '\\';
298 *cp++ = 'b';
299 }
300 else if (code == '\f')
301 {
302 *cp++ = '\\';
303 *cp++ = 'f';
304 }
305 else
306 {
307 *cp++ = '\\';
308 *cp++ = code / 0100 + '0';
309 *cp++ = ((code / 010) & 07) + '0';
310 *cp++ = (code & 07) + '0';
311 }
312 *cp = '\0';
313
314 if (out)
315 obstack_sgrow (out, buf);
316 *pcode = code;
317 return !wasquote;
318 }
319
320
321 void
322 unlex (token_t token)
323 {
324 unlexed = token;
325 unlexed_token_buffer = token_buffer;
326 unlexed_symval = symval;
327 }
328
329 /*-----------------------------------------------------------------.
330 | We just read `<' from FIN. Store in TOKEN_BUFFER, the type name |
331 | specified between the `<...>'. |
332 `-----------------------------------------------------------------*/
333
334 void
335 read_type_name (FILE *fin)
336 {
337 int c = getc (fin);
338
339 while (c != '>')
340 {
341 if (c == EOF)
342 fatal (_("unterminated type name at end of file"));
343 if (c == '\n')
344 {
345 complain (_("unterminated type name"));
346 ungetc (c, fin);
347 break;
348 }
349
350 obstack_1grow (&token_obstack, c);
351 c = getc (fin);
352 }
353 obstack_1grow (&token_obstack, '\0');
354 token_buffer = obstack_finish (&token_obstack);
355 }
356
357
358 token_t
359 lex (void)
360 {
361 int c;
362
363 /* Just to make sure. */
364 token_buffer = NULL;
365
366 if (unlexed != tok_undef)
367 {
368 token_t res = unlexed;
369 symval = unlexed_symval;
370 token_buffer = unlexed_token_buffer;
371 unlexed = tok_undef;
372 return res;
373 }
374
375 c = skip_white_space ();
376
377 switch (c)
378 {
379 case EOF:
380 token_buffer = "EOF";
381 return tok_eof;
382
383 case 'A': case 'B': case 'C': case 'D': case 'E':
384 case 'F': case 'G': case 'H': case 'I': case 'J':
385 case 'K': case 'L': case 'M': case 'N': case 'O':
386 case 'P': case 'Q': case 'R': case 'S': case 'T':
387 case 'U': case 'V': case 'W': case 'X': case 'Y':
388 case 'Z':
389 case 'a': case 'b': case 'c': case 'd': case 'e':
390 case 'f': case 'g': case 'h': case 'i': case 'j':
391 case 'k': case 'l': case 'm': case 'n': case 'o':
392 case 'p': case 'q': case 'r': case 's': case 't':
393 case 'u': case 'v': case 'w': case 'x': case 'y':
394 case 'z':
395 case '.': case '_':
396
397 while (isalnum (c) || c == '_' || c == '.')
398 {
399 obstack_1grow (&token_obstack, c);
400 c = getc (finput);
401 }
402 obstack_1grow (&token_obstack, '\0');
403 token_buffer = obstack_finish (&token_obstack);
404 ungetc (c, finput);
405 symval = getsym (token_buffer);
406 return tok_identifier;
407
408 case '0': case '1': case '2': case '3': case '4':
409 case '5': case '6': case '7': case '8': case '9':
410 {
411 numval = 0;
412
413 while (isdigit (c))
414 {
415 obstack_1grow (&token_obstack, c);
416 numval = numval * 10 + c - '0';
417 c = getc (finput);
418 }
419 obstack_1grow (&token_obstack, '\0');
420 token_buffer = obstack_finish (&token_obstack);
421 ungetc (c, finput);
422 return tok_number;
423 }
424
425 case '\'':
426 /* parse the literal token and compute character code in code */
427
428 {
429 int code;
430
431 obstack_1grow (&token_obstack, '\'');
432 literalchar (&token_obstack, &code, '\'');
433
434 c = getc (finput);
435 if (c != '\'')
436 {
437 int discode;
438 complain (_("use \"...\" for multi-character literal tokens"));
439 while (1)
440 if (!literalchar (0, &discode, '\''))
441 break;
442 }
443 obstack_1grow (&token_obstack, '\'');
444 obstack_1grow (&token_obstack, '\0');
445 token_buffer = obstack_finish (&token_obstack);
446 symval = getsym (token_buffer);
447 symval->class = token_sym;
448 if (symval->user_token_number == SUNDEF)
449 symval->user_token_number = code;
450 return tok_identifier;
451 }
452
453 case '\"':
454 /* parse the literal string token and treat as an identifier */
455
456 {
457 int code; /* ignored here */
458
459 obstack_1grow (&token_obstack, '\"');
460 /* Read up to and including ". */
461 while (literalchar (&token_obstack, &code, '\"'))
462 /* nothing */;
463 obstack_1grow (&token_obstack, '\0');
464 token_buffer = obstack_finish (&token_obstack);
465
466 symval = getsym (token_buffer);
467 symval->class = token_sym;
468
469 return tok_identifier;
470 }
471
472 case ',':
473 token_buffer = ",";
474 return tok_comma;
475
476 case ':':
477 token_buffer = ":";
478 return tok_colon;
479
480 case ';':
481 token_buffer = ";";
482 return tok_semicolon;
483
484 case '|':
485 token_buffer = "|";
486 return tok_bar;
487
488 case '{':
489 token_buffer = "{";
490 return tok_left_curly;
491
492 case '=':
493 obstack_1grow (&token_obstack, c);
494 do
495 {
496 c = getc (finput);
497 obstack_1grow (&token_obstack, c);
498 if (c == '\n')
499 lineno++;
500 }
501 while (c == ' ' || c == '\n' || c == '\t');
502 obstack_1grow (&token_obstack, '\0');
503 token_buffer = obstack_finish (&token_obstack);
504
505 if (c == '{')
506 {
507 return tok_left_curly;
508 }
509 else
510 {
511 ungetc (c, finput);
512 return tok_illegal;
513 }
514
515 case '<':
516 read_type_name (finput);
517 return tok_typename;
518
519 case '%':
520 return parse_percent_token ();
521
522 default:
523 obstack_1grow (&token_obstack, c);
524 obstack_1grow (&token_obstack, '\0');
525 token_buffer = obstack_finish (&token_obstack);
526 return tok_illegal;
527 }
528 }
529
530 /* the following table dictates the action taken for the various %
531 directives. A set_flag value causes the named flag to be set. A
532 retval action returns the code. */
533 struct percent_table_struct
534 {
535 const char *name;
536 void *set_flag;
537 token_t retval;
538 };
539
540 struct percent_table_struct percent_table[] =
541 {
542 { "token", NULL, tok_token },
543 { "term", NULL, tok_token },
544 { "nterm", NULL, tok_nterm },
545 { "type", NULL, tok_type },
546 { "guard", NULL, tok_guard },
547 { "union", NULL, tok_union },
548 { "expect", NULL, tok_expect },
549 { "thong", NULL, tok_thong },
550 { "start", NULL, tok_start },
551 { "left", NULL, tok_left },
552 { "right", NULL, tok_right },
553 { "nonassoc", NULL, tok_nonassoc },
554 { "binary", NULL, tok_nonassoc },
555 { "prec", NULL, tok_prec },
556 { "locations", &locations_flag, tok_intopt }, /* -l */
557 { "no-lines", &no_lines_flag, tok_intopt }, /* -l */
558 { "raw", NULL, tok_obsolete }, /* -r */
559 { "token-table", &token_table_flag, tok_intopt }, /* -k */
560 { "yacc", &yacc_flag, tok_intopt }, /* -y */
561 { "fixed-output-files",&yacc_flag, tok_intopt }, /* -y */
562 { "defines", &defines_flag, tok_intopt }, /* -d */
563 { "no-parser", &no_parser_flag, tok_intopt }, /* -n */
564 { "graph", &graph_flag, tok_intopt }, /* -g */
565
566 /* FIXME: semantic parsers which will output an `include' of an
567 output file: be sure that the name included is indeed the name of
568 the output file. */
569 { "output", &spec_outfile, tok_stropt }, /* -o */
570 { "file-prefix", &spec_file_prefix, tok_stropt }, /* -b */
571 { "name-prefix", &spec_name_prefix, tok_stropt }, /* -p */
572
573 { "verbose", &verbose_flag, tok_intopt }, /* -v */
574 { "debug", &debug_flag, tok_intopt }, /* -t */
575 { "semantic-parser", &semantic_parser, tok_intopt },
576 { "pure-parser", &pure_parser, tok_intopt },
577
578 { NULL, NULL, tok_illegal}
579 };
580
581 /* Parse a token which starts with %.
582 Assumes the % has already been read and discarded. */
583
584 token_t
585 parse_percent_token (void)
586 {
587 struct percent_table_struct *tx = NULL;
588 const char *arg = NULL;
589 /* Where the ARG was found in token_buffer. */
590 size_t arg_offset = 0;
591
592 int c = getc (finput);
593
594 switch (c)
595 {
596 case '%':
597 return tok_two_percents;
598
599 case '{':
600 return tok_percent_left_curly;
601
602 /* FIXME: Who the heck are those 5 guys!?! `%<' = `%left'!!!
603 Let's ask for there removal. */
604 case '<':
605 return tok_left;
606
607 case '>':
608 return tok_right;
609
610 case '2':
611 return tok_nonassoc;
612
613 case '0':
614 return tok_token;
615
616 case '=':
617 return tok_prec;
618 }
619
620 if (!isalpha (c))
621 return tok_illegal;
622
623 obstack_1grow (&token_obstack, '%');
624 while (isalpha (c) || c == '_' || c == '-')
625 {
626 if (c == '_')
627 c = '-';
628 obstack_1grow (&token_obstack, c);
629 c = getc (finput);
630 }
631
632 /* %DIRECTIVE="ARG". Separate into
633 TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
634 This is a bit hackish, but once we move to a Bison parser,
635 things will be cleaned up. */
636 if (c == '=')
637 {
638 /* End of the directive. We skip the `='. */
639 obstack_1grow (&token_obstack, '\0');
640 /* Fetch the ARG if present. */
641 c = getc (finput);
642 if (c == '"')
643 {
644 int code;
645 arg_offset = obstack_object_size (&token_obstack);
646 /* Read up to and including `"'. Do not append the closing
647 `"' in the output: it's not part of the ARG. */
648 while (literalchar (NULL, &code, '"'))
649 obstack_1grow (&token_obstack, code);
650 }
651 /* else: should be an error. */
652 }
653 else
654 ungetc (c, finput);
655
656 obstack_1grow (&token_obstack, '\0');
657 token_buffer = obstack_finish (&token_obstack);
658 if (arg_offset)
659 arg = token_buffer + arg_offset;
660
661 /* table lookup % directive */
662 for (tx = percent_table; tx->name; tx++)
663 if (strcmp (token_buffer + 1, tx->name) == 0)
664 break;
665
666 if (arg && tx->retval != tok_stropt)
667 fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
668
669 switch (tx->retval)
670 {
671 case tok_stropt:
672 assert (tx->set_flag);
673 if (arg)
674 {
675 /* Keep only the first assignment: command line options have
676 already been processed, and we want them to have
677 precedence. Side effect: if this %-option is used
678 several times, only the first is honored. Bah. */
679 if (!*((char **) (tx->set_flag)))
680 *((char **) (tx->set_flag)) = xstrdup (arg);
681 }
682 else
683 fatal (_("`%s' requires an argument"), token_buffer);
684 return tok_noop;
685 break;
686
687 case tok_intopt:
688 assert (tx->set_flag);
689 *((int *) (tx->set_flag)) = 1;
690 return tok_noop;
691 break;
692
693 case tok_obsolete:
694 fatal (_("`%s' is no longer supported"), token_buffer);
695 return tok_noop;
696 break;
697
698 default:
699 return tx->retval;
700 break;
701 }
702 abort ();
703 }