]> git.saurik.com Git - bison.git/blame - src/lex.c
Update.
[bison.git] / src / lex.c
CommitLineData
40675e7c 1/* Token-reader for Bison's input parser,
62ab6972 2 Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
40675e7c 3
a0f6b076 4 This file is part of Bison, the GNU Compiler Compiler.
40675e7c 5
a0f6b076
AD
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
40675e7c 10
a0f6b076
AD
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
40675e7c 15
a0f6b076
AD
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
40675e7c 20
40675e7c 21#include "system.h"
ceed8467 22#include "getargs.h"
40675e7c 23#include "files.h"
a44c2277 24#include "getopt.h" /* for optarg */
40675e7c 25#include "symtab.h"
82b6d266 26#include "options.h"
40675e7c 27#include "lex.h"
d7913476 28#include "xalloc.h"
a0f6b076 29#include "complain.h"
b2ca4022 30#include "gram.h"
ff4a34be 31#include "quote.h"
40675e7c
DM
32
33/* Buffer for storing the current token. */
f17bcd1f 34struct obstack token_obstack;
b0ce6046 35const char *token_buffer = NULL;
40675e7c
DM
36
37bucket *symval;
38int numval;
39
40static int unlexed; /* these two describe a token to be reread */
41static bucket *unlexed_symval; /* by the next call to lex */
42
43
44void
d2729d44 45init_lex (void)
40675e7c 46{
f17bcd1f 47 obstack_init (&token_obstack);
40675e7c
DM
48 unlexed = -1;
49}
50
51
40675e7c 52int
d2729d44 53skip_white_space (void)
40675e7c 54{
abadc117
AD
55 int c;
56 int inside;
40675e7c 57
abadc117 58 c = getc (finput);
40675e7c
DM
59
60 for (;;)
61 {
62 int cplus_comment;
63
64 switch (c)
65 {
66 case '/':
79282c5a 67 /* FIXME: Should probably be merged with copy_comment. */
abadc117 68 c = getc (finput);
a083fbbf 69 if (c != '*' && c != '/')
a44c2277 70 {
a0f6b076 71 complain (_("unexpected `/' found and ignored"));
a44c2277
RS
72 break;
73 }
40675e7c
DM
74 cplus_comment = (c == '/');
75
abadc117 76 c = getc (finput);
40675e7c
DM
77
78 inside = 1;
79 while (inside)
80 {
81 if (!cplus_comment && c == '*')
82 {
83 while (c == '*')
abadc117 84 c = getc (finput);
40675e7c
DM
85
86 if (c == '/')
87 {
88 inside = 0;
abadc117 89 c = getc (finput);
40675e7c
DM
90 }
91 }
92 else if (c == '\n')
93 {
94 lineno++;
95 if (cplus_comment)
96 inside = 0;
abadc117 97 c = getc (finput);
40675e7c
DM
98 }
99 else if (c == EOF)
a0f6b076 100 fatal (_("unterminated comment"));
40675e7c 101 else
abadc117 102 c = getc (finput);
40675e7c
DM
103 }
104
105 break;
106
107 case '\n':
108 lineno++;
109
110 case ' ':
111 case '\t':
112 case '\f':
abadc117 113 c = getc (finput);
40675e7c
DM
114 break;
115
116 default:
36281465 117 return c;
40675e7c
DM
118 }
119 }
120}
121
79282c5a
AD
122
123/*-----------------------------------------------------.
124| Do a getc, but give error message if EOF encountered |
125`-----------------------------------------------------*/
126
4a120d45 127static int
abadc117 128xgetc (FILE *f)
a44c2277 129{
abadc117 130 int c = getc (f);
a44c2277 131 if (c == EOF)
a0f6b076 132 fatal (_("unexpected end of file"));
a44c2277
RS
133 return c;
134}
135
abadc117
AD
136
137/*------------------------------------------------------------------.
138| Read one literal character from finput. Process \ escapes. |
f17bcd1f 139| Append the normalized string version of the char to OUT. Assign |
abadc117 140| the character code to *PCODE. Return 1 unless the character is an |
f17bcd1f 141| unescaped `term' or \n report error for \n. |
abadc117
AD
142`------------------------------------------------------------------*/
143
f17bcd1f
AD
144/* FIXME: We could directly work in the obstack, but that would make
145 it more difficult to move to quotearg some day. So for the time
146 being, I prefer have literalchar behave like quotearg, and change
147 my mind later if I was wrong. */
148
4a120d45 149static int
f17bcd1f 150literalchar (struct obstack *out, int *pcode, char term)
a44c2277 151{
abadc117 152 int c;
f17bcd1f
AD
153 char buf[4096];
154 char *cp;
abadc117 155 int code;
a44c2277
RS
156 int wasquote = 0;
157
abadc117 158 c = xgetc (finput);
a083fbbf 159 if (c == '\n')
a44c2277 160 {
a0f6b076 161 complain (_("unescaped newline in constant"));
abadc117 162 ungetc (c, finput);
a44c2277
RS
163 code = '?';
164 wasquote = 1;
165 }
166 else if (c != '\\')
167 {
168 code = c;
a083fbbf 169 if (c == term)
a44c2277
RS
170 wasquote = 1;
171 }
172 else
173 {
abadc117
AD
174 c = xgetc (finput);
175 if (c == 't')
176 code = '\t';
177 else if (c == 'n')
178 code = '\n';
179 else if (c == 'a')
180 code = '\007';
181 else if (c == 'r')
182 code = '\r';
183 else if (c == 'f')
184 code = '\f';
185 else if (c == 'b')
186 code = '\b';
187 else if (c == 'v')
188 code = '\013';
189 else if (c == '\\')
190 code = '\\';
191 else if (c == '\'')
192 code = '\'';
193 else if (c == '\"')
194 code = '\"';
a44c2277
RS
195 else if (c <= '7' && c >= '0')
196 {
197 code = 0;
198 while (c <= '7' && c >= '0')
199 {
200 code = (code * 8) + (c - '0');
201 if (code >= 256 || code < 0)
202 {
a0f6b076
AD
203 complain (_("octal value outside range 0...255: `\\%o'"),
204 code);
a44c2277
RS
205 code &= 0xFF;
206 break;
207 }
abadc117 208 c = xgetc (finput);
a44c2277 209 }
abadc117 210 ungetc (c, finput);
a44c2277
RS
211 }
212 else if (c == 'x')
213 {
abadc117 214 c = xgetc (finput);
a44c2277
RS
215 code = 0;
216 while (1)
217 {
218 if (c >= '0' && c <= '9')
abadc117 219 code *= 16, code += c - '0';
a44c2277 220 else if (c >= 'a' && c <= 'f')
abadc117 221 code *= 16, code += c - 'a' + 10;
a44c2277 222 else if (c >= 'A' && c <= 'F')
abadc117 223 code *= 16, code += c - 'A' + 10;
a083fbbf 224 else
a44c2277 225 break;
abadc117 226 if (code >= 256 || code < 0)
a44c2277 227 {
abadc117 228 complain (_("hexadecimal value above 255: `\\x%x'"), code);
a44c2277
RS
229 code &= 0xFF;
230 break;
231 }
abadc117 232 c = xgetc (finput);
a44c2277 233 }
abadc117 234 ungetc (c, finput);
a44c2277
RS
235 }
236 else
237 {
b0ce6046
AD
238 char badchar [] = "c";
239 badchar[0] = c;
a0f6b076 240 complain (_("unknown escape sequence: `\\' followed by `%s'"),
b0ce6046 241 quote (badchar));
a44c2277
RS
242 code = '?';
243 }
abadc117 244 } /* has \ */
a44c2277 245
f17bcd1f
AD
246 /* now fill BUF with the canonical name for this character as a
247 literal token. Do not use what the user typed, so that `\012'
248 and `\n' can be interchangeable. */
a44c2277 249
f17bcd1f 250 cp = buf;
e5335b74 251 if (code == term && wasquote)
f17bcd1f 252 *cp++ = code;
abadc117
AD
253 else if (code == '\\')
254 {
f17bcd1f
AD
255 *cp++ = '\\';
256 *cp++ = '\\';
abadc117
AD
257 }
258 else if (code == '\'')
259 {
f17bcd1f
AD
260 *cp++ = '\\';
261 *cp++ = '\'';
abadc117
AD
262 }
263 else if (code == '\"')
264 {
f17bcd1f
AD
265 *cp++ = '\\';
266 *cp++ = '\"';
abadc117 267 }
5ce94c29 268 else if (code >= 040 && code < 0177)
f17bcd1f 269 *cp++ = code;
abadc117
AD
270 else if (code == '\t')
271 {
f17bcd1f
AD
272 *cp++ = '\\';
273 *cp++ = 't';
abadc117
AD
274 }
275 else if (code == '\n')
276 {
f17bcd1f
AD
277 *cp++ = '\\';
278 *cp++ = 'n';
abadc117
AD
279 }
280 else if (code == '\r')
281 {
f17bcd1f
AD
282 *cp++ = '\\';
283 *cp++ = 'r';
abadc117
AD
284 }
285 else if (code == '\v')
286 {
f17bcd1f
AD
287 *cp++ = '\\';
288 *cp++ = 'v';
abadc117
AD
289 }
290 else if (code == '\b')
291 {
f17bcd1f
AD
292 *cp++ = '\\';
293 *cp++ = 'b';
abadc117
AD
294 }
295 else if (code == '\f')
296 {
f17bcd1f
AD
297 *cp++ = '\\';
298 *cp++ = 'f';
abadc117 299 }
a44c2277
RS
300 else
301 {
f17bcd1f
AD
302 *cp++ = '\\';
303 *cp++ = code / 0100 + '0';
304 *cp++ = ((code / 010) & 07) + '0';
305 *cp++ = (code & 07) + '0';
a44c2277 306 }
f17bcd1f
AD
307 *cp = '\0';
308
309 if (out)
310 obstack_sgrow (out, buf);
a44c2277 311 *pcode = code;
abadc117 312 return !wasquote;
a44c2277
RS
313}
314
40675e7c
DM
315
316void
d2729d44 317unlex (int token)
40675e7c
DM
318{
319 unlexed = token;
320 unlexed_symval = symval;
321}
322
f282676b
AD
323/*-----------------------------------------------------------------.
324| We just read `<' from FIN. Store in TOKEN_BUFFER, the type name |
325| specified between the `<...>'. |
326`-----------------------------------------------------------------*/
327
328void
329read_type_name (FILE *fin)
330{
f282676b
AD
331 int c = getc (fin);
332
333 while (c != '>')
334 {
335 if (c == EOF)
336 fatal (_("unterminated type name at end of file"));
337 if (c == '\n')
338 {
339 complain (_("unterminated type name"));
340 ungetc (c, fin);
341 break;
342 }
343
f17bcd1f 344 obstack_1grow (&token_obstack, c);
f282676b
AD
345 c = getc (fin);
346 }
f17bcd1f
AD
347 obstack_1grow (&token_obstack, '\0');
348 token_buffer = obstack_finish (&token_obstack);
f282676b
AD
349}
350
40675e7c 351
511e79b3 352token_t
d2729d44 353lex (void)
40675e7c 354{
abadc117 355 int c;
f17bcd1f
AD
356
357 /* Just to make sure. */
358 token_buffer = NULL;
40675e7c
DM
359
360 if (unlexed >= 0)
361 {
362 symval = unlexed_symval;
363 c = unlexed;
364 unlexed = -1;
36281465 365 return c;
40675e7c
DM
366 }
367
abadc117 368 c = skip_white_space ();
40675e7c
DM
369
370 switch (c)
371 {
372 case EOF:
f17bcd1f 373 token_buffer = "EOF";
511e79b3 374 return tok_eof;
40675e7c 375
abadc117
AD
376 case 'A': case 'B': case 'C': case 'D': case 'E':
377 case 'F': case 'G': case 'H': case 'I': case 'J':
378 case 'K': case 'L': case 'M': case 'N': case 'O':
379 case 'P': case 'Q': case 'R': case 'S': case 'T':
380 case 'U': case 'V': case 'W': case 'X': case 'Y':
40675e7c 381 case 'Z':
abadc117
AD
382 case 'a': case 'b': case 'c': case 'd': case 'e':
383 case 'f': case 'g': case 'h': case 'i': case 'j':
384 case 'k': case 'l': case 'm': case 'n': case 'o':
385 case 'p': case 'q': case 'r': case 's': case 't':
386 case 'u': case 'v': case 'w': case 'x': case 'y':
40675e7c 387 case 'z':
abadc117
AD
388 case '.': case '_':
389
abadc117 390 while (isalnum (c) || c == '_' || c == '.')
40675e7c 391 {
f17bcd1f 392 obstack_1grow (&token_obstack, c);
abadc117 393 c = getc (finput);
40675e7c 394 }
f17bcd1f
AD
395 obstack_1grow (&token_obstack, '\0');
396 token_buffer = obstack_finish (&token_obstack);
abadc117
AD
397 ungetc (c, finput);
398 symval = getsym (token_buffer);
511e79b3 399 return tok_identifier;
40675e7c 400
abadc117
AD
401 case '0': case '1': case '2': case '3': case '4':
402 case '5': case '6': case '7': case '8': case '9':
40675e7c
DM
403 {
404 numval = 0;
405
abadc117 406 while (isdigit (c))
40675e7c 407 {
f17bcd1f 408 obstack_1grow (&token_obstack, c);
abadc117
AD
409 numval = numval * 10 + c - '0';
410 c = getc (finput);
40675e7c 411 }
f17bcd1f
AD
412 obstack_1grow (&token_obstack, '\0');
413 token_buffer = obstack_finish (&token_obstack);
abadc117 414 ungetc (c, finput);
511e79b3 415 return tok_number;
40675e7c
DM
416 }
417
418 case '\'':
40675e7c
DM
419 /* parse the literal token and compute character code in code */
420
a44c2277 421 translations = -1;
40675e7c 422 {
a44c2277 423 int code, discode;
5ce94c29 424
f17bcd1f
AD
425 obstack_1grow (&token_obstack, '\'');
426 literalchar (&token_obstack, &code, '\'');
40675e7c 427
abadc117 428 c = getc (finput);
a44c2277 429 if (c != '\'')
40675e7c 430 {
a0f6b076 431 complain (_("use \"...\" for multi-character literal tokens"));
5ce94c29 432 while (1)
f17bcd1f
AD
433 if (!literalchar (0, &discode, '\''))
434 break;
40675e7c 435 }
f17bcd1f
AD
436 obstack_1grow (&token_obstack, '\'');
437 obstack_1grow (&token_obstack, '\0');
438 token_buffer = obstack_finish (&token_obstack);
abadc117 439 symval = getsym (token_buffer);
d7020c20 440 symval->class = token_sym;
abadc117 441 if (!symval->user_token_number)
a44c2277 442 symval->user_token_number = code;
511e79b3 443 return tok_identifier;
a44c2277 444 }
40675e7c 445
a44c2277 446 case '\"':
a44c2277
RS
447 /* parse the literal string token and treat as an identifier */
448
449 translations = -1;
450 {
abadc117 451 int code; /* ignored here */
f17bcd1f
AD
452
453 obstack_1grow (&token_obstack, '\"');
79282c5a 454 /* Read up to and including ". */
f17bcd1f
AD
455 while (literalchar (&token_obstack, &code, '\"'))
456 /* nothing */;
457 obstack_1grow (&token_obstack, '\0');
458 token_buffer = obstack_finish (&token_obstack);
a44c2277 459
abadc117 460 symval = getsym (token_buffer);
d7020c20 461 symval->class = token_sym;
a44c2277 462
511e79b3 463 return tok_identifier;
40675e7c
DM
464 }
465
466 case ',':
511e79b3 467 return tok_comma;
40675e7c
DM
468
469 case ':':
511e79b3 470 return tok_colon;
40675e7c
DM
471
472 case ';':
511e79b3 473 return tok_semicolon;
40675e7c
DM
474
475 case '|':
511e79b3 476 return tok_bar;
40675e7c
DM
477
478 case '{':
511e79b3 479 return tok_left_curly;
40675e7c
DM
480
481 case '=':
482 do
483 {
abadc117
AD
484 c = getc (finput);
485 if (c == '\n')
486 lineno++;
40675e7c 487 }
abadc117 488 while (c == ' ' || c == '\n' || c == '\t');
40675e7c
DM
489
490 if (c == '{')
a44c2277 491 {
f17bcd1f 492 token_buffer = "={";
511e79b3 493 return tok_left_curly;
a44c2277 494 }
40675e7c
DM
495 else
496 {
abadc117 497 ungetc (c, finput);
511e79b3 498 return tok_illegal;
40675e7c
DM
499 }
500
501 case '<':
f282676b 502 read_type_name (finput);
511e79b3 503 return tok_typename;
a083fbbf 504
40675e7c 505 case '%':
abadc117 506 return parse_percent_token ();
40675e7c
DM
507
508 default:
511e79b3 509 return tok_illegal;
40675e7c
DM
510 }
511}
512
82b6d266
PB
513/* This function is a strcmp, which doesn't differentiate `-' and `_'
514 chars. */
6deb4447 515
82b6d266
PB
516static int
517option_strcmp (const char *left, const char *right)
abadc117 518{
82b6d266
PB
519 const unsigned char *l, *r;
520 int c;
521
522 assert(left != NULL && right != NULL);
523 l = (const unsigned char *)left;
524 r = (const unsigned char *)right;
525 while (((c = *l - *r++) == 0 && *l != '\0')
526 || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
527 l++;
528 return c;
529}
a44c2277
RS
530
531/* Parse a token which starts with %.
532 Assumes the % has already been read and discarded. */
40675e7c
DM
533
534int
d2729d44 535parse_percent_token (void)
40675e7c 536{
abadc117 537 int c;
82b6d266 538 const struct option_table_struct *tx;
40675e7c 539
abadc117 540 c = getc (finput);
40675e7c
DM
541
542 switch (c)
543 {
544 case '%':
511e79b3 545 return tok_two_percents;
40675e7c
DM
546
547 case '{':
511e79b3 548 return tok_percent_left_curly;
40675e7c
DM
549
550 case '<':
511e79b3 551 return tok_left;
40675e7c
DM
552
553 case '>':
511e79b3 554 return tok_right;
40675e7c
DM
555
556 case '2':
511e79b3 557 return tok_nonassoc;
40675e7c
DM
558
559 case '0':
511e79b3 560 return tok_token;
40675e7c
DM
561
562 case '=':
511e79b3 563 return tok_prec;
40675e7c 564 }
f17bcd1f 565
abadc117 566 if (!isalpha (c))
511e79b3 567 return tok_illegal;
40675e7c 568
f17bcd1f 569 obstack_1grow (&token_obstack, '%');
abadc117 570 while (isalpha (c) || c == '_' || c == '-')
40675e7c 571 {
abadc117
AD
572 if (c == '-')
573 c = '_';
f17bcd1f 574 obstack_1grow (&token_obstack, c);
abadc117 575 c = getc (finput);
40675e7c
DM
576 }
577
abadc117 578 ungetc (c, finput);
f17bcd1f
AD
579 obstack_1grow (&token_obstack, '\0');
580 token_buffer = obstack_finish (&token_obstack);
40675e7c 581
a44c2277 582 /* table lookup % directive */
82b6d266
PB
583 for (tx = option_table; tx->name; tx++)
584 if ((tx->access == opt_percent || tx->access == opt_both)
585 && option_strcmp (token_buffer + 1, tx->name) == 0)
a44c2277 586 break;
6deb4447 587
62ab6972 588 if (tx->set_flag)
a44c2277 589 {
62ab6972 590 *((int *) (tx->set_flag)) = 1;
511e79b3 591 return tok_noop;
a44c2277 592 }
62ab6972 593
82b6d266 594 switch (tx->ret_val)
a44c2277 595 {
62ab6972
AD
596 case tok_setopt:
597 *((char **) (tx->set_flag)) = optarg;
511e79b3 598 return tok_noop;
62ab6972
AD
599 break;
600
601 case tok_obsolete:
602 fatal (_("`%s' is no longer supported"), token_buffer);
603 break;
a44c2277 604 }
62ab6972 605
82b6d266 606 return tx->ret_val;
40675e7c 607}