]> git.saurik.com Git - bison.git/blame - src/lex.c
* configure.in: Append WARNING_CFLAGS to CFLAGS.
[bison.git] / src / lex.c
CommitLineData
40675e7c 1/* Token-reader for Bison's input parser,
a0f6b076 2 Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
40675e7c 3
a0f6b076 4 This file is part of Bison, the GNU Compiler Compiler.
40675e7c 5
a0f6b076
AD
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
40675e7c 10
a0f6b076
AD
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
40675e7c 15
a0f6b076
AD
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
40675e7c
DM
20
21
a083fbbf 22/*
a44c2277 23 lex is the entry point. It is called from reader.c.
40675e7c
DM
24 It returns one of the token-type codes defined in lex.h.
25 When an identifier is seen, the code IDENTIFIER is returned
26 and the name is looked up in the symbol table using symtab.c;
27 symval is set to a pointer to the entry found. */
28
29#include <stdio.h>
40675e7c
DM
30#include "system.h"
31#include "files.h"
a44c2277 32#include "getopt.h" /* for optarg */
40675e7c
DM
33#include "symtab.h"
34#include "lex.h"
7612000c 35#include "alloc.h"
a0f6b076 36#include "complain.h"
40675e7c 37
a44c2277
RS
38/* flags set by % directives */
39extern int definesflag; /* for -d */
40extern int toknumflag; /* for -k */
41extern int noparserflag; /* for -n */
42extern int fixed_outfiles; /* for -y */
43extern int nolinesflag; /* for -l */
44extern int rawtoknumflag; /* for -r */
45extern int verboseflag; /* for -v */
46extern int debugflag; /* for -t */
47extern char *spec_name_prefix; /* for -p */
48extern char *spec_file_prefix; /* for -b */
49/*spec_outfile is declared in files.h, for -o */
40675e7c 50
40675e7c
DM
51extern int translations;
52
4a120d45
JT
53extern void init_lex PARAMS((void));
54extern char *grow_token_buffer PARAMS((char *));
55extern int skip_white_space PARAMS((void));
56extern void unlex PARAMS((int));
57extern int lex PARAMS((void));
58extern int parse_percent_token PARAMS((void));
59
60static int safegetc PARAMS((FILE *));
61static int literalchar PARAMS((char **, int *, char));
40675e7c 62
a44c2277 63/* functions from main.c */
d2729d44 64extern char *printable_version PARAMS((int));
40675e7c
DM
65
66/* Buffer for storing the current token. */
67char *token_buffer;
68
69/* Allocated size of token_buffer, not including space for terminator. */
d2729d44 70int maxtoken;
40675e7c
DM
71
72bucket *symval;
73int numval;
74
75static int unlexed; /* these two describe a token to be reread */
76static bucket *unlexed_symval; /* by the next call to lex */
77
78
79void
d2729d44 80init_lex (void)
40675e7c
DM
81{
82 maxtoken = 100;
83 token_buffer = NEW2 (maxtoken + 1, char);
84 unlexed = -1;
85}
86
87
d2729d44
JT
88char *
89grow_token_buffer (char *p)
40675e7c
DM
90{
91 int offset = p - token_buffer;
92 maxtoken *= 2;
93 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
94 return token_buffer + offset;
95}
96
97
98int
d2729d44 99skip_white_space (void)
40675e7c
DM
100{
101 register int c;
102 register int inside;
103
104 c = getc(finput);
105
106 for (;;)
107 {
108 int cplus_comment;
109
110 switch (c)
111 {
112 case '/':
113 c = getc(finput);
a083fbbf 114 if (c != '*' && c != '/')
a44c2277 115 {
a0f6b076 116 complain (_("unexpected `/' found and ignored"));
a44c2277
RS
117 break;
118 }
40675e7c
DM
119 cplus_comment = (c == '/');
120
121 c = getc(finput);
122
123 inside = 1;
124 while (inside)
125 {
126 if (!cplus_comment && c == '*')
127 {
128 while (c == '*')
129 c = getc(finput);
130
131 if (c == '/')
132 {
133 inside = 0;
134 c = getc(finput);
135 }
136 }
137 else if (c == '\n')
138 {
139 lineno++;
140 if (cplus_comment)
141 inside = 0;
142 c = getc(finput);
143 }
144 else if (c == EOF)
a0f6b076 145 fatal (_("unterminated comment"));
40675e7c
DM
146 else
147 c = getc(finput);
148 }
149
150 break;
151
152 case '\n':
153 lineno++;
154
155 case ' ':
156 case '\t':
157 case '\f':
158 c = getc(finput);
159 break;
160
161 default:
36281465 162 return c;
40675e7c
DM
163 }
164 }
165}
166
a44c2277 167/* do a getc, but give error message if EOF encountered */
4a120d45 168static int
d2729d44 169safegetc (FILE *f)
a44c2277
RS
170{
171 register int c = getc(f);
172 if (c == EOF)
a0f6b076 173 fatal (_("unexpected end of file"));
a44c2277
RS
174 return c;
175}
176
177/* read one literal character from finput. process \ escapes.
178 append the normalized string version of the char to *pp.
179 assign the character code to *pcode
180 return 1 unless the character is an unescaped `term' or \n
181 report error for \n
182*/
4a120d45 183static int
d2729d44 184literalchar (char **pp, int *pcode, char term)
a44c2277
RS
185{
186 register int c;
187 register char *p;
188 register int code;
189 int wasquote = 0;
190
191 c = safegetc(finput);
a083fbbf 192 if (c == '\n')
a44c2277 193 {
a0f6b076 194 complain (_("unescaped newline in constant"));
a44c2277
RS
195 ungetc(c, finput);
196 code = '?';
197 wasquote = 1;
198 }
199 else if (c != '\\')
200 {
201 code = c;
a083fbbf 202 if (c == term)
a44c2277
RS
203 wasquote = 1;
204 }
205 else
206 {
207 c = safegetc(finput);
208 if (c == 't') code = '\t';
209 else if (c == 'n') code = '\n';
210 else if (c == 'a') code = '\007';
211 else if (c == 'r') code = '\r';
212 else if (c == 'f') code = '\f';
213 else if (c == 'b') code = '\b';
d2729d44 214 else if (c == 'v') code = '\013';
a44c2277
RS
215 else if (c == '\\') code = '\\';
216 else if (c == '\'') code = '\'';
217 else if (c == '\"') code = '\"';
218 else if (c <= '7' && c >= '0')
219 {
220 code = 0;
221 while (c <= '7' && c >= '0')
222 {
223 code = (code * 8) + (c - '0');
224 if (code >= 256 || code < 0)
225 {
a0f6b076
AD
226 complain (_("octal value outside range 0...255: `\\%o'"),
227 code);
a44c2277
RS
228 code &= 0xFF;
229 break;
230 }
231 c = safegetc(finput);
232 }
233 ungetc(c, finput);
234 }
235 else if (c == 'x')
236 {
237 c = safegetc(finput);
238 code = 0;
239 while (1)
240 {
241 if (c >= '0' && c <= '9')
242 code *= 16, code += c - '0';
243 else if (c >= 'a' && c <= 'f')
244 code *= 16, code += c - 'a' + 10;
245 else if (c >= 'A' && c <= 'F')
246 code *= 16, code += c - 'A' + 10;
a083fbbf 247 else
a44c2277
RS
248 break;
249 if (code >= 256 || code<0)
250 {
a0f6b076
AD
251 complain (_("hexadecimal value above 255: `\\x%x'"),
252 code);
a44c2277
RS
253 code &= 0xFF;
254 break;
255 }
256 c = safegetc(finput);
257 }
258 ungetc(c, finput);
259 }
260 else
261 {
a0f6b076
AD
262 complain (_("unknown escape sequence: `\\' followed by `%s'"),
263 printable_version(c));
a44c2277
RS
264 code = '?';
265 }
266 } /* has \ */
267
268 /* now fill token_buffer with the canonical name for this character
269 as a literal token. Do not use what the user typed,
270 so that `\012' and `\n' can be interchangeable. */
271
272 p = *pp;
e5335b74
JT
273 if (code == term && wasquote)
274 *p++ = code;
275 else if (code == '\\') {*p++ = '\\'; *p++ = '\\';}
a44c2277
RS
276 else if (code == '\'') {*p++ = '\\'; *p++ = '\'';}
277 else if (code == '\"') {*p++ = '\\'; *p++ = '\"';}
5ce94c29
RS
278 else if (code >= 040 && code < 0177)
279 *p++ = code;
a44c2277
RS
280 else if (code == '\t') {*p++ = '\\'; *p++ = 't';}
281 else if (code == '\n') {*p++ = '\\'; *p++ = 'n';}
282 else if (code == '\r') {*p++ = '\\'; *p++ = 'r';}
283 else if (code == '\v') {*p++ = '\\'; *p++ = 'v';}
284 else if (code == '\b') {*p++ = '\\'; *p++ = 'b';}
285 else if (code == '\f') {*p++ = '\\'; *p++ = 'f';}
286 else
287 {
288 *p++ = '\\';
289 *p++ = code / 0100 + '0';
290 *p++ = ((code / 010) & 07) + '0';
291 *p++ = (code & 07) + '0';
292 }
293 *pp = p;
294 *pcode = code;
295 return ! wasquote;
296}
297
40675e7c
DM
298
299void
d2729d44 300unlex (int token)
40675e7c
DM
301{
302 unlexed = token;
303 unlexed_symval = symval;
304}
305
306
40675e7c 307int
d2729d44 308lex (void)
40675e7c
DM
309{
310 register int c;
a44c2277 311 char *p;
40675e7c
DM
312
313 if (unlexed >= 0)
314 {
315 symval = unlexed_symval;
316 c = unlexed;
317 unlexed = -1;
36281465 318 return c;
40675e7c
DM
319 }
320
321 c = skip_white_space();
a44c2277
RS
322 *token_buffer = c; /* for error messages (token buffer always valid) */
323 token_buffer[1] = 0;
40675e7c
DM
324
325 switch (c)
326 {
327 case EOF:
a44c2277 328 strcpy(token_buffer, "EOF");
36281465 329 return ENDFILE;
40675e7c
DM
330
331 case 'A': case 'B': case 'C': case 'D': case 'E':
332 case 'F': case 'G': case 'H': case 'I': case 'J':
333 case 'K': case 'L': case 'M': case 'N': case 'O':
334 case 'P': case 'Q': case 'R': case 'S': case 'T':
335 case 'U': case 'V': case 'W': case 'X': case 'Y':
336 case 'Z':
337 case 'a': case 'b': case 'c': case 'd': case 'e':
338 case 'f': case 'g': case 'h': case 'i': case 'j':
339 case 'k': case 'l': case 'm': case 'n': case 'o':
340 case 'p': case 'q': case 'r': case 's': case 't':
341 case 'u': case 'v': case 'w': case 'x': case 'y':
342 case 'z':
343 case '.': case '_':
344 p = token_buffer;
345 while (isalnum(c) || c == '_' || c == '.')
346 {
347 if (p == token_buffer + maxtoken)
348 p = grow_token_buffer(p);
349
350 *p++ = c;
351 c = getc(finput);
352 }
353
354 *p = 0;
355 ungetc(c, finput);
356 symval = getsym(token_buffer);
36281465 357 return IDENTIFIER;
40675e7c
DM
358
359 case '0': case '1': case '2': case '3': case '4':
360 case '5': case '6': case '7': case '8': case '9':
361 {
362 numval = 0;
363
a44c2277 364 p = token_buffer;
40675e7c
DM
365 while (isdigit(c))
366 {
a44c2277
RS
367 if (p == token_buffer + maxtoken)
368 p = grow_token_buffer(p);
369
370 *p++ = c;
40675e7c
DM
371 numval = numval*10 + c - '0';
372 c = getc(finput);
373 }
a44c2277 374 *p = 0;
40675e7c 375 ungetc(c, finput);
36281465 376 return NUMBER;
40675e7c
DM
377 }
378
379 case '\'':
40675e7c
DM
380
381 /* parse the literal token and compute character code in code */
382
a44c2277 383 translations = -1;
40675e7c 384 {
a44c2277
RS
385 int code, discode;
386 char discard[10], *dp;
5ce94c29 387
a44c2277
RS
388 p = token_buffer;
389 *p++ = '\'';
390 literalchar(&p, &code, '\'');
40675e7c 391
a44c2277
RS
392 c = getc(finput);
393 if (c != '\'')
40675e7c 394 {
a0f6b076 395 complain (_("use \"...\" for multi-character literal tokens"));
5ce94c29
RS
396 while (1)
397 {
398 dp = discard;
399 if (! literalchar(&dp, &discode, '\''))
400 break;
401 }
40675e7c 402 }
a44c2277
RS
403 *p++ = '\'';
404 *p = 0;
405 symval = getsym(token_buffer);
406 symval->class = STOKEN;
407 if (! symval->user_token_number)
408 symval->user_token_number = code;
36281465 409 return IDENTIFIER;
a44c2277 410 }
40675e7c 411
a44c2277 412 case '\"':
40675e7c 413
a44c2277
RS
414 /* parse the literal string token and treat as an identifier */
415
416 translations = -1;
417 {
418 int code; /* ignored here */
40675e7c 419 p = token_buffer;
a44c2277
RS
420 *p++ = '\"';
421 while (literalchar(&p, &code, '\"')) /* read up to and including " */
40675e7c 422 {
a44c2277
RS
423 if (p >= token_buffer + maxtoken - 4)
424 p = grow_token_buffer(p);
40675e7c 425 }
40675e7c 426 *p = 0;
a44c2277 427
40675e7c
DM
428 symval = getsym(token_buffer);
429 symval->class = STOKEN;
a44c2277 430
36281465 431 return IDENTIFIER;
40675e7c
DM
432 }
433
434 case ',':
36281465 435 return COMMA;
40675e7c
DM
436
437 case ':':
36281465 438 return COLON;
40675e7c
DM
439
440 case ';':
36281465 441 return SEMICOLON;
40675e7c
DM
442
443 case '|':
36281465 444 return BAR;
40675e7c
DM
445
446 case '{':
36281465 447 return LEFT_CURLY;
40675e7c
DM
448
449 case '=':
450 do
451 {
452 c = getc(finput);
453 if (c == '\n') lineno++;
454 }
455 while(c==' ' || c=='\n' || c=='\t');
456
457 if (c == '{')
a44c2277
RS
458 {
459 strcpy(token_buffer, "={");
36281465 460 return LEFT_CURLY;
a44c2277 461 }
40675e7c
DM
462 else
463 {
464 ungetc(c, finput);
36281465 465 return ILLEGAL;
40675e7c
DM
466 }
467
468 case '<':
469 p = token_buffer;
470 c = getc(finput);
471 while (c != '>')
472 {
a44c2277 473 if (c == EOF)
a0f6b076 474 fatal (_("unterminated type name at end of file"));
a083fbbf 475 if (c == '\n')
a44c2277 476 {
a0f6b076 477 complain (_("unterminated type name"));
a44c2277
RS
478 ungetc(c, finput);
479 break;
480 }
40675e7c
DM
481
482 if (p == token_buffer + maxtoken)
483 p = grow_token_buffer(p);
484
485 *p++ = c;
486 c = getc(finput);
487 }
488 *p = 0;
36281465 489 return TYPENAME;
a083fbbf 490
40675e7c
DM
491
492 case '%':
36281465 493 return parse_percent_token();
40675e7c
DM
494
495 default:
36281465 496 return ILLEGAL;
40675e7c
DM
497 }
498}
499
a083fbbf 500/* the following table dictates the action taken for the various
a44c2277
RS
501 % directives. A setflag value causes the named flag to be
502 set. A retval action returns the code.
503*/
504struct percent_table_struct {
4a120d45 505 const char *name;
a083fbbf 506 void *setflag;
a44c2277
RS
507 int retval;
508} percent_table[] =
509{
510 {"token", NULL, TOKEN},
511 {"term", NULL, TOKEN},
512 {"nterm", NULL, NTERM},
513 {"type", NULL, TYPE},
514 {"guard", NULL, GUARD},
515 {"union", NULL, UNION},
516 {"expect", NULL, EXPECT},
517 {"thong", NULL, THONG},
518 {"start", NULL, START},
519 {"left", NULL, LEFT},
520 {"right", NULL, RIGHT},
521 {"nonassoc", NULL, NONASSOC},
522 {"binary", NULL, NONASSOC},
523 {"semantic_parser", NULL, SEMANTIC_PARSER},
524 {"pure_parser", NULL, PURE_PARSER},
525 {"prec", NULL, PREC},
526
527 {"no_lines", &nolinesflag, NOOP}, /* -l */
528 {"raw", &rawtoknumflag, NOOP}, /* -r */
529 {"token_table", &toknumflag, NOOP}, /* -k */
530
531#if 0
532 /* These can be utilized after main is reoganized so
533 open_files() is deferred 'til after read_declarations().
534 But %{ and %union both put information into files
535 that have to be opened before read_declarations().
536 */
537 {"yacc", &fixed_outfiles, NOOP}, /* -y */
538 {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
539 {"defines", &definesflag, NOOP}, /* -d */
540 {"no_parser", &noparserflag, NOOP}, /* -n */
541 {"output_file", &spec_outfile, SETOPT}, /* -o */
542 {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
543 {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
544
545 /* These would be acceptable, but they do not affect processing */
546 {"verbose", &verboseflag, NOOP}, /* -v */
547 {"debug", &debugflag, NOOP}, /* -t */
d2729d44
JT
548 /* {"help", <print usage stmt>, NOOP},*/ /* -h */
549 /* {"version", <print version number> , NOOP},*/ /* -V */
a44c2277
RS
550#endif
551
552 {NULL, NULL, ILLEGAL}
553};
554
555/* Parse a token which starts with %.
556 Assumes the % has already been read and discarded. */
40675e7c
DM
557
558int
d2729d44 559parse_percent_token (void)
40675e7c
DM
560{
561 register int c;
562 register char *p;
a44c2277 563 register struct percent_table_struct *tx;
40675e7c
DM
564
565 p = token_buffer;
566 c = getc(finput);
a44c2277
RS
567 *p++ = '%';
568 *p++ = c; /* for error msg */
569 *p = 0;
40675e7c
DM
570
571 switch (c)
572 {
573 case '%':
36281465 574 return TWO_PERCENTS;
40675e7c
DM
575
576 case '{':
36281465 577 return PERCENT_LEFT_CURLY;
40675e7c
DM
578
579 case '<':
36281465 580 return LEFT;
40675e7c
DM
581
582 case '>':
36281465 583 return RIGHT;
40675e7c
DM
584
585 case '2':
36281465 586 return NONASSOC;
40675e7c
DM
587
588 case '0':
36281465 589 return TOKEN;
40675e7c
DM
590
591 case '=':
36281465 592 return PREC;
40675e7c 593 }
a083fbbf 594 if (!isalpha(c))
36281465 595 return ILLEGAL;
40675e7c 596
a44c2277
RS
597 p = token_buffer;
598 *p++ = '%';
599 while (isalpha(c) || c == '_' || c == '-')
40675e7c
DM
600 {
601 if (p == token_buffer + maxtoken)
602 p = grow_token_buffer(p);
603
a44c2277 604 if (c == '-') c = '_';
40675e7c
DM
605 *p++ = c;
606 c = getc(finput);
607 }
608
609 ungetc(c, finput);
610
611 *p = 0;
612
a44c2277
RS
613 /* table lookup % directive */
614 for (tx = percent_table; tx->name; tx++)
615 if (strcmp(token_buffer+1, tx->name) == 0)
616 break;
617 if (tx->retval == SETOPT)
618 {
619 *((char **)(tx->setflag)) = optarg;
620 return NOOP;
621 }
622 if (tx->setflag)
623 {
624 *((int *)(tx->setflag)) = 1;
625 return NOOP;
626 }
627 return tx->retval;
40675e7c 628}