]> git.saurik.com Git - bison.git/blame - src/lex.c
Ditch sprintf to statically-sized buffers in fatal/warn functions in
[bison.git] / src / lex.c
CommitLineData
40675e7c 1/* Token-reader for Bison's input parser,
a44c2277 2 Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
40675e7c
DM
3
4This file is part of Bison, the GNU Compiler Compiler.
5
6Bison is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11Bison is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Bison; see the file COPYING. If not, write to
18the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
19
20
a083fbbf 21/*
a44c2277 22 lex is the entry point. It is called from reader.c.
40675e7c
DM
23 It returns one of the token-type codes defined in lex.h.
24 When an identifier is seen, the code IDENTIFIER is returned
25 and the name is looked up in the symbol table using symtab.c;
26 symval is set to a pointer to the entry found. */
27
28#include <stdio.h>
40675e7c
DM
29#include "system.h"
30#include "files.h"
a44c2277 31#include "getopt.h" /* for optarg */
40675e7c
DM
32#include "symtab.h"
33#include "lex.h"
7612000c 34#include "alloc.h"
40675e7c 35
a44c2277
RS
36/* flags set by % directives */
37extern int definesflag; /* for -d */
38extern int toknumflag; /* for -k */
39extern int noparserflag; /* for -n */
40extern int fixed_outfiles; /* for -y */
41extern int nolinesflag; /* for -l */
42extern int rawtoknumflag; /* for -r */
43extern int verboseflag; /* for -v */
44extern int debugflag; /* for -t */
45extern char *spec_name_prefix; /* for -p */
46extern char *spec_file_prefix; /* for -b */
47/*spec_outfile is declared in files.h, for -o */
40675e7c
DM
48
49extern int lineno;
50extern int translations;
51
d2729d44
JT
52void init_lex PARAMS((void));
53char *grow_token_buffer PARAMS((char *));
54int skip_white_space PARAMS((void));
55int safegetc PARAMS((FILE *));
56int literalchar PARAMS((char **, int *, char));
57void unlex PARAMS((int));
58int lex PARAMS((void));
59int parse_percent_token PARAMS((void));
40675e7c 60
a44c2277 61/* functions from main.c */
d2729d44
JT
62extern char *printable_version PARAMS((int));
63extern void fatal PARAMS((char *));
64extern void warn PARAMS((char *));
65extern void warni PARAMS((char *, int));
66extern void warns PARAMS((char *, char *));
40675e7c
DM
67
68/* Buffer for storing the current token. */
69char *token_buffer;
70
71/* Allocated size of token_buffer, not including space for terminator. */
d2729d44 72int maxtoken;
40675e7c
DM
73
74bucket *symval;
75int numval;
76
77static int unlexed; /* these two describe a token to be reread */
78static bucket *unlexed_symval; /* by the next call to lex */
79
80
81void
d2729d44 82init_lex (void)
40675e7c
DM
83{
84 maxtoken = 100;
85 token_buffer = NEW2 (maxtoken + 1, char);
86 unlexed = -1;
87}
88
89
d2729d44
JT
90char *
91grow_token_buffer (char *p)
40675e7c
DM
92{
93 int offset = p - token_buffer;
94 maxtoken *= 2;
95 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
96 return token_buffer + offset;
97}
98
99
100int
d2729d44 101skip_white_space (void)
40675e7c
DM
102{
103 register int c;
104 register int inside;
105
106 c = getc(finput);
107
108 for (;;)
109 {
110 int cplus_comment;
111
112 switch (c)
113 {
114 case '/':
115 c = getc(finput);
a083fbbf 116 if (c != '*' && c != '/')
a44c2277 117 {
a083fbbf 118 warn(_("unexpected `/' found and ignored"));
a44c2277
RS
119 break;
120 }
40675e7c
DM
121 cplus_comment = (c == '/');
122
123 c = getc(finput);
124
125 inside = 1;
126 while (inside)
127 {
128 if (!cplus_comment && c == '*')
129 {
130 while (c == '*')
131 c = getc(finput);
132
133 if (c == '/')
134 {
135 inside = 0;
136 c = getc(finput);
137 }
138 }
139 else if (c == '\n')
140 {
141 lineno++;
142 if (cplus_comment)
143 inside = 0;
144 c = getc(finput);
145 }
146 else if (c == EOF)
a083fbbf 147 fatal(_("unterminated comment"));
40675e7c
DM
148 else
149 c = getc(finput);
150 }
151
152 break;
153
154 case '\n':
155 lineno++;
156
157 case ' ':
158 case '\t':
159 case '\f':
160 c = getc(finput);
161 break;
162
163 default:
164 return (c);
165 }
166 }
167}
168
a44c2277
RS
169/* do a getc, but give error message if EOF encountered */
170int
d2729d44 171safegetc (FILE *f)
a44c2277
RS
172{
173 register int c = getc(f);
174 if (c == EOF)
a083fbbf 175 fatal(_("Unexpected end of file"));
a44c2277
RS
176 return c;
177}
178
179/* read one literal character from finput. process \ escapes.
180 append the normalized string version of the char to *pp.
181 assign the character code to *pcode
182 return 1 unless the character is an unescaped `term' or \n
183 report error for \n
184*/
185int
d2729d44 186literalchar (char **pp, int *pcode, char term)
a44c2277
RS
187{
188 register int c;
189 register char *p;
190 register int code;
191 int wasquote = 0;
192
193 c = safegetc(finput);
a083fbbf 194 if (c == '\n')
a44c2277 195 {
a083fbbf 196 warn(_("unescaped newline in constant"));
a44c2277
RS
197 ungetc(c, finput);
198 code = '?';
199 wasquote = 1;
200 }
201 else if (c != '\\')
202 {
203 code = c;
a083fbbf 204 if (c == term)
a44c2277
RS
205 wasquote = 1;
206 }
207 else
208 {
209 c = safegetc(finput);
210 if (c == 't') code = '\t';
211 else if (c == 'n') code = '\n';
212 else if (c == 'a') code = '\007';
213 else if (c == 'r') code = '\r';
214 else if (c == 'f') code = '\f';
215 else if (c == 'b') code = '\b';
d2729d44 216 else if (c == 'v') code = '\013';
a44c2277
RS
217 else if (c == '\\') code = '\\';
218 else if (c == '\'') code = '\'';
219 else if (c == '\"') code = '\"';
220 else if (c <= '7' && c >= '0')
221 {
222 code = 0;
223 while (c <= '7' && c >= '0')
224 {
225 code = (code * 8) + (c - '0');
226 if (code >= 256 || code < 0)
227 {
a083fbbf 228 warni(_("octal value outside range 0...255: `\\%o'"), code);
a44c2277
RS
229 code &= 0xFF;
230 break;
231 }
232 c = safegetc(finput);
233 }
234 ungetc(c, finput);
235 }
236 else if (c == 'x')
237 {
238 c = safegetc(finput);
239 code = 0;
240 while (1)
241 {
242 if (c >= '0' && c <= '9')
243 code *= 16, code += c - '0';
244 else if (c >= 'a' && c <= 'f')
245 code *= 16, code += c - 'a' + 10;
246 else if (c >= 'A' && c <= 'F')
247 code *= 16, code += c - 'A' + 10;
a083fbbf 248 else
a44c2277
RS
249 break;
250 if (code >= 256 || code<0)
251 {
a083fbbf 252 warni(_("hexadecimal value above 255: `\\x%x'"), code);
a44c2277
RS
253 code &= 0xFF;
254 break;
255 }
256 c = safegetc(finput);
257 }
258 ungetc(c, finput);
259 }
260 else
261 {
b0180c64 262 warns (_("unknown escape sequence: `\\' followed by `%s'"),
a44c2277
RS
263 printable_version(c));
264 code = '?';
265 }
266 } /* has \ */
267
268 /* now fill token_buffer with the canonical name for this character
269 as a literal token. Do not use what the user typed,
270 so that `\012' and `\n' can be interchangeable. */
271
272 p = *pp;
5ce94c29 273 if (code == '\\') {*p++ = '\\'; *p++ = '\\';}
a44c2277
RS
274 else if (code == '\'') {*p++ = '\\'; *p++ = '\'';}
275 else if (code == '\"') {*p++ = '\\'; *p++ = '\"';}
5ce94c29
RS
276 else if (code >= 040 && code < 0177)
277 *p++ = code;
a44c2277
RS
278 else if (code == '\t') {*p++ = '\\'; *p++ = 't';}
279 else if (code == '\n') {*p++ = '\\'; *p++ = 'n';}
280 else if (code == '\r') {*p++ = '\\'; *p++ = 'r';}
281 else if (code == '\v') {*p++ = '\\'; *p++ = 'v';}
282 else if (code == '\b') {*p++ = '\\'; *p++ = 'b';}
283 else if (code == '\f') {*p++ = '\\'; *p++ = 'f';}
284 else
285 {
286 *p++ = '\\';
287 *p++ = code / 0100 + '0';
288 *p++ = ((code / 010) & 07) + '0';
289 *p++ = (code & 07) + '0';
290 }
291 *pp = p;
292 *pcode = code;
293 return ! wasquote;
294}
295
40675e7c
DM
296
297void
d2729d44 298unlex (int token)
40675e7c
DM
299{
300 unlexed = token;
301 unlexed_symval = symval;
302}
303
304
40675e7c 305int
d2729d44 306lex (void)
40675e7c
DM
307{
308 register int c;
a44c2277 309 char *p;
40675e7c
DM
310
311 if (unlexed >= 0)
312 {
313 symval = unlexed_symval;
314 c = unlexed;
315 unlexed = -1;
316 return (c);
317 }
318
319 c = skip_white_space();
a44c2277
RS
320 *token_buffer = c; /* for error messages (token buffer always valid) */
321 token_buffer[1] = 0;
40675e7c
DM
322
323 switch (c)
324 {
325 case EOF:
a44c2277 326 strcpy(token_buffer, "EOF");
40675e7c
DM
327 return (ENDFILE);
328
329 case 'A': case 'B': case 'C': case 'D': case 'E':
330 case 'F': case 'G': case 'H': case 'I': case 'J':
331 case 'K': case 'L': case 'M': case 'N': case 'O':
332 case 'P': case 'Q': case 'R': case 'S': case 'T':
333 case 'U': case 'V': case 'W': case 'X': case 'Y':
334 case 'Z':
335 case 'a': case 'b': case 'c': case 'd': case 'e':
336 case 'f': case 'g': case 'h': case 'i': case 'j':
337 case 'k': case 'l': case 'm': case 'n': case 'o':
338 case 'p': case 'q': case 'r': case 's': case 't':
339 case 'u': case 'v': case 'w': case 'x': case 'y':
340 case 'z':
341 case '.': case '_':
342 p = token_buffer;
343 while (isalnum(c) || c == '_' || c == '.')
344 {
345 if (p == token_buffer + maxtoken)
346 p = grow_token_buffer(p);
347
348 *p++ = c;
349 c = getc(finput);
350 }
351
352 *p = 0;
353 ungetc(c, finput);
354 symval = getsym(token_buffer);
355 return (IDENTIFIER);
356
357 case '0': case '1': case '2': case '3': case '4':
358 case '5': case '6': case '7': case '8': case '9':
359 {
360 numval = 0;
361
a44c2277 362 p = token_buffer;
40675e7c
DM
363 while (isdigit(c))
364 {
a44c2277
RS
365 if (p == token_buffer + maxtoken)
366 p = grow_token_buffer(p);
367
368 *p++ = c;
40675e7c
DM
369 numval = numval*10 + c - '0';
370 c = getc(finput);
371 }
a44c2277 372 *p = 0;
40675e7c
DM
373 ungetc(c, finput);
374 return (NUMBER);
375 }
376
377 case '\'':
40675e7c
DM
378
379 /* parse the literal token and compute character code in code */
380
a44c2277 381 translations = -1;
40675e7c 382 {
a44c2277
RS
383 int code, discode;
384 char discard[10], *dp;
5ce94c29 385
a44c2277
RS
386 p = token_buffer;
387 *p++ = '\'';
388 literalchar(&p, &code, '\'');
40675e7c 389
a44c2277
RS
390 c = getc(finput);
391 if (c != '\'')
40675e7c 392 {
a083fbbf 393 warn(_("use \"...\" for multi-character literal tokens"));
5ce94c29
RS
394 while (1)
395 {
396 dp = discard;
397 if (! literalchar(&dp, &discode, '\''))
398 break;
399 }
40675e7c 400 }
a44c2277
RS
401 *p++ = '\'';
402 *p = 0;
403 symval = getsym(token_buffer);
404 symval->class = STOKEN;
405 if (! symval->user_token_number)
406 symval->user_token_number = code;
407 return (IDENTIFIER);
408 }
40675e7c 409
a44c2277 410 case '\"':
40675e7c 411
a44c2277
RS
412 /* parse the literal string token and treat as an identifier */
413
414 translations = -1;
415 {
416 int code; /* ignored here */
40675e7c 417 p = token_buffer;
a44c2277
RS
418 *p++ = '\"';
419 while (literalchar(&p, &code, '\"')) /* read up to and including " */
40675e7c 420 {
a44c2277
RS
421 if (p >= token_buffer + maxtoken - 4)
422 p = grow_token_buffer(p);
40675e7c 423 }
40675e7c 424 *p = 0;
a44c2277 425
40675e7c
DM
426 symval = getsym(token_buffer);
427 symval->class = STOKEN;
a44c2277 428
40675e7c
DM
429 return (IDENTIFIER);
430 }
431
432 case ',':
433 return (COMMA);
434
435 case ':':
436 return (COLON);
437
438 case ';':
439 return (SEMICOLON);
440
441 case '|':
442 return (BAR);
443
444 case '{':
445 return (LEFT_CURLY);
446
447 case '=':
448 do
449 {
450 c = getc(finput);
451 if (c == '\n') lineno++;
452 }
453 while(c==' ' || c=='\n' || c=='\t');
454
455 if (c == '{')
a44c2277
RS
456 {
457 strcpy(token_buffer, "={");
458 return(LEFT_CURLY);
459 }
40675e7c
DM
460 else
461 {
462 ungetc(c, finput);
463 return(ILLEGAL);
464 }
465
466 case '<':
467 p = token_buffer;
468 c = getc(finput);
469 while (c != '>')
470 {
a44c2277 471 if (c == EOF)
a083fbbf
RS
472 fatal(_("unterminated type name at end of file"));
473 if (c == '\n')
a44c2277 474 {
a083fbbf 475 warn(_("unterminated type name"));
a44c2277
RS
476 ungetc(c, finput);
477 break;
478 }
40675e7c
DM
479
480 if (p == token_buffer + maxtoken)
481 p = grow_token_buffer(p);
482
483 *p++ = c;
484 c = getc(finput);
485 }
486 *p = 0;
487 return (TYPENAME);
a083fbbf 488
40675e7c
DM
489
490 case '%':
491 return (parse_percent_token());
492
493 default:
494 return (ILLEGAL);
495 }
496}
497
a083fbbf 498/* the following table dictates the action taken for the various
a44c2277
RS
499 % directives. A setflag value causes the named flag to be
500 set. A retval action returns the code.
501*/
502struct percent_table_struct {
503 char *name;
a083fbbf 504 void *setflag;
a44c2277
RS
505 int retval;
506} percent_table[] =
507{
508 {"token", NULL, TOKEN},
509 {"term", NULL, TOKEN},
510 {"nterm", NULL, NTERM},
511 {"type", NULL, TYPE},
512 {"guard", NULL, GUARD},
513 {"union", NULL, UNION},
514 {"expect", NULL, EXPECT},
515 {"thong", NULL, THONG},
516 {"start", NULL, START},
517 {"left", NULL, LEFT},
518 {"right", NULL, RIGHT},
519 {"nonassoc", NULL, NONASSOC},
520 {"binary", NULL, NONASSOC},
521 {"semantic_parser", NULL, SEMANTIC_PARSER},
522 {"pure_parser", NULL, PURE_PARSER},
523 {"prec", NULL, PREC},
524
525 {"no_lines", &nolinesflag, NOOP}, /* -l */
526 {"raw", &rawtoknumflag, NOOP}, /* -r */
527 {"token_table", &toknumflag, NOOP}, /* -k */
528
529#if 0
530 /* These can be utilized after main is reoganized so
531 open_files() is deferred 'til after read_declarations().
532 But %{ and %union both put information into files
533 that have to be opened before read_declarations().
534 */
535 {"yacc", &fixed_outfiles, NOOP}, /* -y */
536 {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
537 {"defines", &definesflag, NOOP}, /* -d */
538 {"no_parser", &noparserflag, NOOP}, /* -n */
539 {"output_file", &spec_outfile, SETOPT}, /* -o */
540 {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
541 {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
542
543 /* These would be acceptable, but they do not affect processing */
544 {"verbose", &verboseflag, NOOP}, /* -v */
545 {"debug", &debugflag, NOOP}, /* -t */
d2729d44
JT
546 /* {"help", <print usage stmt>, NOOP},*/ /* -h */
547 /* {"version", <print version number> , NOOP},*/ /* -V */
a44c2277
RS
548#endif
549
550 {NULL, NULL, ILLEGAL}
551};
552
553/* Parse a token which starts with %.
554 Assumes the % has already been read and discarded. */
40675e7c
DM
555
556int
d2729d44 557parse_percent_token (void)
40675e7c
DM
558{
559 register int c;
560 register char *p;
a44c2277 561 register struct percent_table_struct *tx;
40675e7c
DM
562
563 p = token_buffer;
564 c = getc(finput);
a44c2277
RS
565 *p++ = '%';
566 *p++ = c; /* for error msg */
567 *p = 0;
40675e7c
DM
568
569 switch (c)
570 {
571 case '%':
572 return (TWO_PERCENTS);
573
574 case '{':
575 return (PERCENT_LEFT_CURLY);
576
577 case '<':
578 return (LEFT);
579
580 case '>':
581 return (RIGHT);
582
583 case '2':
584 return (NONASSOC);
585
586 case '0':
587 return (TOKEN);
588
589 case '=':
590 return (PREC);
591 }
a083fbbf 592 if (!isalpha(c))
40675e7c
DM
593 return (ILLEGAL);
594
a44c2277
RS
595 p = token_buffer;
596 *p++ = '%';
597 while (isalpha(c) || c == '_' || c == '-')
40675e7c
DM
598 {
599 if (p == token_buffer + maxtoken)
600 p = grow_token_buffer(p);
601
a44c2277 602 if (c == '-') c = '_';
40675e7c
DM
603 *p++ = c;
604 c = getc(finput);
605 }
606
607 ungetc(c, finput);
608
609 *p = 0;
610
a44c2277
RS
611 /* table lookup % directive */
612 for (tx = percent_table; tx->name; tx++)
613 if (strcmp(token_buffer+1, tx->name) == 0)
614 break;
615 if (tx->retval == SETOPT)
616 {
617 *((char **)(tx->setflag)) = optarg;
618 return NOOP;
619 }
620 if (tx->setflag)
621 {
622 *((int *)(tx->setflag)) = 1;
623 return NOOP;
624 }
625 return tx->retval;
40675e7c 626}