]> git.saurik.com Git - bison.git/blame - src/lex.c
* reader.c (copy_comment): New function, factored out from:
[bison.git] / src / lex.c
CommitLineData
40675e7c 1/* Token-reader for Bison's input parser,
a44c2277 2 Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
40675e7c
DM
3
4This file is part of Bison, the GNU Compiler Compiler.
5
6Bison is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11Bison is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Bison; see the file COPYING. If not, write to
c49a8e71
JT
18the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19Boston, MA 02111-1307, USA. */
40675e7c
DM
20
21
a083fbbf 22/*
a44c2277 23 lex is the entry point. It is called from reader.c.
40675e7c
DM
24 It returns one of the token-type codes defined in lex.h.
25 When an identifier is seen, the code IDENTIFIER is returned
26 and the name is looked up in the symbol table using symtab.c;
27 symval is set to a pointer to the entry found. */
28
29#include <stdio.h>
40675e7c
DM
30#include "system.h"
31#include "files.h"
a44c2277 32#include "getopt.h" /* for optarg */
40675e7c
DM
33#include "symtab.h"
34#include "lex.h"
7612000c 35#include "alloc.h"
40675e7c 36
a44c2277
RS
37/* flags set by % directives */
38extern int definesflag; /* for -d */
39extern int toknumflag; /* for -k */
40extern int noparserflag; /* for -n */
41extern int fixed_outfiles; /* for -y */
42extern int nolinesflag; /* for -l */
43extern int rawtoknumflag; /* for -r */
44extern int verboseflag; /* for -v */
45extern int debugflag; /* for -t */
46extern char *spec_name_prefix; /* for -p */
47extern char *spec_file_prefix; /* for -b */
48/*spec_outfile is declared in files.h, for -o */
40675e7c
DM
49
50extern int lineno;
51extern int translations;
52
d2729d44
JT
53void init_lex PARAMS((void));
54char *grow_token_buffer PARAMS((char *));
55int skip_white_space PARAMS((void));
56int safegetc PARAMS((FILE *));
57int literalchar PARAMS((char **, int *, char));
58void unlex PARAMS((int));
59int lex PARAMS((void));
60int parse_percent_token PARAMS((void));
40675e7c 61
a44c2277 62/* functions from main.c */
d2729d44
JT
63extern char *printable_version PARAMS((int));
64extern void fatal PARAMS((char *));
65extern void warn PARAMS((char *));
66extern void warni PARAMS((char *, int));
67extern void warns PARAMS((char *, char *));
40675e7c
DM
68
69/* Buffer for storing the current token. */
70char *token_buffer;
71
72/* Allocated size of token_buffer, not including space for terminator. */
d2729d44 73int maxtoken;
40675e7c
DM
74
75bucket *symval;
76int numval;
77
78static int unlexed; /* these two describe a token to be reread */
79static bucket *unlexed_symval; /* by the next call to lex */
80
81
82void
d2729d44 83init_lex (void)
40675e7c
DM
84{
85 maxtoken = 100;
86 token_buffer = NEW2 (maxtoken + 1, char);
87 unlexed = -1;
88}
89
90
d2729d44
JT
91char *
92grow_token_buffer (char *p)
40675e7c
DM
93{
94 int offset = p - token_buffer;
95 maxtoken *= 2;
96 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
97 return token_buffer + offset;
98}
99
100
101int
d2729d44 102skip_white_space (void)
40675e7c
DM
103{
104 register int c;
105 register int inside;
106
107 c = getc(finput);
108
109 for (;;)
110 {
111 int cplus_comment;
112
113 switch (c)
114 {
115 case '/':
116 c = getc(finput);
a083fbbf 117 if (c != '*' && c != '/')
a44c2277 118 {
a083fbbf 119 warn(_("unexpected `/' found and ignored"));
a44c2277
RS
120 break;
121 }
40675e7c
DM
122 cplus_comment = (c == '/');
123
124 c = getc(finput);
125
126 inside = 1;
127 while (inside)
128 {
129 if (!cplus_comment && c == '*')
130 {
131 while (c == '*')
132 c = getc(finput);
133
134 if (c == '/')
135 {
136 inside = 0;
137 c = getc(finput);
138 }
139 }
140 else if (c == '\n')
141 {
142 lineno++;
143 if (cplus_comment)
144 inside = 0;
145 c = getc(finput);
146 }
147 else if (c == EOF)
a083fbbf 148 fatal(_("unterminated comment"));
40675e7c
DM
149 else
150 c = getc(finput);
151 }
152
153 break;
154
155 case '\n':
156 lineno++;
157
158 case ' ':
159 case '\t':
160 case '\f':
161 c = getc(finput);
162 break;
163
164 default:
165 return (c);
166 }
167 }
168}
169
a44c2277
RS
170/* do a getc, but give error message if EOF encountered */
171int
d2729d44 172safegetc (FILE *f)
a44c2277
RS
173{
174 register int c = getc(f);
175 if (c == EOF)
a083fbbf 176 fatal(_("Unexpected end of file"));
a44c2277
RS
177 return c;
178}
179
180/* read one literal character from finput. process \ escapes.
181 append the normalized string version of the char to *pp.
182 assign the character code to *pcode
183 return 1 unless the character is an unescaped `term' or \n
184 report error for \n
185*/
186int
d2729d44 187literalchar (char **pp, int *pcode, char term)
a44c2277
RS
188{
189 register int c;
190 register char *p;
191 register int code;
192 int wasquote = 0;
193
194 c = safegetc(finput);
a083fbbf 195 if (c == '\n')
a44c2277 196 {
a083fbbf 197 warn(_("unescaped newline in constant"));
a44c2277
RS
198 ungetc(c, finput);
199 code = '?';
200 wasquote = 1;
201 }
202 else if (c != '\\')
203 {
204 code = c;
a083fbbf 205 if (c == term)
a44c2277
RS
206 wasquote = 1;
207 }
208 else
209 {
210 c = safegetc(finput);
211 if (c == 't') code = '\t';
212 else if (c == 'n') code = '\n';
213 else if (c == 'a') code = '\007';
214 else if (c == 'r') code = '\r';
215 else if (c == 'f') code = '\f';
216 else if (c == 'b') code = '\b';
d2729d44 217 else if (c == 'v') code = '\013';
a44c2277
RS
218 else if (c == '\\') code = '\\';
219 else if (c == '\'') code = '\'';
220 else if (c == '\"') code = '\"';
221 else if (c <= '7' && c >= '0')
222 {
223 code = 0;
224 while (c <= '7' && c >= '0')
225 {
226 code = (code * 8) + (c - '0');
227 if (code >= 256 || code < 0)
228 {
a083fbbf 229 warni(_("octal value outside range 0...255: `\\%o'"), code);
a44c2277
RS
230 code &= 0xFF;
231 break;
232 }
233 c = safegetc(finput);
234 }
235 ungetc(c, finput);
236 }
237 else if (c == 'x')
238 {
239 c = safegetc(finput);
240 code = 0;
241 while (1)
242 {
243 if (c >= '0' && c <= '9')
244 code *= 16, code += c - '0';
245 else if (c >= 'a' && c <= 'f')
246 code *= 16, code += c - 'a' + 10;
247 else if (c >= 'A' && c <= 'F')
248 code *= 16, code += c - 'A' + 10;
a083fbbf 249 else
a44c2277
RS
250 break;
251 if (code >= 256 || code<0)
252 {
a083fbbf 253 warni(_("hexadecimal value above 255: `\\x%x'"), code);
a44c2277
RS
254 code &= 0xFF;
255 break;
256 }
257 c = safegetc(finput);
258 }
259 ungetc(c, finput);
260 }
261 else
262 {
b0180c64 263 warns (_("unknown escape sequence: `\\' followed by `%s'"),
a44c2277
RS
264 printable_version(c));
265 code = '?';
266 }
267 } /* has \ */
268
269 /* now fill token_buffer with the canonical name for this character
270 as a literal token. Do not use what the user typed,
271 so that `\012' and `\n' can be interchangeable. */
272
273 p = *pp;
e5335b74
JT
274 if (code == term && wasquote)
275 *p++ = code;
276 else if (code == '\\') {*p++ = '\\'; *p++ = '\\';}
a44c2277
RS
277 else if (code == '\'') {*p++ = '\\'; *p++ = '\'';}
278 else if (code == '\"') {*p++ = '\\'; *p++ = '\"';}
5ce94c29
RS
279 else if (code >= 040 && code < 0177)
280 *p++ = code;
a44c2277
RS
281 else if (code == '\t') {*p++ = '\\'; *p++ = 't';}
282 else if (code == '\n') {*p++ = '\\'; *p++ = 'n';}
283 else if (code == '\r') {*p++ = '\\'; *p++ = 'r';}
284 else if (code == '\v') {*p++ = '\\'; *p++ = 'v';}
285 else if (code == '\b') {*p++ = '\\'; *p++ = 'b';}
286 else if (code == '\f') {*p++ = '\\'; *p++ = 'f';}
287 else
288 {
289 *p++ = '\\';
290 *p++ = code / 0100 + '0';
291 *p++ = ((code / 010) & 07) + '0';
292 *p++ = (code & 07) + '0';
293 }
294 *pp = p;
295 *pcode = code;
296 return ! wasquote;
297}
298
40675e7c
DM
299
300void
d2729d44 301unlex (int token)
40675e7c
DM
302{
303 unlexed = token;
304 unlexed_symval = symval;
305}
306
307
40675e7c 308int
d2729d44 309lex (void)
40675e7c
DM
310{
311 register int c;
a44c2277 312 char *p;
40675e7c
DM
313
314 if (unlexed >= 0)
315 {
316 symval = unlexed_symval;
317 c = unlexed;
318 unlexed = -1;
319 return (c);
320 }
321
322 c = skip_white_space();
a44c2277
RS
323 *token_buffer = c; /* for error messages (token buffer always valid) */
324 token_buffer[1] = 0;
40675e7c
DM
325
326 switch (c)
327 {
328 case EOF:
a44c2277 329 strcpy(token_buffer, "EOF");
40675e7c
DM
330 return (ENDFILE);
331
332 case 'A': case 'B': case 'C': case 'D': case 'E':
333 case 'F': case 'G': case 'H': case 'I': case 'J':
334 case 'K': case 'L': case 'M': case 'N': case 'O':
335 case 'P': case 'Q': case 'R': case 'S': case 'T':
336 case 'U': case 'V': case 'W': case 'X': case 'Y':
337 case 'Z':
338 case 'a': case 'b': case 'c': case 'd': case 'e':
339 case 'f': case 'g': case 'h': case 'i': case 'j':
340 case 'k': case 'l': case 'm': case 'n': case 'o':
341 case 'p': case 'q': case 'r': case 's': case 't':
342 case 'u': case 'v': case 'w': case 'x': case 'y':
343 case 'z':
344 case '.': case '_':
345 p = token_buffer;
346 while (isalnum(c) || c == '_' || c == '.')
347 {
348 if (p == token_buffer + maxtoken)
349 p = grow_token_buffer(p);
350
351 *p++ = c;
352 c = getc(finput);
353 }
354
355 *p = 0;
356 ungetc(c, finput);
357 symval = getsym(token_buffer);
358 return (IDENTIFIER);
359
360 case '0': case '1': case '2': case '3': case '4':
361 case '5': case '6': case '7': case '8': case '9':
362 {
363 numval = 0;
364
a44c2277 365 p = token_buffer;
40675e7c
DM
366 while (isdigit(c))
367 {
a44c2277
RS
368 if (p == token_buffer + maxtoken)
369 p = grow_token_buffer(p);
370
371 *p++ = c;
40675e7c
DM
372 numval = numval*10 + c - '0';
373 c = getc(finput);
374 }
a44c2277 375 *p = 0;
40675e7c
DM
376 ungetc(c, finput);
377 return (NUMBER);
378 }
379
380 case '\'':
40675e7c
DM
381
382 /* parse the literal token and compute character code in code */
383
a44c2277 384 translations = -1;
40675e7c 385 {
a44c2277
RS
386 int code, discode;
387 char discard[10], *dp;
5ce94c29 388
a44c2277
RS
389 p = token_buffer;
390 *p++ = '\'';
391 literalchar(&p, &code, '\'');
40675e7c 392
a44c2277
RS
393 c = getc(finput);
394 if (c != '\'')
40675e7c 395 {
a083fbbf 396 warn(_("use \"...\" for multi-character literal tokens"));
5ce94c29
RS
397 while (1)
398 {
399 dp = discard;
400 if (! literalchar(&dp, &discode, '\''))
401 break;
402 }
40675e7c 403 }
a44c2277
RS
404 *p++ = '\'';
405 *p = 0;
406 symval = getsym(token_buffer);
407 symval->class = STOKEN;
408 if (! symval->user_token_number)
409 symval->user_token_number = code;
410 return (IDENTIFIER);
411 }
40675e7c 412
a44c2277 413 case '\"':
40675e7c 414
a44c2277
RS
415 /* parse the literal string token and treat as an identifier */
416
417 translations = -1;
418 {
419 int code; /* ignored here */
40675e7c 420 p = token_buffer;
a44c2277
RS
421 *p++ = '\"';
422 while (literalchar(&p, &code, '\"')) /* read up to and including " */
40675e7c 423 {
a44c2277
RS
424 if (p >= token_buffer + maxtoken - 4)
425 p = grow_token_buffer(p);
40675e7c 426 }
40675e7c 427 *p = 0;
a44c2277 428
40675e7c
DM
429 symval = getsym(token_buffer);
430 symval->class = STOKEN;
a44c2277 431
40675e7c
DM
432 return (IDENTIFIER);
433 }
434
435 case ',':
436 return (COMMA);
437
438 case ':':
439 return (COLON);
440
441 case ';':
442 return (SEMICOLON);
443
444 case '|':
445 return (BAR);
446
447 case '{':
448 return (LEFT_CURLY);
449
450 case '=':
451 do
452 {
453 c = getc(finput);
454 if (c == '\n') lineno++;
455 }
456 while(c==' ' || c=='\n' || c=='\t');
457
458 if (c == '{')
a44c2277
RS
459 {
460 strcpy(token_buffer, "={");
461 return(LEFT_CURLY);
462 }
40675e7c
DM
463 else
464 {
465 ungetc(c, finput);
466 return(ILLEGAL);
467 }
468
469 case '<':
470 p = token_buffer;
471 c = getc(finput);
472 while (c != '>')
473 {
a44c2277 474 if (c == EOF)
a083fbbf
RS
475 fatal(_("unterminated type name at end of file"));
476 if (c == '\n')
a44c2277 477 {
a083fbbf 478 warn(_("unterminated type name"));
a44c2277
RS
479 ungetc(c, finput);
480 break;
481 }
40675e7c
DM
482
483 if (p == token_buffer + maxtoken)
484 p = grow_token_buffer(p);
485
486 *p++ = c;
487 c = getc(finput);
488 }
489 *p = 0;
490 return (TYPENAME);
a083fbbf 491
40675e7c
DM
492
493 case '%':
494 return (parse_percent_token());
495
496 default:
497 return (ILLEGAL);
498 }
499}
500
a083fbbf 501/* the following table dictates the action taken for the various
a44c2277
RS
502 % directives. A setflag value causes the named flag to be
503 set. A retval action returns the code.
504*/
505struct percent_table_struct {
506 char *name;
a083fbbf 507 void *setflag;
a44c2277
RS
508 int retval;
509} percent_table[] =
510{
511 {"token", NULL, TOKEN},
512 {"term", NULL, TOKEN},
513 {"nterm", NULL, NTERM},
514 {"type", NULL, TYPE},
515 {"guard", NULL, GUARD},
516 {"union", NULL, UNION},
517 {"expect", NULL, EXPECT},
518 {"thong", NULL, THONG},
519 {"start", NULL, START},
520 {"left", NULL, LEFT},
521 {"right", NULL, RIGHT},
522 {"nonassoc", NULL, NONASSOC},
523 {"binary", NULL, NONASSOC},
524 {"semantic_parser", NULL, SEMANTIC_PARSER},
525 {"pure_parser", NULL, PURE_PARSER},
526 {"prec", NULL, PREC},
527
528 {"no_lines", &nolinesflag, NOOP}, /* -l */
529 {"raw", &rawtoknumflag, NOOP}, /* -r */
530 {"token_table", &toknumflag, NOOP}, /* -k */
531
532#if 0
533 /* These can be utilized after main is reoganized so
534 open_files() is deferred 'til after read_declarations().
535 But %{ and %union both put information into files
536 that have to be opened before read_declarations().
537 */
538 {"yacc", &fixed_outfiles, NOOP}, /* -y */
539 {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
540 {"defines", &definesflag, NOOP}, /* -d */
541 {"no_parser", &noparserflag, NOOP}, /* -n */
542 {"output_file", &spec_outfile, SETOPT}, /* -o */
543 {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
544 {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
545
546 /* These would be acceptable, but they do not affect processing */
547 {"verbose", &verboseflag, NOOP}, /* -v */
548 {"debug", &debugflag, NOOP}, /* -t */
d2729d44
JT
549 /* {"help", <print usage stmt>, NOOP},*/ /* -h */
550 /* {"version", <print version number> , NOOP},*/ /* -V */
a44c2277
RS
551#endif
552
553 {NULL, NULL, ILLEGAL}
554};
555
556/* Parse a token which starts with %.
557 Assumes the % has already been read and discarded. */
40675e7c
DM
558
559int
d2729d44 560parse_percent_token (void)
40675e7c
DM
561{
562 register int c;
563 register char *p;
a44c2277 564 register struct percent_table_struct *tx;
40675e7c
DM
565
566 p = token_buffer;
567 c = getc(finput);
a44c2277
RS
568 *p++ = '%';
569 *p++ = c; /* for error msg */
570 *p = 0;
40675e7c
DM
571
572 switch (c)
573 {
574 case '%':
575 return (TWO_PERCENTS);
576
577 case '{':
578 return (PERCENT_LEFT_CURLY);
579
580 case '<':
581 return (LEFT);
582
583 case '>':
584 return (RIGHT);
585
586 case '2':
587 return (NONASSOC);
588
589 case '0':
590 return (TOKEN);
591
592 case '=':
593 return (PREC);
594 }
a083fbbf 595 if (!isalpha(c))
40675e7c
DM
596 return (ILLEGAL);
597
a44c2277
RS
598 p = token_buffer;
599 *p++ = '%';
600 while (isalpha(c) || c == '_' || c == '-')
40675e7c
DM
601 {
602 if (p == token_buffer + maxtoken)
603 p = grow_token_buffer(p);
604
a44c2277 605 if (c == '-') c = '_';
40675e7c
DM
606 *p++ = c;
607 c = getc(finput);
608 }
609
610 ungetc(c, finput);
611
612 *p = 0;
613
a44c2277
RS
614 /* table lookup % directive */
615 for (tx = percent_table; tx->name; tx++)
616 if (strcmp(token_buffer+1, tx->name) == 0)
617 break;
618 if (tx->retval == SETOPT)
619 {
620 *((char **)(tx->setflag)) = optarg;
621 return NOOP;
622 }
623 if (tx->setflag)
624 {
625 *((int *)(tx->setflag)) = 1;
626 return NOOP;
627 }
628 return tx->retval;
40675e7c 629}