]> git.saurik.com Git - bison.git/blob - src/lex.c
* src/lalr.h: New file.
[bison.git] / src / lex.c
1 /* Token-reader for Bison's input parser,
2 Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
3
4 This file is part of Bison, the GNU Compiler Compiler.
5
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21
22 /*
23 lex is the entry point. It is called from reader.c.
24 It returns one of the token-type codes defined in lex.h.
25 When an identifier is seen, the code IDENTIFIER is returned
26 and the name is looked up in the symbol table using symtab.c;
27 symval is set to a pointer to the entry found. */
28
29 #include "system.h"
30 #include "getargs.h"
31 #include "files.h"
32 #include "getopt.h" /* for optarg */
33 #include "symtab.h"
34 #include "lex.h"
35 #include "alloc.h"
36 #include "complain.h"
37
38 /*spec_outfile is declared in files.h, for -o */
39
40 extern int translations;
41
42 extern void init_lex PARAMS((void));
43 extern char *grow_token_buffer PARAMS((char *));
44 extern int skip_white_space PARAMS((void));
45 extern void unlex PARAMS((int));
46 extern int lex PARAMS((void));
47 extern int parse_percent_token PARAMS((void));
48
49 static int safegetc PARAMS((FILE *));
50 static int literalchar PARAMS((char **, int *, char));
51
52 /* functions from main.c */
53 extern char *printable_version PARAMS((int));
54
55 /* Buffer for storing the current token. */
56 char *token_buffer;
57
58 /* Allocated size of token_buffer, not including space for terminator. */
59 int maxtoken;
60
61 bucket *symval;
62 int numval;
63
64 static int unlexed; /* these two describe a token to be reread */
65 static bucket *unlexed_symval; /* by the next call to lex */
66
67
68 void
69 init_lex (void)
70 {
71 maxtoken = 100;
72 token_buffer = NEW2 (maxtoken + 1, char);
73 unlexed = -1;
74 }
75
76
77 char *
78 grow_token_buffer (char *p)
79 {
80 int offset = p - token_buffer;
81 maxtoken *= 2;
82 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
83 return token_buffer + offset;
84 }
85
86
87 int
88 skip_white_space (void)
89 {
90 register int c;
91 register int inside;
92
93 c = getc(finput);
94
95 for (;;)
96 {
97 int cplus_comment;
98
99 switch (c)
100 {
101 case '/':
102 c = getc(finput);
103 if (c != '*' && c != '/')
104 {
105 complain (_("unexpected `/' found and ignored"));
106 break;
107 }
108 cplus_comment = (c == '/');
109
110 c = getc(finput);
111
112 inside = 1;
113 while (inside)
114 {
115 if (!cplus_comment && c == '*')
116 {
117 while (c == '*')
118 c = getc(finput);
119
120 if (c == '/')
121 {
122 inside = 0;
123 c = getc(finput);
124 }
125 }
126 else if (c == '\n')
127 {
128 lineno++;
129 if (cplus_comment)
130 inside = 0;
131 c = getc(finput);
132 }
133 else if (c == EOF)
134 fatal (_("unterminated comment"));
135 else
136 c = getc(finput);
137 }
138
139 break;
140
141 case '\n':
142 lineno++;
143
144 case ' ':
145 case '\t':
146 case '\f':
147 c = getc(finput);
148 break;
149
150 default:
151 return c;
152 }
153 }
154 }
155
156 /* do a getc, but give error message if EOF encountered */
157 static int
158 safegetc (FILE *f)
159 {
160 register int c = getc(f);
161 if (c == EOF)
162 fatal (_("unexpected end of file"));
163 return c;
164 }
165
166 /* read one literal character from finput. process \ escapes.
167 append the normalized string version of the char to *pp.
168 assign the character code to *pcode
169 return 1 unless the character is an unescaped `term' or \n
170 report error for \n
171 */
172 static int
173 literalchar (char **pp, int *pcode, char term)
174 {
175 register int c;
176 register char *p;
177 register int code;
178 int wasquote = 0;
179
180 c = safegetc(finput);
181 if (c == '\n')
182 {
183 complain (_("unescaped newline in constant"));
184 ungetc(c, finput);
185 code = '?';
186 wasquote = 1;
187 }
188 else if (c != '\\')
189 {
190 code = c;
191 if (c == term)
192 wasquote = 1;
193 }
194 else
195 {
196 c = safegetc(finput);
197 if (c == 't') code = '\t';
198 else if (c == 'n') code = '\n';
199 else if (c == 'a') code = '\007';
200 else if (c == 'r') code = '\r';
201 else if (c == 'f') code = '\f';
202 else if (c == 'b') code = '\b';
203 else if (c == 'v') code = '\013';
204 else if (c == '\\') code = '\\';
205 else if (c == '\'') code = '\'';
206 else if (c == '\"') code = '\"';
207 else if (c <= '7' && c >= '0')
208 {
209 code = 0;
210 while (c <= '7' && c >= '0')
211 {
212 code = (code * 8) + (c - '0');
213 if (code >= 256 || code < 0)
214 {
215 complain (_("octal value outside range 0...255: `\\%o'"),
216 code);
217 code &= 0xFF;
218 break;
219 }
220 c = safegetc(finput);
221 }
222 ungetc(c, finput);
223 }
224 else if (c == 'x')
225 {
226 c = safegetc(finput);
227 code = 0;
228 while (1)
229 {
230 if (c >= '0' && c <= '9')
231 code *= 16, code += c - '0';
232 else if (c >= 'a' && c <= 'f')
233 code *= 16, code += c - 'a' + 10;
234 else if (c >= 'A' && c <= 'F')
235 code *= 16, code += c - 'A' + 10;
236 else
237 break;
238 if (code >= 256 || code<0)
239 {
240 complain (_("hexadecimal value above 255: `\\x%x'"),
241 code);
242 code &= 0xFF;
243 break;
244 }
245 c = safegetc(finput);
246 }
247 ungetc(c, finput);
248 }
249 else
250 {
251 complain (_("unknown escape sequence: `\\' followed by `%s'"),
252 printable_version(c));
253 code = '?';
254 }
255 } /* has \ */
256
257 /* now fill token_buffer with the canonical name for this character
258 as a literal token. Do not use what the user typed,
259 so that `\012' and `\n' can be interchangeable. */
260
261 p = *pp;
262 if (code == term && wasquote)
263 *p++ = code;
264 else if (code == '\\') {*p++ = '\\'; *p++ = '\\';}
265 else if (code == '\'') {*p++ = '\\'; *p++ = '\'';}
266 else if (code == '\"') {*p++ = '\\'; *p++ = '\"';}
267 else if (code >= 040 && code < 0177)
268 *p++ = code;
269 else if (code == '\t') {*p++ = '\\'; *p++ = 't';}
270 else if (code == '\n') {*p++ = '\\'; *p++ = 'n';}
271 else if (code == '\r') {*p++ = '\\'; *p++ = 'r';}
272 else if (code == '\v') {*p++ = '\\'; *p++ = 'v';}
273 else if (code == '\b') {*p++ = '\\'; *p++ = 'b';}
274 else if (code == '\f') {*p++ = '\\'; *p++ = 'f';}
275 else
276 {
277 *p++ = '\\';
278 *p++ = code / 0100 + '0';
279 *p++ = ((code / 010) & 07) + '0';
280 *p++ = (code & 07) + '0';
281 }
282 *pp = p;
283 *pcode = code;
284 return ! wasquote;
285 }
286
287
288 void
289 unlex (int token)
290 {
291 unlexed = token;
292 unlexed_symval = symval;
293 }
294
295
296 int
297 lex (void)
298 {
299 register int c;
300 char *p;
301
302 if (unlexed >= 0)
303 {
304 symval = unlexed_symval;
305 c = unlexed;
306 unlexed = -1;
307 return c;
308 }
309
310 c = skip_white_space();
311 *token_buffer = c; /* for error messages (token buffer always valid) */
312 token_buffer[1] = 0;
313
314 switch (c)
315 {
316 case EOF:
317 strcpy(token_buffer, "EOF");
318 return ENDFILE;
319
320 case 'A': case 'B': case 'C': case 'D': case 'E':
321 case 'F': case 'G': case 'H': case 'I': case 'J':
322 case 'K': case 'L': case 'M': case 'N': case 'O':
323 case 'P': case 'Q': case 'R': case 'S': case 'T':
324 case 'U': case 'V': case 'W': case 'X': case 'Y':
325 case 'Z':
326 case 'a': case 'b': case 'c': case 'd': case 'e':
327 case 'f': case 'g': case 'h': case 'i': case 'j':
328 case 'k': case 'l': case 'm': case 'n': case 'o':
329 case 'p': case 'q': case 'r': case 's': case 't':
330 case 'u': case 'v': case 'w': case 'x': case 'y':
331 case 'z':
332 case '.': case '_':
333 p = token_buffer;
334 while (isalnum(c) || c == '_' || c == '.')
335 {
336 if (p == token_buffer + maxtoken)
337 p = grow_token_buffer(p);
338
339 *p++ = c;
340 c = getc(finput);
341 }
342
343 *p = 0;
344 ungetc(c, finput);
345 symval = getsym(token_buffer);
346 return IDENTIFIER;
347
348 case '0': case '1': case '2': case '3': case '4':
349 case '5': case '6': case '7': case '8': case '9':
350 {
351 numval = 0;
352
353 p = token_buffer;
354 while (isdigit(c))
355 {
356 if (p == token_buffer + maxtoken)
357 p = grow_token_buffer(p);
358
359 *p++ = c;
360 numval = numval*10 + c - '0';
361 c = getc(finput);
362 }
363 *p = 0;
364 ungetc(c, finput);
365 return NUMBER;
366 }
367
368 case '\'':
369
370 /* parse the literal token and compute character code in code */
371
372 translations = -1;
373 {
374 int code, discode;
375 char discard[10], *dp;
376
377 p = token_buffer;
378 *p++ = '\'';
379 literalchar(&p, &code, '\'');
380
381 c = getc(finput);
382 if (c != '\'')
383 {
384 complain (_("use \"...\" for multi-character literal tokens"));
385 while (1)
386 {
387 dp = discard;
388 if (! literalchar(&dp, &discode, '\''))
389 break;
390 }
391 }
392 *p++ = '\'';
393 *p = 0;
394 symval = getsym(token_buffer);
395 symval->class = STOKEN;
396 if (! symval->user_token_number)
397 symval->user_token_number = code;
398 return IDENTIFIER;
399 }
400
401 case '\"':
402
403 /* parse the literal string token and treat as an identifier */
404
405 translations = -1;
406 {
407 int code; /* ignored here */
408 p = token_buffer;
409 *p++ = '\"';
410 while (literalchar(&p, &code, '\"')) /* read up to and including " */
411 {
412 if (p >= token_buffer + maxtoken - 4)
413 p = grow_token_buffer(p);
414 }
415 *p = 0;
416
417 symval = getsym(token_buffer);
418 symval->class = STOKEN;
419
420 return IDENTIFIER;
421 }
422
423 case ',':
424 return COMMA;
425
426 case ':':
427 return COLON;
428
429 case ';':
430 return SEMICOLON;
431
432 case '|':
433 return BAR;
434
435 case '{':
436 return LEFT_CURLY;
437
438 case '=':
439 do
440 {
441 c = getc(finput);
442 if (c == '\n') lineno++;
443 }
444 while(c==' ' || c=='\n' || c=='\t');
445
446 if (c == '{')
447 {
448 strcpy(token_buffer, "={");
449 return LEFT_CURLY;
450 }
451 else
452 {
453 ungetc(c, finput);
454 return ILLEGAL;
455 }
456
457 case '<':
458 p = token_buffer;
459 c = getc(finput);
460 while (c != '>')
461 {
462 if (c == EOF)
463 fatal (_("unterminated type name at end of file"));
464 if (c == '\n')
465 {
466 complain (_("unterminated type name"));
467 ungetc(c, finput);
468 break;
469 }
470
471 if (p == token_buffer + maxtoken)
472 p = grow_token_buffer(p);
473
474 *p++ = c;
475 c = getc(finput);
476 }
477 *p = 0;
478 return TYPENAME;
479
480
481 case '%':
482 return parse_percent_token();
483
484 default:
485 return ILLEGAL;
486 }
487 }
488
489 /* the following table dictates the action taken for the various
490 % directives. A setflag value causes the named flag to be
491 set. A retval action returns the code.
492 */
493 struct percent_table_struct {
494 const char *name;
495 void *setflag;
496 int retval;
497 } percent_table[] =
498 {
499 {"token", NULL, TOKEN},
500 {"term", NULL, TOKEN},
501 {"nterm", NULL, NTERM},
502 {"type", NULL, TYPE},
503 {"guard", NULL, GUARD},
504 {"union", NULL, UNION},
505 {"expect", NULL, EXPECT},
506 {"thong", NULL, THONG},
507 {"start", NULL, START},
508 {"left", NULL, LEFT},
509 {"right", NULL, RIGHT},
510 {"nonassoc", NULL, NONASSOC},
511 {"binary", NULL, NONASSOC},
512 {"semantic_parser", NULL, SEMANTIC_PARSER},
513 {"pure_parser", NULL, PURE_PARSER},
514 {"prec", NULL, PREC},
515
516 {"no_lines", &nolinesflag, NOOP}, /* -l */
517 {"raw", &rawtoknumflag, NOOP}, /* -r */
518 {"token_table", &toknumflag, NOOP}, /* -k */
519
520 #if 0
521 /* These can be utilized after main is reoganized so
522 open_files() is deferred 'til after read_declarations().
523 But %{ and %union both put information into files
524 that have to be opened before read_declarations().
525 */
526 {"yacc", &fixed_outfiles, NOOP}, /* -y */
527 {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
528 {"defines", &definesflag, NOOP}, /* -d */
529 {"no_parser", &noparserflag, NOOP}, /* -n */
530 {"output_file", &spec_outfile, SETOPT}, /* -o */
531 {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
532 {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
533
534 /* These would be acceptable, but they do not affect processing */
535 {"verbose", &verboseflag, NOOP}, /* -v */
536 {"debug", &debugflag, NOOP}, /* -t */
537 /* {"help", <print usage stmt>, NOOP},*/ /* -h */
538 /* {"version", <print version number> , NOOP},*/ /* -V */
539 #endif
540
541 {NULL, NULL, ILLEGAL}
542 };
543
544 /* Parse a token which starts with %.
545 Assumes the % has already been read and discarded. */
546
547 int
548 parse_percent_token (void)
549 {
550 register int c;
551 register char *p;
552 register struct percent_table_struct *tx;
553
554 p = token_buffer;
555 c = getc(finput);
556 *p++ = '%';
557 *p++ = c; /* for error msg */
558 *p = 0;
559
560 switch (c)
561 {
562 case '%':
563 return TWO_PERCENTS;
564
565 case '{':
566 return PERCENT_LEFT_CURLY;
567
568 case '<':
569 return LEFT;
570
571 case '>':
572 return RIGHT;
573
574 case '2':
575 return NONASSOC;
576
577 case '0':
578 return TOKEN;
579
580 case '=':
581 return PREC;
582 }
583 if (!isalpha(c))
584 return ILLEGAL;
585
586 p = token_buffer;
587 *p++ = '%';
588 while (isalpha(c) || c == '_' || c == '-')
589 {
590 if (p == token_buffer + maxtoken)
591 p = grow_token_buffer(p);
592
593 if (c == '-') c = '_';
594 *p++ = c;
595 c = getc(finput);
596 }
597
598 ungetc(c, finput);
599
600 *p = 0;
601
602 /* table lookup % directive */
603 for (tx = percent_table; tx->name; tx++)
604 if (strcmp(token_buffer+1, tx->name) == 0)
605 break;
606 if (tx->retval == SETOPT)
607 {
608 *((char **)(tx->setflag)) = optarg;
609 return NOOP;
610 }
611 if (tx->setflag)
612 {
613 *((int *)(tx->setflag)) = 1;
614 return NOOP;
615 }
616 return tx->retval;
617 }