]> git.saurik.com Git - bison.git/blob - src/lex.c
* src/reader.c (copy_comment2): New function, same as former
[bison.git] / src / lex.c
1 /* Token-reader for Bison's input parser,
2 Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
3
4 This file is part of Bison, the GNU Compiler Compiler.
5
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21
22 /*
23 lex is the entry point. It is called from reader.c.
24 It returns one of the token-type codes defined in lex.h.
25 When an identifier is seen, the code IDENTIFIER is returned
26 and the name is looked up in the symbol table using symtab.c;
27 symval is set to a pointer to the entry found. */
28
29 #include <stdio.h>
30 #include "system.h"
31 #include "files.h"
32 #include "getopt.h" /* for optarg */
33 #include "symtab.h"
34 #include "lex.h"
35 #include "alloc.h"
36 #include "complain.h"
37
38 /* flags set by % directives */
39 extern int definesflag; /* for -d */
40 extern int toknumflag; /* for -k */
41 extern int noparserflag; /* for -n */
42 extern int fixed_outfiles; /* for -y */
43 extern int nolinesflag; /* for -l */
44 extern int rawtoknumflag; /* for -r */
45 extern int verboseflag; /* for -v */
46 extern int debugflag; /* for -t */
47 extern char *spec_name_prefix; /* for -p */
48 extern char *spec_file_prefix; /* for -b */
49 /*spec_outfile is declared in files.h, for -o */
50
51 extern int translations;
52
53 void init_lex PARAMS((void));
54 char *grow_token_buffer PARAMS((char *));
55 int skip_white_space PARAMS((void));
56 int safegetc PARAMS((FILE *));
57 int literalchar PARAMS((char **, int *, char));
58 void unlex PARAMS((int));
59 int lex PARAMS((void));
60 int parse_percent_token PARAMS((void));
61
62 /* functions from main.c */
63 extern char *printable_version PARAMS((int));
64
65 /* Buffer for storing the current token. */
66 char *token_buffer;
67
68 /* Allocated size of token_buffer, not including space for terminator. */
69 int maxtoken;
70
71 bucket *symval;
72 int numval;
73
74 static int unlexed; /* these two describe a token to be reread */
75 static bucket *unlexed_symval; /* by the next call to lex */
76
77
78 void
79 init_lex (void)
80 {
81 maxtoken = 100;
82 token_buffer = NEW2 (maxtoken + 1, char);
83 unlexed = -1;
84 }
85
86
87 char *
88 grow_token_buffer (char *p)
89 {
90 int offset = p - token_buffer;
91 maxtoken *= 2;
92 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
93 return token_buffer + offset;
94 }
95
96
97 int
98 skip_white_space (void)
99 {
100 register int c;
101 register int inside;
102
103 c = getc(finput);
104
105 for (;;)
106 {
107 int cplus_comment;
108
109 switch (c)
110 {
111 case '/':
112 c = getc(finput);
113 if (c != '*' && c != '/')
114 {
115 complain (_("unexpected `/' found and ignored"));
116 break;
117 }
118 cplus_comment = (c == '/');
119
120 c = getc(finput);
121
122 inside = 1;
123 while (inside)
124 {
125 if (!cplus_comment && c == '*')
126 {
127 while (c == '*')
128 c = getc(finput);
129
130 if (c == '/')
131 {
132 inside = 0;
133 c = getc(finput);
134 }
135 }
136 else if (c == '\n')
137 {
138 lineno++;
139 if (cplus_comment)
140 inside = 0;
141 c = getc(finput);
142 }
143 else if (c == EOF)
144 fatal (_("unterminated comment"));
145 else
146 c = getc(finput);
147 }
148
149 break;
150
151 case '\n':
152 lineno++;
153
154 case ' ':
155 case '\t':
156 case '\f':
157 c = getc(finput);
158 break;
159
160 default:
161 return c;
162 }
163 }
164 }
165
166 /* do a getc, but give error message if EOF encountered */
167 int
168 safegetc (FILE *f)
169 {
170 register int c = getc(f);
171 if (c == EOF)
172 fatal (_("unexpected end of file"));
173 return c;
174 }
175
176 /* read one literal character from finput. process \ escapes.
177 append the normalized string version of the char to *pp.
178 assign the character code to *pcode
179 return 1 unless the character is an unescaped `term' or \n
180 report error for \n
181 */
182 int
183 literalchar (char **pp, int *pcode, char term)
184 {
185 register int c;
186 register char *p;
187 register int code;
188 int wasquote = 0;
189
190 c = safegetc(finput);
191 if (c == '\n')
192 {
193 complain (_("unescaped newline in constant"));
194 ungetc(c, finput);
195 code = '?';
196 wasquote = 1;
197 }
198 else if (c != '\\')
199 {
200 code = c;
201 if (c == term)
202 wasquote = 1;
203 }
204 else
205 {
206 c = safegetc(finput);
207 if (c == 't') code = '\t';
208 else if (c == 'n') code = '\n';
209 else if (c == 'a') code = '\007';
210 else if (c == 'r') code = '\r';
211 else if (c == 'f') code = '\f';
212 else if (c == 'b') code = '\b';
213 else if (c == 'v') code = '\013';
214 else if (c == '\\') code = '\\';
215 else if (c == '\'') code = '\'';
216 else if (c == '\"') code = '\"';
217 else if (c <= '7' && c >= '0')
218 {
219 code = 0;
220 while (c <= '7' && c >= '0')
221 {
222 code = (code * 8) + (c - '0');
223 if (code >= 256 || code < 0)
224 {
225 complain (_("octal value outside range 0...255: `\\%o'"),
226 code);
227 code &= 0xFF;
228 break;
229 }
230 c = safegetc(finput);
231 }
232 ungetc(c, finput);
233 }
234 else if (c == 'x')
235 {
236 c = safegetc(finput);
237 code = 0;
238 while (1)
239 {
240 if (c >= '0' && c <= '9')
241 code *= 16, code += c - '0';
242 else if (c >= 'a' && c <= 'f')
243 code *= 16, code += c - 'a' + 10;
244 else if (c >= 'A' && c <= 'F')
245 code *= 16, code += c - 'A' + 10;
246 else
247 break;
248 if (code >= 256 || code<0)
249 {
250 complain (_("hexadecimal value above 255: `\\x%x'"),
251 code);
252 code &= 0xFF;
253 break;
254 }
255 c = safegetc(finput);
256 }
257 ungetc(c, finput);
258 }
259 else
260 {
261 complain (_("unknown escape sequence: `\\' followed by `%s'"),
262 printable_version(c));
263 code = '?';
264 }
265 } /* has \ */
266
267 /* now fill token_buffer with the canonical name for this character
268 as a literal token. Do not use what the user typed,
269 so that `\012' and `\n' can be interchangeable. */
270
271 p = *pp;
272 if (code == term && wasquote)
273 *p++ = code;
274 else if (code == '\\') {*p++ = '\\'; *p++ = '\\';}
275 else if (code == '\'') {*p++ = '\\'; *p++ = '\'';}
276 else if (code == '\"') {*p++ = '\\'; *p++ = '\"';}
277 else if (code >= 040 && code < 0177)
278 *p++ = code;
279 else if (code == '\t') {*p++ = '\\'; *p++ = 't';}
280 else if (code == '\n') {*p++ = '\\'; *p++ = 'n';}
281 else if (code == '\r') {*p++ = '\\'; *p++ = 'r';}
282 else if (code == '\v') {*p++ = '\\'; *p++ = 'v';}
283 else if (code == '\b') {*p++ = '\\'; *p++ = 'b';}
284 else if (code == '\f') {*p++ = '\\'; *p++ = 'f';}
285 else
286 {
287 *p++ = '\\';
288 *p++ = code / 0100 + '0';
289 *p++ = ((code / 010) & 07) + '0';
290 *p++ = (code & 07) + '0';
291 }
292 *pp = p;
293 *pcode = code;
294 return ! wasquote;
295 }
296
297
298 void
299 unlex (int token)
300 {
301 unlexed = token;
302 unlexed_symval = symval;
303 }
304
305
306 int
307 lex (void)
308 {
309 register int c;
310 char *p;
311
312 if (unlexed >= 0)
313 {
314 symval = unlexed_symval;
315 c = unlexed;
316 unlexed = -1;
317 return c;
318 }
319
320 c = skip_white_space();
321 *token_buffer = c; /* for error messages (token buffer always valid) */
322 token_buffer[1] = 0;
323
324 switch (c)
325 {
326 case EOF:
327 strcpy(token_buffer, "EOF");
328 return ENDFILE;
329
330 case 'A': case 'B': case 'C': case 'D': case 'E':
331 case 'F': case 'G': case 'H': case 'I': case 'J':
332 case 'K': case 'L': case 'M': case 'N': case 'O':
333 case 'P': case 'Q': case 'R': case 'S': case 'T':
334 case 'U': case 'V': case 'W': case 'X': case 'Y':
335 case 'Z':
336 case 'a': case 'b': case 'c': case 'd': case 'e':
337 case 'f': case 'g': case 'h': case 'i': case 'j':
338 case 'k': case 'l': case 'm': case 'n': case 'o':
339 case 'p': case 'q': case 'r': case 's': case 't':
340 case 'u': case 'v': case 'w': case 'x': case 'y':
341 case 'z':
342 case '.': case '_':
343 p = token_buffer;
344 while (isalnum(c) || c == '_' || c == '.')
345 {
346 if (p == token_buffer + maxtoken)
347 p = grow_token_buffer(p);
348
349 *p++ = c;
350 c = getc(finput);
351 }
352
353 *p = 0;
354 ungetc(c, finput);
355 symval = getsym(token_buffer);
356 return IDENTIFIER;
357
358 case '0': case '1': case '2': case '3': case '4':
359 case '5': case '6': case '7': case '8': case '9':
360 {
361 numval = 0;
362
363 p = token_buffer;
364 while (isdigit(c))
365 {
366 if (p == token_buffer + maxtoken)
367 p = grow_token_buffer(p);
368
369 *p++ = c;
370 numval = numval*10 + c - '0';
371 c = getc(finput);
372 }
373 *p = 0;
374 ungetc(c, finput);
375 return NUMBER;
376 }
377
378 case '\'':
379
380 /* parse the literal token and compute character code in code */
381
382 translations = -1;
383 {
384 int code, discode;
385 char discard[10], *dp;
386
387 p = token_buffer;
388 *p++ = '\'';
389 literalchar(&p, &code, '\'');
390
391 c = getc(finput);
392 if (c != '\'')
393 {
394 complain (_("use \"...\" for multi-character literal tokens"));
395 while (1)
396 {
397 dp = discard;
398 if (! literalchar(&dp, &discode, '\''))
399 break;
400 }
401 }
402 *p++ = '\'';
403 *p = 0;
404 symval = getsym(token_buffer);
405 symval->class = STOKEN;
406 if (! symval->user_token_number)
407 symval->user_token_number = code;
408 return IDENTIFIER;
409 }
410
411 case '\"':
412
413 /* parse the literal string token and treat as an identifier */
414
415 translations = -1;
416 {
417 int code; /* ignored here */
418 p = token_buffer;
419 *p++ = '\"';
420 while (literalchar(&p, &code, '\"')) /* read up to and including " */
421 {
422 if (p >= token_buffer + maxtoken - 4)
423 p = grow_token_buffer(p);
424 }
425 *p = 0;
426
427 symval = getsym(token_buffer);
428 symval->class = STOKEN;
429
430 return IDENTIFIER;
431 }
432
433 case ',':
434 return COMMA;
435
436 case ':':
437 return COLON;
438
439 case ';':
440 return SEMICOLON;
441
442 case '|':
443 return BAR;
444
445 case '{':
446 return LEFT_CURLY;
447
448 case '=':
449 do
450 {
451 c = getc(finput);
452 if (c == '\n') lineno++;
453 }
454 while(c==' ' || c=='\n' || c=='\t');
455
456 if (c == '{')
457 {
458 strcpy(token_buffer, "={");
459 return LEFT_CURLY;
460 }
461 else
462 {
463 ungetc(c, finput);
464 return ILLEGAL;
465 }
466
467 case '<':
468 p = token_buffer;
469 c = getc(finput);
470 while (c != '>')
471 {
472 if (c == EOF)
473 fatal (_("unterminated type name at end of file"));
474 if (c == '\n')
475 {
476 complain (_("unterminated type name"));
477 ungetc(c, finput);
478 break;
479 }
480
481 if (p == token_buffer + maxtoken)
482 p = grow_token_buffer(p);
483
484 *p++ = c;
485 c = getc(finput);
486 }
487 *p = 0;
488 return TYPENAME;
489
490
491 case '%':
492 return parse_percent_token();
493
494 default:
495 return ILLEGAL;
496 }
497 }
498
499 /* the following table dictates the action taken for the various
500 % directives. A setflag value causes the named flag to be
501 set. A retval action returns the code.
502 */
503 struct percent_table_struct {
504 char *name;
505 void *setflag;
506 int retval;
507 } percent_table[] =
508 {
509 {"token", NULL, TOKEN},
510 {"term", NULL, TOKEN},
511 {"nterm", NULL, NTERM},
512 {"type", NULL, TYPE},
513 {"guard", NULL, GUARD},
514 {"union", NULL, UNION},
515 {"expect", NULL, EXPECT},
516 {"thong", NULL, THONG},
517 {"start", NULL, START},
518 {"left", NULL, LEFT},
519 {"right", NULL, RIGHT},
520 {"nonassoc", NULL, NONASSOC},
521 {"binary", NULL, NONASSOC},
522 {"semantic_parser", NULL, SEMANTIC_PARSER},
523 {"pure_parser", NULL, PURE_PARSER},
524 {"prec", NULL, PREC},
525
526 {"no_lines", &nolinesflag, NOOP}, /* -l */
527 {"raw", &rawtoknumflag, NOOP}, /* -r */
528 {"token_table", &toknumflag, NOOP}, /* -k */
529
530 #if 0
531 /* These can be utilized after main is reoganized so
532 open_files() is deferred 'til after read_declarations().
533 But %{ and %union both put information into files
534 that have to be opened before read_declarations().
535 */
536 {"yacc", &fixed_outfiles, NOOP}, /* -y */
537 {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
538 {"defines", &definesflag, NOOP}, /* -d */
539 {"no_parser", &noparserflag, NOOP}, /* -n */
540 {"output_file", &spec_outfile, SETOPT}, /* -o */
541 {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
542 {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
543
544 /* These would be acceptable, but they do not affect processing */
545 {"verbose", &verboseflag, NOOP}, /* -v */
546 {"debug", &debugflag, NOOP}, /* -t */
547 /* {"help", <print usage stmt>, NOOP},*/ /* -h */
548 /* {"version", <print version number> , NOOP},*/ /* -V */
549 #endif
550
551 {NULL, NULL, ILLEGAL}
552 };
553
554 /* Parse a token which starts with %.
555 Assumes the % has already been read and discarded. */
556
557 int
558 parse_percent_token (void)
559 {
560 register int c;
561 register char *p;
562 register struct percent_table_struct *tx;
563
564 p = token_buffer;
565 c = getc(finput);
566 *p++ = '%';
567 *p++ = c; /* for error msg */
568 *p = 0;
569
570 switch (c)
571 {
572 case '%':
573 return TWO_PERCENTS;
574
575 case '{':
576 return PERCENT_LEFT_CURLY;
577
578 case '<':
579 return LEFT;
580
581 case '>':
582 return RIGHT;
583
584 case '2':
585 return NONASSOC;
586
587 case '0':
588 return TOKEN;
589
590 case '=':
591 return PREC;
592 }
593 if (!isalpha(c))
594 return ILLEGAL;
595
596 p = token_buffer;
597 *p++ = '%';
598 while (isalpha(c) || c == '_' || c == '-')
599 {
600 if (p == token_buffer + maxtoken)
601 p = grow_token_buffer(p);
602
603 if (c == '-') c = '_';
604 *p++ = c;
605 c = getc(finput);
606 }
607
608 ungetc(c, finput);
609
610 *p = 0;
611
612 /* table lookup % directive */
613 for (tx = percent_table; tx->name; tx++)
614 if (strcmp(token_buffer+1, tx->name) == 0)
615 break;
616 if (tx->retval == SETOPT)
617 {
618 *((char **)(tx->setflag)) = optarg;
619 return NOOP;
620 }
621 if (tx->setflag)
622 {
623 *((int *)(tx->setflag)) = 1;
624 return NOOP;
625 }
626 return tx->retval;
627 }