]> git.saurik.com Git - bison.git/blob - src/lex.c
entered into RCS
[bison.git] / src / lex.c
1 /* Token-reader for Bison's input parser,
2 Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc.
3
4 This file is part of Bison, the GNU Compiler Compiler.
5
6 Bison is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 Bison is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Bison; see the file COPYING. If not, write to
18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
19
20
21 /*
22 lex() is the entry point. It is called from reader.c.
23 It returns one of the token-type codes defined in lex.h.
24 When an identifier is seen, the code IDENTIFIER is returned
25 and the name is looked up in the symbol table using symtab.c;
26 symval is set to a pointer to the entry found. */
27
28 #include <stdio.h>
29 #include <ctype.h>
30 #include "system.h"
31 #include "files.h"
32 #include "symtab.h"
33 #include "lex.h"
34 #include "new.h"
35
36
37 extern int lineno;
38 extern int translations;
39
40 int parse_percent_token();
41
42 extern void fatals();
43 extern void fatal();
44
45 /* Buffer for storing the current token. */
46 char *token_buffer;
47
48 /* Allocated size of token_buffer, not including space for terminator. */
49 static int maxtoken;
50
51 bucket *symval;
52 int numval;
53
54 static int unlexed; /* these two describe a token to be reread */
55 static bucket *unlexed_symval; /* by the next call to lex */
56
57
58 void
59 init_lex()
60 {
61 maxtoken = 100;
62 token_buffer = NEW2 (maxtoken + 1, char);
63 unlexed = -1;
64 }
65
66
67 static char *
68 grow_token_buffer (p)
69 char *p;
70 {
71 int offset = p - token_buffer;
72 maxtoken *= 2;
73 token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
74 return token_buffer + offset;
75 }
76
77
78 int
79 skip_white_space()
80 {
81 register int c;
82 register int inside;
83
84 c = getc(finput);
85
86 for (;;)
87 {
88 int cplus_comment;
89
90 switch (c)
91 {
92 case '/':
93 c = getc(finput);
94 if (c != '*' && c != '/')
95 fatals("unexpected `/%c' found",c);
96 cplus_comment = (c == '/');
97
98 c = getc(finput);
99
100 inside = 1;
101 while (inside)
102 {
103 if (!cplus_comment && c == '*')
104 {
105 while (c == '*')
106 c = getc(finput);
107
108 if (c == '/')
109 {
110 inside = 0;
111 c = getc(finput);
112 }
113 }
114 else if (c == '\n')
115 {
116 lineno++;
117 if (cplus_comment)
118 inside = 0;
119 c = getc(finput);
120 }
121 else if (c == EOF)
122 fatal("unterminated comment");
123 else
124 c = getc(finput);
125 }
126
127 break;
128
129 case '\n':
130 lineno++;
131
132 case ' ':
133 case '\t':
134 case '\f':
135 c = getc(finput);
136 break;
137
138 default:
139 return (c);
140 }
141 }
142 }
143
144
145 void
146 unlex(token)
147 int token;
148 {
149 unlexed = token;
150 unlexed_symval = symval;
151 }
152
153
154
155 int
156 lex()
157 {
158 register int c;
159 register char *p;
160
161 if (unlexed >= 0)
162 {
163 symval = unlexed_symval;
164 c = unlexed;
165 unlexed = -1;
166 return (c);
167 }
168
169 c = skip_white_space();
170
171 switch (c)
172 {
173 case EOF:
174 return (ENDFILE);
175
176 case 'A': case 'B': case 'C': case 'D': case 'E':
177 case 'F': case 'G': case 'H': case 'I': case 'J':
178 case 'K': case 'L': case 'M': case 'N': case 'O':
179 case 'P': case 'Q': case 'R': case 'S': case 'T':
180 case 'U': case 'V': case 'W': case 'X': case 'Y':
181 case 'Z':
182 case 'a': case 'b': case 'c': case 'd': case 'e':
183 case 'f': case 'g': case 'h': case 'i': case 'j':
184 case 'k': case 'l': case 'm': case 'n': case 'o':
185 case 'p': case 'q': case 'r': case 's': case 't':
186 case 'u': case 'v': case 'w': case 'x': case 'y':
187 case 'z':
188 case '.': case '_':
189 p = token_buffer;
190 while (isalnum(c) || c == '_' || c == '.')
191 {
192 if (p == token_buffer + maxtoken)
193 p = grow_token_buffer(p);
194
195 *p++ = c;
196 c = getc(finput);
197 }
198
199 *p = 0;
200 ungetc(c, finput);
201 symval = getsym(token_buffer);
202 return (IDENTIFIER);
203
204 case '0': case '1': case '2': case '3': case '4':
205 case '5': case '6': case '7': case '8': case '9':
206 {
207 numval = 0;
208
209 while (isdigit(c))
210 {
211 numval = numval*10 + c - '0';
212 c = getc(finput);
213 }
214 ungetc(c, finput);
215 return (NUMBER);
216 }
217
218 case '\'':
219 translations = -1;
220
221 /* parse the literal token and compute character code in code */
222
223 c = getc(finput);
224 {
225 register int code = 0;
226
227 if (c == '\\')
228 {
229 c = getc(finput);
230
231 if (c <= '7' && c >= '0')
232 {
233 while (c <= '7' && c >= '0')
234 {
235 code = (code * 8) + (c - '0');
236 c = getc(finput);
237 if (code >= 256 || code < 0)
238 fatals("malformatted literal token `\\%03o'", code);
239 }
240 }
241 else
242 {
243 if (c == 't')
244 code = '\t';
245 else if (c == 'n')
246 code = '\n';
247 else if (c == 'a')
248 code = '\007';
249 else if (c == 'r')
250 code = '\r';
251 else if (c == 'f')
252 code = '\f';
253 else if (c == 'b')
254 code = '\b';
255 else if (c == 'v')
256 code = 013;
257 else if (c == 'x')
258 {
259 c = getc(finput);
260 while ((c <= '9' && c >= '0')
261 || (c >= 'a' && c <= 'z')
262 || (c >= 'A' && c <= 'Z'))
263 {
264 code *= 16;
265 if (c <= '9' && c >= '0')
266 code += c - '0';
267 else if (c >= 'a' && c <= 'z')
268 code += c - 'a' + 10;
269 else if (c >= 'A' && c <= 'Z')
270 code += c - 'A' + 10;
271 if (code >= 256 || code<0)/* JF this said if(c>=128) */
272 fatals("malformatted literal token `\\x%x'",code);
273 c = getc(finput);
274 }
275 ungetc(c, finput);
276 }
277 else if (c == '\\')
278 code = '\\';
279 else if (c == '\'')
280 code = '\'';
281 else if (c == '\"') /* JF this is a good idea */
282 code = '\"';
283 else
284 {
285 if (c >= 040 && c <= 0177)
286 fatals ("unknown escape sequence `\\%c'", c);
287 else
288 fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c);
289 }
290
291 c = getc(finput);
292 }
293 }
294 else
295 {
296 code = c;
297 c = getc(finput);
298 }
299 if (c != '\'')
300 fatal("multicharacter literal tokens not supported");
301
302 /* now fill token_buffer with the canonical name for this character
303 as a literal token. Do not use what the user typed,
304 so that '\012' and '\n' can be interchangeable. */
305
306 p = token_buffer;
307 *p++ = '\'';
308 if (code == '\\')
309 {
310 *p++ = '\\';
311 *p++ = '\\';
312 }
313 else if (code == '\'')
314 {
315 *p++ = '\\';
316 *p++ = '\'';
317 }
318 else if (code >= 040 && code != 0177)
319 *p++ = code;
320 else if (code == '\t')
321 {
322 *p++ = '\\';
323 *p++ = 't';
324 }
325 else if (code == '\n')
326 {
327 *p++ = '\\';
328 *p++ = 'n';
329 }
330 else if (code == '\r')
331 {
332 *p++ = '\\';
333 *p++ = 'r';
334 }
335 else if (code == '\v')
336 {
337 *p++ = '\\';
338 *p++ = 'v';
339 }
340 else if (code == '\b')
341 {
342 *p++ = '\\';
343 *p++ = 'b';
344 }
345 else if (code == '\f')
346 {
347 *p++ = '\\';
348 *p++ = 'f';
349 }
350 else
351 {
352 *p++ = code / 0100 + '0';
353 *p++ = ((code / 010) & 07) + '0';
354 *p++ = (code & 07) + '0';
355 }
356 *p++ = '\'';
357 *p = 0;
358 symval = getsym(token_buffer);
359 symval->class = STOKEN;
360 if (! symval->user_token_number)
361 symval->user_token_number = code;
362 return (IDENTIFIER);
363 }
364
365 case ',':
366 return (COMMA);
367
368 case ':':
369 return (COLON);
370
371 case ';':
372 return (SEMICOLON);
373
374 case '|':
375 return (BAR);
376
377 case '{':
378 return (LEFT_CURLY);
379
380 case '=':
381 do
382 {
383 c = getc(finput);
384 if (c == '\n') lineno++;
385 }
386 while(c==' ' || c=='\n' || c=='\t');
387
388 if (c == '{')
389 return(LEFT_CURLY);
390 else
391 {
392 ungetc(c, finput);
393 return(ILLEGAL);
394 }
395
396 case '<':
397 p = token_buffer;
398 c = getc(finput);
399 while (c != '>')
400 {
401 if (c == '\n' || c == EOF)
402 fatal("unterminated type name");
403
404 if (p == token_buffer + maxtoken)
405 p = grow_token_buffer(p);
406
407 *p++ = c;
408 c = getc(finput);
409 }
410 *p = 0;
411 return (TYPENAME);
412
413
414 case '%':
415 return (parse_percent_token());
416
417 default:
418 return (ILLEGAL);
419 }
420 }
421
422
423 /* parse a token which starts with %. Assumes the % has already been read and discarded. */
424
425 int
426 parse_percent_token ()
427 {
428 register int c;
429 register char *p;
430
431 p = token_buffer;
432 c = getc(finput);
433
434 switch (c)
435 {
436 case '%':
437 return (TWO_PERCENTS);
438
439 case '{':
440 return (PERCENT_LEFT_CURLY);
441
442 case '<':
443 return (LEFT);
444
445 case '>':
446 return (RIGHT);
447
448 case '2':
449 return (NONASSOC);
450
451 case '0':
452 return (TOKEN);
453
454 case '=':
455 return (PREC);
456 }
457 if (!isalpha(c))
458 return (ILLEGAL);
459
460 while (isalpha(c) || c == '_')
461 {
462 if (p == token_buffer + maxtoken)
463 p = grow_token_buffer(p);
464
465 *p++ = c;
466 c = getc(finput);
467 }
468
469 ungetc(c, finput);
470
471 *p = 0;
472
473 if (strcmp(token_buffer, "token") == 0
474 ||
475 strcmp(token_buffer, "term") == 0)
476 return (TOKEN);
477 else if (strcmp(token_buffer, "nterm") == 0)
478 return (NTERM);
479 else if (strcmp(token_buffer, "type") == 0)
480 return (TYPE);
481 else if (strcmp(token_buffer, "guard") == 0)
482 return (GUARD);
483 else if (strcmp(token_buffer, "union") == 0)
484 return (UNION);
485 else if (strcmp(token_buffer, "expect") == 0)
486 return (EXPECT);
487 else if (strcmp(token_buffer, "start") == 0)
488 return (START);
489 else if (strcmp(token_buffer, "left") == 0)
490 return (LEFT);
491 else if (strcmp(token_buffer, "right") == 0)
492 return (RIGHT);
493 else if (strcmp(token_buffer, "nonassoc") == 0
494 ||
495 strcmp(token_buffer, "binary") == 0)
496 return (NONASSOC);
497 else if (strcmp(token_buffer, "semantic_parser") == 0)
498 return (SEMANTIC_PARSER);
499 else if (strcmp(token_buffer, "pure_parser") == 0)
500 return (PURE_PARSER);
501 else if (strcmp(token_buffer, "prec") == 0)
502 return (PREC);
503 else return (ILLEGAL);
504 }