]>
Commit | Line | Data |
---|---|---|
40675e7c DM |
1 | /* Token-reader for Bison's input parser, |
2 | Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of Bison, the GNU Compiler Compiler. | |
5 | ||
6 | Bison is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | Bison is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with Bison; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
19 | ||
20 | ||
21 | /* | |
22 | lex() is the entry point. It is called from reader.c. | |
23 | It returns one of the token-type codes defined in lex.h. | |
24 | When an identifier is seen, the code IDENTIFIER is returned | |
25 | and the name is looked up in the symbol table using symtab.c; | |
26 | symval is set to a pointer to the entry found. */ | |
27 | ||
28 | #include <stdio.h> | |
29 | #include <ctype.h> | |
30 | #include "system.h" | |
31 | #include "files.h" | |
32 | #include "symtab.h" | |
33 | #include "lex.h" | |
34 | #include "new.h" | |
35 | ||
36 | ||
37 | extern int lineno; | |
38 | extern int translations; | |
39 | ||
40 | int parse_percent_token(); | |
41 | ||
42 | extern void fatals(); | |
43 | extern void fatal(); | |
44 | ||
45 | /* Buffer for storing the current token. */ | |
46 | char *token_buffer; | |
47 | ||
48 | /* Allocated size of token_buffer, not including space for terminator. */ | |
49 | static int maxtoken; | |
50 | ||
51 | bucket *symval; | |
52 | int numval; | |
53 | ||
54 | static int unlexed; /* these two describe a token to be reread */ | |
55 | static bucket *unlexed_symval; /* by the next call to lex */ | |
56 | ||
57 | ||
58 | void | |
59 | init_lex() | |
60 | { | |
61 | maxtoken = 100; | |
62 | token_buffer = NEW2 (maxtoken + 1, char); | |
63 | unlexed = -1; | |
64 | } | |
65 | ||
66 | ||
67 | static char * | |
68 | grow_token_buffer (p) | |
69 | char *p; | |
70 | { | |
71 | int offset = p - token_buffer; | |
72 | maxtoken *= 2; | |
73 | token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); | |
74 | return token_buffer + offset; | |
75 | } | |
76 | ||
77 | ||
78 | int | |
79 | skip_white_space() | |
80 | { | |
81 | register int c; | |
82 | register int inside; | |
83 | ||
84 | c = getc(finput); | |
85 | ||
86 | for (;;) | |
87 | { | |
88 | int cplus_comment; | |
89 | ||
90 | switch (c) | |
91 | { | |
92 | case '/': | |
93 | c = getc(finput); | |
94 | if (c != '*' && c != '/') | |
95 | fatals("unexpected `/%c' found",c); | |
96 | cplus_comment = (c == '/'); | |
97 | ||
98 | c = getc(finput); | |
99 | ||
100 | inside = 1; | |
101 | while (inside) | |
102 | { | |
103 | if (!cplus_comment && c == '*') | |
104 | { | |
105 | while (c == '*') | |
106 | c = getc(finput); | |
107 | ||
108 | if (c == '/') | |
109 | { | |
110 | inside = 0; | |
111 | c = getc(finput); | |
112 | } | |
113 | } | |
114 | else if (c == '\n') | |
115 | { | |
116 | lineno++; | |
117 | if (cplus_comment) | |
118 | inside = 0; | |
119 | c = getc(finput); | |
120 | } | |
121 | else if (c == EOF) | |
122 | fatal("unterminated comment"); | |
123 | else | |
124 | c = getc(finput); | |
125 | } | |
126 | ||
127 | break; | |
128 | ||
129 | case '\n': | |
130 | lineno++; | |
131 | ||
132 | case ' ': | |
133 | case '\t': | |
134 | case '\f': | |
135 | c = getc(finput); | |
136 | break; | |
137 | ||
138 | default: | |
139 | return (c); | |
140 | } | |
141 | } | |
142 | } | |
143 | ||
144 | ||
145 | void | |
146 | unlex(token) | |
147 | int token; | |
148 | { | |
149 | unlexed = token; | |
150 | unlexed_symval = symval; | |
151 | } | |
152 | ||
153 | ||
154 | ||
155 | int | |
156 | lex() | |
157 | { | |
158 | register int c; | |
159 | register char *p; | |
160 | ||
161 | if (unlexed >= 0) | |
162 | { | |
163 | symval = unlexed_symval; | |
164 | c = unlexed; | |
165 | unlexed = -1; | |
166 | return (c); | |
167 | } | |
168 | ||
169 | c = skip_white_space(); | |
170 | ||
171 | switch (c) | |
172 | { | |
173 | case EOF: | |
174 | return (ENDFILE); | |
175 | ||
176 | case 'A': case 'B': case 'C': case 'D': case 'E': | |
177 | case 'F': case 'G': case 'H': case 'I': case 'J': | |
178 | case 'K': case 'L': case 'M': case 'N': case 'O': | |
179 | case 'P': case 'Q': case 'R': case 'S': case 'T': | |
180 | case 'U': case 'V': case 'W': case 'X': case 'Y': | |
181 | case 'Z': | |
182 | case 'a': case 'b': case 'c': case 'd': case 'e': | |
183 | case 'f': case 'g': case 'h': case 'i': case 'j': | |
184 | case 'k': case 'l': case 'm': case 'n': case 'o': | |
185 | case 'p': case 'q': case 'r': case 's': case 't': | |
186 | case 'u': case 'v': case 'w': case 'x': case 'y': | |
187 | case 'z': | |
188 | case '.': case '_': | |
189 | p = token_buffer; | |
190 | while (isalnum(c) || c == '_' || c == '.') | |
191 | { | |
192 | if (p == token_buffer + maxtoken) | |
193 | p = grow_token_buffer(p); | |
194 | ||
195 | *p++ = c; | |
196 | c = getc(finput); | |
197 | } | |
198 | ||
199 | *p = 0; | |
200 | ungetc(c, finput); | |
201 | symval = getsym(token_buffer); | |
202 | return (IDENTIFIER); | |
203 | ||
204 | case '0': case '1': case '2': case '3': case '4': | |
205 | case '5': case '6': case '7': case '8': case '9': | |
206 | { | |
207 | numval = 0; | |
208 | ||
209 | while (isdigit(c)) | |
210 | { | |
211 | numval = numval*10 + c - '0'; | |
212 | c = getc(finput); | |
213 | } | |
214 | ungetc(c, finput); | |
215 | return (NUMBER); | |
216 | } | |
217 | ||
218 | case '\'': | |
219 | translations = -1; | |
220 | ||
221 | /* parse the literal token and compute character code in code */ | |
222 | ||
223 | c = getc(finput); | |
224 | { | |
225 | register int code = 0; | |
226 | ||
227 | if (c == '\\') | |
228 | { | |
229 | c = getc(finput); | |
230 | ||
231 | if (c <= '7' && c >= '0') | |
232 | { | |
233 | while (c <= '7' && c >= '0') | |
234 | { | |
235 | code = (code * 8) + (c - '0'); | |
236 | c = getc(finput); | |
237 | if (code >= 256 || code < 0) | |
238 | fatals("malformatted literal token `\\%03o'", code); | |
239 | } | |
240 | } | |
241 | else | |
242 | { | |
243 | if (c == 't') | |
244 | code = '\t'; | |
245 | else if (c == 'n') | |
246 | code = '\n'; | |
247 | else if (c == 'a') | |
248 | code = '\007'; | |
249 | else if (c == 'r') | |
250 | code = '\r'; | |
251 | else if (c == 'f') | |
252 | code = '\f'; | |
253 | else if (c == 'b') | |
254 | code = '\b'; | |
255 | else if (c == 'v') | |
256 | code = 013; | |
257 | else if (c == 'x') | |
258 | { | |
259 | c = getc(finput); | |
260 | while ((c <= '9' && c >= '0') | |
261 | || (c >= 'a' && c <= 'z') | |
262 | || (c >= 'A' && c <= 'Z')) | |
263 | { | |
264 | code *= 16; | |
265 | if (c <= '9' && c >= '0') | |
266 | code += c - '0'; | |
267 | else if (c >= 'a' && c <= 'z') | |
268 | code += c - 'a' + 10; | |
269 | else if (c >= 'A' && c <= 'Z') | |
270 | code += c - 'A' + 10; | |
271 | if (code >= 256 || code<0)/* JF this said if(c>=128) */ | |
272 | fatals("malformatted literal token `\\x%x'",code); | |
273 | c = getc(finput); | |
274 | } | |
275 | ungetc(c, finput); | |
276 | } | |
277 | else if (c == '\\') | |
278 | code = '\\'; | |
279 | else if (c == '\'') | |
280 | code = '\''; | |
281 | else if (c == '\"') /* JF this is a good idea */ | |
282 | code = '\"'; | |
283 | else | |
284 | { | |
285 | if (c >= 040 && c <= 0177) | |
286 | fatals ("unknown escape sequence `\\%c'", c); | |
287 | else | |
288 | fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c); | |
289 | } | |
290 | ||
291 | c = getc(finput); | |
292 | } | |
293 | } | |
294 | else | |
295 | { | |
296 | code = c; | |
297 | c = getc(finput); | |
298 | } | |
299 | if (c != '\'') | |
300 | fatal("multicharacter literal tokens not supported"); | |
301 | ||
302 | /* now fill token_buffer with the canonical name for this character | |
303 | as a literal token. Do not use what the user typed, | |
304 | so that '\012' and '\n' can be interchangeable. */ | |
305 | ||
306 | p = token_buffer; | |
307 | *p++ = '\''; | |
308 | if (code == '\\') | |
309 | { | |
310 | *p++ = '\\'; | |
311 | *p++ = '\\'; | |
312 | } | |
313 | else if (code == '\'') | |
314 | { | |
315 | *p++ = '\\'; | |
316 | *p++ = '\''; | |
317 | } | |
318 | else if (code >= 040 && code != 0177) | |
319 | *p++ = code; | |
320 | else if (code == '\t') | |
321 | { | |
322 | *p++ = '\\'; | |
323 | *p++ = 't'; | |
324 | } | |
325 | else if (code == '\n') | |
326 | { | |
327 | *p++ = '\\'; | |
328 | *p++ = 'n'; | |
329 | } | |
330 | else if (code == '\r') | |
331 | { | |
332 | *p++ = '\\'; | |
333 | *p++ = 'r'; | |
334 | } | |
335 | else if (code == '\v') | |
336 | { | |
337 | *p++ = '\\'; | |
338 | *p++ = 'v'; | |
339 | } | |
340 | else if (code == '\b') | |
341 | { | |
342 | *p++ = '\\'; | |
343 | *p++ = 'b'; | |
344 | } | |
345 | else if (code == '\f') | |
346 | { | |
347 | *p++ = '\\'; | |
348 | *p++ = 'f'; | |
349 | } | |
350 | else | |
351 | { | |
352 | *p++ = code / 0100 + '0'; | |
353 | *p++ = ((code / 010) & 07) + '0'; | |
354 | *p++ = (code & 07) + '0'; | |
355 | } | |
356 | *p++ = '\''; | |
357 | *p = 0; | |
358 | symval = getsym(token_buffer); | |
359 | symval->class = STOKEN; | |
360 | if (! symval->user_token_number) | |
361 | symval->user_token_number = code; | |
362 | return (IDENTIFIER); | |
363 | } | |
364 | ||
365 | case ',': | |
366 | return (COMMA); | |
367 | ||
368 | case ':': | |
369 | return (COLON); | |
370 | ||
371 | case ';': | |
372 | return (SEMICOLON); | |
373 | ||
374 | case '|': | |
375 | return (BAR); | |
376 | ||
377 | case '{': | |
378 | return (LEFT_CURLY); | |
379 | ||
380 | case '=': | |
381 | do | |
382 | { | |
383 | c = getc(finput); | |
384 | if (c == '\n') lineno++; | |
385 | } | |
386 | while(c==' ' || c=='\n' || c=='\t'); | |
387 | ||
388 | if (c == '{') | |
389 | return(LEFT_CURLY); | |
390 | else | |
391 | { | |
392 | ungetc(c, finput); | |
393 | return(ILLEGAL); | |
394 | } | |
395 | ||
396 | case '<': | |
397 | p = token_buffer; | |
398 | c = getc(finput); | |
399 | while (c != '>') | |
400 | { | |
401 | if (c == '\n' || c == EOF) | |
402 | fatal("unterminated type name"); | |
403 | ||
404 | if (p == token_buffer + maxtoken) | |
405 | p = grow_token_buffer(p); | |
406 | ||
407 | *p++ = c; | |
408 | c = getc(finput); | |
409 | } | |
410 | *p = 0; | |
411 | return (TYPENAME); | |
412 | ||
413 | ||
414 | case '%': | |
415 | return (parse_percent_token()); | |
416 | ||
417 | default: | |
418 | return (ILLEGAL); | |
419 | } | |
420 | } | |
421 | ||
422 | ||
423 | /* parse a token which starts with %. Assumes the % has already been read and discarded. */ | |
424 | ||
425 | int | |
426 | parse_percent_token () | |
427 | { | |
428 | register int c; | |
429 | register char *p; | |
430 | ||
431 | p = token_buffer; | |
432 | c = getc(finput); | |
433 | ||
434 | switch (c) | |
435 | { | |
436 | case '%': | |
437 | return (TWO_PERCENTS); | |
438 | ||
439 | case '{': | |
440 | return (PERCENT_LEFT_CURLY); | |
441 | ||
442 | case '<': | |
443 | return (LEFT); | |
444 | ||
445 | case '>': | |
446 | return (RIGHT); | |
447 | ||
448 | case '2': | |
449 | return (NONASSOC); | |
450 | ||
451 | case '0': | |
452 | return (TOKEN); | |
453 | ||
454 | case '=': | |
455 | return (PREC); | |
456 | } | |
457 | if (!isalpha(c)) | |
458 | return (ILLEGAL); | |
459 | ||
460 | while (isalpha(c) || c == '_') | |
461 | { | |
462 | if (p == token_buffer + maxtoken) | |
463 | p = grow_token_buffer(p); | |
464 | ||
465 | *p++ = c; | |
466 | c = getc(finput); | |
467 | } | |
468 | ||
469 | ungetc(c, finput); | |
470 | ||
471 | *p = 0; | |
472 | ||
473 | if (strcmp(token_buffer, "token") == 0 | |
474 | || | |
475 | strcmp(token_buffer, "term") == 0) | |
476 | return (TOKEN); | |
477 | else if (strcmp(token_buffer, "nterm") == 0) | |
478 | return (NTERM); | |
479 | else if (strcmp(token_buffer, "type") == 0) | |
480 | return (TYPE); | |
481 | else if (strcmp(token_buffer, "guard") == 0) | |
482 | return (GUARD); | |
483 | else if (strcmp(token_buffer, "union") == 0) | |
484 | return (UNION); | |
485 | else if (strcmp(token_buffer, "expect") == 0) | |
486 | return (EXPECT); | |
487 | else if (strcmp(token_buffer, "start") == 0) | |
488 | return (START); | |
489 | else if (strcmp(token_buffer, "left") == 0) | |
490 | return (LEFT); | |
491 | else if (strcmp(token_buffer, "right") == 0) | |
492 | return (RIGHT); | |
493 | else if (strcmp(token_buffer, "nonassoc") == 0 | |
494 | || | |
495 | strcmp(token_buffer, "binary") == 0) | |
496 | return (NONASSOC); | |
497 | else if (strcmp(token_buffer, "semantic_parser") == 0) | |
498 | return (SEMANTIC_PARSER); | |
499 | else if (strcmp(token_buffer, "pure_parser") == 0) | |
500 | return (PURE_PARSER); | |
501 | else if (strcmp(token_buffer, "prec") == 0) | |
502 | return (PREC); | |
503 | else return (ILLEGAL); | |
504 | } |