]> git.saurik.com Git - bison.git/blob - src/scan-gram.l
* bootstrap: Comment out the AM_CPPFLAGS line, since we don't use
[bison.git] / src / scan-gram.l
1 /* Bison Grammar Scanner -*- C -*-
2
3 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
4
5 This file is part of Bison, the GNU Compiler Compiler.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301 USA
21 */
22
23 %option debug nodefault nounput noyywrap never-interactive
24 %option prefix="gram_" outfile="lex.yy.c"
25
26 %{
27 /* Work around a bug in flex 2.5.31. See Debian bug 333231
28 <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=333231>. */
29 #undef gram_wrap
30 #define gram_wrap() 1
31
32 #include "system.h"
33
34 #include <mbswidth.h>
35 #include <quote.h>
36
37 #include "complain.h"
38 #include "files.h"
39 #include "getargs.h"
40 #include "gram.h"
41 #include "quotearg.h"
42 #include "reader.h"
43 #include "uniqstr.h"
44
45 #define YY_USER_INIT \
46 do \
47 { \
48 scanner_cursor.file = current_file; \
49 scanner_cursor.line = 1; \
50 scanner_cursor.column = 1; \
51 code_start = scanner_cursor; \
52 } \
53 while (0)
54
55 /* Pacify "gcc -Wmissing-prototypes" when flex 2.5.31 is used. */
56 int gram_get_lineno (void);
57 FILE *gram_get_in (void);
58 FILE *gram_get_out (void);
59 int gram_get_leng (void);
60 char *gram_get_text (void);
61 void gram_set_lineno (int);
62 void gram_set_in (FILE *);
63 void gram_set_out (FILE *);
64 int gram_get_debug (void);
65 void gram_set_debug (int);
66 int gram_lex_destroy (void);
67
68 /* Location of scanner cursor. */
69 boundary scanner_cursor;
70
71 static void adjust_location (location *, char const *, size_t);
72 #define YY_USER_ACTION adjust_location (loc, yytext, yyleng);
73
74 static size_t no_cr_read (FILE *, char *, size_t);
75 #define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
76
77
78 /* OBSTACK_FOR_STRING -- Used to store all the characters that we need to
79 keep (to construct ID, STRINGS etc.). Use the following macros to
80 use it.
81
82 Use STRING_GROW to append what has just been matched, and
83 STRING_FINISH to end the string (it puts the ending 0).
84 STRING_FINISH also stores this string in LAST_STRING, which can be
85 used, and which is used by STRING_FREE to free the last string. */
86
87 static struct obstack obstack_for_string;
88
89 /* A string representing the most recently saved token. */
90 char *last_string;
91
92 /* The location of the most recently saved token, if it was a
93 BRACED_CODE token; otherwise, this has an unspecified value. */
94 location last_braced_code_loc;
95
96 #define STRING_GROW \
97 obstack_grow (&obstack_for_string, yytext, yyleng)
98
99 #define STRING_FINISH \
100 do { \
101 obstack_1grow (&obstack_for_string, '\0'); \
102 last_string = obstack_finish (&obstack_for_string); \
103 } while (0)
104
105 #define STRING_FREE \
106 obstack_free (&obstack_for_string, last_string)
107
108 void
109 scanner_last_string_free (void)
110 {
111 STRING_FREE;
112 }
113
114 /* Within well-formed rules, RULE_LENGTH is the number of values in
115 the current rule so far, which says where to find `$0' with respect
116 to the top of the stack. It is not the same as the rule->length in
117 the case of mid rule actions.
118
119 Outside of well-formed rules, RULE_LENGTH has an undefined value. */
120 static int rule_length;
121
122 static void rule_length_overflow (location) __attribute__ ((__noreturn__));
123
124 /* Increment the rule length by one, checking for overflow. */
125 static inline void
126 increment_rule_length (location loc)
127 {
128 rule_length++;
129
130 /* Don't allow rule_length == INT_MAX, since that might cause
131 confusion with strtol if INT_MAX == LONG_MAX. */
132 if (rule_length == INT_MAX)
133 rule_length_overflow (loc);
134 }
135
136 static void handle_dollar (int token_type, char *cp, location loc);
137 static void handle_at (int token_type, char *cp, location loc);
138 static void handle_syncline (char *, location);
139 static unsigned long int scan_integer (char const *p, int base, location loc);
140 static int convert_ucn_to_byte (char const *hex_text);
141 static void unexpected_eof (boundary, char const *);
142 static void unexpected_newline (boundary, char const *);
143
144 %}
145 %x SC_COMMENT SC_LINE_COMMENT SC_YACC_COMMENT
146 %x SC_STRING SC_CHARACTER
147 %x SC_AFTER_IDENTIFIER
148 %x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
149 %x SC_PRE_CODE SC_BRACED_CODE SC_PROLOGUE SC_EPILOGUE
150
151 letter [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
152 id {letter}({letter}|[0-9])*
153 directive %{letter}({letter}|[0-9]|-)*
154 int [0-9]+
155
156 /* POSIX says that a tag must be both an id and a C union member, but
157 historically almost any character is allowed in a tag. We disallow
158 NUL and newline, as this simplifies our implementation. */
159 tag [^\0\n>]+
160
161 /* Zero or more instances of backslash-newline. Following GCC, allow
162 white space between the backslash and the newline. */
163 splice (\\[ \f\t\v]*\n)*
164
165 %%
166 %{
167 /* Nesting level of the current code in braces. */
168 int braces_level IF_LINT (= 0);
169
170 /* Parent context state, when applicable. */
171 int context_state IF_LINT (= 0);
172
173 /* Token type to return, when applicable. */
174 int token_type IF_LINT (= 0);
175
176 /* Location of most recent identifier, when applicable. */
177 location id_loc IF_LINT (= empty_location);
178
179 /* Where containing code started, when applicable. Its initial
180 value is relevant only when yylex is invoked in the SC_EPILOGUE
181 start condition. */
182 boundary code_start = scanner_cursor;
183
184 /* Where containing comment or string or character literal started,
185 when applicable. */
186 boundary token_start IF_LINT (= scanner_cursor);
187 %}
188
189
190 /*-----------------------.
191 | Scanning white space. |
192 `-----------------------*/
193
194 <INITIAL,SC_AFTER_IDENTIFIER,SC_PRE_CODE>
195 {
196 /* Comments and white space. */
197 "," warn_at (*loc, _("stray `,' treated as white space"));
198 [ \f\n\t\v] |
199 "//".* ;
200 "/*" {
201 token_start = loc->start;
202 context_state = YY_START;
203 BEGIN SC_YACC_COMMENT;
204 }
205
206 /* #line directives are not documented, and may be withdrawn or
207 modified in future versions of Bison. */
208 ^"#line "{int}" \"".*"\"\n" {
209 handle_syncline (yytext + sizeof "#line " - 1, *loc);
210 }
211 }
212
213
214 /*----------------------------.
215 | Scanning Bison directives. |
216 `----------------------------*/
217 <INITIAL>
218 {
219 "%binary" return PERCENT_NONASSOC;
220 "%debug" return PERCENT_DEBUG;
221 "%default"[-_]"prec" return PERCENT_DEFAULT_PREC;
222 "%define" return PERCENT_DEFINE;
223 "%defines" return PERCENT_DEFINES;
224 "%destructor" token_type = PERCENT_DESTRUCTOR; BEGIN SC_PRE_CODE;
225 "%dprec" return PERCENT_DPREC;
226 "%error"[-_]"verbose" return PERCENT_ERROR_VERBOSE;
227 "%expect" return PERCENT_EXPECT;
228 "%expect"[-_]"rr" return PERCENT_EXPECT_RR;
229 "%file-prefix" return PERCENT_FILE_PREFIX;
230 "%fixed"[-_]"output"[-_]"files" return PERCENT_YACC;
231 "%initial-action" token_type = PERCENT_INITIAL_ACTION; BEGIN SC_PRE_CODE;
232 "%glr-parser" return PERCENT_GLR_PARSER;
233 "%left" return PERCENT_LEFT;
234 "%lex-param" token_type = PERCENT_LEX_PARAM; BEGIN SC_PRE_CODE;
235 "%locations" return PERCENT_LOCATIONS;
236 "%merge" return PERCENT_MERGE;
237 "%name"[-_]"prefix" return PERCENT_NAME_PREFIX;
238 "%no"[-_]"default"[-_]"prec" return PERCENT_NO_DEFAULT_PREC;
239 "%no"[-_]"lines" return PERCENT_NO_LINES;
240 "%nonassoc" return PERCENT_NONASSOC;
241 "%nondeterministic-parser" return PERCENT_NONDETERMINISTIC_PARSER;
242 "%nterm" return PERCENT_NTERM;
243 "%output" return PERCENT_OUTPUT;
244 "%parse-param" token_type = PERCENT_PARSE_PARAM; BEGIN SC_PRE_CODE;
245 "%prec" rule_length--; return PERCENT_PREC;
246 "%printer" token_type = PERCENT_PRINTER; BEGIN SC_PRE_CODE;
247 "%pure"[-_]"parser" return PERCENT_PURE_PARSER;
248 "%require" return PERCENT_REQUIRE;
249 "%right" return PERCENT_RIGHT;
250 "%skeleton" return PERCENT_SKELETON;
251 "%start" return PERCENT_START;
252 "%term" return PERCENT_TOKEN;
253 "%token" return PERCENT_TOKEN;
254 "%token"[-_]"table" return PERCENT_TOKEN_TABLE;
255 "%type" return PERCENT_TYPE;
256 "%union" token_type = PERCENT_UNION; BEGIN SC_PRE_CODE;
257 "%verbose" return PERCENT_VERBOSE;
258 "%yacc" return PERCENT_YACC;
259
260 {directive} {
261 complain_at (*loc, _("invalid directive: %s"), quote (yytext));
262 }
263
264 "=" return EQUAL;
265 "|" rule_length = 0; return PIPE;
266 ";" return SEMICOLON;
267
268 {id} {
269 val->symbol = symbol_get (yytext, *loc);
270 id_loc = *loc;
271 increment_rule_length (*loc);
272 BEGIN SC_AFTER_IDENTIFIER;
273 }
274
275 {int} {
276 val->integer = scan_integer (yytext, 10, *loc);
277 return INT;
278 }
279 0[xX][0-9abcdefABCDEF]+ {
280 val->integer = scan_integer (yytext, 16, *loc);
281 return INT;
282 }
283
284 /* Characters. We don't check there is only one. */
285 "'" STRING_GROW; token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;
286
287 /* Strings. */
288 "\"" token_start = loc->start; BEGIN SC_ESCAPED_STRING;
289
290 /* Prologue. */
291 "%{" code_start = loc->start; BEGIN SC_PROLOGUE;
292
293 /* Code in between braces. */
294 "{" {
295 if (current_rule && current_rule->action)
296 grammar_midrule_action ();
297 STRING_GROW;
298 token_type = BRACED_CODE;
299 braces_level = 0;
300 code_start = loc->start;
301 BEGIN SC_BRACED_CODE;
302 }
303
304 /* A type. */
305 "<"{tag}">" {
306 obstack_grow (&obstack_for_string, yytext + 1, yyleng - 2);
307 STRING_FINISH;
308 val->uniqstr = uniqstr_new (last_string);
309 STRING_FREE;
310 return TYPE;
311 }
312
313 "%%" {
314 static int percent_percent_count;
315 if (++percent_percent_count == 2)
316 BEGIN SC_EPILOGUE;
317 return PERCENT_PERCENT;
318 }
319
320 . {
321 complain_at (*loc, _("invalid character: %s"), quote (yytext));
322 }
323
324 <<EOF>> {
325 loc->start = loc->end = scanner_cursor;
326 yyterminate ();
327 }
328 }
329
330
331 /*-----------------------------------------------------------------.
332 | Scanning after an identifier, checking whether a colon is next. |
333 `-----------------------------------------------------------------*/
334
335 <SC_AFTER_IDENTIFIER>
336 {
337 ":" {
338 rule_length = 0;
339 *loc = id_loc;
340 BEGIN INITIAL;
341 return ID_COLON;
342 }
343 . {
344 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
345 yyless (0);
346 *loc = id_loc;
347 BEGIN INITIAL;
348 return ID;
349 }
350 <<EOF>> {
351 *loc = id_loc;
352 BEGIN INITIAL;
353 return ID;
354 }
355 }
356
357
358 /*---------------------------------------------------------------.
359 | Scanning a Yacc comment. The initial `/ *' is already eaten. |
360 `---------------------------------------------------------------*/
361
362 <SC_YACC_COMMENT>
363 {
364 "*/" BEGIN context_state;
365 .|\n ;
366 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
367 }
368
369
370 /*------------------------------------------------------------.
371 | Scanning a C comment. The initial `/ *' is already eaten. |
372 `------------------------------------------------------------*/
373
374 <SC_COMMENT>
375 {
376 "*"{splice}"/" STRING_GROW; BEGIN context_state;
377 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
378 }
379
380
381 /*--------------------------------------------------------------.
382 | Scanning a line comment. The initial `//' is already eaten. |
383 `--------------------------------------------------------------*/
384
385 <SC_LINE_COMMENT>
386 {
387 "\n" STRING_GROW; BEGIN context_state;
388 {splice} STRING_GROW;
389 <<EOF>> BEGIN context_state;
390 }
391
392
393 /*------------------------------------------------.
394 | Scanning a Bison string, including its escapes. |
395 | The initial quote is already eaten. |
396 `------------------------------------------------*/
397
398 <SC_ESCAPED_STRING>
399 {
400 "\"" {
401 STRING_FINISH;
402 loc->start = token_start;
403 val->chars = last_string;
404 increment_rule_length (*loc);
405 BEGIN INITIAL;
406 return STRING;
407 }
408 \n unexpected_newline (token_start, "\""); BEGIN INITIAL;
409 <<EOF>> unexpected_eof (token_start, "\""); BEGIN INITIAL;
410 }
411
412 /*----------------------------------------------------------.
413 | Scanning a Bison character literal, decoding its escapes. |
414 | The initial quote is already eaten. |
415 `----------------------------------------------------------*/
416
417 <SC_ESCAPED_CHARACTER>
418 {
419 "'" {
420 unsigned char last_string_1;
421 STRING_GROW;
422 STRING_FINISH;
423 loc->start = token_start;
424 val->symbol = symbol_get (quotearg_style (escape_quoting_style,
425 last_string),
426 *loc);
427 symbol_class_set (val->symbol, token_sym, *loc, false);
428 last_string_1 = last_string[1];
429 symbol_user_token_number_set (val->symbol, last_string_1, *loc);
430 STRING_FREE;
431 increment_rule_length (*loc);
432 BEGIN INITIAL;
433 return ID;
434 }
435 \n unexpected_newline (token_start, "'"); BEGIN INITIAL;
436 <<EOF>> unexpected_eof (token_start, "'"); BEGIN INITIAL;
437 }
438
439 <SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING>
440 {
441 \0 complain_at (*loc, _("invalid null character"));
442 }
443
444
445 /*----------------------------.
446 | Decode escaped characters. |
447 `----------------------------*/
448
449 <SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
450 {
451 \\[0-7]{1,3} {
452 unsigned long int c = strtoul (yytext + 1, NULL, 8);
453 if (UCHAR_MAX < c)
454 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
455 else if (! c)
456 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
457 else
458 obstack_1grow (&obstack_for_string, c);
459 }
460
461 \\x[0-9abcdefABCDEF]+ {
462 verify (UCHAR_MAX < ULONG_MAX);
463 unsigned long int c = strtoul (yytext + 2, NULL, 16);
464 if (UCHAR_MAX < c)
465 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
466 else if (! c)
467 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
468 else
469 obstack_1grow (&obstack_for_string, c);
470 }
471
472 \\a obstack_1grow (&obstack_for_string, '\a');
473 \\b obstack_1grow (&obstack_for_string, '\b');
474 \\f obstack_1grow (&obstack_for_string, '\f');
475 \\n obstack_1grow (&obstack_for_string, '\n');
476 \\r obstack_1grow (&obstack_for_string, '\r');
477 \\t obstack_1grow (&obstack_for_string, '\t');
478 \\v obstack_1grow (&obstack_for_string, '\v');
479
480 /* \\[\"\'?\\] would be shorter, but it confuses xgettext. */
481 \\("\""|"'"|"?"|"\\") obstack_1grow (&obstack_for_string, yytext[1]);
482
483 \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
484 int c = convert_ucn_to_byte (yytext);
485 if (c < 0)
486 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
487 else if (! c)
488 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
489 else
490 obstack_1grow (&obstack_for_string, c);
491 }
492 \\(.|\n) {
493 complain_at (*loc, _("unrecognized escape sequence: %s"), quote (yytext));
494 STRING_GROW;
495 }
496 }
497
498 /*--------------------------------------------.
499 | Scanning user-code characters and strings. |
500 `--------------------------------------------*/
501
502 <SC_CHARACTER,SC_STRING>
503 {
504 {splice}|\\{splice}[^\n$@\[\]] STRING_GROW;
505 }
506
507 <SC_CHARACTER>
508 {
509 "'" STRING_GROW; BEGIN context_state;
510 \n unexpected_newline (token_start, "'"); BEGIN context_state;
511 <<EOF>> unexpected_eof (token_start, "'"); BEGIN context_state;
512 }
513
514 <SC_STRING>
515 {
516 "\"" STRING_GROW; BEGIN context_state;
517 \n unexpected_newline (token_start, "\""); BEGIN context_state;
518 <<EOF>> unexpected_eof (token_start, "\""); BEGIN context_state;
519 }
520
521
522 /*---------------------------------------------------.
523 | Strings, comments etc. can be found in user code. |
524 `---------------------------------------------------*/
525
526 <SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
527 {
528 "'" {
529 STRING_GROW;
530 context_state = YY_START;
531 token_start = loc->start;
532 BEGIN SC_CHARACTER;
533 }
534 "\"" {
535 STRING_GROW;
536 context_state = YY_START;
537 token_start = loc->start;
538 BEGIN SC_STRING;
539 }
540 "/"{splice}"*" {
541 STRING_GROW;
542 context_state = YY_START;
543 token_start = loc->start;
544 BEGIN SC_COMMENT;
545 }
546 "/"{splice}"/" {
547 STRING_GROW;
548 context_state = YY_START;
549 BEGIN SC_LINE_COMMENT;
550 }
551 }
552
553
554 /*---------------------------------------------------------------.
555 | Scanning after %union etc., possibly followed by white space. |
556 | For %union only, allow arbitrary C code to appear before the |
557 | following brace, as an extension to POSIX. |
558 `---------------------------------------------------------------*/
559
560 <SC_PRE_CODE>
561 {
562 . {
563 bool valid = yytext[0] == '{' || token_type == PERCENT_UNION;
564 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
565 yyless (0);
566
567 if (valid)
568 {
569 braces_level = -1;
570 code_start = loc->start;
571 BEGIN SC_BRACED_CODE;
572 }
573 else
574 {
575 complain_at (*loc, _("missing `{' in %s"),
576 token_name (token_type));
577 obstack_sgrow (&obstack_for_string, "{}");
578 STRING_FINISH;
579 val->chars = last_string;
580 BEGIN INITIAL;
581 return token_type;
582 }
583 }
584
585 <<EOF>> unexpected_eof (scanner_cursor, "{}"); BEGIN INITIAL;
586 }
587
588
589 /*---------------------------------------------------------------.
590 | Scanning some code in braces (%union and actions). The initial |
591 | "{" is already eaten. |
592 `---------------------------------------------------------------*/
593
594 <SC_BRACED_CODE>
595 {
596 "{"|"<"{splice}"%" STRING_GROW; braces_level++;
597 "%"{splice}">" STRING_GROW; braces_level--;
598 "}" {
599 bool outer_brace = --braces_level < 0;
600
601 /* As an undocumented Bison extension, append `;' before the last
602 brace in braced code, so that the user code can omit trailing
603 `;'. But do not append `;' if emulating Yacc, since Yacc does
604 not append one.
605
606 FIXME: Bison should warn if a semicolon seems to be necessary
607 here, and should omit the semicolon if it seems unnecessary
608 (e.g., after ';', '{', or '}', each followed by comments or
609 white space). Such a warning shouldn't depend on --yacc; it
610 should depend on a new --pedantic option, which would cause
611 Bison to warn if it detects an extension to POSIX. --pedantic
612 should also diagnose other Bison extensions like %yacc.
613 Perhaps there should also be a GCC-style --pedantic-errors
614 option, so that such warnings are diagnosed as errors. */
615 if (outer_brace && token_type == BRACED_CODE && ! yacc_flag)
616 obstack_1grow (&obstack_for_string, ';');
617
618 obstack_1grow (&obstack_for_string, '}');
619
620 if (outer_brace)
621 {
622 STRING_FINISH;
623 loc->start = code_start;
624 val->chars = last_string;
625 increment_rule_length (*loc);
626 last_braced_code_loc = *loc;
627 BEGIN INITIAL;
628 return token_type;
629 }
630 }
631
632 /* Tokenize `<<%' correctly (as `<<' `%') rather than incorrrectly
633 (as `<' `<%'). */
634 "<"{splice}"<" STRING_GROW;
635
636 "$"("<"{tag}">")?(-?[0-9]+|"$") handle_dollar (token_type, yytext, *loc);
637 "@"(-?[0-9]+|"$") handle_at (token_type, yytext, *loc);
638
639 "$" {
640 warn_at (*loc, _("stray `$'"));
641 obstack_sgrow (&obstack_for_string, "$][");
642 }
643 "@" {
644 warn_at (*loc, _("stray `@'"));
645 obstack_sgrow (&obstack_for_string, "@@");
646 }
647
648 <<EOF>> unexpected_eof (code_start, "}"); BEGIN INITIAL;
649 }
650
651
652 /*--------------------------------------------------------------.
653 | Scanning some prologue: from "%{" (already scanned) to "%}". |
654 `--------------------------------------------------------------*/
655
656 <SC_PROLOGUE>
657 {
658 "%}" {
659 STRING_FINISH;
660 loc->start = code_start;
661 val->chars = last_string;
662 BEGIN INITIAL;
663 return PROLOGUE;
664 }
665
666 <<EOF>> unexpected_eof (code_start, "%}"); BEGIN INITIAL;
667 }
668
669
670 /*---------------------------------------------------------------.
671 | Scanning the epilogue (everything after the second "%%", which |
672 | has already been eaten). |
673 `---------------------------------------------------------------*/
674
675 <SC_EPILOGUE>
676 {
677 <<EOF>> {
678 STRING_FINISH;
679 loc->start = code_start;
680 val->chars = last_string;
681 BEGIN INITIAL;
682 return EPILOGUE;
683 }
684 }
685
686
687 /*-----------------------------------------.
688 | Escape M4 quoting characters in C code. |
689 `-----------------------------------------*/
690
691 <SC_COMMENT,SC_LINE_COMMENT,SC_STRING,SC_CHARACTER,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
692 {
693 \$ obstack_sgrow (&obstack_for_string, "$][");
694 \@ obstack_sgrow (&obstack_for_string, "@@");
695 \[ obstack_sgrow (&obstack_for_string, "@{");
696 \] obstack_sgrow (&obstack_for_string, "@}");
697 }
698
699
700 /*-----------------------------------------------------.
701 | By default, grow the string obstack with the input. |
702 `-----------------------------------------------------*/
703
704 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>. |
705 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>\n STRING_GROW;
706
707 %%
708
709 /* Keeps track of the maximum number of semantic values to the left of
710 a handle (those referenced by $0, $-1, etc.) are required by the
711 semantic actions of this grammar. */
712 int max_left_semantic_context = 0;
713
714 /* If BUF is null, add BUFSIZE (which in this case must be less than
715 INT_MAX) to COLUMN; otherwise, add mbsnwidth (BUF, BUFSIZE, 0) to
716 COLUMN. If an overflow occurs, or might occur but is undetectable,
717 return INT_MAX. Assume COLUMN is nonnegative. */
718
719 static inline int
720 add_column_width (int column, char const *buf, size_t bufsize)
721 {
722 size_t width;
723 unsigned int remaining_columns = INT_MAX - column;
724
725 if (buf)
726 {
727 if (INT_MAX / 2 <= bufsize)
728 return INT_MAX;
729 width = mbsnwidth (buf, bufsize, 0);
730 }
731 else
732 width = bufsize;
733
734 return width <= remaining_columns ? column + width : INT_MAX;
735 }
736
737 /* Set *LOC and adjust scanner cursor to account for token TOKEN of
738 size SIZE. */
739
740 static void
741 adjust_location (location *loc, char const *token, size_t size)
742 {
743 int line = scanner_cursor.line;
744 int column = scanner_cursor.column;
745 char const *p0 = token;
746 char const *p = token;
747 char const *lim = token + size;
748
749 loc->start = scanner_cursor;
750
751 for (p = token; p < lim; p++)
752 switch (*p)
753 {
754 case '\n':
755 line += line < INT_MAX;
756 column = 1;
757 p0 = p + 1;
758 break;
759
760 case '\t':
761 column = add_column_width (column, p0, p - p0);
762 column = add_column_width (column, NULL, 8 - ((column - 1) & 7));
763 p0 = p + 1;
764 break;
765
766 default:
767 break;
768 }
769
770 scanner_cursor.line = line;
771 scanner_cursor.column = column = add_column_width (column, p0, p - p0);
772
773 loc->end = scanner_cursor;
774
775 if (line == INT_MAX && loc->start.line != INT_MAX)
776 warn_at (*loc, _("line number overflow"));
777 if (column == INT_MAX && loc->start.column != INT_MAX)
778 warn_at (*loc, _("column number overflow"));
779 }
780
781
782 /* Read bytes from FP into buffer BUF of size SIZE. Return the
783 number of bytes read. Remove '\r' from input, treating \r\n
784 and isolated \r as \n. */
785
786 static size_t
787 no_cr_read (FILE *fp, char *buf, size_t size)
788 {
789 size_t bytes_read = fread (buf, 1, size, fp);
790 if (bytes_read)
791 {
792 char *w = memchr (buf, '\r', bytes_read);
793 if (w)
794 {
795 char const *r = ++w;
796 char const *lim = buf + bytes_read;
797
798 for (;;)
799 {
800 /* Found an '\r'. Treat it like '\n', but ignore any
801 '\n' that immediately follows. */
802 w[-1] = '\n';
803 if (r == lim)
804 {
805 int ch = getc (fp);
806 if (ch != '\n' && ungetc (ch, fp) != ch)
807 break;
808 }
809 else if (*r == '\n')
810 r++;
811
812 /* Copy until the next '\r'. */
813 do
814 {
815 if (r == lim)
816 return w - buf;
817 }
818 while ((*w++ = *r++) != '\r');
819 }
820
821 return w - buf;
822 }
823 }
824
825 return bytes_read;
826 }
827
828
829 /*------------------------------------------------------------------.
830 | TEXT is pointing to a wannabee semantic value (i.e., a `$'). |
831 | |
832 | Possible inputs: $[<TYPENAME>]($|integer) |
833 | |
834 | Output to OBSTACK_FOR_STRING a reference to this semantic value. |
835 `------------------------------------------------------------------*/
836
837 static inline bool
838 handle_action_dollar (char *text, location loc)
839 {
840 const char *type_name = NULL;
841 char *cp = text + 1;
842
843 if (! current_rule)
844 return false;
845
846 /* Get the type name if explicit. */
847 if (*cp == '<')
848 {
849 type_name = ++cp;
850 while (*cp != '>')
851 ++cp;
852 *cp = '\0';
853 ++cp;
854 }
855
856 if (*cp == '$')
857 {
858 if (!type_name)
859 type_name = symbol_list_n_type_name_get (current_rule, loc, 0);
860 if (!type_name && typed)
861 complain_at (loc, _("$$ of `%s' has no declared type"),
862 current_rule->sym->tag);
863 if (!type_name)
864 type_name = "";
865 obstack_fgrow1 (&obstack_for_string,
866 "]b4_lhs_value([%s])[", type_name);
867 current_rule->used = true;
868 }
869 else
870 {
871 long int num = strtol (cp, NULL, 10);
872
873 if (1 - INT_MAX + rule_length <= num && num <= rule_length)
874 {
875 int n = num;
876 if (max_left_semantic_context < 1 - n)
877 max_left_semantic_context = 1 - n;
878 if (!type_name && 0 < n)
879 type_name = symbol_list_n_type_name_get (current_rule, loc, n);
880 if (!type_name && typed)
881 complain_at (loc, _("$%d of `%s' has no declared type"),
882 n, current_rule->sym->tag);
883 if (!type_name)
884 type_name = "";
885 obstack_fgrow3 (&obstack_for_string,
886 "]b4_rhs_value(%d, %d, [%s])[",
887 rule_length, n, type_name);
888 symbol_list_n_used_set (current_rule, n, true);
889 }
890 else
891 complain_at (loc, _("integer out of range: %s"), quote (text));
892 }
893
894 return true;
895 }
896
897
898 /*----------------------------------------------------------------.
899 | Map `$?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
900 | (are we in an action?). |
901 `----------------------------------------------------------------*/
902
903 static void
904 handle_dollar (int token_type, char *text, location loc)
905 {
906 switch (token_type)
907 {
908 case BRACED_CODE:
909 if (handle_action_dollar (text, loc))
910 return;
911 break;
912
913 case PERCENT_DESTRUCTOR:
914 case PERCENT_INITIAL_ACTION:
915 case PERCENT_PRINTER:
916 if (text[1] == '$')
917 {
918 obstack_sgrow (&obstack_for_string, "]b4_dollar_dollar[");
919 return;
920 }
921 break;
922
923 default:
924 break;
925 }
926
927 complain_at (loc, _("invalid value: %s"), quote (text));
928 }
929
930
931 /*------------------------------------------------------.
932 | TEXT is a location token (i.e., a `@...'). Output to |
933 | OBSTACK_FOR_STRING a reference to this location. |
934 `------------------------------------------------------*/
935
936 static inline bool
937 handle_action_at (char *text, location loc)
938 {
939 char *cp = text + 1;
940 locations_flag = true;
941
942 if (! current_rule)
943 return false;
944
945 if (*cp == '$')
946 obstack_sgrow (&obstack_for_string, "]b4_lhs_location[");
947 else
948 {
949 long int num = strtol (cp, NULL, 10);
950
951 if (1 - INT_MAX + rule_length <= num && num <= rule_length)
952 {
953 int n = num;
954 obstack_fgrow2 (&obstack_for_string, "]b4_rhs_location(%d, %d)[",
955 rule_length, n);
956 }
957 else
958 complain_at (loc, _("integer out of range: %s"), quote (text));
959 }
960
961 return true;
962 }
963
964
965 /*----------------------------------------------------------------.
966 | Map `@?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
967 | (are we in an action?). |
968 `----------------------------------------------------------------*/
969
970 static void
971 handle_at (int token_type, char *text, location loc)
972 {
973 switch (token_type)
974 {
975 case BRACED_CODE:
976 handle_action_at (text, loc);
977 return;
978
979 case PERCENT_INITIAL_ACTION:
980 case PERCENT_DESTRUCTOR:
981 case PERCENT_PRINTER:
982 if (text[1] == '$')
983 {
984 obstack_sgrow (&obstack_for_string, "]b4_at_dollar[");
985 return;
986 }
987 break;
988
989 default:
990 break;
991 }
992
993 complain_at (loc, _("invalid value: %s"), quote (text));
994 }
995
996
997 /*------------------------------------------------------.
998 | Scan NUMBER for a base-BASE integer at location LOC. |
999 `------------------------------------------------------*/
1000
1001 static unsigned long int
1002 scan_integer (char const *number, int base, location loc)
1003 {
1004 verify (INT_MAX < ULONG_MAX);
1005 unsigned long int num = strtoul (number, NULL, base);
1006
1007 if (INT_MAX < num)
1008 {
1009 complain_at (loc, _("integer out of range: %s"), quote (number));
1010 num = INT_MAX;
1011 }
1012
1013 return num;
1014 }
1015
1016
1017 /*------------------------------------------------------------------.
1018 | Convert universal character name UCN to a single-byte character, |
1019 | and return that character. Return -1 if UCN does not correspond |
1020 | to a single-byte character. |
1021 `------------------------------------------------------------------*/
1022
1023 static int
1024 convert_ucn_to_byte (char const *ucn)
1025 {
1026 verify (UCHAR_MAX <= INT_MAX);
1027 unsigned long int code = strtoul (ucn + 2, NULL, 16);
1028
1029 /* FIXME: Currently we assume Unicode-compatible unibyte characters
1030 on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes). On
1031 non-ASCII hosts we support only the portable C character set.
1032 These limitations should be removed once we add support for
1033 multibyte characters. */
1034
1035 if (UCHAR_MAX < code)
1036 return -1;
1037
1038 #if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e)
1039 {
1040 /* A non-ASCII host. Use CODE to index into a table of the C
1041 basic execution character set, which is guaranteed to exist on
1042 all Standard C platforms. This table also includes '$', '@',
1043 and '`', which are not in the basic execution character set but
1044 which are unibyte characters on all the platforms that we know
1045 about. */
1046 static signed char const table[] =
1047 {
1048 '\0', -1, -1, -1, -1, -1, -1, '\a',
1049 '\b', '\t', '\n', '\v', '\f', '\r', -1, -1,
1050 -1, -1, -1, -1, -1, -1, -1, -1,
1051 -1, -1, -1, -1, -1, -1, -1, -1,
1052 ' ', '!', '"', '#', '$', '%', '&', '\'',
1053 '(', ')', '*', '+', ',', '-', '.', '/',
1054 '0', '1', '2', '3', '4', '5', '6', '7',
1055 '8', '9', ':', ';', '<', '=', '>', '?',
1056 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
1057 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
1058 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
1059 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
1060 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
1061 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
1062 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
1063 'x', 'y', 'z', '{', '|', '}', '~'
1064 };
1065
1066 code = code < sizeof table ? table[code] : -1;
1067 }
1068 #endif
1069
1070 return code;
1071 }
1072
1073
1074 /*----------------------------------------------------------------.
1075 | Handle `#line INT "FILE"'. ARGS has already skipped `#line '. |
1076 `----------------------------------------------------------------*/
1077
1078 static void
1079 handle_syncline (char *args, location loc)
1080 {
1081 char *after_num;
1082 unsigned long int lineno = strtoul (args, &after_num, 10);
1083 char *file = strchr (after_num, '"') + 1;
1084 *strchr (file, '"') = '\0';
1085 if (INT_MAX <= lineno)
1086 {
1087 warn_at (loc, _("line number overflow"));
1088 lineno = INT_MAX;
1089 }
1090 scanner_cursor.file = current_file = uniqstr_new (file);
1091 scanner_cursor.line = lineno;
1092 scanner_cursor.column = 1;
1093 }
1094
1095
1096 /*---------------------------------.
1097 | Report a rule that is too long. |
1098 `---------------------------------*/
1099
1100 static void
1101 rule_length_overflow (location loc)
1102 {
1103 fatal_at (loc, _("rule is too long"));
1104 }
1105
1106
1107 /*----------------------------------------------------------------.
1108 | For a token or comment starting at START, report message MSGID, |
1109 | which should say that an end marker was found before |
1110 | the expected TOKEN_END. |
1111 `----------------------------------------------------------------*/
1112
1113 static void
1114 unexpected_end (boundary start, char const *msgid, char const *token_end)
1115 {
1116 location loc;
1117 loc.start = start;
1118 loc.end = scanner_cursor;
1119 complain_at (loc, _(msgid), token_end);
1120 }
1121
1122
1123 /*------------------------------------------------------------------------.
1124 | Report an unexpected EOF in a token or comment starting at START. |
1125 | An end of file was encountered and the expected TOKEN_END was missing. |
1126 `------------------------------------------------------------------------*/
1127
1128 static void
1129 unexpected_eof (boundary start, char const *token_end)
1130 {
1131 unexpected_end (start, N_("missing `%s' at end of file"), token_end);
1132 }
1133
1134
1135 /*----------------------------------------.
1136 | Likewise, but for unexpected newlines. |
1137 `----------------------------------------*/
1138
1139 static void
1140 unexpected_newline (boundary start, char const *token_end)
1141 {
1142 unexpected_end (start, N_("missing `%s' at end of line"), token_end);
1143 }
1144
1145
1146 /*-------------------------.
1147 | Initialize the scanner. |
1148 `-------------------------*/
1149
1150 void
1151 scanner_initialize (void)
1152 {
1153 obstack_init (&obstack_for_string);
1154 }
1155
1156
1157 /*-----------------------------------------------.
1158 | Free all the memory allocated to the scanner. |
1159 `-----------------------------------------------*/
1160
1161 void
1162 scanner_free (void)
1163 {
1164 obstack_free (&obstack_for_string, 0);
1165 /* Reclaim Flex's buffers. */
1166 yy_delete_buffer (YY_CURRENT_BUFFER);
1167 }