1 /* Bison Grammar Scanner -*- C -*-
3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
4 Free Software Foundation, Inc.
6 This file is part of Bison, the GNU Compiler Compiler.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 %option debug nodefault nounput noyywrap never-interactive
22 %option prefix="gram_" outfile="lex.yy.c"
25 /* Work around a bug in flex 2.5.31. See Debian bug 333231
26 <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=333231>. */
30 #define FLEX_PREFIX(Id) gram_ ## Id
31 #include <src/flex-scanner.h>
33 #include <src/complain.h>
34 #include <src/files.h>
37 #include <src/reader.h>
38 #include <src/uniqstr.h>
43 #include <src/scan-gram.h>
45 #define YY_DECL GRAM_LEX_DECL
47 #define YY_USER_INIT \
48 code_start = scanner_cursor = loc->start; \
50 /* Location of scanner cursor. */
51 static boundary scanner_cursor;
53 #define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext, yyleng);
55 static size_t no_cr_read (FILE *, char *, size_t);
56 #define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
58 #define RETURN_PERCENT_FLAG(Value) \
60 val->uniqstr = uniqstr_new (Value); \
61 return PERCENT_FLAG; \
65 /* A string representing the most recently saved token. */
66 static char *last_string;
69 gram_scanner_last_string_free (void)
74 static void handle_syncline (char *, location);
75 static unsigned long int scan_integer (char const *p, int base, location loc);
76 static int convert_ucn_to_byte (char const *hex_text);
77 static void unexpected_eof (boundary, char const *);
78 static void unexpected_newline (boundary, char const *);
81 /* A C-like comment in directives/rules. */
83 /* Strings and characters in directives/rules. */
84 %x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
85 /* A identifier was just read in directives/rules. Special state
86 to capture the sequence `identifier :'. */
87 %x SC_AFTER_IDENTIFIER
88 /* A complex tag, with nested angles brackets. */
91 /* Three types of user code:
92 - prologue (code between `%{' `%}' in the first section, before %%);
93 - actions, printers, union, etc, (between braced in the middle section);
94 - epilogue (everything after the second %%). */
95 %x SC_PROLOGUE SC_BRACED_CODE SC_EPILOGUE
96 /* C and C++ comments in code. */
97 %x SC_COMMENT SC_LINE_COMMENT
98 /* Strings and characters in code. */
99 %x SC_STRING SC_CHARACTER
101 letter [-.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
102 id {letter}({letter}|[0-9])*
106 /* POSIX says that a tag must be both an id and a C union member, but
107 historically almost any character is allowed in a tag. We disallow
108 NUL, as this simplifies our implementation. We disallow angle
109 bracket to match them in nested pairs: several languages use them
110 for generics/template types. */
113 /* Zero or more instances of backslash-newline. Following GCC, allow
114 white space between the backslash and the newline. */
115 splice (\\[ \f\t\v]*\n)*
119 /* Nesting level. Either for nested braces, or nested angle brackets
121 int nesting IF_LINT (= 0);
123 /* Parent context state, when applicable. */
124 int context_state IF_LINT (= 0);
126 /* Location of most recent identifier, when applicable. */
127 location id_loc IF_LINT (= empty_location);
129 /* Where containing code started, when applicable. Its initial
130 value is relevant only when yylex is invoked in the SC_EPILOGUE
132 boundary code_start = scanner_cursor;
134 /* Where containing comment or string or character literal started,
136 boundary token_start IF_LINT (= scanner_cursor);
140 /*-----------------------.
141 | Scanning white space. |
142 `-----------------------*/
144 <INITIAL,SC_AFTER_IDENTIFIER>
146 /* Comments and white space. */
147 "," warn_at (*loc, _("stray `,' treated as white space"));
151 token_start = loc->start;
152 context_state = YY_START;
153 BEGIN SC_YACC_COMMENT;
156 /* #line directives are not documented, and may be withdrawn or
157 modified in future versions of Bison. */
158 ^"#line "{int}" \"".*"\"\n" {
159 handle_syncline (yytext + sizeof "#line " - 1, *loc);
164 /*----------------------------.
165 | Scanning Bison directives. |
166 `----------------------------*/
168 /* For directives that are also command line options, the regex must be
170 after "[-_]"s are removed, and the directive must match the --long
171 option name, with a single string argument. Otherwise, add exceptions
172 to ../build-aux/cross-options.pl. */
176 "%binary" return PERCENT_NONASSOC;
177 "%code" return PERCENT_CODE;
178 "%debug" RETURN_PERCENT_FLAG("parse.trace");
179 "%default"[-_]"prec" return PERCENT_DEFAULT_PREC;
180 "%define" return PERCENT_DEFINE;
181 "%defines" return PERCENT_DEFINES;
182 "%destructor" return PERCENT_DESTRUCTOR;
183 "%dprec" return PERCENT_DPREC;
184 "%error"[-_]"verbose" RETURN_PERCENT_FLAG("error-verbose");
185 "%expect" return PERCENT_EXPECT;
186 "%expect"[-_]"rr" return PERCENT_EXPECT_RR;
187 "%file-prefix" return PERCENT_FILE_PREFIX;
188 "%fixed"[-_]"output"[-_]"files" return PERCENT_YACC;
189 "%initial-action" return PERCENT_INITIAL_ACTION;
190 "%glr-parser" return PERCENT_GLR_PARSER;
191 "%language" return PERCENT_LANGUAGE;
192 "%left" return PERCENT_LEFT;
193 "%lex-param" return PERCENT_LEX_PARAM;
194 "%locations" RETURN_PERCENT_FLAG("locations");
195 "%merge" return PERCENT_MERGE;
196 "%name"[-_]"prefix" return PERCENT_NAME_PREFIX;
197 "%no"[-_]"default"[-_]"prec" return PERCENT_NO_DEFAULT_PREC;
198 "%no"[-_]"lines" return PERCENT_NO_LINES;
199 "%nonassoc" return PERCENT_NONASSOC;
200 "%nondeterministic-parser" return PERCENT_NONDETERMINISTIC_PARSER;
201 "%nterm" return PERCENT_NTERM;
202 "%output" return PERCENT_OUTPUT;
203 "%parse-param" return PERCENT_PARSE_PARAM;
204 "%prec" return PERCENT_PREC;
205 "%precedence" return PERCENT_PRECEDENCE;
206 "%printer" return PERCENT_PRINTER;
207 "%pure"[-_]"parser" RETURN_PERCENT_FLAG("api.pure");
208 "%require" return PERCENT_REQUIRE;
209 "%right" return PERCENT_RIGHT;
210 "%skeleton" return PERCENT_SKELETON;
211 "%start" return PERCENT_START;
212 "%term" return PERCENT_TOKEN;
213 "%token" return PERCENT_TOKEN;
214 "%token"[-_]"table" return PERCENT_TOKEN_TABLE;
215 "%type" return PERCENT_TYPE;
216 "%union" return PERCENT_UNION;
217 "%verbose" return PERCENT_VERBOSE;
218 "%yacc" return PERCENT_YACC;
221 complain_at (*loc, _("invalid directive: %s"), quote (yytext));
226 ";" return SEMICOLON;
229 val->uniqstr = uniqstr_new (yytext);
231 BEGIN SC_AFTER_IDENTIFIER;
235 val->integer = scan_integer (yytext, 10, *loc);
238 0[xX][0-9abcdefABCDEF]+ {
239 val->integer = scan_integer (yytext, 16, *loc);
243 /* Identifiers may not start with a digit. Yet, don't silently
244 accept "1FOO" as "1 FOO". */
246 complain_at (*loc, _("invalid identifier: %s"), quote (yytext));
249 /* Characters. We don't check there is only one. */
250 "'" STRING_GROW; token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;
253 "\"" token_start = loc->start; BEGIN SC_ESCAPED_STRING;
256 "%{" code_start = loc->start; BEGIN SC_PROLOGUE;
258 /* Code in between braces. */
262 code_start = loc->start;
263 BEGIN SC_BRACED_CODE;
267 "<*>" return TAG_ANY;
268 "<>" return TAG_NONE;
270 obstack_grow (&obstack_for_string, yytext + 1, yyleng - 2);
272 val->uniqstr = uniqstr_new (last_string);
278 token_start = loc->start;
283 static int percent_percent_count;
284 if (++percent_percent_count == 2)
286 return PERCENT_PERCENT;
290 complain_at (*loc, _("invalid character: %s"), quote (yytext));
294 loc->start = loc->end = scanner_cursor;
300 /*--------------------------------------------------------------.
301 | Supporting \0 complexifies our implementation for no expected |
303 `--------------------------------------------------------------*/
305 <SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING,SC_TAG>
307 \0 complain_at (*loc, _("invalid null character"));
311 /*-----------------------------------------------------------------.
312 | Scanning after an identifier, checking whether a colon is next. |
313 `-----------------------------------------------------------------*/
315 <SC_AFTER_IDENTIFIER>
323 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
337 /*---------------------------------------------------------------.
338 | Scanning a Yacc comment. The initial `/ *' is already eaten. |
339 `---------------------------------------------------------------*/
343 "*/" BEGIN context_state;
345 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
349 /*------------------------------------------------------------.
350 | Scanning a C comment. The initial `/ *' is already eaten. |
351 `------------------------------------------------------------*/
355 "*"{splice}"/" STRING_GROW; BEGIN context_state;
356 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
360 /*--------------------------------------------------------------.
361 | Scanning a line comment. The initial `//' is already eaten. |
362 `--------------------------------------------------------------*/
366 "\n" STRING_GROW; BEGIN context_state;
367 {splice} STRING_GROW;
368 <<EOF>> BEGIN context_state;
372 /*------------------------------------------------.
373 | Scanning a Bison string, including its escapes. |
374 | The initial quote is already eaten. |
375 `------------------------------------------------*/
380 if (yytext[0] == '\n')
381 unexpected_newline (token_start, "\"");
383 loc->start = token_start;
384 val->chars = last_string;
389 unexpected_eof (token_start, "\"");
391 loc->start = token_start;
392 val->chars = last_string;
398 /*----------------------------------------------------------.
399 | Scanning a Bison character literal, decoding its escapes. |
400 | The initial quote is already eaten. |
401 `----------------------------------------------------------*/
403 <SC_ESCAPED_CHARACTER>
406 if (yytext[0] == '\n')
407 unexpected_newline (token_start, "'");
410 loc->start = token_start;
411 val->character = last_string[1];
417 unexpected_eof (token_start, "'");
419 loc->start = token_start;
420 if (strlen (last_string) > 1)
421 val->character = last_string[1];
423 val->character = last_string[0];
430 /*-----------------------------------------------------------.
431 | Scanning a Bison nested tag. The initial angle bracket is |
433 `-----------------------------------------------------------*/
442 loc->start = token_start;
443 val->uniqstr = uniqstr_new (last_string);
452 "<"+ STRING_GROW; nesting += yyleng;
455 unexpected_eof (token_start, ">");
457 loc->start = token_start;
458 val->uniqstr = uniqstr_new (last_string);
465 /*----------------------------.
466 | Decode escaped characters. |
467 `----------------------------*/
469 <SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
472 unsigned long int c = strtoul (yytext + 1, NULL, 8);
474 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
476 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
478 obstack_1grow (&obstack_for_string, c);
481 \\x[0-9abcdefABCDEF]+ {
482 verify (UCHAR_MAX < ULONG_MAX);
483 unsigned long int c = strtoul (yytext + 2, NULL, 16);
485 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
487 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
489 obstack_1grow (&obstack_for_string, c);
492 \\a obstack_1grow (&obstack_for_string, '\a');
493 \\b obstack_1grow (&obstack_for_string, '\b');
494 \\f obstack_1grow (&obstack_for_string, '\f');
495 \\n obstack_1grow (&obstack_for_string, '\n');
496 \\r obstack_1grow (&obstack_for_string, '\r');
497 \\t obstack_1grow (&obstack_for_string, '\t');
498 \\v obstack_1grow (&obstack_for_string, '\v');
500 /* \\[\"\'?\\] would be shorter, but it confuses xgettext. */
501 \\("\""|"'"|"?"|"\\") obstack_1grow (&obstack_for_string, yytext[1]);
503 \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
504 int c = convert_ucn_to_byte (yytext);
506 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
508 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
510 obstack_1grow (&obstack_for_string, c);
513 complain_at (*loc, _("unrecognized escape sequence: %s"), quote (yytext));
518 /*--------------------------------------------.
519 | Scanning user-code characters and strings. |
520 `--------------------------------------------*/
522 <SC_CHARACTER,SC_STRING>
524 {splice}|\\{splice}[^\n\[\]] STRING_GROW;
529 "'" STRING_GROW; BEGIN context_state;
530 \n unexpected_newline (token_start, "'"); BEGIN context_state;
531 <<EOF>> unexpected_eof (token_start, "'"); BEGIN context_state;
536 "\"" STRING_GROW; BEGIN context_state;
537 \n unexpected_newline (token_start, "\""); BEGIN context_state;
538 <<EOF>> unexpected_eof (token_start, "\""); BEGIN context_state;
542 /*---------------------------------------------------.
543 | Strings, comments etc. can be found in user code. |
544 `---------------------------------------------------*/
546 <SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
550 context_state = YY_START;
551 token_start = loc->start;
556 context_state = YY_START;
557 token_start = loc->start;
562 context_state = YY_START;
563 token_start = loc->start;
568 context_state = YY_START;
569 BEGIN SC_LINE_COMMENT;
575 /*-----------------------------------------------------------.
576 | Scanning some code in braces (actions). The initial "{" is |
578 `-----------------------------------------------------------*/
582 "{"|"<"{splice}"%" STRING_GROW; nesting++;
583 "%"{splice}">" STRING_GROW; nesting--;
585 obstack_1grow (&obstack_for_string, '}');
591 loc->start = code_start;
592 val->code = last_string;
598 /* Tokenize `<<%' correctly (as `<<' `%') rather than incorrrectly
600 "<"{splice}"<" STRING_GROW;
603 unexpected_eof (code_start, "}");
605 loc->start = code_start;
606 val->code = last_string;
613 /*--------------------------------------------------------------.
614 | Scanning some prologue: from "%{" (already scanned) to "%}". |
615 `--------------------------------------------------------------*/
621 loc->start = code_start;
622 val->chars = last_string;
628 unexpected_eof (code_start, "%}");
630 loc->start = code_start;
631 val->chars = last_string;
638 /*---------------------------------------------------------------.
639 | Scanning the epilogue (everything after the second "%%", which |
640 | has already been eaten). |
641 `---------------------------------------------------------------*/
647 loc->start = code_start;
648 val->chars = last_string;
655 /*-----------------------------------------------------.
656 | By default, grow the string obstack with the input. |
657 `-----------------------------------------------------*/
659 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>. |
660 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>\n STRING_GROW;
664 /* Read bytes from FP into buffer BUF of size SIZE. Return the
665 number of bytes read. Remove '\r' from input, treating \r\n
666 and isolated \r as \n. */
669 no_cr_read (FILE *fp, char *buf, size_t size)
671 size_t bytes_read = fread (buf, 1, size, fp);
674 char *w = memchr (buf, '\r', bytes_read);
678 char const *lim = buf + bytes_read;
682 /* Found an '\r'. Treat it like '\n', but ignore any
683 '\n' that immediately follows. */
688 if (ch != '\n' && ungetc (ch, fp) != ch)
694 /* Copy until the next '\r'. */
700 while ((*w++ = *r++) != '\r');
712 /*------------------------------------------------------.
713 | Scan NUMBER for a base-BASE integer at location LOC. |
714 `------------------------------------------------------*/
716 static unsigned long int
717 scan_integer (char const *number, int base, location loc)
719 verify (INT_MAX < ULONG_MAX);
720 unsigned long int num = strtoul (number, NULL, base);
724 complain_at (loc, _("integer out of range: %s"), quote (number));
732 /*------------------------------------------------------------------.
733 | Convert universal character name UCN to a single-byte character, |
734 | and return that character. Return -1 if UCN does not correspond |
735 | to a single-byte character. |
736 `------------------------------------------------------------------*/
739 convert_ucn_to_byte (char const *ucn)
741 verify (UCHAR_MAX <= INT_MAX);
742 unsigned long int code = strtoul (ucn + 2, NULL, 16);
744 /* FIXME: Currently we assume Unicode-compatible unibyte characters
745 on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes). On
746 non-ASCII hosts we support only the portable C character set.
747 These limitations should be removed once we add support for
748 multibyte characters. */
750 if (UCHAR_MAX < code)
753 #if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e)
755 /* A non-ASCII host. Use CODE to index into a table of the C
756 basic execution character set, which is guaranteed to exist on
757 all Standard C platforms. This table also includes '$', '@',
758 and '`', which are not in the basic execution character set but
759 which are unibyte characters on all the platforms that we know
761 static signed char const table[] =
763 '\0', -1, -1, -1, -1, -1, -1, '\a',
764 '\b', '\t', '\n', '\v', '\f', '\r', -1, -1,
765 -1, -1, -1, -1, -1, -1, -1, -1,
766 -1, -1, -1, -1, -1, -1, -1, -1,
767 ' ', '!', '"', '#', '$', '%', '&', '\'',
768 '(', ')', '*', '+', ',', '-', '.', '/',
769 '0', '1', '2', '3', '4', '5', '6', '7',
770 '8', '9', ':', ';', '<', '=', '>', '?',
771 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
772 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
773 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
774 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
775 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
776 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
777 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
778 'x', 'y', 'z', '{', '|', '}', '~'
781 code = code < sizeof table ? table[code] : -1;
789 /*----------------------------------------------------------------.
790 | Handle `#line INT "FILE"'. ARGS has already skipped `#line '. |
791 `----------------------------------------------------------------*/
794 handle_syncline (char *args, location loc)
797 unsigned long int lineno = strtoul (args, &after_num, 10);
798 char *file = strchr (after_num, '"') + 1;
799 *strchr (file, '"') = '\0';
800 if (INT_MAX <= lineno)
802 warn_at (loc, _("line number overflow"));
805 current_file = uniqstr_new (file);
806 boundary_set (&scanner_cursor, current_file, lineno, 1);
810 /*----------------------------------------------------------------.
811 | For a token or comment starting at START, report message MSGID, |
812 | which should say that an end marker was found before |
813 | the expected TOKEN_END. |
814 `----------------------------------------------------------------*/
817 unexpected_end (boundary start, char const *msgid, char const *token_end)
821 loc.end = scanner_cursor;
822 complain_at (loc, _(msgid), token_end);
826 /*------------------------------------------------------------------------.
827 | Report an unexpected EOF in a token or comment starting at START. |
828 | An end of file was encountered and the expected TOKEN_END was missing. |
829 `------------------------------------------------------------------------*/
832 unexpected_eof (boundary start, char const *token_end)
834 unexpected_end (start, N_("missing `%s' at end of file"), token_end);
838 /*----------------------------------------.
839 | Likewise, but for unexpected newlines. |
840 `----------------------------------------*/
843 unexpected_newline (boundary start, char const *token_end)
845 unexpected_end (start, N_("missing `%s' at end of line"), token_end);
849 /*-------------------------.
850 | Initialize the scanner. |
851 `-------------------------*/
854 gram_scanner_initialize (void)
856 obstack_init (&obstack_for_string);
860 /*-----------------------------------------------.
861 | Free all the memory allocated to the scanner. |
862 `-----------------------------------------------*/
865 gram_scanner_free (void)
867 obstack_free (&obstack_for_string, 0);
868 /* Reclaim Flex's buffers. */