]> git.saurik.com Git - bison.git/blame - src/scan-gram.l
* m4/cxx.m4 (BISON_TEST_FOR_WORKING_CXX_COMPILER): Check that
[bison.git] / src / scan-gram.l
CommitLineData
e9955c83 1/* Bison Grammar Scanner -*- C -*-
3b1e470c 2
073f9288 3 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
e9955c83
AD
4
5 This file is part of Bison, the GNU Compiler Compiler.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
0fb669f9
PE
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301 USA
e9955c83
AD
21*/
22
aa418041 23%option debug nodefault nounput noyywrap never-interactive
e9955c83
AD
24%option prefix="gram_" outfile="lex.yy.c"
25
26%{
4f6e011e
PE
27/* Work around a bug in flex 2.5.31. See Debian bug 333231
28 <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=333231>. */
29#undef gram_wrap
30#define gram_wrap() 1
31
e9955c83 32#include "system.h"
223ff46e
PE
33
34#include <mbswidth.h>
223ff46e
PE
35#include <quote.h>
36
e9955c83 37#include "complain.h"
3f2d73f1 38#include "files.h"
e9955c83
AD
39#include "getargs.h"
40#include "gram.h"
ca407bdf 41#include "quotearg.h"
e9955c83 42#include "reader.h"
223ff46e 43#include "uniqstr.h"
e9955c83 44
3f2d73f1
PE
45#define YY_USER_INIT \
46 do \
47 { \
48 scanner_cursor.file = current_file; \
49 scanner_cursor.line = 1; \
50 scanner_cursor.column = 1; \
379f0ac8 51 code_start = scanner_cursor; \
3f2d73f1
PE
52 } \
53 while (0)
8efe435c 54
dc9701e8
PE
55/* Pacify "gcc -Wmissing-prototypes" when flex 2.5.31 is used. */
56int gram_get_lineno (void);
57FILE *gram_get_in (void);
58FILE *gram_get_out (void);
59int gram_get_leng (void);
60char *gram_get_text (void);
61void gram_set_lineno (int);
62void gram_set_in (FILE *);
63void gram_set_out (FILE *);
64int gram_get_debug (void);
65void gram_set_debug (int);
66int gram_lex_destroy (void);
67
3f2d73f1
PE
68/* Location of scanner cursor. */
69boundary scanner_cursor;
41141c56 70
223ff46e 71static void adjust_location (location *, char const *, size_t);
3f2d73f1 72#define YY_USER_ACTION adjust_location (loc, yytext, yyleng);
d8d3f94a 73
6c30d641 74static size_t no_cr_read (FILE *, char *, size_t);
d8d3f94a
PE
75#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
76
77
223ff46e 78/* OBSTACK_FOR_STRING -- Used to store all the characters that we need to
44995b2e
AD
79 keep (to construct ID, STRINGS etc.). Use the following macros to
80 use it.
81
41141c56
PE
82 Use STRING_GROW to append what has just been matched, and
83 STRING_FINISH to end the string (it puts the ending 0).
84 STRING_FINISH also stores this string in LAST_STRING, which can be
85 used, and which is used by STRING_FREE to free the last string. */
44995b2e 86
223ff46e 87static struct obstack obstack_for_string;
44995b2e 88
7ec2d4cd 89/* A string representing the most recently saved token. */
6b702268 90char *last_string;
7ec2d4cd 91
6b702268
PE
92/* The location of the most recently saved token, if it was a
93 BRACED_CODE token; otherwise, this has an unspecified value. */
94location last_braced_code_loc;
7ec2d4cd 95
41141c56 96#define STRING_GROW \
223ff46e 97 obstack_grow (&obstack_for_string, yytext, yyleng)
44995b2e 98
41141c56 99#define STRING_FINISH \
44995b2e 100 do { \
223ff46e
PE
101 obstack_1grow (&obstack_for_string, '\0'); \
102 last_string = obstack_finish (&obstack_for_string); \
44995b2e
AD
103 } while (0)
104
41141c56 105#define STRING_FREE \
223ff46e 106 obstack_free (&obstack_for_string, last_string)
e9955c83 107
7ec2d4cd
AD
108void
109scanner_last_string_free (void)
110{
41141c56 111 STRING_FREE;
7ec2d4cd 112}
e9955c83 113
efcb44dd
PE
114/* Within well-formed rules, RULE_LENGTH is the number of values in
115 the current rule so far, which says where to find `$0' with respect
116 to the top of the stack. It is not the same as the rule->length in
117 the case of mid rule actions.
118
119 Outside of well-formed rules, RULE_LENGTH has an undefined value. */
120static int rule_length;
121
4517da37
PE
122static void rule_length_overflow (location) __attribute__ ((__noreturn__));
123
124/* Increment the rule length by one, checking for overflow. */
125static inline void
126increment_rule_length (location loc)
127{
128 rule_length++;
129
130 /* Don't allow rule_length == INT_MAX, since that might cause
131 confusion with strtol if INT_MAX == LONG_MAX. */
132 if (rule_length == INT_MAX)
133 rule_length_overflow (loc);
134}
135
624a35e2
PE
136static void handle_dollar (int token_type, char *cp, location loc);
137static void handle_at (int token_type, char *cp, location loc);
4517da37 138static void handle_syncline (char *, location);
1452af69 139static unsigned long int scan_integer (char const *p, int base, location loc);
d8d3f94a 140static int convert_ucn_to_byte (char const *hex_text);
aa418041 141static void unexpected_eof (boundary, char const *);
4febdd96 142static void unexpected_newline (boundary, char const *);
e9955c83
AD
143
144%}
d8d3f94a 145%x SC_COMMENT SC_LINE_COMMENT SC_YACC_COMMENT
e9955c83 146%x SC_STRING SC_CHARACTER
3f2d73f1 147%x SC_AFTER_IDENTIFIER
e9955c83 148%x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
624a35e2 149%x SC_PRE_CODE SC_BRACED_CODE SC_PROLOGUE SC_EPILOGUE
e9955c83 150
29c01725
AD
151letter [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
152id {letter}({letter}|[0-9])*
153directive %{letter}({letter}|[0-9]|-)*
624a35e2 154int [0-9]+
d8d3f94a
PE
155
156/* POSIX says that a tag must be both an id and a C union member, but
157 historically almost any character is allowed in a tag. We disallow
158 NUL and newline, as this simplifies our implementation. */
159tag [^\0\n>]+
160
161/* Zero or more instances of backslash-newline. Following GCC, allow
162 white space between the backslash and the newline. */
163splice (\\[ \f\t\v]*\n)*
e9955c83
AD
164
165%%
166%{
a706a1cc 167 /* Nesting level of the current code in braces. */
1a9e39f1
PE
168 int braces_level IF_LINT (= 0);
169
3f2d73f1
PE
170 /* Parent context state, when applicable. */
171 int context_state IF_LINT (= 0);
a706a1cc 172
624a35e2
PE
173 /* Token type to return, when applicable. */
174 int token_type IF_LINT (= 0);
175
3f2d73f1 176 /* Location of most recent identifier, when applicable. */
a2bc9dbc 177 location id_loc IF_LINT (= empty_location);
3f2d73f1 178
a2bc9dbc
PE
179 /* Where containing code started, when applicable. Its initial
180 value is relevant only when yylex is invoked in the SC_EPILOGUE
181 start condition. */
182 boundary code_start = scanner_cursor;
3f2d73f1 183
223ff46e
PE
184 /* Where containing comment or string or character literal started,
185 when applicable. */
a2bc9dbc 186 boundary token_start IF_LINT (= scanner_cursor);
e9955c83
AD
187%}
188
189
3f2d73f1
PE
190 /*-----------------------.
191 | Scanning white space. |
192 `-----------------------*/
193
624a35e2 194<INITIAL,SC_AFTER_IDENTIFIER,SC_PRE_CODE>
3f2d73f1 195{
4febdd96 196 /* Comments and white space. */
83adb046 197 "," warn_at (*loc, _("stray `,' treated as white space"));
4febdd96 198 [ \f\n\t\v] |
3f2d73f1 199 "//".* ;
83adb046
PE
200 "/*" {
201 token_start = loc->start;
202 context_state = YY_START;
203 BEGIN SC_YACC_COMMENT;
204 }
3f2d73f1
PE
205
206 /* #line directives are not documented, and may be withdrawn or
207 modified in future versions of Bison. */
208 ^"#line "{int}" \"".*"\"\n" {
4517da37 209 handle_syncline (yytext + sizeof "#line " - 1, *loc);
3f2d73f1
PE
210 }
211}
212
213
e9955c83
AD
214 /*----------------------------.
215 | Scanning Bison directives. |
216 `----------------------------*/
217<INITIAL>
218{
219 "%binary" return PERCENT_NONASSOC;
220 "%debug" return PERCENT_DEBUG;
39a06c25 221 "%default"[-_]"prec" return PERCENT_DEFAULT_PREC;
e9955c83
AD
222 "%define" return PERCENT_DEFINE;
223 "%defines" return PERCENT_DEFINES;
624a35e2 224 "%destructor" token_type = PERCENT_DESTRUCTOR; BEGIN SC_PRE_CODE;
676385e2 225 "%dprec" return PERCENT_DPREC;
e9955c83
AD
226 "%error"[-_]"verbose" return PERCENT_ERROR_VERBOSE;
227 "%expect" return PERCENT_EXPECT;
d6328241 228 "%expect"[-_]"rr" return PERCENT_EXPECT_RR;
e9955c83
AD
229 "%file-prefix" return PERCENT_FILE_PREFIX;
230 "%fixed"[-_]"output"[-_]"files" return PERCENT_YACC;
cd3684cf 231 "%initial-action" token_type = PERCENT_INITIAL_ACTION; BEGIN SC_PRE_CODE;
ae7453f2 232 "%glr-parser" return PERCENT_GLR_PARSER;
e9955c83 233 "%left" return PERCENT_LEFT;
624a35e2 234 "%lex-param" token_type = PERCENT_LEX_PARAM; BEGIN SC_PRE_CODE;
e9955c83 235 "%locations" return PERCENT_LOCATIONS;
676385e2 236 "%merge" return PERCENT_MERGE;
e9955c83 237 "%name"[-_]"prefix" return PERCENT_NAME_PREFIX;
22fccf95 238 "%no"[-_]"default"[-_]"prec" return PERCENT_NO_DEFAULT_PREC;
e9955c83
AD
239 "%no"[-_]"lines" return PERCENT_NO_LINES;
240 "%nonassoc" return PERCENT_NONASSOC;
916708d5 241 "%nondeterministic-parser" return PERCENT_NONDETERMINISTIC_PARSER;
e9955c83
AD
242 "%nterm" return PERCENT_NTERM;
243 "%output" return PERCENT_OUTPUT;
624a35e2 244 "%parse-param" token_type = PERCENT_PARSE_PARAM; BEGIN SC_PRE_CODE;
d8d3f94a 245 "%prec" rule_length--; return PERCENT_PREC;
624a35e2 246 "%printer" token_type = PERCENT_PRINTER; BEGIN SC_PRE_CODE;
e9955c83 247 "%pure"[-_]"parser" return PERCENT_PURE_PARSER;
b50d2359 248 "%require" return PERCENT_REQUIRE;
e9955c83
AD
249 "%right" return PERCENT_RIGHT;
250 "%skeleton" return PERCENT_SKELETON;
251 "%start" return PERCENT_START;
252 "%term" return PERCENT_TOKEN;
253 "%token" return PERCENT_TOKEN;
254 "%token"[-_]"table" return PERCENT_TOKEN_TABLE;
255 "%type" return PERCENT_TYPE;
624a35e2 256 "%union" token_type = PERCENT_UNION; BEGIN SC_PRE_CODE;
e9955c83
AD
257 "%verbose" return PERCENT_VERBOSE;
258 "%yacc" return PERCENT_YACC;
259
3f2d73f1 260 {directive} {
41141c56 261 complain_at (*loc, _("invalid directive: %s"), quote (yytext));
412f8a59 262 }
900c5db5 263
e9955c83 264 "=" return EQUAL;
d8d3f94a 265 "|" rule_length = 0; return PIPE;
e9955c83
AD
266 ";" return SEMICOLON;
267
3f2d73f1 268 {id} {
41141c56 269 val->symbol = symbol_get (yytext, *loc);
3f2d73f1 270 id_loc = *loc;
4517da37 271 increment_rule_length (*loc);
3f2d73f1 272 BEGIN SC_AFTER_IDENTIFIER;
e9955c83
AD
273 }
274
d8d3f94a 275 {int} {
1452af69
PE
276 val->integer = scan_integer (yytext, 10, *loc);
277 return INT;
278 }
279 0[xX][0-9abcdefABCDEF]+ {
280 val->integer = scan_integer (yytext, 16, *loc);
d8d3f94a
PE
281 return INT;
282 }
e9955c83
AD
283
284 /* Characters. We don't check there is only one. */
3f2d73f1 285 "'" STRING_GROW; token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;
e9955c83
AD
286
287 /* Strings. */
ca407bdf 288 "\"" token_start = loc->start; BEGIN SC_ESCAPED_STRING;
e9955c83
AD
289
290 /* Prologue. */
3f2d73f1 291 "%{" code_start = loc->start; BEGIN SC_PROLOGUE;
e9955c83
AD
292
293 /* Code in between braces. */
3f2d73f1 294 "{" {
6b702268
PE
295 if (current_rule->action)
296 grammar_midrule_action ();
3f2d73f1 297 STRING_GROW;
624a35e2 298 token_type = BRACED_CODE;
3f2d73f1
PE
299 braces_level = 0;
300 code_start = loc->start;
301 BEGIN SC_BRACED_CODE;
302 }
e9955c83
AD
303
304 /* A type. */
d8d3f94a 305 "<"{tag}">" {
223ff46e 306 obstack_grow (&obstack_for_string, yytext + 1, yyleng - 2);
41141c56 307 STRING_FINISH;
223ff46e 308 val->uniqstr = uniqstr_new (last_string);
41141c56 309 STRING_FREE;
4cdb01db
AD
310 return TYPE;
311 }
312
a706a1cc
PE
313 "%%" {
314 static int percent_percent_count;
e9955c83 315 if (++percent_percent_count == 2)
a2bc9dbc 316 BEGIN SC_EPILOGUE;
e9955c83
AD
317 return PERCENT_PERCENT;
318 }
319
a706a1cc 320 . {
41141c56 321 complain_at (*loc, _("invalid character: %s"), quote (yytext));
3f2d73f1 322 }
379f0ac8
PE
323
324 <<EOF>> {
325 loc->start = loc->end = scanner_cursor;
326 yyterminate ();
327 }
3f2d73f1
PE
328}
329
330
331 /*-----------------------------------------------------------------.
332 | Scanning after an identifier, checking whether a colon is next. |
333 `-----------------------------------------------------------------*/
334
335<SC_AFTER_IDENTIFIER>
336{
337 ":" {
338 rule_length = 0;
339 *loc = id_loc;
340 BEGIN INITIAL;
341 return ID_COLON;
342 }
343 . {
344 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
345 yyless (0);
346 *loc = id_loc;
347 BEGIN INITIAL;
348 return ID;
349 }
350 <<EOF>> {
351 *loc = id_loc;
352 BEGIN INITIAL;
353 return ID;
e9955c83
AD
354 }
355}
356
357
d8d3f94a
PE
358 /*---------------------------------------------------------------.
359 | Scanning a Yacc comment. The initial `/ *' is already eaten. |
360 `---------------------------------------------------------------*/
e9955c83 361
d8d3f94a 362<SC_YACC_COMMENT>
e9955c83 363{
3f2d73f1 364 "*/" BEGIN context_state;
a706a1cc 365 .|\n ;
aa418041 366 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
d8d3f94a
PE
367}
368
369
370 /*------------------------------------------------------------.
371 | Scanning a C comment. The initial `/ *' is already eaten. |
372 `------------------------------------------------------------*/
373
374<SC_COMMENT>
375{
3f2d73f1 376 "*"{splice}"/" STRING_GROW; BEGIN context_state;
aa418041 377 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
e9955c83
AD
378}
379
380
d8d3f94a
PE
381 /*--------------------------------------------------------------.
382 | Scanning a line comment. The initial `//' is already eaten. |
383 `--------------------------------------------------------------*/
384
385<SC_LINE_COMMENT>
386{
3f2d73f1 387 "\n" STRING_GROW; BEGIN context_state;
41141c56 388 {splice} STRING_GROW;
3f2d73f1 389 <<EOF>> BEGIN context_state;
d8d3f94a
PE
390}
391
392
4febdd96
PE
393 /*------------------------------------------------.
394 | Scanning a Bison string, including its escapes. |
395 | The initial quote is already eaten. |
396 `------------------------------------------------*/
e9955c83
AD
397
398<SC_ESCAPED_STRING>
399{
db2cc12f 400 "\"" {
41141c56 401 STRING_FINISH;
3f2d73f1 402 loc->start = token_start;
223ff46e 403 val->chars = last_string;
4517da37 404 increment_rule_length (*loc);
a706a1cc 405 BEGIN INITIAL;
e9955c83
AD
406 return STRING;
407 }
4febdd96
PE
408 \n unexpected_newline (token_start, "\""); BEGIN INITIAL;
409 <<EOF>> unexpected_eof (token_start, "\""); BEGIN INITIAL;
e9955c83
AD
410}
411
4febdd96
PE
412 /*----------------------------------------------------------.
413 | Scanning a Bison character literal, decoding its escapes. |
414 | The initial quote is already eaten. |
415 `----------------------------------------------------------*/
e9955c83
AD
416
417<SC_ESCAPED_CHARACTER>
418{
db2cc12f 419 "'" {
3b1e470c 420 unsigned char last_string_1;
41141c56
PE
421 STRING_GROW;
422 STRING_FINISH;
3f2d73f1 423 loc->start = token_start;
ca407bdf
PE
424 val->symbol = symbol_get (quotearg_style (escape_quoting_style,
425 last_string),
426 *loc);
073f9288 427 symbol_class_set (val->symbol, token_sym, *loc, false);
3b1e470c
PE
428 last_string_1 = last_string[1];
429 symbol_user_token_number_set (val->symbol, last_string_1, *loc);
41141c56 430 STRING_FREE;
4517da37 431 increment_rule_length (*loc);
a706a1cc
PE
432 BEGIN INITIAL;
433 return ID;
e9955c83 434 }
4febdd96
PE
435 \n unexpected_newline (token_start, "'"); BEGIN INITIAL;
436 <<EOF>> unexpected_eof (token_start, "'"); BEGIN INITIAL;
437}
a706a1cc 438
4febdd96
PE
439<SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING>
440{
92ac3705 441 \0 complain_at (*loc, _("invalid null character"));
e9955c83
AD
442}
443
444
445 /*----------------------------.
446 | Decode escaped characters. |
447 `----------------------------*/
448
449<SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
450{
d8d3f94a 451 \\[0-7]{1,3} {
4517da37 452 unsigned long int c = strtoul (yytext + 1, NULL, 8);
d8d3f94a 453 if (UCHAR_MAX < c)
3f2d73f1 454 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
05ac60f3 455 else if (! c)
92ac3705 456 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
e9955c83 457 else
223ff46e 458 obstack_1grow (&obstack_for_string, c);
e9955c83
AD
459 }
460
6b0d38ab 461 \\x[0-9abcdefABCDEF]+ {
4517da37
PE
462 verify (UCHAR_MAX < ULONG_MAX);
463 unsigned long int c = strtoul (yytext + 2, NULL, 16);
464 if (UCHAR_MAX < c)
3f2d73f1 465 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
92ac3705
PE
466 else if (! c)
467 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
d8d3f94a 468 else
223ff46e 469 obstack_1grow (&obstack_for_string, c);
e9955c83
AD
470 }
471
223ff46e
PE
472 \\a obstack_1grow (&obstack_for_string, '\a');
473 \\b obstack_1grow (&obstack_for_string, '\b');
474 \\f obstack_1grow (&obstack_for_string, '\f');
475 \\n obstack_1grow (&obstack_for_string, '\n');
476 \\r obstack_1grow (&obstack_for_string, '\r');
477 \\t obstack_1grow (&obstack_for_string, '\t');
478 \\v obstack_1grow (&obstack_for_string, '\v');
412f8a59
PE
479
480 /* \\[\"\'?\\] would be shorter, but it confuses xgettext. */
223ff46e 481 \\("\""|"'"|"?"|"\\") obstack_1grow (&obstack_for_string, yytext[1]);
412f8a59 482
6b0d38ab 483 \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
d8d3f94a
PE
484 int c = convert_ucn_to_byte (yytext);
485 if (c < 0)
3f2d73f1 486 complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
92ac3705
PE
487 else if (! c)
488 complain_at (*loc, _("invalid null character: %s"), quote (yytext));
d8d3f94a 489 else
223ff46e 490 obstack_1grow (&obstack_for_string, c);
d8d3f94a 491 }
4f25ebb0 492 \\(.|\n) {
3f2d73f1 493 complain_at (*loc, _("unrecognized escape sequence: %s"), quote (yytext));
41141c56 494 STRING_GROW;
e9955c83
AD
495 }
496}
497
4febdd96
PE
498 /*--------------------------------------------.
499 | Scanning user-code characters and strings. |
500 `--------------------------------------------*/
e9955c83 501
4febdd96
PE
502<SC_CHARACTER,SC_STRING>
503{
504 {splice}|\\{splice}[^\n$@\[\]] STRING_GROW;
505}
e9955c83
AD
506
507<SC_CHARACTER>
508{
4febdd96
PE
509 "'" STRING_GROW; BEGIN context_state;
510 \n unexpected_newline (token_start, "'"); BEGIN context_state;
511 <<EOF>> unexpected_eof (token_start, "'"); BEGIN context_state;
e9955c83
AD
512}
513
e9955c83
AD
514<SC_STRING>
515{
4febdd96
PE
516 "\"" STRING_GROW; BEGIN context_state;
517 \n unexpected_newline (token_start, "\""); BEGIN context_state;
518 <<EOF>> unexpected_eof (token_start, "\""); BEGIN context_state;
e9955c83
AD
519}
520
521
522 /*---------------------------------------------------.
523 | Strings, comments etc. can be found in user code. |
524 `---------------------------------------------------*/
525
526<SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
527{
3f2d73f1
PE
528 "'" {
529 STRING_GROW;
530 context_state = YY_START;
531 token_start = loc->start;
532 BEGIN SC_CHARACTER;
533 }
534 "\"" {
535 STRING_GROW;
536 context_state = YY_START;
537 token_start = loc->start;
538 BEGIN SC_STRING;
539 }
540 "/"{splice}"*" {
541 STRING_GROW;
542 context_state = YY_START;
543 token_start = loc->start;
544 BEGIN SC_COMMENT;
545 }
546 "/"{splice}"/" {
547 STRING_GROW;
548 context_state = YY_START;
549 BEGIN SC_LINE_COMMENT;
550 }
e9955c83
AD
551}
552
553
624a35e2
PE
554 /*---------------------------------------------------------------.
555 | Scanning after %union etc., possibly followed by white space. |
556 | For %union only, allow arbitrary C code to appear before the |
557 | following brace, as an extension to POSIX. |
558 `---------------------------------------------------------------*/
559
560<SC_PRE_CODE>
561{
562 . {
563 bool valid = yytext[0] == '{' || token_type == PERCENT_UNION;
564 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
565 yyless (0);
566
567 if (valid)
568 {
569 braces_level = -1;
570 code_start = loc->start;
571 BEGIN SC_BRACED_CODE;
572 }
573 else
574 {
6d07bacf 575 complain_at (*loc, _("missing `{' in %s"),
624a35e2
PE
576 token_name (token_type));
577 obstack_sgrow (&obstack_for_string, "{}");
578 STRING_FINISH;
579 val->chars = last_string;
580 BEGIN INITIAL;
581 return token_type;
582 }
583 }
379f0ac8 584
aa418041 585 <<EOF>> unexpected_eof (scanner_cursor, "{}"); BEGIN INITIAL;
624a35e2
PE
586}
587
588
e9955c83
AD
589 /*---------------------------------------------------------------.
590 | Scanning some code in braces (%union and actions). The initial |
591 | "{" is already eaten. |
592 `---------------------------------------------------------------*/
593
594<SC_BRACED_CODE>
595{
41141c56
PE
596 "{"|"<"{splice}"%" STRING_GROW; braces_level++;
597 "%"{splice}">" STRING_GROW; braces_level--;
e9955c83 598 "}" {
25522739
PE
599 bool outer_brace = --braces_level < 0;
600
601 /* As an undocumented Bison extension, append `;' before the last
602 brace in braced code, so that the user code can omit trailing
603 `;'. But do not append `;' if emulating Yacc, since Yacc does
604 not append one.
605
606 FIXME: Bison should warn if a semicolon seems to be necessary
607 here, and should omit the semicolon if it seems unnecessary
608 (e.g., after ';', '{', or '}', each followed by comments or
609 white space). Such a warning shouldn't depend on --yacc; it
610 should depend on a new --pedantic option, which would cause
611 Bison to warn if it detects an extension to POSIX. --pedantic
612 should also diagnose other Bison extensions like %yacc.
613 Perhaps there should also be a GCC-style --pedantic-errors
614 option, so that such warnings are diagnosed as errors. */
1deb9bdc 615 if (outer_brace && token_type == BRACED_CODE && ! yacc_flag)
25522739
PE
616 obstack_1grow (&obstack_for_string, ';');
617
618 obstack_1grow (&obstack_for_string, '}');
619
620 if (outer_brace)
e9955c83 621 {
41141c56 622 STRING_FINISH;
3f2d73f1 623 loc->start = code_start;
223ff46e 624 val->chars = last_string;
4517da37 625 increment_rule_length (*loc);
6b702268 626 last_braced_code_loc = *loc;
a706a1cc 627 BEGIN INITIAL;
624a35e2 628 return token_type;
e9955c83
AD
629 }
630 }
631
a706a1cc
PE
632 /* Tokenize `<<%' correctly (as `<<' `%') rather than incorrrectly
633 (as `<' `<%'). */
41141c56 634 "<"{splice}"<" STRING_GROW;
a706a1cc 635
624a35e2
PE
636 "$"("<"{tag}">")?(-?[0-9]+|"$") handle_dollar (token_type, yytext, *loc);
637 "@"(-?[0-9]+|"$") handle_at (token_type, yytext, *loc);
e9955c83 638
302c0aee
PE
639 "$" {
640 warn_at (*loc, _("stray `$'"));
641 obstack_sgrow (&obstack_for_string, "$][");
642 }
643 "@" {
644 warn_at (*loc, _("stray `@'"));
645 obstack_sgrow (&obstack_for_string, "@@");
646 }
647
aa418041 648 <<EOF>> unexpected_eof (code_start, "}"); BEGIN INITIAL;
e9955c83
AD
649}
650
651
652 /*--------------------------------------------------------------.
653 | Scanning some prologue: from "%{" (already scanned) to "%}". |
654 `--------------------------------------------------------------*/
655
656<SC_PROLOGUE>
657{
658 "%}" {
41141c56 659 STRING_FINISH;
3f2d73f1 660 loc->start = code_start;
223ff46e 661 val->chars = last_string;
a706a1cc 662 BEGIN INITIAL;
e9955c83
AD
663 return PROLOGUE;
664 }
665
aa418041 666 <<EOF>> unexpected_eof (code_start, "%}"); BEGIN INITIAL;
e9955c83
AD
667}
668
669
670 /*---------------------------------------------------------------.
671 | Scanning the epilogue (everything after the second "%%", which |
d8d3f94a 672 | has already been eaten). |
e9955c83
AD
673 `---------------------------------------------------------------*/
674
675<SC_EPILOGUE>
676{
e9955c83 677 <<EOF>> {
41141c56 678 STRING_FINISH;
3f2d73f1 679 loc->start = code_start;
223ff46e 680 val->chars = last_string;
a706a1cc 681 BEGIN INITIAL;
e9955c83
AD
682 return EPILOGUE;
683 }
684}
685
686
4febdd96
PE
687 /*-----------------------------------------.
688 | Escape M4 quoting characters in C code. |
689 `-----------------------------------------*/
a706a1cc
PE
690
691<SC_COMMENT,SC_LINE_COMMENT,SC_STRING,SC_CHARACTER,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
692{
223ff46e
PE
693 \$ obstack_sgrow (&obstack_for_string, "$][");
694 \@ obstack_sgrow (&obstack_for_string, "@@");
695 \[ obstack_sgrow (&obstack_for_string, "@{");
696 \] obstack_sgrow (&obstack_for_string, "@}");
a706a1cc
PE
697}
698
699
4febdd96
PE
700 /*-----------------------------------------------------.
701 | By default, grow the string obstack with the input. |
702 `-----------------------------------------------------*/
703
704<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>. |
705<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>\n STRING_GROW;
706
e9955c83
AD
707%%
708
cd3684cf
AD
709/* Keeps track of the maximum number of semantic values to the left of
710 a handle (those referenced by $0, $-1, etc.) are required by the
25005f6a
PH
711 semantic actions of this grammar. */
712int max_left_semantic_context = 0;
713
4517da37
PE
714/* If BUF is null, add BUFSIZE (which in this case must be less than
715 INT_MAX) to COLUMN; otherwise, add mbsnwidth (BUF, BUFSIZE, 0) to
716 COLUMN. If an overflow occurs, or might occur but is undetectable,
717 return INT_MAX. Assume COLUMN is nonnegative. */
718
719static inline int
720add_column_width (int column, char const *buf, size_t bufsize)
721{
722 size_t width;
723 unsigned int remaining_columns = INT_MAX - column;
724
725 if (buf)
726 {
727 if (INT_MAX / 2 <= bufsize)
728 return INT_MAX;
729 width = mbsnwidth (buf, bufsize, 0);
730 }
731 else
732 width = bufsize;
733
734 return width <= remaining_columns ? column + width : INT_MAX;
735}
736
3f2d73f1
PE
737/* Set *LOC and adjust scanner cursor to account for token TOKEN of
738 size SIZE. */
6c30d641
PE
739
740static void
223ff46e 741adjust_location (location *loc, char const *token, size_t size)
6c30d641 742{
3f2d73f1
PE
743 int line = scanner_cursor.line;
744 int column = scanner_cursor.column;
6c30d641
PE
745 char const *p0 = token;
746 char const *p = token;
747 char const *lim = token + size;
748
3f2d73f1
PE
749 loc->start = scanner_cursor;
750
6c30d641
PE
751 for (p = token; p < lim; p++)
752 switch (*p)
753 {
6c30d641 754 case '\n':
4517da37 755 line += line < INT_MAX;
6c30d641
PE
756 column = 1;
757 p0 = p + 1;
758 break;
759
760 case '\t':
06f01bc4
PE
761 column = add_column_width (column, p0, p - p0);
762 column = add_column_width (column, NULL, 8 - ((column - 1) & 7));
763 p0 = p + 1;
764 break;
765
766 default:
767 break;
6c30d641
PE
768 }
769
3f2d73f1 770 scanner_cursor.line = line;
4517da37 771 scanner_cursor.column = column = add_column_width (column, p0, p - p0);
3f2d73f1
PE
772
773 loc->end = scanner_cursor;
4517da37
PE
774
775 if (line == INT_MAX && loc->start.line != INT_MAX)
776 warn_at (*loc, _("line number overflow"));
777 if (column == INT_MAX && loc->start.column != INT_MAX)
778 warn_at (*loc, _("column number overflow"));
6c30d641
PE
779}
780
781
782/* Read bytes from FP into buffer BUF of size SIZE. Return the
783 number of bytes read. Remove '\r' from input, treating \r\n
784 and isolated \r as \n. */
785
786static size_t
787no_cr_read (FILE *fp, char *buf, size_t size)
788{
a737b216
PE
789 size_t bytes_read = fread (buf, 1, size, fp);
790 if (bytes_read)
6c30d641 791 {
a737b216 792 char *w = memchr (buf, '\r', bytes_read);
6c30d641
PE
793 if (w)
794 {
795 char const *r = ++w;
a737b216 796 char const *lim = buf + bytes_read;
6c30d641
PE
797
798 for (;;)
799 {
800 /* Found an '\r'. Treat it like '\n', but ignore any
801 '\n' that immediately follows. */
802 w[-1] = '\n';
803 if (r == lim)
804 {
805 int ch = getc (fp);
806 if (ch != '\n' && ungetc (ch, fp) != ch)
807 break;
808 }
809 else if (*r == '\n')
810 r++;
811
812 /* Copy until the next '\r'. */
813 do
814 {
815 if (r == lim)
816 return w - buf;
817 }
818 while ((*w++ = *r++) != '\r');
819 }
820
821 return w - buf;
822 }
823 }
824
a737b216 825 return bytes_read;
6c30d641
PE
826}
827
828
e9955c83 829/*------------------------------------------------------------------.
366eea36 830| TEXT is pointing to a wannabee semantic value (i.e., a `$'). |
e9955c83
AD
831| |
832| Possible inputs: $[<TYPENAME>]($|integer) |
833| |
223ff46e 834| Output to OBSTACK_FOR_STRING a reference to this semantic value. |
e9955c83
AD
835`------------------------------------------------------------------*/
836
624a35e2 837static inline bool
223ff46e 838handle_action_dollar (char *text, location loc)
e9955c83
AD
839{
840 const char *type_name = NULL;
366eea36 841 char *cp = text + 1;
e9955c83 842
624a35e2
PE
843 if (! current_rule)
844 return false;
845
e9955c83
AD
846 /* Get the type name if explicit. */
847 if (*cp == '<')
848 {
849 type_name = ++cp;
850 while (*cp != '>')
851 ++cp;
852 *cp = '\0';
853 ++cp;
854 }
855
856 if (*cp == '$')
857 {
858 if (!type_name)
223ff46e 859 type_name = symbol_list_n_type_name_get (current_rule, loc, 0);
e9955c83 860 if (!type_name && typed)
223ff46e 861 complain_at (loc, _("$$ of `%s' has no declared type"),
97650f4e 862 current_rule->sym->tag);
e9955c83
AD
863 if (!type_name)
864 type_name = "";
223ff46e 865 obstack_fgrow1 (&obstack_for_string,
e9955c83 866 "]b4_lhs_value([%s])[", type_name);
8f3596a6 867 current_rule->used = true;
e9955c83 868 }
d8d3f94a 869 else
e9955c83 870 {
4517da37 871 long int num = strtol (cp, NULL, 10);
e9955c83 872
4517da37 873 if (1 - INT_MAX + rule_length <= num && num <= rule_length)
e9955c83 874 {
d8d3f94a 875 int n = num;
affac613
AD
876 if (max_left_semantic_context < 1 - n)
877 max_left_semantic_context = 1 - n;
878 if (!type_name && 0 < n)
223ff46e 879 type_name = symbol_list_n_type_name_get (current_rule, loc, n);
e9955c83 880 if (!type_name && typed)
223ff46e
PE
881 complain_at (loc, _("$%d of `%s' has no declared type"),
882 n, current_rule->sym->tag);
e9955c83
AD
883 if (!type_name)
884 type_name = "";
223ff46e 885 obstack_fgrow3 (&obstack_for_string,
05ac60f3 886 "]b4_rhs_value(%d, %d, [%s])[",
e9955c83 887 rule_length, n, type_name);
8f3596a6 888 symbol_list_n_used_set (current_rule, n, true);
e9955c83 889 }
d8d3f94a 890 else
223ff46e 891 complain_at (loc, _("integer out of range: %s"), quote (text));
9280d3ef 892 }
9280d3ef 893
624a35e2 894 return true;
e9955c83
AD
895}
896
f25bfb75 897
cd3684cf
AD
898/*----------------------------------------------------------------.
899| Map `$?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
900| (are we in an action?). |
901`----------------------------------------------------------------*/
e9955c83
AD
902
903static void
624a35e2 904handle_dollar (int token_type, char *text, location loc)
f25bfb75 905{
624a35e2 906 switch (token_type)
f25bfb75 907 {
624a35e2
PE
908 case BRACED_CODE:
909 if (handle_action_dollar (text, loc))
910 return;
f25bfb75
AD
911 break;
912
624a35e2 913 case PERCENT_DESTRUCTOR:
cd3684cf 914 case PERCENT_INITIAL_ACTION:
624a35e2
PE
915 case PERCENT_PRINTER:
916 if (text[1] == '$')
917 {
918 obstack_sgrow (&obstack_for_string, "]b4_dollar_dollar[");
919 return;
920 }
921 break;
922
923 default:
f25bfb75
AD
924 break;
925 }
624a35e2
PE
926
927 complain_at (loc, _("invalid value: %s"), quote (text));
f25bfb75
AD
928}
929
930
931/*------------------------------------------------------.
932| TEXT is a location token (i.e., a `@...'). Output to |
223ff46e 933| OBSTACK_FOR_STRING a reference to this location. |
f25bfb75
AD
934`------------------------------------------------------*/
935
624a35e2 936static inline bool
223ff46e 937handle_action_at (char *text, location loc)
e9955c83 938{
366eea36 939 char *cp = text + 1;
d0829076 940 locations_flag = true;
e9955c83 941
624a35e2
PE
942 if (! current_rule)
943 return false;
944
366eea36 945 if (*cp == '$')
624a35e2 946 obstack_sgrow (&obstack_for_string, "]b4_lhs_location[");
d8d3f94a 947 else
e9955c83 948 {
4517da37 949 long int num = strtol (cp, NULL, 10);
dafdc66f 950
4517da37 951 if (1 - INT_MAX + rule_length <= num && num <= rule_length)
d8d3f94a
PE
952 {
953 int n = num;
05ac60f3 954 obstack_fgrow2 (&obstack_for_string, "]b4_rhs_location(%d, %d)[",
d8d3f94a
PE
955 rule_length, n);
956 }
e9955c83 957 else
223ff46e 958 complain_at (loc, _("integer out of range: %s"), quote (text));
f25bfb75 959 }
f25bfb75 960
624a35e2 961 return true;
e9955c83 962}
4cdb01db 963
f25bfb75 964
cd3684cf
AD
965/*----------------------------------------------------------------.
966| Map `@?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
967| (are we in an action?). |
968`----------------------------------------------------------------*/
f25bfb75
AD
969
970static void
624a35e2 971handle_at (int token_type, char *text, location loc)
f25bfb75 972{
624a35e2 973 switch (token_type)
f25bfb75 974 {
624a35e2 975 case BRACED_CODE:
223ff46e 976 handle_action_at (text, loc);
624a35e2
PE
977 return;
978
cd3684cf 979 case PERCENT_INITIAL_ACTION:
624a35e2
PE
980 case PERCENT_DESTRUCTOR:
981 case PERCENT_PRINTER:
982 if (text[1] == '$')
983 {
984 obstack_sgrow (&obstack_for_string, "]b4_at_dollar[");
985 return;
986 }
f25bfb75
AD
987 break;
988
624a35e2 989 default:
f25bfb75
AD
990 break;
991 }
624a35e2
PE
992
993 complain_at (loc, _("invalid value: %s"), quote (text));
f25bfb75
AD
994}
995
996
1452af69
PE
997/*------------------------------------------------------.
998| Scan NUMBER for a base-BASE integer at location LOC. |
999`------------------------------------------------------*/
1000
1001static unsigned long int
1002scan_integer (char const *number, int base, location loc)
1003{
4517da37
PE
1004 verify (INT_MAX < ULONG_MAX);
1005 unsigned long int num = strtoul (number, NULL, base);
1006
1007 if (INT_MAX < num)
1452af69
PE
1008 {
1009 complain_at (loc, _("integer out of range: %s"), quote (number));
1010 num = INT_MAX;
1011 }
4517da37 1012
1452af69
PE
1013 return num;
1014}
1015
1016
d8d3f94a
PE
1017/*------------------------------------------------------------------.
1018| Convert universal character name UCN to a single-byte character, |
1019| and return that character. Return -1 if UCN does not correspond |
1020| to a single-byte character. |
1021`------------------------------------------------------------------*/
1022
1023static int
1024convert_ucn_to_byte (char const *ucn)
1025{
4517da37
PE
1026 verify (UCHAR_MAX <= INT_MAX);
1027 unsigned long int code = strtoul (ucn + 2, NULL, 16);
d8d3f94a
PE
1028
1029 /* FIXME: Currently we assume Unicode-compatible unibyte characters
1030 on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes). On
1031 non-ASCII hosts we support only the portable C character set.
1032 These limitations should be removed once we add support for
1033 multibyte characters. */
1034
1035 if (UCHAR_MAX < code)
1036 return -1;
1037
1038#if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e)
1039 {
1040 /* A non-ASCII host. Use CODE to index into a table of the C
1041 basic execution character set, which is guaranteed to exist on
1042 all Standard C platforms. This table also includes '$', '@',
8e6ef483 1043 and '`', which are not in the basic execution character set but
d8d3f94a
PE
1044 which are unibyte characters on all the platforms that we know
1045 about. */
1046 static signed char const table[] =
1047 {
1048 '\0', -1, -1, -1, -1, -1, -1, '\a',
1049 '\b', '\t', '\n', '\v', '\f', '\r', -1, -1,
1050 -1, -1, -1, -1, -1, -1, -1, -1,
1051 -1, -1, -1, -1, -1, -1, -1, -1,
1052 ' ', '!', '"', '#', '$', '%', '&', '\'',
1053 '(', ')', '*', '+', ',', '-', '.', '/',
1054 '0', '1', '2', '3', '4', '5', '6', '7',
1055 '8', '9', ':', ';', '<', '=', '>', '?',
1056 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
1057 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
1058 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
1059 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
1060 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
1061 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
1062 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
1063 'x', 'y', 'z', '{', '|', '}', '~'
1064 };
1065
1066 code = code < sizeof table ? table[code] : -1;
1067 }
1068#endif
c4d720cd 1069
d8d3f94a
PE
1070 return code;
1071}
1072
1073
900c5db5
AD
1074/*----------------------------------------------------------------.
1075| Handle `#line INT "FILE"'. ARGS has already skipped `#line '. |
1076`----------------------------------------------------------------*/
1077
1078static void
4517da37 1079handle_syncline (char *args, location loc)
900c5db5 1080{
4517da37
PE
1081 char *after_num;
1082 unsigned long int lineno = strtoul (args, &after_num, 10);
1083 char *file = strchr (after_num, '"') + 1;
1084 *strchr (file, '"') = '\0';
1085 if (INT_MAX <= lineno)
1086 {
1087 warn_at (loc, _("line number overflow"));
1088 lineno = INT_MAX;
1089 }
dca81a78 1090 scanner_cursor.file = current_file = uniqstr_new (file);
3f2d73f1
PE
1091 scanner_cursor.line = lineno;
1092 scanner_cursor.column = 1;
900c5db5
AD
1093}
1094
a706a1cc 1095
4517da37
PE
1096/*---------------------------------.
1097| Report a rule that is too long. |
1098`---------------------------------*/
1099
1100static void
1101rule_length_overflow (location loc)
1102{
1103 fatal_at (loc, _("rule is too long"));
1104}
1105
1106
4febdd96
PE
1107/*----------------------------------------------------------------.
1108| For a token or comment starting at START, report message MSGID, |
1109| which should say that an end marker was found before |
1110| the expected TOKEN_END. |
1111`----------------------------------------------------------------*/
1112
1113static void
1114unexpected_end (boundary start, char const *msgid, char const *token_end)
1115{
1116 location loc;
1117 loc.start = start;
1118 loc.end = scanner_cursor;
1119 complain_at (loc, _(msgid), token_end);
1120}
1121
1122
3f2d73f1
PE
1123/*------------------------------------------------------------------------.
1124| Report an unexpected EOF in a token or comment starting at START. |
1125| An end of file was encountered and the expected TOKEN_END was missing. |
3f2d73f1 1126`------------------------------------------------------------------------*/
a706a1cc
PE
1127
1128static void
aa418041 1129unexpected_eof (boundary start, char const *token_end)
a706a1cc 1130{
4febdd96
PE
1131 unexpected_end (start, N_("missing `%s' at end of file"), token_end);
1132}
1133
1134
1135/*----------------------------------------.
1136| Likewise, but for unexpected newlines. |
1137`----------------------------------------*/
1138
1139static void
1140unexpected_newline (boundary start, char const *token_end)
1141{
1142 unexpected_end (start, N_("missing `%s' at end of line"), token_end);
a706a1cc
PE
1143}
1144
1145
f25bfb75
AD
1146/*-------------------------.
1147| Initialize the scanner. |
1148`-------------------------*/
1149
1d6412ad
AD
1150void
1151scanner_initialize (void)
1152{
223ff46e 1153 obstack_init (&obstack_for_string);
1d6412ad
AD
1154}
1155
1156
f25bfb75
AD
1157/*-----------------------------------------------.
1158| Free all the memory allocated to the scanner. |
1159`-----------------------------------------------*/
1160
4cdb01db
AD
1161void
1162scanner_free (void)
1163{
223ff46e 1164 obstack_free (&obstack_for_string, 0);
536545f3
AD
1165 /* Reclaim Flex's buffers. */
1166 yy_delete_buffer (YY_CURRENT_BUFFER);
4cdb01db 1167}