]> git.saurik.com Git - bison.git/blob - src/scan-gram.l
Fix handling of yychar manipulation in user semantic actions.
[bison.git] / src / scan-gram.l
1 /* Bison Grammar Scanner -*- C -*-
2
3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
4 Free Software Foundation, Inc.
5
6 This file is part of Bison, the GNU Compiler Compiler.
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
21 %option debug nodefault noinput nounput noyywrap never-interactive
22 %option prefix="gram_" outfile="lex.yy.c"
23
24 %{
25 /* Work around a bug in flex 2.5.31. See Debian bug 333231
26 <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=333231>. */
27 #undef gram_wrap
28 #define gram_wrap() 1
29
30 #define FLEX_PREFIX(Id) gram_ ## Id
31 #include <src/flex-scanner.h>
32
33 #include <src/complain.h>
34 #include <src/files.h>
35 #include <src/gram.h>
36 #include <quotearg.h>
37 #include <src/reader.h>
38 #include <src/uniqstr.h>
39
40 #include <ctype.h>
41 #include <mbswidth.h>
42 #include <quote.h>
43
44 #include <src/scan-gram.h>
45
46 #define YY_DECL GRAM_LEX_DECL
47
48 #define YY_USER_INIT \
49 code_start = scanner_cursor = loc->start; \
50
51 /* Location of scanner cursor. */
52 static boundary scanner_cursor;
53
54 #define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext, yyleng);
55
56 static size_t no_cr_read (FILE *, char *, size_t);
57 #define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
58
59 #define RETURN_PERCENT_PARAM(Value) \
60 RETURN_VALUE(PERCENT_PARAM, param, param_ ## Value)
61
62 #define RETURN_PERCENT_FLAG(Value) \
63 RETURN_VALUE(PERCENT_FLAG, uniqstr, uniqstr_new (Value))
64
65 #define RETURN_VALUE(Token, Field, Value) \
66 do { \
67 val->Field = Value; \
68 return Token; \
69 } while (0)
70
71 #define ROLLBACK_CURRENT_TOKEN \
72 do { \
73 scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0); \
74 yyless (0); \
75 } while (0)
76
77 /* A string representing the most recently saved token. */
78 static char *last_string;
79
80 /* Bracketed identifier. */
81 static uniqstr bracketed_id_str = 0;
82 static location bracketed_id_loc;
83 static boundary bracketed_id_start;
84 static int bracketed_id_context_state = 0;
85
86 void
87 gram_scanner_last_string_free (void)
88 {
89 STRING_FREE;
90 }
91
92 static void handle_syncline (char *, location);
93 static unsigned long int scan_integer (char const *p, int base, location loc);
94 static int convert_ucn_to_byte (char const *hex_text);
95 static void unexpected_eof (boundary, char const *);
96 static void unexpected_newline (boundary, char const *);
97
98 %}
99 /* A C-like comment in directives/rules. */
100 %x SC_YACC_COMMENT
101 /* Strings and characters in directives/rules. */
102 %x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
103 /* A identifier was just read in directives/rules. Special state
104 to capture the sequence `identifier :'. */
105 %x SC_AFTER_IDENTIFIER
106 /* A complex tag, with nested angles brackets. */
107 %x SC_TAG
108
109 /* Three types of user code:
110 - prologue (code between `%{' `%}' in the first section, before %%);
111 - actions, printers, union, etc, (between braced in the middle section);
112 - epilogue (everything after the second %%). */
113 %x SC_PROLOGUE SC_BRACED_CODE SC_EPILOGUE
114 /* C and C++ comments in code. */
115 %x SC_COMMENT SC_LINE_COMMENT
116 /* Strings and characters in code. */
117 %x SC_STRING SC_CHARACTER
118 /* Bracketed identifiers support. */
119 %x SC_BRACKETED_ID SC_RETURN_BRACKETED_ID
120
121 letter [-.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
122 id {letter}({letter}|[0-9])*
123 directive %{id}
124 int [0-9]+
125
126 /* POSIX says that a tag must be both an id and a C union member, but
127 historically almost any character is allowed in a tag. We disallow
128 NUL, as this simplifies our implementation. We disallow angle
129 bracket to match them in nested pairs: several languages use them
130 for generics/template types. */
131 tag [^\0<>]+
132
133 /* Zero or more instances of backslash-newline. Following GCC, allow
134 white space between the backslash and the newline. */
135 splice (\\[ \f\t\v]*\n)*
136
137 %%
138 %{
139 /* Nesting level. Either for nested braces, or nested angle brackets
140 (but not mixed). */
141 int nesting IF_LINT (= 0);
142
143 /* Parent context state, when applicable. */
144 int context_state IF_LINT (= 0);
145
146 /* Location of most recent identifier, when applicable. */
147 location id_loc IF_LINT (= empty_location);
148
149 /* Where containing code started, when applicable. Its initial
150 value is relevant only when yylex is invoked in the SC_EPILOGUE
151 start condition. */
152 boundary code_start = scanner_cursor;
153
154 /* Where containing comment or string or character literal started,
155 when applicable. */
156 boundary token_start IF_LINT (= scanner_cursor);
157 %}
158
159
160 /*-----------------------.
161 | Scanning white space. |
162 `-----------------------*/
163
164 <INITIAL,SC_AFTER_IDENTIFIER,SC_BRACKETED_ID,SC_RETURN_BRACKETED_ID>
165 {
166 /* Comments and white space. */
167 "," warn_at (*loc, _("stray `,' treated as white space"));
168 [ \f\n\t\v] |
169 "//".* ;
170 "/*" {
171 token_start = loc->start;
172 context_state = YY_START;
173 BEGIN SC_YACC_COMMENT;
174 }
175
176 /* #line directives are not documented, and may be withdrawn or
177 modified in future versions of Bison. */
178 ^"#line "{int}" \"".*"\"\n" {
179 handle_syncline (yytext + sizeof "#line " - 1, *loc);
180 }
181 }
182
183
184 /*----------------------------.
185 | Scanning Bison directives. |
186 `----------------------------*/
187
188 /* For directives that are also command line options, the regex must be
189 "%..."
190 after "[-_]"s are removed, and the directive must match the --long
191 option name, with a single string argument. Otherwise, add exceptions
192 to ../build-aux/cross-options.pl. */
193
194 <INITIAL>
195 {
196 "%binary" return PERCENT_NONASSOC;
197 "%code" return PERCENT_CODE;
198 "%debug" RETURN_PERCENT_FLAG("parse.trace");
199 "%default"[-_]"prec" return PERCENT_DEFAULT_PREC;
200 "%define" return PERCENT_DEFINE;
201 "%defines" return PERCENT_DEFINES;
202 "%destructor" return PERCENT_DESTRUCTOR;
203 "%dprec" return PERCENT_DPREC;
204 "%error"[-_]"verbose" return PERCENT_ERROR_VERBOSE;
205 "%expect" return PERCENT_EXPECT;
206 "%expect"[-_]"rr" return PERCENT_EXPECT_RR;
207 "%file-prefix" return PERCENT_FILE_PREFIX;
208 "%fixed"[-_]"output"[-_]"files" return PERCENT_YACC;
209 "%initial-action" return PERCENT_INITIAL_ACTION;
210 "%glr-parser" return PERCENT_GLR_PARSER;
211 "%language" return PERCENT_LANGUAGE;
212 "%left" return PERCENT_LEFT;
213 "%lex-param" RETURN_PERCENT_PARAM(lex);
214 "%locations" RETURN_PERCENT_FLAG("locations");
215 "%merge" return PERCENT_MERGE;
216 "%name"[-_]"prefix" return PERCENT_NAME_PREFIX;
217 "%no"[-_]"default"[-_]"prec" return PERCENT_NO_DEFAULT_PREC;
218 "%no"[-_]"lines" return PERCENT_NO_LINES;
219 "%nonassoc" return PERCENT_NONASSOC;
220 "%nondeterministic-parser" return PERCENT_NONDETERMINISTIC_PARSER;
221 "%nterm" return PERCENT_NTERM;
222 "%output" return PERCENT_OUTPUT;
223 "%param" RETURN_PERCENT_PARAM(both);
224 "%parse-param" RETURN_PERCENT_PARAM(parse);
225 "%prec" return PERCENT_PREC;
226 "%precedence" return PERCENT_PRECEDENCE;
227 "%printer" return PERCENT_PRINTER;
228 "%pure"[-_]"parser" RETURN_PERCENT_FLAG("api.pure");
229 "%require" return PERCENT_REQUIRE;
230 "%right" return PERCENT_RIGHT;
231 "%skeleton" return PERCENT_SKELETON;
232 "%start" return PERCENT_START;
233 "%term" return PERCENT_TOKEN;
234 "%token" return PERCENT_TOKEN;
235 "%token"[-_]"table" return PERCENT_TOKEN_TABLE;
236 "%type" return PERCENT_TYPE;
237 "%union" return PERCENT_UNION;
238 "%verbose" return PERCENT_VERBOSE;
239 "%yacc" return PERCENT_YACC;
240
241 {directive} {
242 complain_at (*loc, _("invalid directive: %s"), quote (yytext));
243 }
244
245 "=" return EQUAL;
246 "|" return PIPE;
247 ";" return SEMICOLON;
248
249 {id} {
250 val->uniqstr = uniqstr_new (yytext);
251 id_loc = *loc;
252 bracketed_id_str = NULL;
253 BEGIN SC_AFTER_IDENTIFIER;
254 }
255
256 {int} {
257 val->integer = scan_integer (yytext, 10, *loc);
258 return INT;
259 }
260 0[xX][0-9abcdefABCDEF]+ {
261 val->integer = scan_integer (yytext, 16, *loc);
262 return INT;
263 }
264
265 /* Identifiers may not start with a digit. Yet, don't silently
266 accept "1FOO" as "1 FOO". */
267 {int}{id} {
268 complain_at (*loc, _("invalid identifier: %s"), quote (yytext));
269 }
270
271 /* Characters. */
272 "'" token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;
273
274 /* Strings. */
275 "\"" token_start = loc->start; BEGIN SC_ESCAPED_STRING;
276
277 /* Prologue. */
278 "%{" code_start = loc->start; BEGIN SC_PROLOGUE;
279
280 /* Code in between braces. */
281 "{" {
282 STRING_GROW;
283 nesting = 0;
284 code_start = loc->start;
285 BEGIN SC_BRACED_CODE;
286 }
287
288 /* A type. */
289 "<*>" return TAG_ANY;
290 "<>" return TAG_NONE;
291 "<"{tag}">" {
292 obstack_grow (&obstack_for_string, yytext + 1, yyleng - 2);
293 STRING_FINISH;
294 val->uniqstr = uniqstr_new (last_string);
295 STRING_FREE;
296 return TAG;
297 }
298 "<" {
299 nesting = 0;
300 token_start = loc->start;
301 BEGIN SC_TAG;
302 }
303
304 "%%" {
305 static int percent_percent_count;
306 if (++percent_percent_count == 2)
307 BEGIN SC_EPILOGUE;
308 return PERCENT_PERCENT;
309 }
310
311 "[" {
312 bracketed_id_str = NULL;
313 bracketed_id_start = loc->start;
314 bracketed_id_context_state = YY_START;
315 BEGIN SC_BRACKETED_ID;
316 }
317
318 . {
319 complain_at (*loc, _("invalid character: %s"), quote (yytext));
320 }
321
322 <<EOF>> {
323 loc->start = loc->end = scanner_cursor;
324 yyterminate ();
325 }
326 }
327
328
329 /*--------------------------------------------------------------.
330 | Supporting \0 complexifies our implementation for no expected |
331 | added value. |
332 `--------------------------------------------------------------*/
333
334 <SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING,SC_TAG>
335 {
336 \0 complain_at (*loc, _("invalid null character"));
337 }
338
339
340 /*-----------------------------------------------------------------.
341 | Scanning after an identifier, checking whether a colon is next. |
342 `-----------------------------------------------------------------*/
343
344 <SC_AFTER_IDENTIFIER>
345 {
346 "[" {
347 if (bracketed_id_str)
348 {
349 ROLLBACK_CURRENT_TOKEN;
350 BEGIN SC_RETURN_BRACKETED_ID;
351 *loc = id_loc;
352 return ID;
353 }
354 else
355 {
356 bracketed_id_start = loc->start;
357 bracketed_id_context_state = YY_START;
358 BEGIN SC_BRACKETED_ID;
359 }
360 }
361 ":" {
362 BEGIN (bracketed_id_str ? SC_RETURN_BRACKETED_ID : INITIAL);
363 *loc = id_loc;
364 return ID_COLON;
365 }
366 . {
367 ROLLBACK_CURRENT_TOKEN;
368 BEGIN (bracketed_id_str ? SC_RETURN_BRACKETED_ID : INITIAL);
369 *loc = id_loc;
370 return ID;
371 }
372 <<EOF>> {
373 BEGIN (bracketed_id_str ? SC_RETURN_BRACKETED_ID : INITIAL);
374 *loc = id_loc;
375 return ID;
376 }
377 }
378
379 /*--------------------------------.
380 | Scanning bracketed identifiers. |
381 `--------------------------------*/
382
383 <SC_BRACKETED_ID>
384 {
385 {id} {
386 if (bracketed_id_str)
387 {
388 complain_at (*loc, _("unexpected identifier in bracketed name: %s"),
389 quote (yytext));
390 }
391 else
392 {
393 bracketed_id_str = uniqstr_new (yytext);
394 bracketed_id_loc = *loc;
395 }
396 }
397 "]" {
398 BEGIN bracketed_id_context_state;
399 if (bracketed_id_str)
400 {
401 if (INITIAL == bracketed_id_context_state)
402 {
403 val->uniqstr = bracketed_id_str;
404 bracketed_id_str = 0;
405 *loc = bracketed_id_loc;
406 return BRACKETED_ID;
407 }
408 }
409 else
410 complain_at (*loc, _("an identifier expected"));
411 }
412 . {
413 complain_at (*loc, _("invalid character in bracketed name: %s"),
414 quote (yytext));
415 }
416 <<EOF>> {
417 BEGIN bracketed_id_context_state;
418 unexpected_eof (bracketed_id_start, "]");
419 }
420 }
421
422 <SC_RETURN_BRACKETED_ID>
423 {
424 . {
425 ROLLBACK_CURRENT_TOKEN;
426 val->uniqstr = bracketed_id_str;
427 bracketed_id_str = 0;
428 *loc = bracketed_id_loc;
429 BEGIN INITIAL;
430 return BRACKETED_ID;
431 }
432 }
433
434
435 /*---------------------------------------------------------------.
436 | Scanning a Yacc comment. The initial `/ *' is already eaten. |
437 `---------------------------------------------------------------*/
438
439 <SC_YACC_COMMENT>
440 {
441 "*/" BEGIN context_state;
442 .|\n ;
443 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
444 }
445
446
447 /*------------------------------------------------------------.
448 | Scanning a C comment. The initial `/ *' is already eaten. |
449 `------------------------------------------------------------*/
450
451 <SC_COMMENT>
452 {
453 "*"{splice}"/" STRING_GROW; BEGIN context_state;
454 <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
455 }
456
457
458 /*--------------------------------------------------------------.
459 | Scanning a line comment. The initial `//' is already eaten. |
460 `--------------------------------------------------------------*/
461
462 <SC_LINE_COMMENT>
463 {
464 "\n" STRING_GROW; BEGIN context_state;
465 {splice} STRING_GROW;
466 <<EOF>> BEGIN context_state;
467 }
468
469
470 /*------------------------------------------------.
471 | Scanning a Bison string, including its escapes. |
472 | The initial quote is already eaten. |
473 `------------------------------------------------*/
474
475 <SC_ESCAPED_STRING>
476 {
477 "\""|"\n" {
478 if (yytext[0] == '\n')
479 unexpected_newline (token_start, "\"");
480 STRING_FINISH;
481 loc->start = token_start;
482 val->chars = last_string;
483 BEGIN INITIAL;
484 return STRING;
485 }
486 <<EOF>> {
487 unexpected_eof (token_start, "\"");
488 STRING_FINISH;
489 loc->start = token_start;
490 val->chars = last_string;
491 BEGIN INITIAL;
492 return STRING;
493 }
494 }
495
496 /*----------------------------------------------------------.
497 | Scanning a Bison character literal, decoding its escapes. |
498 | The initial quote is already eaten. |
499 `----------------------------------------------------------*/
500
501 <SC_ESCAPED_CHARACTER>
502 {
503 "'"|"\n" {
504 STRING_FINISH;
505 loc->start = token_start;
506 val->character = last_string[0];
507 {
508 /* FIXME: Eventually, make these errors. */
509 if (last_string[0] == '\0')
510 {
511 warn_at (*loc, _("empty character literal"));
512 /* '\0' seems dangerous even if we are about to complain. */
513 val->character = '\'';
514 }
515 else if (last_string[1] != '\0')
516 warn_at (*loc, _("extra characters in character literal"));
517 }
518 if (yytext[0] == '\n')
519 unexpected_newline (token_start, "'");
520 STRING_FREE;
521 BEGIN INITIAL;
522 return CHAR;
523 }
524 <<EOF>> {
525 STRING_FINISH;
526 loc->start = token_start;
527 val->character = last_string[0];
528 {
529 /* FIXME: Eventually, make these errors. */
530 if (last_string[0] == '\0')
531 {
532 warn_at (*loc, _("empty character literal"));
533 /* '\0' seems dangerous even if we are about to complain. */
534 val->character = '\'';
535 }
536 else if (last_string[1] != '\0')
537 warn_at (*loc, _("extra characters in character literal"));
538 }
539 unexpected_eof (token_start, "'");
540 STRING_FREE;
541 BEGIN INITIAL;
542 return CHAR;
543 }
544 }
545
546 /*-----------------------------------------------------------.
547 | Scanning a Bison nested tag. The initial angle bracket is |
548 | already eaten. |
549 `-----------------------------------------------------------*/
550
551 <SC_TAG>
552 {
553 ">" {
554 --nesting;
555 if (nesting < 0)
556 {
557 STRING_FINISH;
558 loc->start = token_start;
559 val->uniqstr = uniqstr_new (last_string);
560 STRING_FREE;
561 BEGIN INITIAL;
562 return TAG;
563 }
564 STRING_GROW;
565 }
566
567 [^<>]+ STRING_GROW;
568 "<"+ STRING_GROW; nesting += yyleng;
569
570 <<EOF>> {
571 unexpected_eof (token_start, ">");
572 STRING_FINISH;
573 loc->start = token_start;
574 val->uniqstr = uniqstr_new (last_string);
575 STRING_FREE;
576 BEGIN INITIAL;
577 return TAG;
578 }
579 }
580
581 /*----------------------------.
582 | Decode escaped characters. |
583 `----------------------------*/
584
585 <SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
586 {
587 \\[0-7]{1,3} {
588 unsigned long int c = strtoul (yytext + 1, NULL, 8);
589 if (!c || UCHAR_MAX < c)
590 complain_at (*loc, _("invalid number after \\-escape: %s"),
591 yytext+1);
592 else
593 obstack_1grow (&obstack_for_string, c);
594 }
595
596 \\x[0-9abcdefABCDEF]+ {
597 verify (UCHAR_MAX < ULONG_MAX);
598 unsigned long int c = strtoul (yytext + 2, NULL, 16);
599 if (!c || UCHAR_MAX < c)
600 complain_at (*loc, _("invalid number after \\-escape: %s"),
601 yytext+1);
602 else
603 obstack_1grow (&obstack_for_string, c);
604 }
605
606 \\a obstack_1grow (&obstack_for_string, '\a');
607 \\b obstack_1grow (&obstack_for_string, '\b');
608 \\f obstack_1grow (&obstack_for_string, '\f');
609 \\n obstack_1grow (&obstack_for_string, '\n');
610 \\r obstack_1grow (&obstack_for_string, '\r');
611 \\t obstack_1grow (&obstack_for_string, '\t');
612 \\v obstack_1grow (&obstack_for_string, '\v');
613
614 /* \\[\"\'?\\] would be shorter, but it confuses xgettext. */
615 \\("\""|"'"|"?"|"\\") obstack_1grow (&obstack_for_string, yytext[1]);
616
617 \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
618 int c = convert_ucn_to_byte (yytext);
619 if (c <= 0)
620 complain_at (*loc, _("invalid number after \\-escape: %s"),
621 yytext+1);
622 else
623 obstack_1grow (&obstack_for_string, c);
624 }
625 \\(.|\n) {
626 char const *p = yytext + 1;
627 /* Quote only if escaping won't make the character visible. */
628 if (isspace ((unsigned char) *p) && isprint ((unsigned char) *p))
629 p = quote (p);
630 else
631 p = quotearg_style_mem (escape_quoting_style, p, 1);
632 complain_at (*loc, _("invalid character after \\-escape: %s"), p);
633 }
634 }
635
636 /*--------------------------------------------.
637 | Scanning user-code characters and strings. |
638 `--------------------------------------------*/
639
640 <SC_CHARACTER,SC_STRING>
641 {
642 {splice}|\\{splice}[^\n\[\]] STRING_GROW;
643 }
644
645 <SC_CHARACTER>
646 {
647 "'" STRING_GROW; BEGIN context_state;
648 \n unexpected_newline (token_start, "'"); BEGIN context_state;
649 <<EOF>> unexpected_eof (token_start, "'"); BEGIN context_state;
650 }
651
652 <SC_STRING>
653 {
654 "\"" STRING_GROW; BEGIN context_state;
655 \n unexpected_newline (token_start, "\""); BEGIN context_state;
656 <<EOF>> unexpected_eof (token_start, "\""); BEGIN context_state;
657 }
658
659
660 /*---------------------------------------------------.
661 | Strings, comments etc. can be found in user code. |
662 `---------------------------------------------------*/
663
664 <SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
665 {
666 "'" {
667 STRING_GROW;
668 context_state = YY_START;
669 token_start = loc->start;
670 BEGIN SC_CHARACTER;
671 }
672 "\"" {
673 STRING_GROW;
674 context_state = YY_START;
675 token_start = loc->start;
676 BEGIN SC_STRING;
677 }
678 "/"{splice}"*" {
679 STRING_GROW;
680 context_state = YY_START;
681 token_start = loc->start;
682 BEGIN SC_COMMENT;
683 }
684 "/"{splice}"/" {
685 STRING_GROW;
686 context_state = YY_START;
687 BEGIN SC_LINE_COMMENT;
688 }
689 }
690
691
692
693 /*-----------------------------------------------------------.
694 | Scanning some code in braces (actions). The initial "{" is |
695 | already eaten. |
696 `-----------------------------------------------------------*/
697
698 <SC_BRACED_CODE>
699 {
700 "{"|"<"{splice}"%" STRING_GROW; nesting++;
701 "%"{splice}">" STRING_GROW; nesting--;
702 "}" {
703 obstack_1grow (&obstack_for_string, '}');
704
705 --nesting;
706 if (nesting < 0)
707 {
708 STRING_FINISH;
709 loc->start = code_start;
710 val->code = last_string;
711 BEGIN INITIAL;
712 return BRACED_CODE;
713 }
714 }
715
716 /* Tokenize `<<%' correctly (as `<<' `%') rather than incorrrectly
717 (as `<' `<%'). */
718 "<"{splice}"<" STRING_GROW;
719
720 <<EOF>> {
721 unexpected_eof (code_start, "}");
722 STRING_FINISH;
723 loc->start = code_start;
724 val->code = last_string;
725 BEGIN INITIAL;
726 return BRACED_CODE;
727 }
728 }
729
730
731 /*--------------------------------------------------------------.
732 | Scanning some prologue: from "%{" (already scanned) to "%}". |
733 `--------------------------------------------------------------*/
734
735 <SC_PROLOGUE>
736 {
737 "%}" {
738 STRING_FINISH;
739 loc->start = code_start;
740 val->chars = last_string;
741 BEGIN INITIAL;
742 return PROLOGUE;
743 }
744
745 <<EOF>> {
746 unexpected_eof (code_start, "%}");
747 STRING_FINISH;
748 loc->start = code_start;
749 val->chars = last_string;
750 BEGIN INITIAL;
751 return PROLOGUE;
752 }
753 }
754
755
756 /*---------------------------------------------------------------.
757 | Scanning the epilogue (everything after the second "%%", which |
758 | has already been eaten). |
759 `---------------------------------------------------------------*/
760
761 <SC_EPILOGUE>
762 {
763 <<EOF>> {
764 STRING_FINISH;
765 loc->start = code_start;
766 val->chars = last_string;
767 BEGIN INITIAL;
768 return EPILOGUE;
769 }
770 }
771
772
773 /*-----------------------------------------------------.
774 | By default, grow the string obstack with the input. |
775 `-----------------------------------------------------*/
776
777 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>. |
778 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>\n STRING_GROW;
779
780 %%
781
782 /* Read bytes from FP into buffer BUF of size SIZE. Return the
783 number of bytes read. Remove '\r' from input, treating \r\n
784 and isolated \r as \n. */
785
786 static size_t
787 no_cr_read (FILE *fp, char *buf, size_t size)
788 {
789 size_t bytes_read = fread (buf, 1, size, fp);
790 if (bytes_read)
791 {
792 char *w = memchr (buf, '\r', bytes_read);
793 if (w)
794 {
795 char const *r = ++w;
796 char const *lim = buf + bytes_read;
797
798 for (;;)
799 {
800 /* Found an '\r'. Treat it like '\n', but ignore any
801 '\n' that immediately follows. */
802 w[-1] = '\n';
803 if (r == lim)
804 {
805 int ch = getc (fp);
806 if (ch != '\n' && ungetc (ch, fp) != ch)
807 break;
808 }
809 else if (*r == '\n')
810 r++;
811
812 /* Copy until the next '\r'. */
813 do
814 {
815 if (r == lim)
816 return w - buf;
817 }
818 while ((*w++ = *r++) != '\r');
819 }
820
821 return w - buf;
822 }
823 }
824
825 return bytes_read;
826 }
827
828
829
830 /*------------------------------------------------------.
831 | Scan NUMBER for a base-BASE integer at location LOC. |
832 `------------------------------------------------------*/
833
834 static unsigned long int
835 scan_integer (char const *number, int base, location loc)
836 {
837 verify (INT_MAX < ULONG_MAX);
838 unsigned long int num = strtoul (number, NULL, base);
839
840 if (INT_MAX < num)
841 {
842 complain_at (loc, _("integer out of range: %s"), quote (number));
843 num = INT_MAX;
844 }
845
846 return num;
847 }
848
849
850 /*------------------------------------------------------------------.
851 | Convert universal character name UCN to a single-byte character, |
852 | and return that character. Return -1 if UCN does not correspond |
853 | to a single-byte character. |
854 `------------------------------------------------------------------*/
855
856 static int
857 convert_ucn_to_byte (char const *ucn)
858 {
859 verify (UCHAR_MAX <= INT_MAX);
860 unsigned long int code = strtoul (ucn + 2, NULL, 16);
861
862 /* FIXME: Currently we assume Unicode-compatible unibyte characters
863 on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes). On
864 non-ASCII hosts we support only the portable C character set.
865 These limitations should be removed once we add support for
866 multibyte characters. */
867
868 if (UCHAR_MAX < code)
869 return -1;
870
871 #if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e)
872 {
873 /* A non-ASCII host. Use CODE to index into a table of the C
874 basic execution character set, which is guaranteed to exist on
875 all Standard C platforms. This table also includes '$', '@',
876 and '`', which are not in the basic execution character set but
877 which are unibyte characters on all the platforms that we know
878 about. */
879 static signed char const table[] =
880 {
881 '\0', -1, -1, -1, -1, -1, -1, '\a',
882 '\b', '\t', '\n', '\v', '\f', '\r', -1, -1,
883 -1, -1, -1, -1, -1, -1, -1, -1,
884 -1, -1, -1, -1, -1, -1, -1, -1,
885 ' ', '!', '"', '#', '$', '%', '&', '\'',
886 '(', ')', '*', '+', ',', '-', '.', '/',
887 '0', '1', '2', '3', '4', '5', '6', '7',
888 '8', '9', ':', ';', '<', '=', '>', '?',
889 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
890 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
891 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
892 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
893 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
894 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
895 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
896 'x', 'y', 'z', '{', '|', '}', '~'
897 };
898
899 code = code < sizeof table ? table[code] : -1;
900 }
901 #endif
902
903 return code;
904 }
905
906
907 /*----------------------------------------------------------------.
908 | Handle `#line INT "FILE"'. ARGS has already skipped `#line '. |
909 `----------------------------------------------------------------*/
910
911 static void
912 handle_syncline (char *args, location loc)
913 {
914 char *after_num;
915 unsigned long int lineno = strtoul (args, &after_num, 10);
916 char *file = strchr (after_num, '"') + 1;
917 *strchr (file, '"') = '\0';
918 if (INT_MAX <= lineno)
919 {
920 warn_at (loc, _("line number overflow"));
921 lineno = INT_MAX;
922 }
923 current_file = uniqstr_new (file);
924 boundary_set (&scanner_cursor, current_file, lineno, 1);
925 }
926
927
928 /*----------------------------------------------------------------.
929 | For a token or comment starting at START, report message MSGID, |
930 | which should say that an end marker was found before |
931 | the expected TOKEN_END. |
932 `----------------------------------------------------------------*/
933
934 static void
935 unexpected_end (boundary start, char const *msgid, char const *token_end)
936 {
937 location loc;
938 loc.start = start;
939 loc.end = scanner_cursor;
940 complain_at (loc, _(msgid), token_end);
941 }
942
943
944 /*------------------------------------------------------------------------.
945 | Report an unexpected EOF in a token or comment starting at START. |
946 | An end of file was encountered and the expected TOKEN_END was missing. |
947 `------------------------------------------------------------------------*/
948
949 static void
950 unexpected_eof (boundary start, char const *token_end)
951 {
952 unexpected_end (start, N_("missing `%s' at end of file"), token_end);
953 }
954
955
956 /*----------------------------------------.
957 | Likewise, but for unexpected newlines. |
958 `----------------------------------------*/
959
960 static void
961 unexpected_newline (boundary start, char const *token_end)
962 {
963 unexpected_end (start, N_("missing `%s' at end of line"), token_end);
964 }
965
966
967 /*-------------------------.
968 | Initialize the scanner. |
969 `-------------------------*/
970
971 void
972 gram_scanner_initialize (void)
973 {
974 obstack_init (&obstack_for_string);
975 }
976
977
978 /*-----------------------------------------------.
979 | Free all the memory allocated to the scanner. |
980 `-----------------------------------------------*/
981
982 void
983 gram_scanner_free (void)
984 {
985 obstack_free (&obstack_for_string, 0);
986 /* Reclaim Flex's buffers. */
987 yylex_destroy ();
988 }