src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "symtab.h"
  25 #include "lex.h"
  26 #include "complain.h"
  27 #include "gram.h"
  28 #include "quote.h"
  29
  30 /* Buffer for storing the current token.  */
  31 struct obstack token_obstack;
  32 const char *token_buffer = NULL;
  33
  34 bucket *symval = NULL;
  35 int numval;
  36
  37 /* A token to be reread, see unlex and lex. */
  38 static token_t unlexed = tok_undef;
  39 static bucket *unlexed_symval = NULL;
  40 static const char *unlexed_token_buffer = NULL;
  41
  42 void
  43 lex_init (void)
  44 {
  45   obstack_init (&token_obstack);
  46   unlexed = tok_undef;
  47 }
  48
  49
  50 void
  51 lex_free (void)
  52 {
  53   obstack_free (&token_obstack, NULL);
  54 }
  55
  56
  57 int
  58 skip_white_space (void)
  59 {
  60   int c;
  61   int inside;
  62
  63   c = getc (finput);
  64
  65   for (;;)
  66     {
  67       int cplus_comment;
  68
  69       switch (c)
  70         {
  71         case '/':
  72           /* FIXME: Should probably be merged with copy_comment.  */
  73           c = getc (finput);
  74           if (c != '*' && c != '/')
  75             {
  76               complain (_("unexpected `/' found and ignored"));
  77               break;
  78             }
  79           cplus_comment = (c == '/');
  80
  81           c = getc (finput);
  82
  83           inside = 1;
  84           while (inside)
  85             {
  86               if (!cplus_comment && c == '*')
  87                 {
  88                   while (c == '*')
  89                     c = getc (finput);
  90
  91                   if (c == '/')
  92                     {
  93                       inside = 0;
  94                       c = getc (finput);
  95                     }
  96                 }
  97               else if (c == '\n')
  98                 {
  99                   lineno++;
 100                   if (cplus_comment)
 101                     inside = 0;
 102                   c = getc (finput);
 103                 }
 104               else if (c == EOF)
 105                 fatal (_("unterminated comment"));
 106               else
 107                 c = getc (finput);
 108             }
 109
 110           break;
 111
 112         case '\n':
 113           lineno++;
 114
 115         case ' ':
 116         case '\t':
 117         case '\f':
 118           c = getc (finput);
 119           break;
 120
 121         default:
 122           return c;
 123         }
 124     }
 125 }
 126
 127
 128 /*-----------------------------------------------------.
 129 | Do a getc, but give error message if EOF encountered |
 130 `-----------------------------------------------------*/
 131
 132 static int
 133 xgetc (FILE *f)
 134 {
 135   int c = getc (f);
 136   if (c == EOF)
 137     fatal (_("unexpected end of file"));
 138   return c;
 139 }
 140
 141
 142 /*------------------------------------------------------------------.
 143 | Read one literal character from finput.  Process \ escapes.       |
 144 | Append the normalized string version of the char to OUT.  Assign  |
 145 | the character code to *PCODE. Return 1 unless the character is an |
 146 | unescaped `term' or \n report error for \n.                       |
 147 `------------------------------------------------------------------*/
 148
 149 /* FIXME: We could directly work in the obstack, but that would make
 150    it more difficult to move to quotearg some day.  So for the time
 151    being, I prefer have literalchar behave like quotearg, and change
 152    my mind later if I was wrong.  */
 153
 154 static int
 155 literalchar (struct obstack *out, int *pcode, char term)
 156 {
 157   int c;
 158   char buf[4096];
 159   char *cp;
 160   int code;
 161   int wasquote = 0;
 162
 163   c = xgetc (finput);
 164   if (c == '\n')
 165     {
 166       complain (_("unescaped newline in constant"));
 167       ungetc (c, finput);
 168       code = '?';
 169       wasquote = 1;
 170     }
 171   else if (c != '\\')
 172     {
 173       code = c;
 174       if (c == term)
 175         wasquote = 1;
 176     }
 177   else
 178     {
 179       c = xgetc (finput);
 180       if (c == 't')
 181         code = '\t';
 182       else if (c == 'n')
 183         code = '\n';
 184       else if (c == 'a')
 185         code = '\007';
 186       else if (c == 'r')
 187         code = '\r';
 188       else if (c == 'f')
 189         code = '\f';
 190       else if (c == 'b')
 191         code = '\b';
 192       else if (c == 'v')
 193         code = '\013';
 194       else if (c == '\\')
 195         code = '\\';
 196       else if (c == '\'')
 197         code = '\'';
 198       else if (c == '\"')
 199         code = '\"';
 200       else if (c <= '7' && c >= '0')
 201         {
 202           code = 0;
 203           while (c <= '7' && c >= '0')
 204             {
 205               code = (code * 8) + (c - '0');
 206               if (code >= 256 || code < 0)
 207                 {
 208                   complain (_("octal value outside range 0...255: `\\%o'"),
 209                             code);
 210                   code &= 0xFF;
 211                   break;
 212                 }
 213               c = xgetc (finput);
 214             }
 215           ungetc (c, finput);
 216         }
 217       else if (c == 'x')
 218         {
 219           c = xgetc (finput);
 220           code = 0;
 221           while (1)
 222             {
 223               if (c >= '0' && c <= '9')
 224                 code *= 16, code += c - '0';
 225               else if (c >= 'a' && c <= 'f')
 226                 code *= 16, code += c - 'a' + 10;
 227               else if (c >= 'A' && c <= 'F')
 228                 code *= 16, code += c - 'A' + 10;
 229               else
 230                 break;
 231               if (code >= 256 || code < 0)
 232                 {
 233                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 234                   code &= 0xFF;
 235                   break;
 236                 }
 237               c = xgetc (finput);
 238             }
 239           ungetc (c, finput);
 240         }
 241       else
 242         {
 243           char badchar [] = "c";
 244           badchar[0] = c;
 245           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 246                     quote (badchar));
 247           code = '?';
 248         }
 249     }                           /* has \ */
 250
 251   /* now fill BUF with the canonical name for this character as a
 252      literal token.  Do not use what the user typed, so that `\012'
 253      and `\n' can be interchangeable.  */
 254
 255   cp = buf;
 256   if (code == term && wasquote)
 257     *cp++ = code;
 258   else if (code == '\\')
 259     {
 260       *cp++ = '\\';
 261       *cp++ = '\\';
 262     }
 263   else if (code == '\'')
 264     {
 265       *cp++ = '\\';
 266       *cp++ = '\'';
 267     }
 268   else if (code == '\"')
 269     {
 270       *cp++ = '\\';
 271       *cp++ = '\"';
 272     }
 273   else if (code >= 040 && code < 0177)
 274     *cp++ = code;
 275   else if (code == '\t')
 276     {
 277       *cp++ = '\\';
 278       *cp++ = 't';
 279     }
 280   else if (code == '\n')
 281     {
 282       *cp++ = '\\';
 283       *cp++ = 'n';
 284     }
 285   else if (code == '\r')
 286     {
 287       *cp++ = '\\';
 288       *cp++ = 'r';
 289     }
 290   else if (code == '\v')
 291     {
 292       *cp++ = '\\';
 293       *cp++ = 'v';
 294     }
 295   else if (code == '\b')
 296     {
 297       *cp++ = '\\';
 298       *cp++ = 'b';
 299     }
 300   else if (code == '\f')
 301     {
 302       *cp++ = '\\';
 303       *cp++ = 'f';
 304     }
 305   else
 306     {
 307       *cp++ = '\\';
 308       *cp++ = code / 0100 + '0';
 309       *cp++ = ((code / 010) & 07) + '0';
 310       *cp++ = (code & 07) + '0';
 311     }
 312   *cp = '\0';
 313
 314   if (out)
 315     obstack_sgrow (out, buf);
 316   *pcode = code;
 317   return !wasquote;
 318 }
 319
 320
 321 void
 322 unlex (token_t token)
 323 {
 324   unlexed = token;
 325   unlexed_token_buffer = token_buffer;
 326   unlexed_symval = symval;
 327 }
 328
 329 /*-----------------------------------------------------------------.
 330 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 331 | specified between the `<...>'.                                   |
 332 `-----------------------------------------------------------------*/
 333
 334 void
 335 read_type_name (FILE *fin)
 336 {
 337   int c = getc (fin);
 338
 339   while (c != '>')
 340     {
 341       if (c == EOF)
 342         fatal (_("unterminated type name at end of file"));
 343       if (c == '\n')
 344         {
 345           complain (_("unterminated type name"));
 346           ungetc (c, fin);
 347           break;
 348         }
 349
 350       obstack_1grow (&token_obstack, c);
 351       c = getc (fin);
 352     }
 353   obstack_1grow (&token_obstack, '\0');
 354   token_buffer = obstack_finish (&token_obstack);
 355 }
 356
 357
 358 token_t
 359 lex (void)
 360 {
 361   int c;
 362
 363   /* Just to make sure. */
 364   token_buffer = NULL;
 365
 366   if (unlexed != tok_undef)
 367     {
 368       token_t res = unlexed;
 369       symval = unlexed_symval;
 370       token_buffer = unlexed_token_buffer;
 371       unlexed = tok_undef;
 372       return res;
 373     }
 374
 375   c = skip_white_space ();
 376
 377   switch (c)
 378     {
 379     case EOF:
 380       token_buffer = "EOF";
 381       return tok_eof;
 382
 383     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 384     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 385     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 386     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 387     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 388     case 'Z':
 389     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 390     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 391     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 392     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 393     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 394     case 'z':
 395     case '.':    case '_':
 396
 397       while (isalnum (c) || c == '_' || c == '.')
 398         {
 399           obstack_1grow (&token_obstack, c);
 400           c = getc (finput);
 401         }
 402       obstack_1grow (&token_obstack, '\0');
 403       token_buffer = obstack_finish (&token_obstack);
 404       ungetc (c, finput);
 405       symval = getsym (token_buffer);
 406       return tok_identifier;
 407
 408     case '0':    case '1':    case '2':    case '3':    case '4':
 409     case '5':    case '6':    case '7':    case '8':    case '9':
 410       {
 411         numval = 0;
 412
 413         while (isdigit (c))
 414           {
 415             obstack_1grow (&token_obstack, c);
 416             numval = numval * 10 + c - '0';
 417             c = getc (finput);
 418           }
 419         obstack_1grow (&token_obstack, '\0');
 420         token_buffer = obstack_finish (&token_obstack);
 421         ungetc (c, finput);
 422         return tok_number;
 423       }
 424
 425     case '\'':
 426       /* parse the literal token and compute character code in  code  */
 427
 428       {
 429         int code;
 430
 431         obstack_1grow (&token_obstack, '\'');
 432         literalchar (&token_obstack, &code, '\'');
 433
 434         c = getc (finput);
 435         if (c != '\'')
 436           {
 437             int discode;
 438             complain (_("use \"...\" for multi-character literal tokens"));
 439             while (1)
 440               if (!literalchar (0, &discode, '\''))
 441                 break;
 442           }
 443         obstack_1grow (&token_obstack, '\'');
 444         obstack_1grow (&token_obstack, '\0');
 445         token_buffer = obstack_finish (&token_obstack);
 446         symval = getsym (token_buffer);
 447         symval->class = token_sym;
 448         if (symval->user_token_number == SUNDEF)
 449           symval->user_token_number = code;
 450         return tok_identifier;
 451       }
 452
 453     case '\"':
 454       /* parse the literal string token and treat as an identifier */
 455
 456       {
 457         int code;               /* ignored here */
 458
 459         obstack_1grow (&token_obstack, '\"');
 460         /* Read up to and including ".  */
 461         while (literalchar (&token_obstack, &code, '\"'))
 462           /* nothing */;
 463         obstack_1grow (&token_obstack, '\0');
 464         token_buffer = obstack_finish (&token_obstack);
 465
 466         symval = getsym (token_buffer);
 467         symval->class = token_sym;
 468
 469         return tok_identifier;
 470       }
 471
 472     case ',':
 473       token_buffer = ",";
 474       return tok_comma;
 475
 476     case ':':
 477       token_buffer = ":";
 478       return tok_colon;
 479
 480     case ';':
 481       token_buffer = ";";
 482       return tok_semicolon;
 483
 484     case '|':
 485       token_buffer = "|";
 486       return tok_bar;
 487
 488     case '{':
 489       token_buffer = "{";
 490       return tok_left_curly;
 491
 492     case '=':
 493       obstack_1grow (&token_obstack, c);
 494       do
 495         {
 496           c = getc (finput);
 497           obstack_1grow (&token_obstack, c);
 498           if (c == '\n')
 499             lineno++;
 500         }
 501       while (c == ' ' || c == '\n' || c == '\t');
 502       obstack_1grow (&token_obstack, '\0');
 503       token_buffer = obstack_finish (&token_obstack);
 504
 505       if (c == '{')
 506         {
 507           return tok_left_curly;
 508         }
 509       else
 510         {
 511           ungetc (c, finput);
 512           return tok_illegal;
 513         }
 514
 515     case '<':
 516       read_type_name (finput);
 517       return tok_typename;
 518
 519     case '%':
 520       return parse_percent_token ();
 521
 522     default:
 523       obstack_1grow (&token_obstack, c);
 524       obstack_1grow (&token_obstack, '\0');
 525       token_buffer = obstack_finish (&token_obstack);
 526       return tok_illegal;
 527     }
 528 }
 529
 530 /* the following table dictates the action taken for the various %
 531    directives.  A set_flag value causes the named flag to be set.  A
 532    retval action returns the code.  */
 533 struct percent_table_struct
 534 {
 535   const char *name;
 536   void *set_flag;
 537   token_t retval;
 538 };
 539
 540 struct percent_table_struct percent_table[] =
 541 {
 542   { "token",            NULL,                   tok_token },
 543   { "term",             NULL,                   tok_token },
 544   { "nterm",            NULL,                   tok_nterm },
 545   { "type",             NULL,                   tok_type },
 546   { "guard",            NULL,                   tok_guard },
 547   { "union",            NULL,                   tok_union },
 548   { "expect",           NULL,                   tok_expect },
 549   { "thong",            NULL,                   tok_thong },
 550   { "start",            NULL,                   tok_start },
 551   { "left",             NULL,                   tok_left },
 552   { "right",            NULL,                   tok_right },
 553   { "nonassoc",         NULL,                   tok_nonassoc },
 554   { "binary",           NULL,                   tok_nonassoc },
 555   { "prec",             NULL,                   tok_prec },
 556   { "locations",        &locations_flag,        tok_intopt },   /* -l */
 557   { "no-lines",         &no_lines_flag,         tok_intopt },   /* -l */
 558   { "raw",              NULL,                   tok_obsolete }, /* -r */
 559   { "token-table",      &token_table_flag,      tok_intopt },   /* -k */
 560   { "yacc",             &yacc_flag,             tok_intopt },   /* -y */
 561   { "fixed-output-files",&yacc_flag,            tok_intopt },   /* -y */
 562   { "defines",          &defines_flag,          tok_intopt },   /* -d */
 563   { "no-parser",        &no_parser_flag,        tok_intopt },   /* -n */
 564   { "graph",            &graph_flag,            tok_intopt },   /* -g */
 565
 566   /* FIXME: semantic parsers which will output an `include' of an
 567      output file: be sure that the name included is indeed the name of
 568      the output file.  */
 569   { "output",           &spec_outfile,          tok_stropt },   /* -o */
 570   { "file-prefix",      &spec_file_prefix,      tok_stropt },   /* -b */
 571   { "name-prefix",      &spec_name_prefix,      tok_stropt },   /* -p */
 572
 573   { "verbose",          &verbose_flag,          tok_intopt },   /* -v */
 574   { "debug",            &debug_flag,            tok_intopt },   /* -t */
 575   { "semantic-parser",  &semantic_parser,       tok_intopt },
 576   { "pure-parser",      &pure_parser,           tok_intopt },
 577
 578   { NULL, NULL, tok_illegal}
 579 };
 580
 581 /* Parse a token which starts with %.
 582    Assumes the % has already been read and discarded.  */
 583
 584 token_t
 585 parse_percent_token (void)
 586 {
 587   struct percent_table_struct *tx = NULL;
 588   const char *arg = NULL;
 589   /* Where the ARG was found in token_buffer. */
 590   size_t arg_offset = 0;
 591
 592   int c = getc (finput);
 593
 594   switch (c)
 595     {
 596     case '%':
 597       return tok_two_percents;
 598
 599     case '{':
 600       return tok_percent_left_curly;
 601
 602       /* FIXME: Who the heck are those 5 guys!?! `%<' = `%left'!!!
 603          Let's ask for there removal.  */
 604     case '<':
 605       return tok_left;
 606
 607     case '>':
 608       return tok_right;
 609
 610     case '2':
 611       return tok_nonassoc;
 612
 613     case '0':
 614       return tok_token;
 615
 616     case '=':
 617       return tok_prec;
 618     }
 619
 620   if (!isalpha (c))
 621     return tok_illegal;
 622
 623   obstack_1grow (&token_obstack, '%');
 624   while (isalpha (c) || c == '_' || c == '-')
 625     {
 626       if (c == '_')
 627         c = '-';
 628       obstack_1grow (&token_obstack, c);
 629       c = getc (finput);
 630     }
 631
 632   /* %DIRECTIVE="ARG".  Separate into
 633      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 634      This is a bit hackish, but once we move to a Bison parser,
 635      things will be cleaned up.  */
 636   if (c == '=')
 637     {
 638       /* End of the directive.  We skip the `='. */
 639       obstack_1grow (&token_obstack, '\0');
 640       /* Fetch the ARG if present. */
 641       c = getc (finput);
 642       if (c == '"')
 643         {
 644           int code;
 645           arg_offset = obstack_object_size (&token_obstack);
 646           /* Read up to and including `"'.  Do not append the closing
 647              `"' in the output: it's not part of the ARG.  */
 648           while (literalchar (NULL, &code, '"'))
 649             obstack_1grow (&token_obstack, code);
 650         }
 651       /* else: should be an error. */
 652     }
 653   else
 654     ungetc (c, finput);
 655
 656   obstack_1grow (&token_obstack, '\0');
 657   token_buffer = obstack_finish (&token_obstack);
 658   if (arg_offset)
 659     arg = token_buffer + arg_offset;
 660
 661   /* table lookup % directive */
 662   for (tx = percent_table; tx->name; tx++)
 663     if (strcmp (token_buffer + 1, tx->name) == 0)
 664       break;
 665
 666   if (arg && tx->retval != tok_stropt)
 667     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 668
 669   switch (tx->retval)
 670     {
 671     case tok_stropt:
 672       assert (tx->set_flag);
 673       if (arg)
 674         {
 675           /* Keep only the first assignment: command line options have
 676              already been processed, and we want them to have
 677              precedence.  Side effect: if this %-option is used
 678              several times, only the first is honored.  Bah.  */
 679           if (!*((char **) (tx->set_flag)))
 680             *((char **) (tx->set_flag)) = xstrdup (arg);
 681         }
 682       else
 683         fatal (_("`%s' requires an argument"), token_buffer);
 684       return tok_noop;
 685       break;
 686
 687     case tok_intopt:
 688       assert (tx->set_flag);
 689       *((int *) (tx->set_flag)) = 1;
 690       return tok_noop;
 691       break;
 692
 693     case tok_obsolete:
 694       fatal (_("`%s' is no longer supported"), token_buffer);
 695       return tok_noop;
 696       break;
 697
 698     default:
 699       return tx->retval;
 700       break;
 701     }
 702   abort ();
 703 }