src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 bucket *symval;
  36 int numval;
  37
  38 /* these two describe a token to be reread */
  39 static token_t unlexed = tok_undef;
  40 /* by the next call to lex */
  41 static bucket *unlexed_symval = NULL;
  42
  43
  44 void
  45 lex_init (void)
  46 {
  47   obstack_init (&token_obstack);
  48   unlexed = tok_undef;
  49 }
  50
  51
  52 void
  53 lex_free (void)
  54 {
  55   obstack_free (&token_obstack, NULL);
  56 }
  57
  58
  59 int
  60 skip_white_space (void)
  61 {
  62   int c;
  63   int inside;
  64
  65   c = getc (finput);
  66
  67   for (;;)
  68     {
  69       int cplus_comment;
  70
  71       switch (c)
  72         {
  73         case '/':
  74           /* FIXME: Should probably be merged with copy_comment.  */
  75           c = getc (finput);
  76           if (c != '*' && c != '/')
  77             {
  78               complain (_("unexpected `/' found and ignored"));
  79               break;
  80             }
  81           cplus_comment = (c == '/');
  82
  83           c = getc (finput);
  84
  85           inside = 1;
  86           while (inside)
  87             {
  88               if (!cplus_comment && c == '*')
  89                 {
  90                   while (c == '*')
  91                     c = getc (finput);
  92
  93                   if (c == '/')
  94                     {
  95                       inside = 0;
  96                       c = getc (finput);
  97                     }
  98                 }
  99               else if (c == '\n')
 100                 {
 101                   lineno++;
 102                   if (cplus_comment)
 103                     inside = 0;
 104                   c = getc (finput);
 105                 }
 106               else if (c == EOF)
 107                 fatal (_("unterminated comment"));
 108               else
 109                 c = getc (finput);
 110             }
 111
 112           break;
 113
 114         case '\n':
 115           lineno++;
 116
 117         case ' ':
 118         case '\t':
 119         case '\f':
 120           c = getc (finput);
 121           break;
 122
 123         default:
 124           return c;
 125         }
 126     }
 127 }
 128
 129
 130 /*-----------------------------------------------------.
 131 | Do a getc, but give error message if EOF encountered |
 132 `-----------------------------------------------------*/
 133
 134 static int
 135 xgetc (FILE *f)
 136 {
 137   int c = getc (f);
 138   if (c == EOF)
 139     fatal (_("unexpected end of file"));
 140   return c;
 141 }
 142
 143
 144 /*------------------------------------------------------------------.
 145 | Read one literal character from finput.  Process \ escapes.       |
 146 | Append the normalized string version of the char to OUT.  Assign  |
 147 | the character code to *PCODE. Return 1 unless the character is an |
 148 | unescaped `term' or \n report error for \n.                       |
 149 `------------------------------------------------------------------*/
 150
 151 /* FIXME: We could directly work in the obstack, but that would make
 152    it more difficult to move to quotearg some day.  So for the time
 153    being, I prefer have literalchar behave like quotearg, and change
 154    my mind later if I was wrong.  */
 155
 156 static int
 157 literalchar (struct obstack *out, int *pcode, char term)
 158 {
 159   int c;
 160   char buf[4096];
 161   char *cp;
 162   int code;
 163   int wasquote = 0;
 164
 165   c = xgetc (finput);
 166   if (c == '\n')
 167     {
 168       complain (_("unescaped newline in constant"));
 169       ungetc (c, finput);
 170       code = '?';
 171       wasquote = 1;
 172     }
 173   else if (c != '\\')
 174     {
 175       code = c;
 176       if (c == term)
 177         wasquote = 1;
 178     }
 179   else
 180     {
 181       c = xgetc (finput);
 182       if (c == 't')
 183         code = '\t';
 184       else if (c == 'n')
 185         code = '\n';
 186       else if (c == 'a')
 187         code = '\007';
 188       else if (c == 'r')
 189         code = '\r';
 190       else if (c == 'f')
 191         code = '\f';
 192       else if (c == 'b')
 193         code = '\b';
 194       else if (c == 'v')
 195         code = '\013';
 196       else if (c == '\\')
 197         code = '\\';
 198       else if (c == '\'')
 199         code = '\'';
 200       else if (c == '\"')
 201         code = '\"';
 202       else if (c <= '7' && c >= '0')
 203         {
 204           code = 0;
 205           while (c <= '7' && c >= '0')
 206             {
 207               code = (code * 8) + (c - '0');
 208               if (code >= 256 || code < 0)
 209                 {
 210                   complain (_("octal value outside range 0...255: `\\%o'"),
 211                             code);
 212                   code &= 0xFF;
 213                   break;
 214                 }
 215               c = xgetc (finput);
 216             }
 217           ungetc (c, finput);
 218         }
 219       else if (c == 'x')
 220         {
 221           c = xgetc (finput);
 222           code = 0;
 223           while (1)
 224             {
 225               if (c >= '0' && c <= '9')
 226                 code *= 16, code += c - '0';
 227               else if (c >= 'a' && c <= 'f')
 228                 code *= 16, code += c - 'a' + 10;
 229               else if (c >= 'A' && c <= 'F')
 230                 code *= 16, code += c - 'A' + 10;
 231               else
 232                 break;
 233               if (code >= 256 || code < 0)
 234                 {
 235                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 236                   code &= 0xFF;
 237                   break;
 238                 }
 239               c = xgetc (finput);
 240             }
 241           ungetc (c, finput);
 242         }
 243       else
 244         {
 245           char badchar [] = "c";
 246           badchar[0] = c;
 247           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 248                     quote (badchar));
 249           code = '?';
 250         }
 251     }                           /* has \ */
 252
 253   /* now fill BUF with the canonical name for this character as a
 254      literal token.  Do not use what the user typed, so that `\012'
 255      and `\n' can be interchangeable.  */
 256
 257   cp = buf;
 258   if (code == term && wasquote)
 259     *cp++ = code;
 260   else if (code == '\\')
 261     {
 262       *cp++ = '\\';
 263       *cp++ = '\\';
 264     }
 265   else if (code == '\'')
 266     {
 267       *cp++ = '\\';
 268       *cp++ = '\'';
 269     }
 270   else if (code == '\"')
 271     {
 272       *cp++ = '\\';
 273       *cp++ = '\"';
 274     }
 275   else if (code >= 040 && code < 0177)
 276     *cp++ = code;
 277   else if (code == '\t')
 278     {
 279       *cp++ = '\\';
 280       *cp++ = 't';
 281     }
 282   else if (code == '\n')
 283     {
 284       *cp++ = '\\';
 285       *cp++ = 'n';
 286     }
 287   else if (code == '\r')
 288     {
 289       *cp++ = '\\';
 290       *cp++ = 'r';
 291     }
 292   else if (code == '\v')
 293     {
 294       *cp++ = '\\';
 295       *cp++ = 'v';
 296     }
 297   else if (code == '\b')
 298     {
 299       *cp++ = '\\';
 300       *cp++ = 'b';
 301     }
 302   else if (code == '\f')
 303     {
 304       *cp++ = '\\';
 305       *cp++ = 'f';
 306     }
 307   else
 308     {
 309       *cp++ = '\\';
 310       *cp++ = code / 0100 + '0';
 311       *cp++ = ((code / 010) & 07) + '0';
 312       *cp++ = (code & 07) + '0';
 313     }
 314   *cp = '\0';
 315
 316   if (out)
 317     obstack_sgrow (out, buf);
 318   *pcode = code;
 319   return !wasquote;
 320 }
 321
 322
 323 void
 324 unlex (int token)
 325 {
 326   unlexed = token;
 327   unlexed_symval = symval;
 328 }
 329
 330 /*-----------------------------------------------------------------.
 331 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 332 | specified between the `<...>'.                                   |
 333 `-----------------------------------------------------------------*/
 334
 335 void
 336 read_type_name (FILE *fin)
 337 {
 338   int c = getc (fin);
 339
 340   while (c != '>')
 341     {
 342       if (c == EOF)
 343         fatal (_("unterminated type name at end of file"));
 344       if (c == '\n')
 345         {
 346           complain (_("unterminated type name"));
 347           ungetc (c, fin);
 348           break;
 349         }
 350
 351       obstack_1grow (&token_obstack, c);
 352       c = getc (fin);
 353     }
 354   obstack_1grow (&token_obstack, '\0');
 355   token_buffer = obstack_finish (&token_obstack);
 356 }
 357
 358
 359 token_t
 360 lex (void)
 361 {
 362   int c;
 363
 364   /* Just to make sure. */
 365   token_buffer = NULL;
 366
 367   if (unlexed != tok_undef)
 368     {
 369       token_t res = unlexed;
 370       symval = unlexed_symval;
 371       unlexed = tok_undef;
 372       return res;
 373     }
 374
 375   c = skip_white_space ();
 376
 377   switch (c)
 378     {
 379     case EOF:
 380       token_buffer = "EOF";
 381       return tok_eof;
 382
 383     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 384     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 385     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 386     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 387     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 388     case 'Z':
 389     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 390     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 391     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 392     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 393     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 394     case 'z':
 395     case '.':    case '_':
 396
 397       while (isalnum (c) || c == '_' || c == '.')
 398         {
 399           obstack_1grow (&token_obstack, c);
 400           c = getc (finput);
 401         }
 402       obstack_1grow (&token_obstack, '\0');
 403       token_buffer = obstack_finish (&token_obstack);
 404       ungetc (c, finput);
 405       symval = getsym (token_buffer);
 406       return tok_identifier;
 407
 408     case '0':    case '1':    case '2':    case '3':    case '4':
 409     case '5':    case '6':    case '7':    case '8':    case '9':
 410       {
 411         numval = 0;
 412
 413         while (isdigit (c))
 414           {
 415             obstack_1grow (&token_obstack, c);
 416             numval = numval * 10 + c - '0';
 417             c = getc (finput);
 418           }
 419         obstack_1grow (&token_obstack, '\0');
 420         token_buffer = obstack_finish (&token_obstack);
 421         ungetc (c, finput);
 422         return tok_number;
 423       }
 424
 425     case '\'':
 426       /* parse the literal token and compute character code in  code  */
 427
 428       {
 429         int code, discode;
 430
 431         obstack_1grow (&token_obstack, '\'');
 432         literalchar (&token_obstack, &code, '\'');
 433
 434         c = getc (finput);
 435         if (c != '\'')
 436           {
 437             complain (_("use \"...\" for multi-character literal tokens"));
 438             while (1)
 439               if (!literalchar (0, &discode, '\''))
 440                 break;
 441           }
 442         obstack_1grow (&token_obstack, '\'');
 443         obstack_1grow (&token_obstack, '\0');
 444         token_buffer = obstack_finish (&token_obstack);
 445         symval = getsym (token_buffer);
 446         symval->class = token_sym;
 447         if (!symval->user_token_number)
 448           symval->user_token_number = code;
 449         return tok_identifier;
 450       }
 451
 452     case '\"':
 453       /* parse the literal string token and treat as an identifier */
 454
 455       {
 456         int code;               /* ignored here */
 457
 458         obstack_1grow (&token_obstack, '\"');
 459         /* Read up to and including ".  */
 460         while (literalchar (&token_obstack, &code, '\"'))
 461           /* nothing */;
 462         obstack_1grow (&token_obstack, '\0');
 463         token_buffer = obstack_finish (&token_obstack);
 464
 465         symval = getsym (token_buffer);
 466         symval->class = token_sym;
 467
 468         return tok_identifier;
 469       }
 470
 471     case ',':
 472       return tok_comma;
 473
 474     case ':':
 475       return tok_colon;
 476
 477     case ';':
 478       return tok_semicolon;
 479
 480     case '|':
 481       return tok_bar;
 482
 483     case '{':
 484       return tok_left_curly;
 485
 486     case '=':
 487       do
 488         {
 489           c = getc (finput);
 490           if (c == '\n')
 491             lineno++;
 492         }
 493       while (c == ' ' || c == '\n' || c == '\t');
 494
 495       if (c == '{')
 496         {
 497           token_buffer = "={";
 498           return tok_left_curly;
 499         }
 500       else
 501         {
 502           ungetc (c, finput);
 503           return tok_illegal;
 504         }
 505
 506     case '<':
 507       read_type_name (finput);
 508       return tok_typename;
 509
 510     case '%':
 511       return parse_percent_token ();
 512
 513     default:
 514       return tok_illegal;
 515     }
 516 }
 517
 518 /* the following table dictates the action taken for the various %
 519    directives.  A set_flag value causes the named flag to be set.  A
 520    retval action returns the code.  */
 521 struct percent_table_struct
 522 {
 523   const char *name;
 524   void *set_flag;
 525   token_t retval;
 526 };
 527
 528 struct percent_table_struct percent_table[] =
 529 {
 530   { "token",            NULL,                   tok_token },
 531   { "term",             NULL,                   tok_token },
 532   { "nterm",            NULL,                   tok_nterm },
 533   { "type",             NULL,                   tok_type },
 534   { "guard",            NULL,                   tok_guard },
 535   { "union",            NULL,                   tok_union },
 536   { "expect",           NULL,                   tok_expect },
 537   { "thong",            NULL,                   tok_thong },
 538   { "start",            NULL,                   tok_start },
 539   { "left",             NULL,                   tok_left },
 540   { "right",            NULL,                   tok_right },
 541   { "nonassoc",         NULL,                   tok_nonassoc },
 542   { "binary",           NULL,                   tok_nonassoc },
 543   { "prec",             NULL,                   tok_prec },
 544   { "locations",        &locations_flag,        tok_noop },     /* -l */
 545   { "no_lines",         &no_lines_flag,         tok_noop },     /* -l */
 546   { "raw",              NULL,                   tok_obsolete }, /* -r */
 547   { "token_table",      &token_table_flag,      tok_noop },     /* -k */
 548   { "yacc",             &yacc_flag,             tok_noop },     /* -y */
 549   { "fixed_output_files",&yacc_flag,            tok_noop },     /* -y */
 550   { "defines",          &defines_flag,          tok_noop },     /* -d */
 551   { "no_parser",        &no_parser_flag,        tok_noop },     /* -n */
 552   { "graph",            &graph_flag,            tok_noop },     /* -g */
 553 #if 0
 554   /* For the time being, this is not enabled yet, while it's possible
 555      though, since we use obstacks.  The only risk is with semantic
 556      parsers which will output an `include' of an output file: be sure
 557      that the name included is indeed the name of the output file.  */
 558   { "output_file",      &spec_outfile,          tok_setopt },   /* -o */
 559   { "file_prefix",      &spec_file_prefix,      tok_setopt },   /* -b */
 560   { "name_prefix",      &spec_name_prefix,      tok_setopt },   /* -p */
 561 #endif
 562   { "verbose",          &verbose_flag,          tok_noop },     /* -v */
 563   { "debug",            &debug_flag,            tok_noop },     /* -t */
 564   { "semantic_parser",  &semantic_parser,       tok_noop },
 565   { "pure_parser",      &pure_parser,           tok_noop },
 566
 567   { NULL, NULL, tok_illegal}
 568 };
 569
 570 /* Parse a token which starts with %.
 571    Assumes the % has already been read and discarded.  */
 572
 573 token_t
 574 parse_percent_token (void)
 575 {
 576   struct percent_table_struct *tx;
 577
 578   int c = getc (finput);
 579
 580   switch (c)
 581     {
 582     case '%':
 583       return tok_two_percents;
 584
 585     case '{':
 586       return tok_percent_left_curly;
 587
 588     case '<':
 589       return tok_left;
 590
 591     case '>':
 592       return tok_right;
 593
 594     case '2':
 595       return tok_nonassoc;
 596
 597     case '0':
 598       return tok_token;
 599
 600     case '=':
 601       return tok_prec;
 602     }
 603
 604   if (!isalpha (c))
 605     return tok_illegal;
 606
 607   obstack_1grow (&token_obstack, '%');
 608   while (isalpha (c) || c == '_' || c == '-')
 609     {
 610       if (c == '-')
 611         c = '_';
 612       obstack_1grow (&token_obstack, c);
 613       c = getc (finput);
 614     }
 615
 616   ungetc (c, finput);
 617   obstack_1grow (&token_obstack, '\0');
 618   token_buffer = obstack_finish (&token_obstack);
 619
 620   /* table lookup % directive */
 621   for (tx = percent_table; tx->name; tx++)
 622     if (strcmp (token_buffer + 1, tx->name) == 0)
 623       break;
 624
 625   if (tx->set_flag)
 626     {
 627       *((int *) (tx->set_flag)) = 1;
 628       return tok_noop;
 629     }
 630
 631   switch (tx->retval)
 632     {
 633     case tok_setopt:
 634       *((char **) (tx->set_flag)) = optarg;
 635       return tok_noop;
 636       break;
 637
 638     case tok_obsolete:
 639       fatal (_("`%s' is no longer supported"), token_buffer);
 640       break;
 641
 642     default:
 643       /* Other cases do not apply here. */
 644       break;
 645     }
 646
 647   return tx->retval;
 648 }