src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "symtab.h"
  25 #include "options.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 static struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 bucket *symval = NULL;
  36 int numval;
  37
  38 /* A token to be reread, see unlex and lex. */
  39 static token_t unlexed = tok_undef;
  40 static bucket *unlexed_symval = NULL;
  41 static const char *unlexed_token_buffer = NULL;
  42
  43 void
  44 lex_init (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = tok_undef;
  48 }
  49
  50
  51 void
  52 lex_free (void)
  53 {
  54   obstack_free (&token_obstack, NULL);
  55 }
  56
  57
  58 int
  59 skip_white_space (void)
  60 {
  61   int c;
  62   int inside;
  63
  64   c = getc (finput);
  65
  66   for (;;)
  67     {
  68       int cplus_comment;
  69
  70       switch (c)
  71         {
  72         case '/':
  73           /* FIXME: Should probably be merged with copy_comment.  */
  74           c = getc (finput);
  75           if (c != '*' && c != '/')
  76             {
  77               complain (_("unexpected `/' found and ignored"));
  78               break;
  79             }
  80           cplus_comment = (c == '/');
  81
  82           c = getc (finput);
  83
  84           inside = 1;
  85           while (inside)
  86             {
  87               if (!cplus_comment && c == '*')
  88                 {
  89                   while (c == '*')
  90                     c = getc (finput);
  91
  92                   if (c == '/')
  93                     {
  94                       inside = 0;
  95                       c = getc (finput);
  96                     }
  97                 }
  98               else if (c == '\n')
  99                 {
 100                   lineno++;
 101                   if (cplus_comment)
 102                     inside = 0;
 103                   c = getc (finput);
 104                 }
 105               else if (c == EOF)
 106                 fatal (_("unterminated comment"));
 107               else
 108                 c = getc (finput);
 109             }
 110
 111           break;
 112
 113         case '\n':
 114           lineno++;
 115
 116         case ' ':
 117         case '\t':
 118         case '\f':
 119           c = getc (finput);
 120           break;
 121
 122         default:
 123           return c;
 124         }
 125     }
 126 }
 127
 128
 129 /*-----------------------------------------------------.
 130 | Do a getc, but give error message if EOF encountered |
 131 `-----------------------------------------------------*/
 132
 133 static int
 134 xgetc (FILE *f)
 135 {
 136   int c = getc (f);
 137   if (c == EOF)
 138     fatal (_("unexpected end of file"));
 139   return c;
 140 }
 141
 142
 143 /*------------------------------------------------------------------.
 144 | Read one literal character from finput.  Process \ escapes.       |
 145 | Append the normalized string version of the char to OUT.  Assign  |
 146 | the character code to *PCODE. Return 1 unless the character is an |
 147 | unescaped `term' or \n report error for \n.                       |
 148 `------------------------------------------------------------------*/
 149
 150 /* FIXME: We could directly work in the obstack, but that would make
 151    it more difficult to move to quotearg some day.  So for the time
 152    being, I prefer have literalchar behave like quotearg, and change
 153    my mind later if I was wrong.  */
 154
 155 static int
 156 literalchar (struct obstack *out, int *pcode, char term)
 157 {
 158   int c;
 159   char buf[4096];
 160   char *cp;
 161   int code;
 162   int wasquote = 0;
 163
 164   c = xgetc (finput);
 165   if (c == '\n')
 166     {
 167       complain (_("unescaped newline in constant"));
 168       ungetc (c, finput);
 169       code = '?';
 170       wasquote = 1;
 171     }
 172   else if (c != '\\')
 173     {
 174       code = c;
 175       if (c == term)
 176         wasquote = 1;
 177     }
 178   else
 179     {
 180       c = xgetc (finput);
 181       if (c == 't')
 182         code = '\t';
 183       else if (c == 'n')
 184         code = '\n';
 185       else if (c == 'a')
 186         code = '\007';
 187       else if (c == 'r')
 188         code = '\r';
 189       else if (c == 'f')
 190         code = '\f';
 191       else if (c == 'b')
 192         code = '\b';
 193       else if (c == 'v')
 194         code = '\013';
 195       else if (c == '\\')
 196         code = '\\';
 197       else if (c == '\'')
 198         code = '\'';
 199       else if (c == '\"')
 200         code = '\"';
 201       else if (c <= '7' && c >= '0')
 202         {
 203           code = 0;
 204           while (c <= '7' && c >= '0')
 205             {
 206               code = (code * 8) + (c - '0');
 207               if (code >= 256 || code < 0)
 208                 {
 209                   complain (_("octal value outside range 0...255: `\\%o'"),
 210                             code);
 211                   code &= 0xFF;
 212                   break;
 213                 }
 214               c = xgetc (finput);
 215             }
 216           ungetc (c, finput);
 217         }
 218       else if (c == 'x')
 219         {
 220           c = xgetc (finput);
 221           code = 0;
 222           while (1)
 223             {
 224               if (c >= '0' && c <= '9')
 225                 code *= 16, code += c - '0';
 226               else if (c >= 'a' && c <= 'f')
 227                 code *= 16, code += c - 'a' + 10;
 228               else if (c >= 'A' && c <= 'F')
 229                 code *= 16, code += c - 'A' + 10;
 230               else
 231                 break;
 232               if (code >= 256 || code < 0)
 233                 {
 234                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 235                   code &= 0xFF;
 236                   break;
 237                 }
 238               c = xgetc (finput);
 239             }
 240           ungetc (c, finput);
 241         }
 242       else
 243         {
 244           char badchar [] = "c";
 245           badchar[0] = c;
 246           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 247                     quote (badchar));
 248           code = '?';
 249         }
 250     }                           /* has \ */
 251
 252   /* now fill BUF with the canonical name for this character as a
 253      literal token.  Do not use what the user typed, so that `\012'
 254      and `\n' can be interchangeable.  */
 255
 256   cp = buf;
 257   if (code == term && wasquote)
 258     *cp++ = code;
 259   else if (code == '\\')
 260     {
 261       *cp++ = '\\';
 262       *cp++ = '\\';
 263     }
 264   else if (code == '\'')
 265     {
 266       *cp++ = '\\';
 267       *cp++ = '\'';
 268     }
 269   else if (code == '\"')
 270     {
 271       *cp++ = '\\';
 272       *cp++ = '\"';
 273     }
 274   else if (code >= 040 && code < 0177)
 275     *cp++ = code;
 276   else if (code == '\t')
 277     {
 278       *cp++ = '\\';
 279       *cp++ = 't';
 280     }
 281   else if (code == '\n')
 282     {
 283       *cp++ = '\\';
 284       *cp++ = 'n';
 285     }
 286   else if (code == '\r')
 287     {
 288       *cp++ = '\\';
 289       *cp++ = 'r';
 290     }
 291   else if (code == '\v')
 292     {
 293       *cp++ = '\\';
 294       *cp++ = 'v';
 295     }
 296   else if (code == '\b')
 297     {
 298       *cp++ = '\\';
 299       *cp++ = 'b';
 300     }
 301   else if (code == '\f')
 302     {
 303       *cp++ = '\\';
 304       *cp++ = 'f';
 305     }
 306   else
 307     {
 308       *cp++ = '\\';
 309       *cp++ = code / 0100 + '0';
 310       *cp++ = ((code / 010) & 07) + '0';
 311       *cp++ = (code & 07) + '0';
 312     }
 313   *cp = '\0';
 314
 315   if (out)
 316     obstack_sgrow (out, buf);
 317   *pcode = code;
 318   return !wasquote;
 319 }
 320
 321
 322 void
 323 unlex (token_t token)
 324 {
 325   unlexed = token;
 326   unlexed_token_buffer = token_buffer;
 327   unlexed_symval = symval;
 328 }
 329
 330 /*-----------------------------------------------------------------.
 331 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 332 | specified between the `<...>'.                                   |
 333 `-----------------------------------------------------------------*/
 334
 335 void
 336 read_type_name (FILE *fin)
 337 {
 338   int c = getc (fin);
 339
 340   while (c != '>')
 341     {
 342       if (c == EOF)
 343         fatal (_("unterminated type name at end of file"));
 344       if (c == '\n')
 345         {
 346           complain (_("unterminated type name"));
 347           ungetc (c, fin);
 348           break;
 349         }
 350
 351       obstack_1grow (&token_obstack, c);
 352       c = getc (fin);
 353     }
 354   obstack_1grow (&token_obstack, '\0');
 355   token_buffer = obstack_finish (&token_obstack);
 356 }
 357
 358
 359 token_t
 360 lex (void)
 361 {
 362   int c;
 363
 364   /* Just to make sure. */
 365   token_buffer = NULL;
 366
 367   if (unlexed != tok_undef)
 368     {
 369       token_t res = unlexed;
 370       symval = unlexed_symval;
 371       token_buffer = unlexed_token_buffer;
 372       unlexed = tok_undef;
 373       return res;
 374     }
 375
 376   c = skip_white_space ();
 377
 378   switch (c)
 379     {
 380     case EOF:
 381       token_buffer = "EOF";
 382       return tok_eof;
 383
 384     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 385     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 386     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 387     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 388     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 389     case 'Z':
 390     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 391     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 392     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 393     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 394     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 395     case 'z':
 396     case '.':    case '_':
 397
 398       while (isalnum (c) || c == '_' || c == '.')
 399         {
 400           obstack_1grow (&token_obstack, c);
 401           c = getc (finput);
 402         }
 403       obstack_1grow (&token_obstack, '\0');
 404       token_buffer = obstack_finish (&token_obstack);
 405       ungetc (c, finput);
 406       symval = getsym (token_buffer);
 407       return tok_identifier;
 408
 409     case '0':    case '1':    case '2':    case '3':    case '4':
 410     case '5':    case '6':    case '7':    case '8':    case '9':
 411       {
 412         numval = 0;
 413
 414         while (isdigit (c))
 415           {
 416             obstack_1grow (&token_obstack, c);
 417             numval = numval * 10 + c - '0';
 418             c = getc (finput);
 419           }
 420         obstack_1grow (&token_obstack, '\0');
 421         token_buffer = obstack_finish (&token_obstack);
 422         ungetc (c, finput);
 423         return tok_number;
 424       }
 425
 426     case '\'':
 427       /* parse the literal token and compute character code in  code  */
 428
 429       {
 430         int code;
 431
 432         obstack_1grow (&token_obstack, '\'');
 433         literalchar (&token_obstack, &code, '\'');
 434
 435         c = getc (finput);
 436         if (c != '\'')
 437           {
 438             int discode;
 439             complain (_("use \"...\" for multi-character literal tokens"));
 440             while (1)
 441               if (!literalchar (0, &discode, '\''))
 442                 break;
 443           }
 444         obstack_1grow (&token_obstack, '\'');
 445         obstack_1grow (&token_obstack, '\0');
 446         token_buffer = obstack_finish (&token_obstack);
 447         symval = getsym (token_buffer);
 448         symval->class = token_sym;
 449         if (symval->user_token_number == SUNDEF)
 450           symval->user_token_number = code;
 451         return tok_identifier;
 452       }
 453
 454     case '\"':
 455       /* parse the literal string token and treat as an identifier */
 456
 457       {
 458         int code;               /* ignored here */
 459
 460         obstack_1grow (&token_obstack, '\"');
 461         /* Read up to and including ".  */
 462         while (literalchar (&token_obstack, &code, '\"'))
 463           /* nothing */;
 464         obstack_1grow (&token_obstack, '\0');
 465         token_buffer = obstack_finish (&token_obstack);
 466
 467         symval = getsym (token_buffer);
 468         symval->class = token_sym;
 469
 470         return tok_identifier;
 471       }
 472
 473     case ',':
 474       token_buffer = ",";
 475       return tok_comma;
 476
 477     case ':':
 478       token_buffer = ":";
 479       return tok_colon;
 480
 481     case ';':
 482       token_buffer = ";";
 483       return tok_semicolon;
 484
 485     case '|':
 486       token_buffer = "|";
 487       return tok_bar;
 488
 489     case '{':
 490       token_buffer = "{";
 491       return tok_left_curly;
 492
 493     case '=':
 494       obstack_1grow (&token_obstack, c);
 495       do
 496         {
 497           c = getc (finput);
 498           obstack_1grow (&token_obstack, c);
 499           if (c == '\n')
 500             lineno++;
 501         }
 502       while (c == ' ' || c == '\n' || c == '\t');
 503       obstack_1grow (&token_obstack, '\0');
 504       token_buffer = obstack_finish (&token_obstack);
 505
 506       if (c == '{')
 507         {
 508           return tok_left_curly;
 509         }
 510       else
 511         {
 512           ungetc (c, finput);
 513           return tok_illegal;
 514         }
 515
 516     case '<':
 517       read_type_name (finput);
 518       return tok_typename;
 519
 520     case '%':
 521       return parse_percent_token ();
 522
 523     default:
 524       obstack_1grow (&token_obstack, c);
 525       obstack_1grow (&token_obstack, '\0');
 526       token_buffer = obstack_finish (&token_obstack);
 527       return tok_illegal;
 528     }
 529 }
 530
 531 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 532    chars.  */
 533
 534 static int
 535 option_strcmp (const char *left, const char *right)
 536 {
 537   const unsigned char *l, *r;
 538   int c;
 539
 540   assert (left);
 541   assert (right);
 542   l = (const unsigned char *)left;
 543   r = (const unsigned char *)right;
 544   while (((c = *l - *r++) == 0 && *l != '\0')
 545          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 546     l++;
 547   return c;
 548 }
 549
 550 /* Parse a token which starts with %.
 551    Assumes the % has already been read and discarded.  */
 552
 553 token_t
 554 parse_percent_token (void)
 555 {
 556   const struct option_table_struct *tx = NULL;
 557   const char *arg = NULL;
 558   /* Where the ARG was found in token_buffer. */
 559   size_t arg_offset = 0;
 560
 561   int c = getc (finput);
 562
 563   switch (c)
 564     {
 565     case '%':
 566       return tok_two_percents;
 567
 568     case '{':
 569       return tok_percent_left_curly;
 570
 571       /* FIXME: Who the heck are those 5 guys!?! `%<' = `%left'!!!
 572          Let's ask for there removal.  */
 573     case '<':
 574       return tok_left;
 575
 576     case '>':
 577       return tok_right;
 578
 579     case '2':
 580       return tok_nonassoc;
 581
 582     case '0':
 583       return tok_token;
 584
 585     case '=':
 586       return tok_prec;
 587     }
 588
 589   if (!isalpha (c))
 590     return tok_illegal;
 591
 592   obstack_1grow (&token_obstack, '%');
 593   while (isalpha (c) || c == '_' || c == '-')
 594     {
 595       if (c == '_')
 596         c = '-';
 597       obstack_1grow (&token_obstack, c);
 598       c = getc (finput);
 599     }
 600
 601   /* %DIRECTIVE="ARG".  Separate into
 602      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 603      This is a bit hackish, but once we move to a Bison parser,
 604      things will be cleaned up.  */
 605   if (c == '=')
 606     {
 607       /* End of the directive.  We skip the `='. */
 608       obstack_1grow (&token_obstack, '\0');
 609       /* Fetch the ARG if present. */
 610       c = getc (finput);
 611       if (c == '"')
 612         {
 613           int code;
 614           arg_offset = obstack_object_size (&token_obstack);
 615           /* Read up to and including `"'.  Do not append the closing
 616              `"' in the output: it's not part of the ARG.  */
 617           while (literalchar (NULL, &code, '"'))
 618             obstack_1grow (&token_obstack, code);
 619         }
 620       /* else: should be an error. */
 621     }
 622   else
 623     ungetc (c, finput);
 624
 625   obstack_1grow (&token_obstack, '\0');
 626   token_buffer = obstack_finish (&token_obstack);
 627   if (arg_offset)
 628     arg = token_buffer + arg_offset;
 629
 630   /* table lookup % directive */
 631   for (tx = option_table; tx->name; tx++)
 632     if ((tx->access == opt_percent || tx->access == opt_both)
 633         && option_strcmp (token_buffer + 1, tx->name) == 0)
 634       break;
 635
 636   if (arg && tx->ret_val != tok_stropt)
 637     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 638
 639
 640   switch (tx->ret_val)
 641     {
 642     case tok_stropt:
 643       assert (tx->set_flag);
 644       if (arg)
 645         {
 646           /* Keep only the first assignment: command line options have
 647              already been processed, and we want them to have
 648              precedence.  Side effect: if this %-option is used
 649              several times, only the first is honored.  Bah.  */
 650           if (!*((char **) (tx->set_flag)))
 651             *((char **) (tx->set_flag)) = xstrdup (arg);
 652         }
 653       else
 654         fatal (_("`%s' requires an argument"), token_buffer);
 655       return tok_noop;
 656       break;
 657
 658     case tok_intopt:
 659       assert (tx->set_flag);
 660       *((int *) (tx->set_flag)) = 1;
 661       return tok_noop;
 662       break;
 663
 664     case tok_obsolete:
 665       fatal (_("`%s' is no longer supported"), token_buffer);
 666       return tok_noop;
 667       break;
 668
 669     default:
 670       return tx->ret_val;
 671       break;
 672     }
 673   abort ();
 674 }