src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "symtab.h"
  25 #include "options.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 static struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 symbol_t *symval = NULL;
  36 int numval;
  37
  38 /* A token to be reread, see unlex and lex. */
  39 static token_t unlexed = tok_undef;
  40 static symbol_t *unlexed_symval = NULL;
  41 static const char *unlexed_token_buffer = NULL;
  42
  43 void
  44 lex_init (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = tok_undef;
  48 }
  49
  50
  51 void
  52 lex_free (void)
  53 {
  54   obstack_free (&token_obstack, NULL);
  55 }
  56
  57
  58 int
  59 skip_white_space (void)
  60 {
  61   int c;
  62   int inside;
  63
  64   c = getc (finput);
  65
  66   for (;;)
  67     {
  68       int cplus_comment;
  69
  70       switch (c)
  71         {
  72         case '/':
  73           /* FIXME: Should probably be merged with copy_comment.  */
  74           c = getc (finput);
  75           if (c != '*' && c != '/')
  76             {
  77               complain (_("unexpected `/' found and ignored"));
  78               break;
  79             }
  80           cplus_comment = (c == '/');
  81
  82           c = getc (finput);
  83
  84           inside = 1;
  85           while (inside)
  86             {
  87               if (!cplus_comment && c == '*')
  88                 {
  89                   while (c == '*')
  90                     c = getc (finput);
  91
  92                   if (c == '/')
  93                     {
  94                       inside = 0;
  95                       c = getc (finput);
  96                     }
  97                 }
  98               else if (c == '\n')
  99                 {
 100                   lineno++;
 101                   if (cplus_comment)
 102                     inside = 0;
 103                   c = getc (finput);
 104                 }
 105               else if (c == EOF)
 106                 fatal (_("unterminated comment"));
 107               else
 108                 c = getc (finput);
 109             }
 110
 111           break;
 112
 113         case '\n':
 114           lineno++;
 115
 116         case ' ':
 117         case '\t':
 118         case '\f':
 119           c = getc (finput);
 120           break;
 121
 122         default:
 123           return c;
 124         }
 125     }
 126 }
 127
 128
 129 /*-----------------------------------------------------.
 130 | Do a getc, but give error message if EOF encountered |
 131 `-----------------------------------------------------*/
 132
 133 int
 134 xgetc (FILE *f)
 135 {
 136   int c = getc (f);
 137   if (c == EOF)
 138     fatal (_("unexpected end of file"));
 139   return c;
 140 }
 141
 142
 143 /*---------------------------------------------------------------.
 144 | Read one literal character from FINPUT, process \-escapes, and |
 145 | return the character.                                          |
 146 `---------------------------------------------------------------*/
 147
 148 char
 149 literalchar (void)
 150 {
 151   int c;
 152   int res;
 153
 154   c = xgetc (finput);
 155   if (c == '\n')
 156     {
 157       complain (_("unescaped newline in constant"));
 158       ungetc (c, finput);
 159       res = '?';
 160     }
 161   else if (c != '\\')
 162     {
 163       res = c;
 164     }
 165   else
 166     {
 167       c = xgetc (finput);
 168       if (c == 't')
 169         res = '\t';
 170       else if (c == 'n')
 171         res = '\n';
 172       else if (c == 'a')
 173         res = '\007';
 174       else if (c == 'r')
 175         res = '\r';
 176       else if (c == 'f')
 177         res = '\f';
 178       else if (c == 'b')
 179         res = '\b';
 180       else if (c == 'v')
 181         res = '\013';
 182       else if (c == '\\')
 183         res = '\\';
 184       else if (c == '\'')
 185         res = '\'';
 186       else if (c == '\"')
 187         res = '\"';
 188       else if (c <= '7' && c >= '0')
 189         {
 190           res = 0;
 191           while (c <= '7' && c >= '0')
 192             {
 193               res = (res * 8) + (c - '0');
 194               if (res >= 256 || res < 0)
 195                 {
 196                   complain (_("octal value outside range 0...255: `\\%o'"),
 197                             res);
 198                   res &= 0xFF;
 199                   break;
 200                 }
 201               c = xgetc (finput);
 202             }
 203           ungetc (c, finput);
 204         }
 205       else if (c == 'x')
 206         {
 207           c = xgetc (finput);
 208           res = 0;
 209           while (1)
 210             {
 211               if (c >= '0' && c <= '9')
 212                 res *= 16, res += c - '0';
 213               else if (c >= 'a' && c <= 'f')
 214                 res *= 16, res += c - 'a' + 10;
 215               else if (c >= 'A' && c <= 'F')
 216                 res *= 16, res += c - 'A' + 10;
 217               else
 218                 break;
 219               if (res >= 256 || res < 0)
 220                 {
 221                   complain (_("hexadecimal value above 255: `\\x%x'"), res);
 222                   res &= 0xFF;
 223                   break;
 224                 }
 225               c = xgetc (finput);
 226             }
 227           ungetc (c, finput);
 228         }
 229       else
 230         {
 231           char badchar [] = "c";
 232           badchar[0] = c;
 233           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 234                     quote (badchar));
 235           res = '?';
 236         }
 237     }                           /* has \ */
 238
 239   return res;
 240 }
 241
 242
 243 void
 244 unlex (token_t token)
 245 {
 246   unlexed = token;
 247   unlexed_token_buffer = token_buffer;
 248   unlexed_symval = symval;
 249 }
 250
 251 /*-----------------------------------------------------------------.
 252 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 253 | specified between the `<...>'.                                   |
 254 `-----------------------------------------------------------------*/
 255
 256 void
 257 read_type_name (FILE *fin)
 258 {
 259   int c = getc (fin);
 260
 261   while (c != '>')
 262     {
 263       if (c == EOF)
 264         fatal (_("unterminated type name at end of file"));
 265       if (c == '\n')
 266         {
 267           complain (_("unterminated type name"));
 268           ungetc (c, fin);
 269           break;
 270         }
 271
 272       obstack_1grow (&token_obstack, c);
 273       c = getc (fin);
 274     }
 275   obstack_1grow (&token_obstack, '\0');
 276   token_buffer = obstack_finish (&token_obstack);
 277 }
 278
 279
 280 token_t
 281 lex (void)
 282 {
 283   int c;
 284
 285   /* Just to make sure. */
 286   token_buffer = NULL;
 287
 288   if (unlexed != tok_undef)
 289     {
 290       token_t res = unlexed;
 291       symval = unlexed_symval;
 292       token_buffer = unlexed_token_buffer;
 293       unlexed = tok_undef;
 294       return res;
 295     }
 296
 297   c = skip_white_space ();
 298
 299   switch (c)
 300     {
 301     case EOF:
 302       token_buffer = "EOF";
 303       return tok_eof;
 304
 305     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 306     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 307     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 308     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 309     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 310     case 'Z':
 311     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 312     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 313     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 314     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 315     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 316     case 'z':
 317     case '.':    case '_':
 318
 319       while (isalnum (c) || c == '_' || c == '.')
 320         {
 321           obstack_1grow (&token_obstack, c);
 322           c = getc (finput);
 323         }
 324       obstack_1grow (&token_obstack, '\0');
 325       token_buffer = obstack_finish (&token_obstack);
 326       ungetc (c, finput);
 327       symval = getsym (token_buffer);
 328       return tok_identifier;
 329
 330     case '0':    case '1':    case '2':    case '3':    case '4':
 331     case '5':    case '6':    case '7':    case '8':    case '9':
 332       {
 333         numval = 0;
 334
 335         while (isdigit (c))
 336           {
 337             obstack_1grow (&token_obstack, c);
 338             numval = numval * 10 + c - '0';
 339             c = getc (finput);
 340           }
 341         obstack_1grow (&token_obstack, '\0');
 342         token_buffer = obstack_finish (&token_obstack);
 343         ungetc (c, finput);
 344         return tok_number;
 345       }
 346
 347     case '\'':
 348       /* parse the literal token and compute character code in  code  */
 349
 350       {
 351         int code = literalchar ();
 352
 353         obstack_1grow (&token_obstack, '\'');
 354         obstack_1grow (&token_obstack, code);
 355
 356         c = getc (finput);
 357         if (c != '\'')
 358           {
 359             complain (_("use \"...\" for multi-character literal tokens"));
 360             while (literalchar () != '\'')
 361               /* Skip. */;
 362           }
 363         obstack_1grow (&token_obstack, '\'');
 364         obstack_1grow (&token_obstack, '\0');
 365         token_buffer = obstack_finish (&token_obstack);
 366         symval = getsym (token_buffer);
 367         if (symval->number == NUMBER_UNDEFINED)
 368           {
 369             symval->number = ntokens++;
 370             symval->class = token_sym;
 371             if (symval->user_token_number == USER_NUMBER_UNDEFINED)
 372               symval->user_token_number = code;
 373           }
 374         return tok_identifier;
 375       }
 376
 377     case '\"':
 378       /* parse the literal string token and treat as an identifier */
 379
 380       {
 381         int code;               /* ignored here */
 382
 383         obstack_1grow (&token_obstack, '\"');
 384         /* Read up to and including ".  */
 385         do
 386           {
 387             code = literalchar ();
 388             obstack_1grow (&token_obstack, code);
 389           }
 390         while (code != '\"');
 391         obstack_1grow (&token_obstack, '\0');
 392         token_buffer = obstack_finish (&token_obstack);
 393
 394         symval = getsym (token_buffer);
 395         if (symval->number == NUMBER_UNDEFINED)
 396           {
 397             symval->number = ntokens++;
 398             symval->class = token_sym;
 399           }
 400
 401         return tok_identifier;
 402       }
 403
 404     case ',':
 405       token_buffer = ",";
 406       return tok_comma;
 407
 408     case ':':
 409       token_buffer = ":";
 410       return tok_colon;
 411
 412     case ';':
 413       token_buffer = ";";
 414       return tok_semicolon;
 415
 416     case '|':
 417       token_buffer = "|";
 418       return tok_bar;
 419
 420     case '{':
 421       token_buffer = "{";
 422       return tok_left_curly;
 423
 424     case '=':
 425       obstack_1grow (&token_obstack, c);
 426       do
 427         {
 428           c = getc (finput);
 429           obstack_1grow (&token_obstack, c);
 430           if (c == '\n')
 431             lineno++;
 432         }
 433       while (c == ' ' || c == '\n' || c == '\t');
 434       obstack_1grow (&token_obstack, '\0');
 435       token_buffer = obstack_finish (&token_obstack);
 436
 437       if (c == '{')
 438         {
 439           return tok_left_curly;
 440         }
 441       else
 442         {
 443           ungetc (c, finput);
 444           return tok_illegal;
 445         }
 446
 447     case '<':
 448       read_type_name (finput);
 449       return tok_typename;
 450
 451     case '%':
 452       return parse_percent_token ();
 453
 454     default:
 455       obstack_1grow (&token_obstack, c);
 456       obstack_1grow (&token_obstack, '\0');
 457       token_buffer = obstack_finish (&token_obstack);
 458       return tok_illegal;
 459     }
 460 }
 461
 462 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 463    chars.  */
 464
 465 static int
 466 option_strcmp (const char *left, const char *right)
 467 {
 468   const unsigned char *l, *r;
 469   int c;
 470
 471   assert (left);
 472   assert (right);
 473   l = (const unsigned char *)left;
 474   r = (const unsigned char *)right;
 475   while (((c = *l - *r++) == 0 && *l != '\0')
 476          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 477     l++;
 478   return c;
 479 }
 480
 481 /* Parse a token which starts with %.
 482    Assumes the % has already been read and discarded.  */
 483
 484 token_t
 485 parse_percent_token (void)
 486 {
 487   const struct option_table_struct *tx = NULL;
 488   const char *arg = NULL;
 489   /* Where the ARG was found in token_buffer. */
 490   size_t arg_offset = 0;
 491
 492   int c = getc (finput);
 493   obstack_1grow (&token_obstack, '%');
 494   obstack_1grow (&token_obstack, c);
 495
 496   if (!isalpha (c))
 497     {
 498       obstack_1grow (&token_obstack, '\0');
 499       token_buffer = obstack_finish (&token_obstack);
 500
 501       switch (c)
 502         {
 503         case '%':
 504           return tok_two_percents;
 505
 506         case '{':
 507           return tok_percent_left_curly;
 508
 509           /* The following guys are here for backward compatibility with
 510              very ancient Yacc versions.  The paper of Johnson mentions
 511              them (as ancient :).  */
 512         case '<':
 513           return tok_left;
 514
 515         case '>':
 516           return tok_right;
 517
 518         case '2':
 519           return tok_nonassoc;
 520
 521         case '0':
 522           return tok_token;
 523
 524         case '=':
 525           return tok_prec;
 526
 527         default:
 528           return tok_illegal;
 529         }
 530     }
 531
 532   while (c = getc (finput), isalpha (c) || c == '_' || c == '-')
 533     {
 534       if (c == '_')
 535         c = '-';
 536       obstack_1grow (&token_obstack, c);
 537     }
 538
 539   /* %DIRECTIVE="ARG".  Separate into
 540      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 541      This is a bit hackish, but once we move to a Bison parser,
 542      things will be cleaned up.  */
 543   if (c == '=')
 544     {
 545       /* End of the directive.  We skip the `='. */
 546       obstack_1grow (&token_obstack, '\0');
 547       /* Fetch the ARG if present. */
 548       c = getc (finput);
 549       if (c == '"')
 550         {
 551           int code;
 552           arg_offset = obstack_object_size (&token_obstack);
 553           /* Read up to and including `"'.  Do not append the closing
 554              `"' in the output: it's not part of the ARG.  */
 555           while ((code = literalchar ()) != '"')
 556             obstack_1grow (&token_obstack, code);
 557         }
 558       /* else: should be an error. */
 559     }
 560   else
 561     ungetc (c, finput);
 562
 563   obstack_1grow (&token_obstack, '\0');
 564   token_buffer = obstack_finish (&token_obstack);
 565   if (arg_offset)
 566     arg = token_buffer + arg_offset;
 567
 568   /* table lookup % directive */
 569   for (tx = option_table; tx->name; tx++)
 570     if ((tx->access == opt_percent || tx->access == opt_both)
 571         && option_strcmp (token_buffer + 1, tx->name) == 0)
 572       break;
 573
 574   if (arg && tx->ret_val != tok_stropt)
 575     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 576
 577
 578   switch (tx->ret_val)
 579     {
 580     case tok_stropt:
 581       assert (tx->set_flag);
 582       if (arg)
 583         {
 584           /* Keep only the first assignment: command line options have
 585              already been processed, and we want them to have
 586              precedence.  Side effect: if this %-option is used
 587              several times, only the first is honored.  Bah.  */
 588           if (!*((char **) (tx->set_flag)))
 589             *((char **) (tx->set_flag)) = xstrdup (arg);
 590         }
 591       else
 592         fatal (_("`%s' requires an argument"), token_buffer);
 593       return tok_noop;
 594       break;
 595
 596     case tok_intopt:
 597       assert (tx->set_flag);
 598       *((int *) (tx->set_flag)) = 1;
 599       return tok_noop;
 600       break;
 601
 602     case tok_obsolete:
 603       fatal (_("`%s' is no longer supported"), token_buffer);
 604       return tok_noop;
 605       break;
 606
 607     default:
 608       return tx->ret_val;
 609       break;
 610     }
 611   abort ();
 612 }