src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "symtab.h"
  25 #include "options.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 static struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 bucket *symval = NULL;
  36 int numval;
  37
  38 /* A token to be reread, see unlex and lex. */
  39 static token_t unlexed = tok_undef;
  40 static bucket *unlexed_symval = NULL;
  41 static const char *unlexed_token_buffer = NULL;
  42
  43 void
  44 lex_init (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = tok_undef;
  48 }
  49
  50
  51 void
  52 lex_free (void)
  53 {
  54   obstack_free (&token_obstack, NULL);
  55 }
  56
  57
  58 int
  59 skip_white_space (void)
  60 {
  61   int c;
  62   int inside;
  63
  64   c = getc (finput);
  65
  66   for (;;)
  67     {
  68       int cplus_comment;
  69
  70       switch (c)
  71         {
  72         case '/':
  73           /* FIXME: Should probably be merged with copy_comment.  */
  74           c = getc (finput);
  75           if (c != '*' && c != '/')
  76             {
  77               complain (_("unexpected `/' found and ignored"));
  78               break;
  79             }
  80           cplus_comment = (c == '/');
  81
  82           c = getc (finput);
  83
  84           inside = 1;
  85           while (inside)
  86             {
  87               if (!cplus_comment && c == '*')
  88                 {
  89                   while (c == '*')
  90                     c = getc (finput);
  91
  92                   if (c == '/')
  93                     {
  94                       inside = 0;
  95                       c = getc (finput);
  96                     }
  97                 }
  98               else if (c == '\n')
  99                 {
 100                   lineno++;
 101                   if (cplus_comment)
 102                     inside = 0;
 103                   c = getc (finput);
 104                 }
 105               else if (c == EOF)
 106                 fatal (_("unterminated comment"));
 107               else
 108                 c = getc (finput);
 109             }
 110
 111           break;
 112
 113         case '\n':
 114           lineno++;
 115
 116         case ' ':
 117         case '\t':
 118         case '\f':
 119           c = getc (finput);
 120           break;
 121
 122         default:
 123           return c;
 124         }
 125     }
 126 }
 127
 128
 129 /*-----------------------------------------------------.
 130 | Do a getc, but give error message if EOF encountered |
 131 `-----------------------------------------------------*/
 132
 133 int
 134 xgetc (FILE *f)
 135 {
 136   int c = getc (f);
 137   if (c == EOF)
 138     fatal (_("unexpected end of file"));
 139   return c;
 140 }
 141
 142
 143 /*---------------------------------------------------------------.
 144 | Read one literal character from FINPUT, process \-escapes, and |
 145 | return the character.                                          |
 146 `---------------------------------------------------------------*/
 147
 148 char
 149 literalchar (void)
 150 {
 151   int c;
 152   int res;
 153
 154   c = xgetc (finput);
 155   if (c == '\n')
 156     {
 157       complain (_("unescaped newline in constant"));
 158       ungetc (c, finput);
 159       res = '?';
 160     }
 161   else if (c != '\\')
 162     {
 163       res = c;
 164     }
 165   else
 166     {
 167       c = xgetc (finput);
 168       if (c == 't')
 169         res = '\t';
 170       else if (c == 'n')
 171         res = '\n';
 172       else if (c == 'a')
 173         res = '\007';
 174       else if (c == 'r')
 175         res = '\r';
 176       else if (c == 'f')
 177         res = '\f';
 178       else if (c == 'b')
 179         res = '\b';
 180       else if (c == 'v')
 181         res = '\013';
 182       else if (c == '\\')
 183         res = '\\';
 184       else if (c == '\'')
 185         res = '\'';
 186       else if (c == '\"')
 187         res = '\"';
 188       else if (c <= '7' && c >= '0')
 189         {
 190           res = 0;
 191           while (c <= '7' && c >= '0')
 192             {
 193               res = (res * 8) + (c - '0');
 194               if (res >= 256 || res < 0)
 195                 {
 196                   complain (_("octal value outside range 0...255: `\\%o'"),
 197                             res);
 198                   res &= 0xFF;
 199                   break;
 200                 }
 201               c = xgetc (finput);
 202             }
 203           ungetc (c, finput);
 204         }
 205       else if (c == 'x')
 206         {
 207           c = xgetc (finput);
 208           res = 0;
 209           while (1)
 210             {
 211               if (c >= '0' && c <= '9')
 212                 res *= 16, res += c - '0';
 213               else if (c >= 'a' && c <= 'f')
 214                 res *= 16, res += c - 'a' + 10;
 215               else if (c >= 'A' && c <= 'F')
 216                 res *= 16, res += c - 'A' + 10;
 217               else
 218                 break;
 219               if (res >= 256 || res < 0)
 220                 {
 221                   complain (_("hexadecimal value above 255: `\\x%x'"), res);
 222                   res &= 0xFF;
 223                   break;
 224                 }
 225               c = xgetc (finput);
 226             }
 227           ungetc (c, finput);
 228         }
 229       else
 230         {
 231           char badchar [] = "c";
 232           badchar[0] = c;
 233           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 234                     quote (badchar));
 235           res = '?';
 236         }
 237     }                           /* has \ */
 238
 239   return res;
 240 }
 241
 242
 243 void
 244 unlex (token_t token)
 245 {
 246   unlexed = token;
 247   unlexed_token_buffer = token_buffer;
 248   unlexed_symval = symval;
 249 }
 250
 251 /*-----------------------------------------------------------------.
 252 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 253 | specified between the `<...>'.                                   |
 254 `-----------------------------------------------------------------*/
 255
 256 void
 257 read_type_name (FILE *fin)
 258 {
 259   int c = getc (fin);
 260
 261   while (c != '>')
 262     {
 263       if (c == EOF)
 264         fatal (_("unterminated type name at end of file"));
 265       if (c == '\n')
 266         {
 267           complain (_("unterminated type name"));
 268           ungetc (c, fin);
 269           break;
 270         }
 271
 272       obstack_1grow (&token_obstack, c);
 273       c = getc (fin);
 274     }
 275   obstack_1grow (&token_obstack, '\0');
 276   token_buffer = obstack_finish (&token_obstack);
 277 }
 278
 279
 280 token_t
 281 lex (void)
 282 {
 283   int c;
 284
 285   /* Just to make sure. */
 286   token_buffer = NULL;
 287
 288   if (unlexed != tok_undef)
 289     {
 290       token_t res = unlexed;
 291       symval = unlexed_symval;
 292       token_buffer = unlexed_token_buffer;
 293       unlexed = tok_undef;
 294       return res;
 295     }
 296
 297   c = skip_white_space ();
 298
 299   switch (c)
 300     {
 301     case EOF:
 302       token_buffer = "EOF";
 303       return tok_eof;
 304
 305     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 306     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 307     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 308     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 309     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 310     case 'Z':
 311     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 312     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 313     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 314     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 315     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 316     case 'z':
 317     case '.':    case '_':
 318
 319       while (isalnum (c) || c == '_' || c == '.')
 320         {
 321           obstack_1grow (&token_obstack, c);
 322           c = getc (finput);
 323         }
 324       obstack_1grow (&token_obstack, '\0');
 325       token_buffer = obstack_finish (&token_obstack);
 326       ungetc (c, finput);
 327       symval = getsym (token_buffer);
 328       return tok_identifier;
 329
 330     case '0':    case '1':    case '2':    case '3':    case '4':
 331     case '5':    case '6':    case '7':    case '8':    case '9':
 332       {
 333         numval = 0;
 334
 335         while (isdigit (c))
 336           {
 337             obstack_1grow (&token_obstack, c);
 338             numval = numval * 10 + c - '0';
 339             c = getc (finput);
 340           }
 341         obstack_1grow (&token_obstack, '\0');
 342         token_buffer = obstack_finish (&token_obstack);
 343         ungetc (c, finput);
 344         return tok_number;
 345       }
 346
 347     case '\'':
 348       /* parse the literal token and compute character code in  code  */
 349
 350       {
 351         int code = literalchar ();
 352
 353         obstack_1grow (&token_obstack, '\'');
 354         obstack_1grow (&token_obstack, code);
 355
 356         c = getc (finput);
 357         if (c != '\'')
 358           {
 359             complain (_("use \"...\" for multi-character literal tokens"));
 360             while (literalchar () != '\'')
 361               /* Skip. */;
 362           }
 363         obstack_1grow (&token_obstack, '\'');
 364         obstack_1grow (&token_obstack, '\0');
 365         token_buffer = obstack_finish (&token_obstack);
 366         symval = getsym (token_buffer);
 367         symval->class = token_sym;
 368         if (symval->user_token_number == SUNDEF)
 369           symval->user_token_number = code;
 370         return tok_identifier;
 371       }
 372
 373     case '\"':
 374       /* parse the literal string token and treat as an identifier */
 375
 376       {
 377         int code;               /* ignored here */
 378
 379         obstack_1grow (&token_obstack, '\"');
 380         /* Read up to and including ".  */
 381         do
 382           {
 383             code = literalchar ();
 384             obstack_1grow (&token_obstack, code);
 385           }
 386         while (code != '\"');
 387         obstack_1grow (&token_obstack, '\0');
 388         token_buffer = obstack_finish (&token_obstack);
 389
 390         symval = getsym (token_buffer);
 391         symval->class = token_sym;
 392
 393         return tok_identifier;
 394       }
 395
 396     case ',':
 397       token_buffer = ",";
 398       return tok_comma;
 399
 400     case ':':
 401       token_buffer = ":";
 402       return tok_colon;
 403
 404     case ';':
 405       token_buffer = ";";
 406       return tok_semicolon;
 407
 408     case '|':
 409       token_buffer = "|";
 410       return tok_bar;
 411
 412     case '{':
 413       token_buffer = "{";
 414       return tok_left_curly;
 415
 416     case '=':
 417       obstack_1grow (&token_obstack, c);
 418       do
 419         {
 420           c = getc (finput);
 421           obstack_1grow (&token_obstack, c);
 422           if (c == '\n')
 423             lineno++;
 424         }
 425       while (c == ' ' || c == '\n' || c == '\t');
 426       obstack_1grow (&token_obstack, '\0');
 427       token_buffer = obstack_finish (&token_obstack);
 428
 429       if (c == '{')
 430         {
 431           return tok_left_curly;
 432         }
 433       else
 434         {
 435           ungetc (c, finput);
 436           return tok_illegal;
 437         }
 438
 439     case '<':
 440       read_type_name (finput);
 441       return tok_typename;
 442
 443     case '%':
 444       return parse_percent_token ();
 445
 446     default:
 447       obstack_1grow (&token_obstack, c);
 448       obstack_1grow (&token_obstack, '\0');
 449       token_buffer = obstack_finish (&token_obstack);
 450       return tok_illegal;
 451     }
 452 }
 453
 454 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 455    chars.  */
 456
 457 static int
 458 option_strcmp (const char *left, const char *right)
 459 {
 460   const unsigned char *l, *r;
 461   int c;
 462
 463   assert (left);
 464   assert (right);
 465   l = (const unsigned char *)left;
 466   r = (const unsigned char *)right;
 467   while (((c = *l - *r++) == 0 && *l != '\0')
 468          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 469     l++;
 470   return c;
 471 }
 472
 473 /* Parse a token which starts with %.
 474    Assumes the % has already been read and discarded.  */
 475
 476 token_t
 477 parse_percent_token (void)
 478 {
 479   const struct option_table_struct *tx = NULL;
 480   const char *arg = NULL;
 481   /* Where the ARG was found in token_buffer. */
 482   size_t arg_offset = 0;
 483
 484   int c = getc (finput);
 485   obstack_1grow (&token_obstack, '%');
 486   obstack_1grow (&token_obstack, c);
 487
 488   switch (c)
 489     {
 490     case '%':
 491       token_buffer = obstack_finish (&token_obstack);
 492       return tok_two_percents;
 493
 494     case '{':
 495       token_buffer = obstack_finish (&token_obstack);
 496       return tok_percent_left_curly;
 497
 498       /* The following guys are here for backward compatibility with
 499          very ancient Yacc versions.  The paper of Johnson mentions
 500          them (as ancient :).  */
 501     case '<':
 502       token_buffer = obstack_finish (&token_obstack);
 503       return tok_left;
 504
 505     case '>':
 506       token_buffer = obstack_finish (&token_obstack);
 507       return tok_right;
 508
 509     case '2':
 510       token_buffer = obstack_finish (&token_obstack);
 511       return tok_nonassoc;
 512
 513     case '0':
 514       token_buffer = obstack_finish (&token_obstack);
 515       return tok_token;
 516
 517     case '=':
 518       token_buffer = obstack_finish (&token_obstack);
 519       return tok_prec;
 520     }
 521
 522   if (!isalpha (c))
 523     {
 524       token_buffer = obstack_finish (&token_obstack);
 525       return tok_illegal;
 526     }
 527
 528   while (c = getc (finput), isalpha (c) || c == '_' || c == '-')
 529     {
 530       if (c == '_')
 531         c = '-';
 532       obstack_1grow (&token_obstack, c);
 533     }
 534
 535   /* %DIRECTIVE="ARG".  Separate into
 536      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 537      This is a bit hackish, but once we move to a Bison parser,
 538      things will be cleaned up.  */
 539   if (c == '=')
 540     {
 541       /* End of the directive.  We skip the `='. */
 542       obstack_1grow (&token_obstack, '\0');
 543       /* Fetch the ARG if present. */
 544       c = getc (finput);
 545       if (c == '"')
 546         {
 547           int code;
 548           arg_offset = obstack_object_size (&token_obstack);
 549           /* Read up to and including `"'.  Do not append the closing
 550              `"' in the output: it's not part of the ARG.  */
 551           while ((code = literalchar ()) != '"')
 552             obstack_1grow (&token_obstack, code);
 553         }
 554       /* else: should be an error. */
 555     }
 556   else
 557     ungetc (c, finput);
 558
 559   obstack_1grow (&token_obstack, '\0');
 560   token_buffer = obstack_finish (&token_obstack);
 561   if (arg_offset)
 562     arg = token_buffer + arg_offset;
 563
 564   /* table lookup % directive */
 565   for (tx = option_table; tx->name; tx++)
 566     if ((tx->access == opt_percent || tx->access == opt_both)
 567         && option_strcmp (token_buffer + 1, tx->name) == 0)
 568       break;
 569
 570   if (arg && tx->ret_val != tok_stropt)
 571     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 572
 573
 574   switch (tx->ret_val)
 575     {
 576     case tok_stropt:
 577       assert (tx->set_flag);
 578       if (arg)
 579         {
 580           /* Keep only the first assignment: command line options have
 581              already been processed, and we want them to have
 582              precedence.  Side effect: if this %-option is used
 583              several times, only the first is honored.  Bah.  */
 584           if (!*((char **) (tx->set_flag)))
 585             *((char **) (tx->set_flag)) = xstrdup (arg);
 586         }
 587       else
 588         fatal (_("`%s' requires an argument"), token_buffer);
 589       return tok_noop;
 590       break;
 591
 592     case tok_intopt:
 593       assert (tx->set_flag);
 594       *((int *) (tx->set_flag)) = 1;
 595       return tok_noop;
 596       break;
 597
 598     case tok_obsolete:
 599       fatal (_("`%s' is no longer supported"), token_buffer);
 600       return tok_noop;
 601       break;
 602
 603     default:
 604       return tx->ret_val;
 605       break;
 606     }
 607   abort ();
 608 }