src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000, 2001, 2002
   3    Free Software Foundation, Inc.
   4
   5    This file is part of Bison, the GNU Compiler Compiler.
   6
   7    Bison is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2, or (at your option)
  10    any later version.
  11
  12    Bison is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with Bison; see the file COPYING.  If not, write to
  19    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20    Boston, MA 02111-1307, USA.  */
  21
  22 #include "system.h"
  23 #include "getargs.h"
  24 #include "files.h"
  25 #include "symtab.h"
  26 #include "options.h"
  27 #include "lex.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 static struct obstack token_obstack;
  34 const char *token_buffer = NULL;
  35
  36 symbol_t *symval = NULL;
  37 int numval;
  38
  39 /* A token to be reread, see unlex and lex. */
  40 static token_t unlexed = tok_undef;
  41 static symbol_t *unlexed_symval = NULL;
  42 static const char *unlexed_token_buffer = NULL;
  43
  44 void
  45 lex_init (void)
  46 {
  47   obstack_init (&token_obstack);
  48   unlexed = tok_undef;
  49 }
  50
  51
  52 void
  53 lex_free (void)
  54 {
  55   obstack_free (&token_obstack, NULL);
  56 }
  57
  58
  59 int
  60 skip_white_space (void)
  61 {
  62   int c;
  63   int inside;
  64
  65   c = getc (finput);
  66
  67   for (;;)
  68     {
  69       int cplus_comment;
  70
  71       switch (c)
  72         {
  73         case '/':
  74           /* FIXME: Should probably be merged with copy_comment.  */
  75           c = getc (finput);
  76           if (c != '*' && c != '/')
  77             {
  78               complain (_("unexpected `/' found and ignored"));
  79               break;
  80             }
  81           cplus_comment = (c == '/');
  82
  83           c = getc (finput);
  84
  85           inside = 1;
  86           while (inside)
  87             {
  88               if (!cplus_comment && c == '*')
  89                 {
  90                   while (c == '*')
  91                     c = getc (finput);
  92
  93                   if (c == '/')
  94                     {
  95                       inside = 0;
  96                       c = getc (finput);
  97                     }
  98                 }
  99               else if (c == '\n')
 100                 {
 101                   lineno++;
 102                   if (cplus_comment)
 103                     inside = 0;
 104                   c = getc (finput);
 105                 }
 106               else if (c == EOF)
 107                 fatal (_("unterminated comment"));
 108               else
 109                 c = getc (finput);
 110             }
 111
 112           break;
 113
 114         case '\n':
 115           lineno++;
 116
 117         case ' ':
 118         case '\t':
 119         case '\f':
 120           c = getc (finput);
 121           break;
 122
 123         default:
 124           return c;
 125         }
 126     }
 127 }
 128
 129
 130 /*-----------------------------------------------------.
 131 | Do a getc, but give error message if EOF encountered |
 132 `-----------------------------------------------------*/
 133
 134 int
 135 xgetc (FILE *f)
 136 {
 137   int c = getc (f);
 138   if (c == EOF)
 139     fatal (_("unexpected end of file"));
 140   return c;
 141 }
 142
 143
 144 /*---------------------------------------------------------------.
 145 | Read one literal character from FINPUT, process \-escapes, and |
 146 | return the character.                                          |
 147 `---------------------------------------------------------------*/
 148
 149 char
 150 literalchar (void)
 151 {
 152   int c;
 153   int res;
 154
 155   c = xgetc (finput);
 156   if (c == '\n')
 157     {
 158       complain (_("unescaped newline in constant"));
 159       ungetc (c, finput);
 160       res = '?';
 161     }
 162   else if (c != '\\')
 163     {
 164       res = c;
 165     }
 166   else
 167     {
 168       c = xgetc (finput);
 169       if (c == 't')
 170         res = '\t';
 171       else if (c == 'n')
 172         res = '\n';
 173       else if (c == 'a')
 174         res = '\007';
 175       else if (c == 'r')
 176         res = '\r';
 177       else if (c == 'f')
 178         res = '\f';
 179       else if (c == 'b')
 180         res = '\b';
 181       else if (c == 'v')
 182         res = '\013';
 183       else if (c == '\\')
 184         res = '\\';
 185       else if (c == '\'')
 186         res = '\'';
 187       else if (c == '\"')
 188         res = '\"';
 189       else if (c <= '7' && c >= '0')
 190         {
 191           res = 0;
 192           while (c <= '7' && c >= '0')
 193             {
 194               res = (res * 8) + (c - '0');
 195               if (res >= 256 || res < 0)
 196                 {
 197                   complain (_("octal value outside range 0...255: `\\%o'"),
 198                             res);
 199                   res &= 0xFF;
 200                   break;
 201                 }
 202               c = xgetc (finput);
 203             }
 204           ungetc (c, finput);
 205         }
 206       else if (c == 'x')
 207         {
 208           c = xgetc (finput);
 209           res = 0;
 210           while (1)
 211             {
 212               if (c >= '0' && c <= '9')
 213                 res *= 16, res += c - '0';
 214               else if (c >= 'a' && c <= 'f')
 215                 res *= 16, res += c - 'a' + 10;
 216               else if (c >= 'A' && c <= 'F')
 217                 res *= 16, res += c - 'A' + 10;
 218               else
 219                 break;
 220               if (res >= 256 || res < 0)
 221                 {
 222                   complain (_("hexadecimal value above 255: `\\x%x'"), res);
 223                   res &= 0xFF;
 224                   break;
 225                 }
 226               c = xgetc (finput);
 227             }
 228           ungetc (c, finput);
 229         }
 230       else
 231         {
 232           char badchar [] = "c";
 233           badchar[0] = c;
 234           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 235                     quote (badchar));
 236           res = '?';
 237         }
 238     }                           /* has \ */
 239
 240   return res;
 241 }
 242
 243
 244 void
 245 unlex (token_t token)
 246 {
 247   unlexed = token;
 248   unlexed_token_buffer = token_buffer;
 249   unlexed_symval = symval;
 250 }
 251
 252 /*-----------------------------------------------------------------.
 253 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 254 | specified between the `<...>'.                                   |
 255 `-----------------------------------------------------------------*/
 256
 257 void
 258 read_type_name (FILE *fin)
 259 {
 260   int c = getc (fin);
 261
 262   while (c != '>')
 263     {
 264       if (c == EOF)
 265         fatal (_("unterminated type name at end of file"));
 266       if (c == '\n')
 267         {
 268           complain (_("unterminated type name"));
 269           ungetc (c, fin);
 270           break;
 271         }
 272
 273       obstack_1grow (&token_obstack, c);
 274       c = getc (fin);
 275     }
 276   obstack_1grow (&token_obstack, '\0');
 277   token_buffer = obstack_finish (&token_obstack);
 278 }
 279
 280
 281 token_t
 282 lex (void)
 283 {
 284   int c;
 285
 286   /* Just to make sure. */
 287   token_buffer = NULL;
 288
 289   if (unlexed != tok_undef)
 290     {
 291       token_t res = unlexed;
 292       symval = unlexed_symval;
 293       token_buffer = unlexed_token_buffer;
 294       unlexed = tok_undef;
 295       return res;
 296     }
 297
 298   c = skip_white_space ();
 299
 300   switch (c)
 301     {
 302     case EOF:
 303       token_buffer = "EOF";
 304       return tok_eof;
 305
 306     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 307     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 308     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 309     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 310     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 311     case 'Z':
 312     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 313     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 314     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 315     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 316     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 317     case 'z':
 318     case '.':    case '_':
 319
 320       while (isalnum (c) || c == '_' || c == '.')
 321         {
 322           obstack_1grow (&token_obstack, c);
 323           c = getc (finput);
 324         }
 325       obstack_1grow (&token_obstack, '\0');
 326       token_buffer = obstack_finish (&token_obstack);
 327       ungetc (c, finput);
 328       symval = getsym (token_buffer);
 329       return tok_identifier;
 330
 331     case '0':    case '1':    case '2':    case '3':    case '4':
 332     case '5':    case '6':    case '7':    case '8':    case '9':
 333       {
 334         numval = 0;
 335
 336         while (isdigit (c))
 337           {
 338             obstack_1grow (&token_obstack, c);
 339             numval = numval * 10 + c - '0';
 340             c = getc (finput);
 341           }
 342         obstack_1grow (&token_obstack, '\0');
 343         token_buffer = obstack_finish (&token_obstack);
 344         ungetc (c, finput);
 345         return tok_number;
 346       }
 347
 348     case '\'':
 349       /* parse the literal token and compute character code in  code  */
 350
 351       {
 352         int code = literalchar ();
 353
 354         obstack_1grow (&token_obstack, '\'');
 355         obstack_1grow (&token_obstack, code);
 356
 357         c = getc (finput);
 358         if (c != '\'')
 359           {
 360             complain (_("use \"...\" for multi-character literal tokens"));
 361             while (literalchar () != '\'')
 362               /* Skip. */;
 363           }
 364         obstack_1grow (&token_obstack, '\'');
 365         obstack_1grow (&token_obstack, '\0');
 366         token_buffer = obstack_finish (&token_obstack);
 367         symval = getsym (token_buffer);
 368         symbol_class_set (symval, token_sym);
 369         symbol_user_token_number_set (symval, code);
 370         return tok_identifier;
 371       }
 372
 373     case '\"':
 374       /* parse the literal string token and treat as an identifier */
 375
 376       {
 377         int code;
 378
 379         obstack_1grow (&token_obstack, '\"');
 380         /* Read up to and including ".  */
 381         do
 382           {
 383             code = literalchar ();
 384             obstack_1grow (&token_obstack, code);
 385           }
 386         while (code != '\"');
 387         obstack_1grow (&token_obstack, '\0');
 388         token_buffer = obstack_finish (&token_obstack);
 389
 390         symval = getsym (token_buffer);
 391         symbol_class_set (symval, token_sym);
 392         return tok_identifier;
 393       }
 394
 395     case ',':
 396       token_buffer = ",";
 397       return tok_comma;
 398
 399     case ':':
 400       token_buffer = ":";
 401       return tok_colon;
 402
 403     case ';':
 404       token_buffer = ";";
 405       return tok_semicolon;
 406
 407     case '|':
 408       token_buffer = "|";
 409       return tok_bar;
 410
 411     case '{':
 412       token_buffer = "{";
 413       return tok_left_curly;
 414
 415     case '=':
 416       obstack_1grow (&token_obstack, c);
 417       do
 418         {
 419           c = getc (finput);
 420           obstack_1grow (&token_obstack, c);
 421           if (c == '\n')
 422             lineno++;
 423         }
 424       while (c == ' ' || c == '\n' || c == '\t');
 425       obstack_1grow (&token_obstack, '\0');
 426       token_buffer = obstack_finish (&token_obstack);
 427
 428       if (c == '{')
 429         {
 430           return tok_left_curly;
 431         }
 432       else
 433         {
 434           ungetc (c, finput);
 435           return tok_illegal;
 436         }
 437
 438     case '<':
 439       read_type_name (finput);
 440       return tok_typename;
 441
 442     case '%':
 443       return parse_percent_token ();
 444
 445     default:
 446       obstack_1grow (&token_obstack, c);
 447       obstack_1grow (&token_obstack, '\0');
 448       token_buffer = obstack_finish (&token_obstack);
 449       return tok_illegal;
 450     }
 451 }
 452
 453 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 454    chars.  */
 455
 456 static int
 457 option_strcmp (const char *left, const char *right)
 458 {
 459   const unsigned char *l, *r;
 460   int c;
 461
 462   assert (left);
 463   assert (right);
 464   l = (const unsigned char *)left;
 465   r = (const unsigned char *)right;
 466   while (((c = *l - *r++) == 0 && *l != '\0')
 467          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 468     l++;
 469   return c;
 470 }
 471
 472 /* Parse a token which starts with %.
 473    Assumes the % has already been read and discarded.  */
 474
 475 token_t
 476 parse_percent_token (void)
 477 {
 478   const struct option_table_s *tx = NULL;
 479   const char *arg = NULL;
 480   /* Where the ARG was found in token_buffer. */
 481   size_t arg_offset = 0;
 482
 483   int c = getc (finput);
 484   obstack_1grow (&token_obstack, '%');
 485   obstack_1grow (&token_obstack, c);
 486
 487   if (!isalpha (c))
 488     {
 489       obstack_1grow (&token_obstack, '\0');
 490       token_buffer = obstack_finish (&token_obstack);
 491
 492       switch (c)
 493         {
 494         case '%':
 495           return tok_two_percents;
 496
 497         case '{':
 498           return tok_percent_left_curly;
 499
 500           /* The following guys are here for backward compatibility with
 501              very ancient Yacc versions.  The paper of Johnson mentions
 502              them (as ancient :).  */
 503         case '<':
 504           return tok_left;
 505
 506         case '>':
 507           return tok_right;
 508
 509         case '2':
 510           return tok_nonassoc;
 511
 512         case '0':
 513           return tok_token;
 514
 515         case '=':
 516           return tok_prec;
 517
 518         default:
 519           return tok_illegal;
 520         }
 521     }
 522
 523   while (c = getc (finput), isalpha (c) || c == '_' || c == '-')
 524     {
 525       if (c == '_')
 526         c = '-';
 527       obstack_1grow (&token_obstack, c);
 528     }
 529
 530   /* %DIRECTIVE="ARG".  Separate into
 531      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 532      This is a bit hackish, but once we move to a Bison parser,
 533      things will be cleaned up.  */
 534   if (c == '=')
 535     {
 536       /* End of the directive.  We skip the `='. */
 537       obstack_1grow (&token_obstack, '\0');
 538       /* Fetch the ARG if present. */
 539       c = getc (finput);
 540       if (c == '"')
 541         {
 542           int code;
 543           arg_offset = obstack_object_size (&token_obstack);
 544           /* Read up to and including `"'.  Do not append the closing
 545              `"' in the output: it's not part of the ARG.  */
 546           while ((code = literalchar ()) != '"')
 547             obstack_1grow (&token_obstack, code);
 548         }
 549       /* else: should be an error. */
 550     }
 551   else
 552     ungetc (c, finput);
 553
 554   obstack_1grow (&token_obstack, '\0');
 555   token_buffer = obstack_finish (&token_obstack);
 556   if (arg_offset)
 557     arg = token_buffer + arg_offset;
 558
 559   /* table lookup % directive */
 560   for (tx = option_table; tx->name; tx++)
 561     if ((tx->access == opt_percent || tx->access == opt_both)
 562         && option_strcmp (token_buffer + 1, tx->name) == 0)
 563       break;
 564
 565   if (arg && tx->ret_val != tok_stropt)
 566     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 567
 568
 569   switch (tx->ret_val)
 570     {
 571     case tok_stropt:
 572       assert (tx->flag);
 573       if (arg)
 574         {
 575           char **flag = (char **) tx->flag;
 576           /* Keep only the first assignment: command line options have
 577              already been processed, and we want them to have
 578              precedence.  Side effect: if this %-option is used
 579              several times, only the first is honored.  Bah.  */
 580           if (!*flag)
 581             *flag = xstrdup (arg);
 582         }
 583       else
 584         fatal (_("`%s' requires an argument"), token_buffer);
 585       return tok_noop;
 586       break;
 587
 588     case tok_intopt:
 589       assert (tx->flag);
 590       *((int *) (tx->flag)) = 1;
 591       return tok_noop;
 592       break;
 593
 594     case tok_obsolete:
 595       fatal (_("`%s' is no longer supported"), token_buffer);
 596       return tok_noop;
 597       break;
 598
 599     default:
 600       return tx->ret_val;
 601       break;
 602     }
 603   abort ();
 604 }