src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "symtab.h"
  25 #include "options.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 static struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 bucket *symval = NULL;
  36 int numval;
  37
  38 /* A token to be reread, see unlex and lex. */
  39 static token_t unlexed = tok_undef;
  40 static bucket *unlexed_symval = NULL;
  41 static const char *unlexed_token_buffer = NULL;
  42
  43 void
  44 lex_init (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = tok_undef;
  48 }
  49
  50
  51 void
  52 lex_free (void)
  53 {
  54   obstack_free (&token_obstack, NULL);
  55 }
  56
  57
  58 int
  59 skip_white_space (void)
  60 {
  61   int c;
  62   int inside;
  63
  64   c = getc (finput);
  65
  66   for (;;)
  67     {
  68       int cplus_comment;
  69
  70       switch (c)
  71         {
  72         case '/':
  73           /* FIXME: Should probably be merged with copy_comment.  */
  74           c = getc (finput);
  75           if (c != '*' && c != '/')
  76             {
  77               complain (_("unexpected `/' found and ignored"));
  78               break;
  79             }
  80           cplus_comment = (c == '/');
  81
  82           c = getc (finput);
  83
  84           inside = 1;
  85           while (inside)
  86             {
  87               if (!cplus_comment && c == '*')
  88                 {
  89                   while (c == '*')
  90                     c = getc (finput);
  91
  92                   if (c == '/')
  93                     {
  94                       inside = 0;
  95                       c = getc (finput);
  96                     }
  97                 }
  98               else if (c == '\n')
  99                 {
 100                   lineno++;
 101                   if (cplus_comment)
 102                     inside = 0;
 103                   c = getc (finput);
 104                 }
 105               else if (c == EOF)
 106                 fatal (_("unterminated comment"));
 107               else
 108                 c = getc (finput);
 109             }
 110
 111           break;
 112
 113         case '\n':
 114           lineno++;
 115
 116         case ' ':
 117         case '\t':
 118         case '\f':
 119           c = getc (finput);
 120           break;
 121
 122         default:
 123           return c;
 124         }
 125     }
 126 }
 127
 128
 129 /*-----------------------------------------------------.
 130 | Do a getc, but give error message if EOF encountered |
 131 `-----------------------------------------------------*/
 132
 133 static int
 134 xgetc (FILE *f)
 135 {
 136   int c = getc (f);
 137   if (c == EOF)
 138     fatal (_("unexpected end of file"));
 139   return c;
 140 }
 141
 142
 143 /*-----------------------------------------------------------------.
 144 | Read one literal character from FINPUT.  Process \-escapes.      |
 145 | Append the char to OUT and assign it *PCODE. Return 1 unless the |
 146 | character is an unescaped `term' or \n report error for \n.      |
 147 `-----------------------------------------------------------------*/
 148
 149 int
 150 literalchar (struct obstack *out, int *pcode, char term)
 151 {
 152   int c;
 153   int code;
 154   int wasquote = 0;
 155
 156   c = xgetc (finput);
 157   if (c == '\n')
 158     {
 159       complain (_("unescaped newline in constant"));
 160       ungetc (c, finput);
 161       code = '?';
 162       wasquote = 1;
 163     }
 164   else if (c != '\\')
 165     {
 166       code = c;
 167       if (c == term)
 168         wasquote = 1;
 169     }
 170   else
 171     {
 172       c = xgetc (finput);
 173       if (c == 't')
 174         code = '\t';
 175       else if (c == 'n')
 176         code = '\n';
 177       else if (c == 'a')
 178         code = '\007';
 179       else if (c == 'r')
 180         code = '\r';
 181       else if (c == 'f')
 182         code = '\f';
 183       else if (c == 'b')
 184         code = '\b';
 185       else if (c == 'v')
 186         code = '\013';
 187       else if (c == '\\')
 188         code = '\\';
 189       else if (c == '\'')
 190         code = '\'';
 191       else if (c == '\"')
 192         code = '\"';
 193       else if (c <= '7' && c >= '0')
 194         {
 195           code = 0;
 196           while (c <= '7' && c >= '0')
 197             {
 198               code = (code * 8) + (c - '0');
 199               if (code >= 256 || code < 0)
 200                 {
 201                   complain (_("octal value outside range 0...255: `\\%o'"),
 202                             code);
 203                   code &= 0xFF;
 204                   break;
 205                 }
 206               c = xgetc (finput);
 207             }
 208           ungetc (c, finput);
 209         }
 210       else if (c == 'x')
 211         {
 212           c = xgetc (finput);
 213           code = 0;
 214           while (1)
 215             {
 216               if (c >= '0' && c <= '9')
 217                 code *= 16, code += c - '0';
 218               else if (c >= 'a' && c <= 'f')
 219                 code *= 16, code += c - 'a' + 10;
 220               else if (c >= 'A' && c <= 'F')
 221                 code *= 16, code += c - 'A' + 10;
 222               else
 223                 break;
 224               if (code >= 256 || code < 0)
 225                 {
 226                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 227                   code &= 0xFF;
 228                   break;
 229                 }
 230               c = xgetc (finput);
 231             }
 232           ungetc (c, finput);
 233         }
 234       else
 235         {
 236           char badchar [] = "c";
 237           badchar[0] = c;
 238           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 239                     quote (badchar));
 240           code = '?';
 241         }
 242     }                           /* has \ */
 243
 244   if (out)
 245     obstack_1grow (out, code);
 246   *pcode = code;
 247   return !wasquote;
 248 }
 249
 250
 251 void
 252 unlex (token_t token)
 253 {
 254   unlexed = token;
 255   unlexed_token_buffer = token_buffer;
 256   unlexed_symval = symval;
 257 }
 258
 259 /*-----------------------------------------------------------------.
 260 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 261 | specified between the `<...>'.                                   |
 262 `-----------------------------------------------------------------*/
 263
 264 void
 265 read_type_name (FILE *fin)
 266 {
 267   int c = getc (fin);
 268
 269   while (c != '>')
 270     {
 271       if (c == EOF)
 272         fatal (_("unterminated type name at end of file"));
 273       if (c == '\n')
 274         {
 275           complain (_("unterminated type name"));
 276           ungetc (c, fin);
 277           break;
 278         }
 279
 280       obstack_1grow (&token_obstack, c);
 281       c = getc (fin);
 282     }
 283   obstack_1grow (&token_obstack, '\0');
 284   token_buffer = obstack_finish (&token_obstack);
 285 }
 286
 287
 288 token_t
 289 lex (void)
 290 {
 291   int c;
 292
 293   /* Just to make sure. */
 294   token_buffer = NULL;
 295
 296   if (unlexed != tok_undef)
 297     {
 298       token_t res = unlexed;
 299       symval = unlexed_symval;
 300       token_buffer = unlexed_token_buffer;
 301       unlexed = tok_undef;
 302       return res;
 303     }
 304
 305   c = skip_white_space ();
 306
 307   switch (c)
 308     {
 309     case EOF:
 310       token_buffer = "EOF";
 311       return tok_eof;
 312
 313     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 314     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 315     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 316     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 317     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 318     case 'Z':
 319     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 320     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 321     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 322     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 323     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 324     case 'z':
 325     case '.':    case '_':
 326
 327       while (isalnum (c) || c == '_' || c == '.')
 328         {
 329           obstack_1grow (&token_obstack, c);
 330           c = getc (finput);
 331         }
 332       obstack_1grow (&token_obstack, '\0');
 333       token_buffer = obstack_finish (&token_obstack);
 334       ungetc (c, finput);
 335       symval = getsym (token_buffer);
 336       return tok_identifier;
 337
 338     case '0':    case '1':    case '2':    case '3':    case '4':
 339     case '5':    case '6':    case '7':    case '8':    case '9':
 340       {
 341         numval = 0;
 342
 343         while (isdigit (c))
 344           {
 345             obstack_1grow (&token_obstack, c);
 346             numval = numval * 10 + c - '0';
 347             c = getc (finput);
 348           }
 349         obstack_1grow (&token_obstack, '\0');
 350         token_buffer = obstack_finish (&token_obstack);
 351         ungetc (c, finput);
 352         return tok_number;
 353       }
 354
 355     case '\'':
 356       /* parse the literal token and compute character code in  code  */
 357
 358       {
 359         int code;
 360
 361         obstack_1grow (&token_obstack, '\'');
 362         literalchar (&token_obstack, &code, '\'');
 363
 364         c = getc (finput);
 365         if (c != '\'')
 366           {
 367             int discode;
 368             complain (_("use \"...\" for multi-character literal tokens"));
 369             while (1)
 370               if (!literalchar (0, &discode, '\''))
 371                 break;
 372           }
 373         obstack_1grow (&token_obstack, '\'');
 374         obstack_1grow (&token_obstack, '\0');
 375         token_buffer = obstack_finish (&token_obstack);
 376         symval = getsym (token_buffer);
 377         symval->class = token_sym;
 378         if (symval->user_token_number == SUNDEF)
 379           symval->user_token_number = code;
 380         return tok_identifier;
 381       }
 382
 383     case '\"':
 384       /* parse the literal string token and treat as an identifier */
 385
 386       {
 387         int code;               /* ignored here */
 388
 389         obstack_1grow (&token_obstack, '\"');
 390         /* Read up to and including ".  */
 391         while (literalchar (&token_obstack, &code, '\"'))
 392           /* nothing */;
 393         obstack_1grow (&token_obstack, '\0');
 394         token_buffer = obstack_finish (&token_obstack);
 395
 396         symval = getsym (token_buffer);
 397         symval->class = token_sym;
 398
 399         return tok_identifier;
 400       }
 401
 402     case ',':
 403       token_buffer = ",";
 404       return tok_comma;
 405
 406     case ':':
 407       token_buffer = ":";
 408       return tok_colon;
 409
 410     case ';':
 411       token_buffer = ";";
 412       return tok_semicolon;
 413
 414     case '|':
 415       token_buffer = "|";
 416       return tok_bar;
 417
 418     case '{':
 419       token_buffer = "{";
 420       return tok_left_curly;
 421
 422     case '=':
 423       obstack_1grow (&token_obstack, c);
 424       do
 425         {
 426           c = getc (finput);
 427           obstack_1grow (&token_obstack, c);
 428           if (c == '\n')
 429             lineno++;
 430         }
 431       while (c == ' ' || c == '\n' || c == '\t');
 432       obstack_1grow (&token_obstack, '\0');
 433       token_buffer = obstack_finish (&token_obstack);
 434
 435       if (c == '{')
 436         {
 437           return tok_left_curly;
 438         }
 439       else
 440         {
 441           ungetc (c, finput);
 442           return tok_illegal;
 443         }
 444
 445     case '<':
 446       read_type_name (finput);
 447       return tok_typename;
 448
 449     case '%':
 450       return parse_percent_token ();
 451
 452     default:
 453       obstack_1grow (&token_obstack, c);
 454       obstack_1grow (&token_obstack, '\0');
 455       token_buffer = obstack_finish (&token_obstack);
 456       return tok_illegal;
 457     }
 458 }
 459
 460 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 461    chars.  */
 462
 463 static int
 464 option_strcmp (const char *left, const char *right)
 465 {
 466   const unsigned char *l, *r;
 467   int c;
 468
 469   assert (left);
 470   assert (right);
 471   l = (const unsigned char *)left;
 472   r = (const unsigned char *)right;
 473   while (((c = *l - *r++) == 0 && *l != '\0')
 474          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 475     l++;
 476   return c;
 477 }
 478
 479 /* Parse a token which starts with %.
 480    Assumes the % has already been read and discarded.  */
 481
 482 token_t
 483 parse_percent_token (void)
 484 {
 485   const struct option_table_struct *tx = NULL;
 486   const char *arg = NULL;
 487   /* Where the ARG was found in token_buffer. */
 488   size_t arg_offset = 0;
 489
 490   int c = getc (finput);
 491
 492   switch (c)
 493     {
 494     case '%':
 495       return tok_two_percents;
 496
 497     case '{':
 498       return tok_percent_left_curly;
 499
 500       /* FIXME: Who the heck are those 5 guys!?! `%<' = `%left'!!!
 501          Let's ask for there removal.  */
 502     case '<':
 503       return tok_left;
 504
 505     case '>':
 506       return tok_right;
 507
 508     case '2':
 509       return tok_nonassoc;
 510
 511     case '0':
 512       return tok_token;
 513
 514     case '=':
 515       return tok_prec;
 516     }
 517
 518   if (!isalpha (c))
 519     return tok_illegal;
 520
 521   obstack_1grow (&token_obstack, '%');
 522   while (isalpha (c) || c == '_' || c == '-')
 523     {
 524       if (c == '_')
 525         c = '-';
 526       obstack_1grow (&token_obstack, c);
 527       c = getc (finput);
 528     }
 529
 530   /* %DIRECTIVE="ARG".  Separate into
 531      TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'.
 532      This is a bit hackish, but once we move to a Bison parser,
 533      things will be cleaned up.  */
 534   if (c == '=')
 535     {
 536       /* End of the directive.  We skip the `='. */
 537       obstack_1grow (&token_obstack, '\0');
 538       /* Fetch the ARG if present. */
 539       c = getc (finput);
 540       if (c == '"')
 541         {
 542           int code;
 543           arg_offset = obstack_object_size (&token_obstack);
 544           /* Read up to and including `"'.  Do not append the closing
 545              `"' in the output: it's not part of the ARG.  */
 546           while (literalchar (NULL, &code, '"'))
 547             obstack_1grow (&token_obstack, code);
 548         }
 549       /* else: should be an error. */
 550     }
 551   else
 552     ungetc (c, finput);
 553
 554   obstack_1grow (&token_obstack, '\0');
 555   token_buffer = obstack_finish (&token_obstack);
 556   if (arg_offset)
 557     arg = token_buffer + arg_offset;
 558
 559   /* table lookup % directive */
 560   for (tx = option_table; tx->name; tx++)
 561     if ((tx->access == opt_percent || tx->access == opt_both)
 562         && option_strcmp (token_buffer + 1, tx->name) == 0)
 563       break;
 564
 565   if (arg && tx->ret_val != tok_stropt)
 566     fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg));
 567
 568
 569   switch (tx->ret_val)
 570     {
 571     case tok_stropt:
 572       assert (tx->set_flag);
 573       if (arg)
 574         {
 575           /* Keep only the first assignment: command line options have
 576              already been processed, and we want them to have
 577              precedence.  Side effect: if this %-option is used
 578              several times, only the first is honored.  Bah.  */
 579           if (!*((char **) (tx->set_flag)))
 580             *((char **) (tx->set_flag)) = xstrdup (arg);
 581         }
 582       else
 583         fatal (_("`%s' requires an argument"), token_buffer);
 584       return tok_noop;
 585       break;
 586
 587     case tok_intopt:
 588       assert (tx->set_flag);
 589       *((int *) (tx->set_flag)) = 1;
 590       return tok_noop;
 591       break;
 592
 593     case tok_obsolete:
 594       fatal (_("`%s' is no longer supported"), token_buffer);
 595       return tok_noop;
 596       break;
 597
 598     default:
 599       return tx->ret_val;
 600       break;
 601     }
 602   abort ();
 603 }