src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "options.h"
  27 #include "lex.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 struct obstack token_obstack;
  34 const char *token_buffer = NULL;
  35
  36 bucket *symval;
  37 int numval;
  38
  39 /* these two describe a token to be reread */
  40 static token_t unlexed = tok_undef;
  41 /* by the next call to lex */
  42 static bucket *unlexed_symval = NULL;
  43
  44
  45 void
  46 lex_init (void)
  47 {
  48   obstack_init (&token_obstack);
  49   unlexed = tok_undef;
  50 }
  51
  52
  53 void
  54 lex_free (void)
  55 {
  56   obstack_free (&token_obstack, NULL);
  57 }
  58
  59
  60 int
  61 skip_white_space (void)
  62 {
  63   int c;
  64   int inside;
  65
  66   c = getc (finput);
  67
  68   for (;;)
  69     {
  70       int cplus_comment;
  71
  72       switch (c)
  73         {
  74         case '/':
  75           /* FIXME: Should probably be merged with copy_comment.  */
  76           c = getc (finput);
  77           if (c != '*' && c != '/')
  78             {
  79               complain (_("unexpected `/' found and ignored"));
  80               break;
  81             }
  82           cplus_comment = (c == '/');
  83
  84           c = getc (finput);
  85
  86           inside = 1;
  87           while (inside)
  88             {
  89               if (!cplus_comment && c == '*')
  90                 {
  91                   while (c == '*')
  92                     c = getc (finput);
  93
  94                   if (c == '/')
  95                     {
  96                       inside = 0;
  97                       c = getc (finput);
  98                     }
  99                 }
 100               else if (c == '\n')
 101                 {
 102                   lineno++;
 103                   if (cplus_comment)
 104                     inside = 0;
 105                   c = getc (finput);
 106                 }
 107               else if (c == EOF)
 108                 fatal (_("unterminated comment"));
 109               else
 110                 c = getc (finput);
 111             }
 112
 113           break;
 114
 115         case '\n':
 116           lineno++;
 117
 118         case ' ':
 119         case '\t':
 120         case '\f':
 121           c = getc (finput);
 122           break;
 123
 124         default:
 125           return c;
 126         }
 127     }
 128 }
 129
 130
 131 /*-----------------------------------------------------.
 132 | Do a getc, but give error message if EOF encountered |
 133 `-----------------------------------------------------*/
 134
 135 static int
 136 xgetc (FILE *f)
 137 {
 138   int c = getc (f);
 139   if (c == EOF)
 140     fatal (_("unexpected end of file"));
 141   return c;
 142 }
 143
 144
 145 /*------------------------------------------------------------------.
 146 | Read one literal character from finput.  Process \ escapes.       |
 147 | Append the normalized string version of the char to OUT.  Assign  |
 148 | the character code to *PCODE. Return 1 unless the character is an |
 149 | unescaped `term' or \n report error for \n.                       |
 150 `------------------------------------------------------------------*/
 151
 152 /* FIXME: We could directly work in the obstack, but that would make
 153    it more difficult to move to quotearg some day.  So for the time
 154    being, I prefer have literalchar behave like quotearg, and change
 155    my mind later if I was wrong.  */
 156
 157 static int
 158 literalchar (struct obstack *out, int *pcode, char term)
 159 {
 160   int c;
 161   char buf[4096];
 162   char *cp;
 163   int code;
 164   int wasquote = 0;
 165
 166   c = xgetc (finput);
 167   if (c == '\n')
 168     {
 169       complain (_("unescaped newline in constant"));
 170       ungetc (c, finput);
 171       code = '?';
 172       wasquote = 1;
 173     }
 174   else if (c != '\\')
 175     {
 176       code = c;
 177       if (c == term)
 178         wasquote = 1;
 179     }
 180   else
 181     {
 182       c = xgetc (finput);
 183       if (c == 't')
 184         code = '\t';
 185       else if (c == 'n')
 186         code = '\n';
 187       else if (c == 'a')
 188         code = '\007';
 189       else if (c == 'r')
 190         code = '\r';
 191       else if (c == 'f')
 192         code = '\f';
 193       else if (c == 'b')
 194         code = '\b';
 195       else if (c == 'v')
 196         code = '\013';
 197       else if (c == '\\')
 198         code = '\\';
 199       else if (c == '\'')
 200         code = '\'';
 201       else if (c == '\"')
 202         code = '\"';
 203       else if (c <= '7' && c >= '0')
 204         {
 205           code = 0;
 206           while (c <= '7' && c >= '0')
 207             {
 208               code = (code * 8) + (c - '0');
 209               if (code >= 256 || code < 0)
 210                 {
 211                   complain (_("octal value outside range 0...255: `\\%o'"),
 212                             code);
 213                   code &= 0xFF;
 214                   break;
 215                 }
 216               c = xgetc (finput);
 217             }
 218           ungetc (c, finput);
 219         }
 220       else if (c == 'x')
 221         {
 222           c = xgetc (finput);
 223           code = 0;
 224           while (1)
 225             {
 226               if (c >= '0' && c <= '9')
 227                 code *= 16, code += c - '0';
 228               else if (c >= 'a' && c <= 'f')
 229                 code *= 16, code += c - 'a' + 10;
 230               else if (c >= 'A' && c <= 'F')
 231                 code *= 16, code += c - 'A' + 10;
 232               else
 233                 break;
 234               if (code >= 256 || code < 0)
 235                 {
 236                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 237                   code &= 0xFF;
 238                   break;
 239                 }
 240               c = xgetc (finput);
 241             }
 242           ungetc (c, finput);
 243         }
 244       else
 245         {
 246           char badchar [] = "c";
 247           badchar[0] = c;
 248           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 249                     quote (badchar));
 250           code = '?';
 251         }
 252     }                           /* has \ */
 253
 254   /* now fill BUF with the canonical name for this character as a
 255      literal token.  Do not use what the user typed, so that `\012'
 256      and `\n' can be interchangeable.  */
 257
 258   cp = buf;
 259   if (code == term && wasquote)
 260     *cp++ = code;
 261   else if (code == '\\')
 262     {
 263       *cp++ = '\\';
 264       *cp++ = '\\';
 265     }
 266   else if (code == '\'')
 267     {
 268       *cp++ = '\\';
 269       *cp++ = '\'';
 270     }
 271   else if (code == '\"')
 272     {
 273       *cp++ = '\\';
 274       *cp++ = '\"';
 275     }
 276   else if (code >= 040 && code < 0177)
 277     *cp++ = code;
 278   else if (code == '\t')
 279     {
 280       *cp++ = '\\';
 281       *cp++ = 't';
 282     }
 283   else if (code == '\n')
 284     {
 285       *cp++ = '\\';
 286       *cp++ = 'n';
 287     }
 288   else if (code == '\r')
 289     {
 290       *cp++ = '\\';
 291       *cp++ = 'r';
 292     }
 293   else if (code == '\v')
 294     {
 295       *cp++ = '\\';
 296       *cp++ = 'v';
 297     }
 298   else if (code == '\b')
 299     {
 300       *cp++ = '\\';
 301       *cp++ = 'b';
 302     }
 303   else if (code == '\f')
 304     {
 305       *cp++ = '\\';
 306       *cp++ = 'f';
 307     }
 308   else
 309     {
 310       *cp++ = '\\';
 311       *cp++ = code / 0100 + '0';
 312       *cp++ = ((code / 010) & 07) + '0';
 313       *cp++ = (code & 07) + '0';
 314     }
 315   *cp = '\0';
 316
 317   if (out)
 318     obstack_sgrow (out, buf);
 319   *pcode = code;
 320   return !wasquote;
 321 }
 322
 323
 324 void
 325 unlex (token_t token)
 326 {
 327   unlexed = token;
 328   unlexed_symval = symval;
 329 }
 330
 331 /*-----------------------------------------------------------------.
 332 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 333 | specified between the `<...>'.                                   |
 334 `-----------------------------------------------------------------*/
 335
 336 void
 337 read_type_name (FILE *fin)
 338 {
 339   int c = getc (fin);
 340
 341   while (c != '>')
 342     {
 343       if (c == EOF)
 344         fatal (_("unterminated type name at end of file"));
 345       if (c == '\n')
 346         {
 347           complain (_("unterminated type name"));
 348           ungetc (c, fin);
 349           break;
 350         }
 351
 352       obstack_1grow (&token_obstack, c);
 353       c = getc (fin);
 354     }
 355   obstack_1grow (&token_obstack, '\0');
 356   token_buffer = obstack_finish (&token_obstack);
 357 }
 358
 359
 360 token_t
 361 lex (void)
 362 {
 363   int c;
 364
 365   /* Just to make sure. */
 366   token_buffer = NULL;
 367
 368   if (unlexed != tok_undef)
 369     {
 370       token_t res = unlexed;
 371       symval = unlexed_symval;
 372       unlexed = tok_undef;
 373       return res;
 374     }
 375
 376   c = skip_white_space ();
 377
 378   switch (c)
 379     {
 380     case EOF:
 381       token_buffer = "EOF";
 382       return tok_eof;
 383
 384     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 385     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 386     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 387     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 388     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 389     case 'Z':
 390     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 391     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 392     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 393     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 394     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 395     case 'z':
 396     case '.':    case '_':
 397
 398       while (isalnum (c) || c == '_' || c == '.')
 399         {
 400           obstack_1grow (&token_obstack, c);
 401           c = getc (finput);
 402         }
 403       obstack_1grow (&token_obstack, '\0');
 404       token_buffer = obstack_finish (&token_obstack);
 405       ungetc (c, finput);
 406       symval = getsym (token_buffer);
 407       return tok_identifier;
 408
 409     case '0':    case '1':    case '2':    case '3':    case '4':
 410     case '5':    case '6':    case '7':    case '8':    case '9':
 411       {
 412         numval = 0;
 413
 414         while (isdigit (c))
 415           {
 416             obstack_1grow (&token_obstack, c);
 417             numval = numval * 10 + c - '0';
 418             c = getc (finput);
 419           }
 420         obstack_1grow (&token_obstack, '\0');
 421         token_buffer = obstack_finish (&token_obstack);
 422         ungetc (c, finput);
 423         return tok_number;
 424       }
 425
 426     case '\'':
 427       /* parse the literal token and compute character code in  code  */
 428
 429       {
 430         int code, discode;
 431
 432         obstack_1grow (&token_obstack, '\'');
 433         literalchar (&token_obstack, &code, '\'');
 434
 435         c = getc (finput);
 436         if (c != '\'')
 437           {
 438             complain (_("use \"...\" for multi-character literal tokens"));
 439             while (1)
 440               if (!literalchar (0, &discode, '\''))
 441                 break;
 442           }
 443         obstack_1grow (&token_obstack, '\'');
 444         obstack_1grow (&token_obstack, '\0');
 445         token_buffer = obstack_finish (&token_obstack);
 446         symval = getsym (token_buffer);
 447         symval->class = token_sym;
 448         if (!symval->user_token_number)
 449           symval->user_token_number = code;
 450         return tok_identifier;
 451       }
 452
 453     case '\"':
 454       /* parse the literal string token and treat as an identifier */
 455
 456       {
 457         int code;               /* ignored here */
 458
 459         obstack_1grow (&token_obstack, '\"');
 460         /* Read up to and including ".  */
 461         while (literalchar (&token_obstack, &code, '\"'))
 462           /* nothing */;
 463         obstack_1grow (&token_obstack, '\0');
 464         token_buffer = obstack_finish (&token_obstack);
 465
 466         symval = getsym (token_buffer);
 467         symval->class = token_sym;
 468
 469         return tok_identifier;
 470       }
 471
 472     case ',':
 473       token_buffer = ",";
 474       return tok_comma;
 475
 476     case ':':
 477       token_buffer = ":";
 478       return tok_colon;
 479
 480     case ';':
 481       token_buffer = ";";
 482       return tok_semicolon;
 483
 484     case '|':
 485       token_buffer = "|";
 486       return tok_bar;
 487
 488     case '{':
 489       token_buffer = "{";
 490       return tok_left_curly;
 491
 492     case '=':
 493       obstack_1grow (&token_obstack, c);
 494       do
 495         {
 496           c = getc (finput);
 497           obstack_1grow (&token_obstack, c);
 498           if (c == '\n')
 499             lineno++;
 500         }
 501       while (c == ' ' || c == '\n' || c == '\t');
 502       obstack_1grow (&token_obstack, '\0');
 503       token_buffer = obstack_finish (&token_obstack);
 504
 505       if (c == '{')
 506         {
 507           return tok_left_curly;
 508         }
 509       else
 510         {
 511           ungetc (c, finput);
 512           return tok_illegal;
 513         }
 514
 515     case '<':
 516       read_type_name (finput);
 517       return tok_typename;
 518
 519     case '%':
 520       return parse_percent_token ();
 521
 522     default:
 523       obstack_1grow (&token_obstack, c);
 524       obstack_1grow (&token_obstack, '\0');
 525       token_buffer = obstack_finish (&token_obstack);
 526       return tok_illegal;
 527     }
 528 }
 529
 530 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 531    chars.  */
 532
 533 static int
 534 option_strcmp (const char *left, const char *right)
 535 {
 536   const unsigned char *l, *r;
 537   int c;
 538
 539   assert (left);
 540   assert (right);
 541   l = (const unsigned char *)left;
 542   r = (const unsigned char *)right;
 543   while (((c = *l - *r++) == 0 && *l != '\0')
 544          || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 545     l++;
 546   return c;
 547 }
 548
 549 /* Parse a token which starts with %.
 550    Assumes the % has already been read and discarded.  */
 551
 552 token_t
 553 parse_percent_token (void)
 554 {
 555   const struct option_table_struct *tx;
 556
 557   int c = getc (finput);
 558
 559   switch (c)
 560     {
 561     case '%':
 562       return tok_two_percents;
 563
 564     case '{':
 565       return tok_percent_left_curly;
 566
 567     case '<':
 568       return tok_left;
 569
 570     case '>':
 571       return tok_right;
 572
 573     case '2':
 574       return tok_nonassoc;
 575
 576     case '0':
 577       return tok_token;
 578
 579     case '=':
 580       return tok_prec;
 581     }
 582
 583   if (!isalpha (c))
 584     return tok_illegal;
 585
 586   obstack_1grow (&token_obstack, '%');
 587   while (isalpha (c) || c == '_' || c == '-')
 588     {
 589       if (c == '_')
 590         c = '-';
 591       obstack_1grow (&token_obstack, c);
 592       c = getc (finput);
 593     }
 594
 595   ungetc (c, finput);
 596   obstack_1grow (&token_obstack, '\0');
 597   token_buffer = obstack_finish (&token_obstack);
 598
 599   /* table lookup % directive */
 600   for (tx = option_table; tx->name; tx++)
 601     if ((tx->access == opt_percent || tx->access == opt_both)
 602         && option_strcmp (token_buffer + 1, tx->name) == 0)
 603       break;
 604
 605   if (tx->set_flag)
 606     {
 607       *((int *) (tx->set_flag)) = 1;
 608       return tok_noop;
 609     }
 610
 611   switch (tx->ret_val)
 612     {
 613     case tok_setopt:
 614       *((char **) (tx->set_flag)) = optarg;
 615       return tok_noop;
 616       break;
 617
 618     case tok_obsolete:
 619       fatal (_("`%s' is no longer supported"), token_buffer);
 620       break;
 621
 622     default:
 623       /* Other cases do not apply here. */
 624       break;
 625     }
 626
 627   return tx->ret_val;
 628 }