src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "options.h"
  27 #include "lex.h"
  28 #include "xalloc.h"
  29 #include "complain.h"
  30 #include "gram.h"
  31 #include "quote.h"
  32
  33 /* Buffer for storing the current token.  */
  34 struct obstack token_obstack;
  35 const char *token_buffer = NULL;
  36
  37 bucket *symval;
  38 int numval;
  39
  40 static int unlexed;             /* these two describe a token to be reread */
  41 static bucket *unlexed_symval;  /* by the next call to lex */
  42
  43
  44 void
  45 init_lex (void)
  46 {
  47   obstack_init (&token_obstack);
  48   unlexed = -1;
  49 }
  50
  51
  52 int
  53 skip_white_space (void)
  54 {
  55   int c;
  56   int inside;
  57
  58   c = getc (finput);
  59
  60   for (;;)
  61     {
  62       int cplus_comment;
  63
  64       switch (c)
  65         {
  66         case '/':
  67           /* FIXME: Should probably be merged with copy_comment.  */
  68           c = getc (finput);
  69           if (c != '*' && c != '/')
  70             {
  71               complain (_("unexpected `/' found and ignored"));
  72               break;
  73             }
  74           cplus_comment = (c == '/');
  75
  76           c = getc (finput);
  77
  78           inside = 1;
  79           while (inside)
  80             {
  81               if (!cplus_comment && c == '*')
  82                 {
  83                   while (c == '*')
  84                     c = getc (finput);
  85
  86                   if (c == '/')
  87                     {
  88                       inside = 0;
  89                       c = getc (finput);
  90                     }
  91                 }
  92               else if (c == '\n')
  93                 {
  94                   lineno++;
  95                   if (cplus_comment)
  96                     inside = 0;
  97                   c = getc (finput);
  98                 }
  99               else if (c == EOF)
 100                 fatal (_("unterminated comment"));
 101               else
 102                 c = getc (finput);
 103             }
 104
 105           break;
 106
 107         case '\n':
 108           lineno++;
 109
 110         case ' ':
 111         case '\t':
 112         case '\f':
 113           c = getc (finput);
 114           break;
 115
 116         default:
 117           return c;
 118         }
 119     }
 120 }
 121
 122
 123 /*-----------------------------------------------------.
 124 | Do a getc, but give error message if EOF encountered |
 125 `-----------------------------------------------------*/
 126
 127 static int
 128 xgetc (FILE *f)
 129 {
 130   int c = getc (f);
 131   if (c == EOF)
 132     fatal (_("unexpected end of file"));
 133   return c;
 134 }
 135
 136
 137 /*------------------------------------------------------------------.
 138 | Read one literal character from finput.  Process \ escapes.       |
 139 | Append the normalized string version of the char to OUT.  Assign  |
 140 | the character code to *PCODE. Return 1 unless the character is an |
 141 | unescaped `term' or \n report error for \n.                       |
 142 `------------------------------------------------------------------*/
 143
 144 /* FIXME: We could directly work in the obstack, but that would make
 145    it more difficult to move to quotearg some day.  So for the time
 146    being, I prefer have literalchar behave like quotearg, and change
 147    my mind later if I was wrong.  */
 148
 149 static int
 150 literalchar (struct obstack *out, int *pcode, char term)
 151 {
 152   int c;
 153   char buf[4096];
 154   char *cp;
 155   int code;
 156   int wasquote = 0;
 157
 158   c = xgetc (finput);
 159   if (c == '\n')
 160     {
 161       complain (_("unescaped newline in constant"));
 162       ungetc (c, finput);
 163       code = '?';
 164       wasquote = 1;
 165     }
 166   else if (c != '\\')
 167     {
 168       code = c;
 169       if (c == term)
 170         wasquote = 1;
 171     }
 172   else
 173     {
 174       c = xgetc (finput);
 175       if (c == 't')
 176         code = '\t';
 177       else if (c == 'n')
 178         code = '\n';
 179       else if (c == 'a')
 180         code = '\007';
 181       else if (c == 'r')
 182         code = '\r';
 183       else if (c == 'f')
 184         code = '\f';
 185       else if (c == 'b')
 186         code = '\b';
 187       else if (c == 'v')
 188         code = '\013';
 189       else if (c == '\\')
 190         code = '\\';
 191       else if (c == '\'')
 192         code = '\'';
 193       else if (c == '\"')
 194         code = '\"';
 195       else if (c <= '7' && c >= '0')
 196         {
 197           code = 0;
 198           while (c <= '7' && c >= '0')
 199             {
 200               code = (code * 8) + (c - '0');
 201               if (code >= 256 || code < 0)
 202                 {
 203                   complain (_("octal value outside range 0...255: `\\%o'"),
 204                             code);
 205                   code &= 0xFF;
 206                   break;
 207                 }
 208               c = xgetc (finput);
 209             }
 210           ungetc (c, finput);
 211         }
 212       else if (c == 'x')
 213         {
 214           c = xgetc (finput);
 215           code = 0;
 216           while (1)
 217             {
 218               if (c >= '0' && c <= '9')
 219                 code *= 16, code += c - '0';
 220               else if (c >= 'a' && c <= 'f')
 221                 code *= 16, code += c - 'a' + 10;
 222               else if (c >= 'A' && c <= 'F')
 223                 code *= 16, code += c - 'A' + 10;
 224               else
 225                 break;
 226               if (code >= 256 || code < 0)
 227                 {
 228                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 229                   code &= 0xFF;
 230                   break;
 231                 }
 232               c = xgetc (finput);
 233             }
 234           ungetc (c, finput);
 235         }
 236       else
 237         {
 238           char badchar [] = "c";
 239           badchar[0] = c;
 240           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 241                     quote (badchar));
 242           code = '?';
 243         }
 244     }                           /* has \ */
 245
 246   /* now fill BUF with the canonical name for this character as a
 247      literal token.  Do not use what the user typed, so that `\012'
 248      and `\n' can be interchangeable.  */
 249
 250   cp = buf;
 251   if (code == term && wasquote)
 252     *cp++ = code;
 253   else if (code == '\\')
 254     {
 255       *cp++ = '\\';
 256       *cp++ = '\\';
 257     }
 258   else if (code == '\'')
 259     {
 260       *cp++ = '\\';
 261       *cp++ = '\'';
 262     }
 263   else if (code == '\"')
 264     {
 265       *cp++ = '\\';
 266       *cp++ = '\"';
 267     }
 268   else if (code >= 040 && code < 0177)
 269     *cp++ = code;
 270   else if (code == '\t')
 271     {
 272       *cp++ = '\\';
 273       *cp++ = 't';
 274     }
 275   else if (code == '\n')
 276     {
 277       *cp++ = '\\';
 278       *cp++ = 'n';
 279     }
 280   else if (code == '\r')
 281     {
 282       *cp++ = '\\';
 283       *cp++ = 'r';
 284     }
 285   else if (code == '\v')
 286     {
 287       *cp++ = '\\';
 288       *cp++ = 'v';
 289     }
 290   else if (code == '\b')
 291     {
 292       *cp++ = '\\';
 293       *cp++ = 'b';
 294     }
 295   else if (code == '\f')
 296     {
 297       *cp++ = '\\';
 298       *cp++ = 'f';
 299     }
 300   else
 301     {
 302       *cp++ = '\\';
 303       *cp++ = code / 0100 + '0';
 304       *cp++ = ((code / 010) & 07) + '0';
 305       *cp++ = (code & 07) + '0';
 306     }
 307   *cp = '\0';
 308
 309   if (out)
 310     obstack_sgrow (out, buf);
 311   *pcode = code;
 312   return !wasquote;
 313 }
 314
 315
 316 void
 317 unlex (int token)
 318 {
 319   unlexed = token;
 320   unlexed_symval = symval;
 321 }
 322
 323 /*-----------------------------------------------------------------.
 324 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 325 | specified between the `<...>'.                                   |
 326 `-----------------------------------------------------------------*/
 327
 328 void
 329 read_type_name (FILE *fin)
 330 {
 331   int c = getc (fin);
 332
 333   while (c != '>')
 334     {
 335       if (c == EOF)
 336         fatal (_("unterminated type name at end of file"));
 337       if (c == '\n')
 338         {
 339           complain (_("unterminated type name"));
 340           ungetc (c, fin);
 341           break;
 342         }
 343
 344       obstack_1grow (&token_obstack, c);
 345       c = getc (fin);
 346     }
 347   obstack_1grow (&token_obstack, '\0');
 348   token_buffer = obstack_finish (&token_obstack);
 349 }
 350
 351
 352 token_t
 353 lex (void)
 354 {
 355   int c;
 356
 357   /* Just to make sure. */
 358   token_buffer = NULL;
 359
 360   if (unlexed >= 0)
 361     {
 362       symval = unlexed_symval;
 363       c = unlexed;
 364       unlexed = -1;
 365       return c;
 366     }
 367
 368   c = skip_white_space ();
 369
 370   switch (c)
 371     {
 372     case EOF:
 373       token_buffer = "EOF";
 374       return tok_eof;
 375
 376     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 377     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 378     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 379     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 380     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 381     case 'Z':
 382     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 383     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 384     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 385     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 386     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 387     case 'z':
 388     case '.':    case '_':
 389
 390       while (isalnum (c) || c == '_' || c == '.')
 391         {
 392           obstack_1grow (&token_obstack, c);
 393           c = getc (finput);
 394         }
 395       obstack_1grow (&token_obstack, '\0');
 396       token_buffer = obstack_finish (&token_obstack);
 397       ungetc (c, finput);
 398       symval = getsym (token_buffer);
 399       return tok_identifier;
 400
 401     case '0':    case '1':    case '2':    case '3':    case '4':
 402     case '5':    case '6':    case '7':    case '8':    case '9':
 403       {
 404         numval = 0;
 405
 406         while (isdigit (c))
 407           {
 408             obstack_1grow (&token_obstack, c);
 409             numval = numval * 10 + c - '0';
 410             c = getc (finput);
 411           }
 412         obstack_1grow (&token_obstack, '\0');
 413         token_buffer = obstack_finish (&token_obstack);
 414         ungetc (c, finput);
 415         return tok_number;
 416       }
 417
 418     case '\'':
 419       /* parse the literal token and compute character code in  code  */
 420
 421       translations = -1;
 422       {
 423         int code, discode;
 424
 425         obstack_1grow (&token_obstack, '\'');
 426         literalchar (&token_obstack, &code, '\'');
 427
 428         c = getc (finput);
 429         if (c != '\'')
 430           {
 431             complain (_("use \"...\" for multi-character literal tokens"));
 432             while (1)
 433               if (!literalchar (0, &discode, '\''))
 434                 break;
 435           }
 436         obstack_1grow (&token_obstack, '\'');
 437         obstack_1grow (&token_obstack, '\0');
 438         token_buffer = obstack_finish (&token_obstack);
 439         symval = getsym (token_buffer);
 440         symval->class = token_sym;
 441         if (!symval->user_token_number)
 442           symval->user_token_number = code;
 443         return tok_identifier;
 444       }
 445
 446     case '\"':
 447       /* parse the literal string token and treat as an identifier */
 448
 449       translations = -1;
 450       {
 451         int code;               /* ignored here */
 452
 453         obstack_1grow (&token_obstack, '\"');
 454         /* Read up to and including ".  */
 455         while (literalchar (&token_obstack, &code, '\"'))
 456           /* nothing */;
 457         obstack_1grow (&token_obstack, '\0');
 458         token_buffer = obstack_finish (&token_obstack);
 459
 460         symval = getsym (token_buffer);
 461         symval->class = token_sym;
 462
 463         return tok_identifier;
 464       }
 465
 466     case ',':
 467       return tok_comma;
 468
 469     case ':':
 470       return tok_colon;
 471
 472     case ';':
 473       return tok_semicolon;
 474
 475     case '|':
 476       return tok_bar;
 477
 478     case '{':
 479       return tok_left_curly;
 480
 481     case '=':
 482       do
 483         {
 484           c = getc (finput);
 485           if (c == '\n')
 486             lineno++;
 487         }
 488       while (c == ' ' || c == '\n' || c == '\t');
 489
 490       if (c == '{')
 491         {
 492           token_buffer = "={";
 493           return tok_left_curly;
 494         }
 495       else
 496         {
 497           ungetc (c, finput);
 498           return tok_illegal;
 499         }
 500
 501     case '<':
 502       read_type_name (finput);
 503       return tok_typename;
 504
 505     case '%':
 506       return parse_percent_token ();
 507
 508     default:
 509       return tok_illegal;
 510     }
 511 }
 512
 513 /* This function is a strcmp, which doesn't differentiate `-' and `_'
 514    chars.  */
 515
 516 static int
 517 option_strcmp (const char *left, const char *right)
 518 {
 519     const unsigned char *l, *r;
 520     int c;
 521
 522     assert(left != NULL && right != NULL);
 523     l = (const unsigned char *)left;
 524     r = (const unsigned char *)right;
 525     while (((c = *l - *r++) == 0 && *l != '\0')
 526            || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-')))
 527         l++;
 528     return c;
 529 }
 530
 531 /* Parse a token which starts with %.
 532    Assumes the % has already been read and discarded.  */
 533
 534 int
 535 parse_percent_token (void)
 536 {
 537   int c;
 538   const struct option_table_struct *tx;
 539
 540   c = getc (finput);
 541
 542   switch (c)
 543     {
 544     case '%':
 545       return tok_two_percents;
 546
 547     case '{':
 548       return tok_percent_left_curly;
 549
 550     case '<':
 551       return tok_left;
 552
 553     case '>':
 554       return tok_right;
 555
 556     case '2':
 557       return tok_nonassoc;
 558
 559     case '0':
 560       return tok_token;
 561
 562     case '=':
 563       return tok_prec;
 564     }
 565
 566   if (!isalpha (c))
 567     return tok_illegal;
 568
 569   obstack_1grow (&token_obstack, '%');
 570   while (isalpha (c) || c == '_' || c == '-')
 571     {
 572       if (c == '_')
 573         c = '-';
 574       obstack_1grow (&token_obstack, c);
 575       c = getc (finput);
 576     }
 577
 578   ungetc (c, finput);
 579   obstack_1grow (&token_obstack, '\0');
 580   token_buffer = obstack_finish (&token_obstack);
 581
 582   /* table lookup % directive */
 583   for (tx = option_table; tx->name; tx++)
 584     if ((tx->access == opt_percent || tx->access == opt_both)
 585         && option_strcmp (token_buffer + 1, tx->name) == 0)
 586       break;
 587
 588   if (tx->set_flag)
 589     {
 590       *((int *) (tx->set_flag)) = 1;
 591       return tok_noop;
 592     }
 593
 594   switch (tx->ret_val)
 595     {
 596     case tok_setopt:
 597       *((char **) (tx->set_flag)) = optarg;
 598       return tok_noop;
 599       break;
 600
 601     case tok_obsolete:
 602       fatal (_("`%s' is no longer supported"), token_buffer);
 603       break;
 604     }
 605
 606   return tx->ret_val;
 607 }