src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "complain.h"
  28 #include "gram.h"
  29 #include "quote.h"
  30
  31 /* Buffer for storing the current token.  */
  32 struct obstack token_obstack;
  33 const char *token_buffer = NULL;
  34
  35 bucket *symval;
  36 int numval;
  37
  38 /* A token to be reread, see unlex and lex. */
  39 static token_t unlexed = tok_undef;
  40 static bucket *unlexed_symval = NULL;
  41 static const char *unlexed_token_buffer = NULL;
  42
  43 void
  44 lex_init (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = tok_undef;
  48 }
  49
  50
  51 void
  52 lex_free (void)
  53 {
  54   obstack_free (&token_obstack, NULL);
  55 }
  56
  57
  58 int
  59 skip_white_space (void)
  60 {
  61   int c;
  62   int inside;
  63
  64   c = getc (finput);
  65
  66   for (;;)
  67     {
  68       int cplus_comment;
  69
  70       switch (c)
  71         {
  72         case '/':
  73           /* FIXME: Should probably be merged with copy_comment.  */
  74           c = getc (finput);
  75           if (c != '*' && c != '/')
  76             {
  77               complain (_("unexpected `/' found and ignored"));
  78               break;
  79             }
  80           cplus_comment = (c == '/');
  81
  82           c = getc (finput);
  83
  84           inside = 1;
  85           while (inside)
  86             {
  87               if (!cplus_comment && c == '*')
  88                 {
  89                   while (c == '*')
  90                     c = getc (finput);
  91
  92                   if (c == '/')
  93                     {
  94                       inside = 0;
  95                       c = getc (finput);
  96                     }
  97                 }
  98               else if (c == '\n')
  99                 {
 100                   lineno++;
 101                   if (cplus_comment)
 102                     inside = 0;
 103                   c = getc (finput);
 104                 }
 105               else if (c == EOF)
 106                 fatal (_("unterminated comment"));
 107               else
 108                 c = getc (finput);
 109             }
 110
 111           break;
 112
 113         case '\n':
 114           lineno++;
 115
 116         case ' ':
 117         case '\t':
 118         case '\f':
 119           c = getc (finput);
 120           break;
 121
 122         default:
 123           return c;
 124         }
 125     }
 126 }
 127
 128
 129 /*-----------------------------------------------------.
 130 | Do a getc, but give error message if EOF encountered |
 131 `-----------------------------------------------------*/
 132
 133 static int
 134 xgetc (FILE *f)
 135 {
 136   int c = getc (f);
 137   if (c == EOF)
 138     fatal (_("unexpected end of file"));
 139   return c;
 140 }
 141
 142
 143 /*------------------------------------------------------------------.
 144 | Read one literal character from finput.  Process \ escapes.       |
 145 | Append the normalized string version of the char to OUT.  Assign  |
 146 | the character code to *PCODE. Return 1 unless the character is an |
 147 | unescaped `term' or \n report error for \n.                       |
 148 `------------------------------------------------------------------*/
 149
 150 /* FIXME: We could directly work in the obstack, but that would make
 151    it more difficult to move to quotearg some day.  So for the time
 152    being, I prefer have literalchar behave like quotearg, and change
 153    my mind later if I was wrong.  */
 154
 155 static int
 156 literalchar (struct obstack *out, int *pcode, char term)
 157 {
 158   int c;
 159   char buf[4096];
 160   char *cp;
 161   int code;
 162   int wasquote = 0;
 163
 164   c = xgetc (finput);
 165   if (c == '\n')
 166     {
 167       complain (_("unescaped newline in constant"));
 168       ungetc (c, finput);
 169       code = '?';
 170       wasquote = 1;
 171     }
 172   else if (c != '\\')
 173     {
 174       code = c;
 175       if (c == term)
 176         wasquote = 1;
 177     }
 178   else
 179     {
 180       c = xgetc (finput);
 181       if (c == 't')
 182         code = '\t';
 183       else if (c == 'n')
 184         code = '\n';
 185       else if (c == 'a')
 186         code = '\007';
 187       else if (c == 'r')
 188         code = '\r';
 189       else if (c == 'f')
 190         code = '\f';
 191       else if (c == 'b')
 192         code = '\b';
 193       else if (c == 'v')
 194         code = '\013';
 195       else if (c == '\\')
 196         code = '\\';
 197       else if (c == '\'')
 198         code = '\'';
 199       else if (c == '\"')
 200         code = '\"';
 201       else if (c <= '7' && c >= '0')
 202         {
 203           code = 0;
 204           while (c <= '7' && c >= '0')
 205             {
 206               code = (code * 8) + (c - '0');
 207               if (code >= 256 || code < 0)
 208                 {
 209                   complain (_("octal value outside range 0...255: `\\%o'"),
 210                             code);
 211                   code &= 0xFF;
 212                   break;
 213                 }
 214               c = xgetc (finput);
 215             }
 216           ungetc (c, finput);
 217         }
 218       else if (c == 'x')
 219         {
 220           c = xgetc (finput);
 221           code = 0;
 222           while (1)
 223             {
 224               if (c >= '0' && c <= '9')
 225                 code *= 16, code += c - '0';
 226               else if (c >= 'a' && c <= 'f')
 227                 code *= 16, code += c - 'a' + 10;
 228               else if (c >= 'A' && c <= 'F')
 229                 code *= 16, code += c - 'A' + 10;
 230               else
 231                 break;
 232               if (code >= 256 || code < 0)
 233                 {
 234                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 235                   code &= 0xFF;
 236                   break;
 237                 }
 238               c = xgetc (finput);
 239             }
 240           ungetc (c, finput);
 241         }
 242       else
 243         {
 244           char badchar [] = "c";
 245           badchar[0] = c;
 246           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 247                     quote (badchar));
 248           code = '?';
 249         }
 250     }                           /* has \ */
 251
 252   /* now fill BUF with the canonical name for this character as a
 253      literal token.  Do not use what the user typed, so that `\012'
 254      and `\n' can be interchangeable.  */
 255
 256   cp = buf;
 257   if (code == term && wasquote)
 258     *cp++ = code;
 259   else if (code == '\\')
 260     {
 261       *cp++ = '\\';
 262       *cp++ = '\\';
 263     }
 264   else if (code == '\'')
 265     {
 266       *cp++ = '\\';
 267       *cp++ = '\'';
 268     }
 269   else if (code == '\"')
 270     {
 271       *cp++ = '\\';
 272       *cp++ = '\"';
 273     }
 274   else if (code >= 040 && code < 0177)
 275     *cp++ = code;
 276   else if (code == '\t')
 277     {
 278       *cp++ = '\\';
 279       *cp++ = 't';
 280     }
 281   else if (code == '\n')
 282     {
 283       *cp++ = '\\';
 284       *cp++ = 'n';
 285     }
 286   else if (code == '\r')
 287     {
 288       *cp++ = '\\';
 289       *cp++ = 'r';
 290     }
 291   else if (code == '\v')
 292     {
 293       *cp++ = '\\';
 294       *cp++ = 'v';
 295     }
 296   else if (code == '\b')
 297     {
 298       *cp++ = '\\';
 299       *cp++ = 'b';
 300     }
 301   else if (code == '\f')
 302     {
 303       *cp++ = '\\';
 304       *cp++ = 'f';
 305     }
 306   else
 307     {
 308       *cp++ = '\\';
 309       *cp++ = code / 0100 + '0';
 310       *cp++ = ((code / 010) & 07) + '0';
 311       *cp++ = (code & 07) + '0';
 312     }
 313   *cp = '\0';
 314
 315   if (out)
 316     obstack_sgrow (out, buf);
 317   *pcode = code;
 318   return !wasquote;
 319 }
 320
 321
 322 void
 323 unlex (token_t token)
 324 {
 325   unlexed = token;
 326   unlexed_token_buffer = token_buffer;
 327   unlexed_symval = symval;
 328 }
 329
 330 /*-----------------------------------------------------------------.
 331 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 332 | specified between the `<...>'.                                   |
 333 `-----------------------------------------------------------------*/
 334
 335 void
 336 read_type_name (FILE *fin)
 337 {
 338   int c = getc (fin);
 339
 340   while (c != '>')
 341     {
 342       if (c == EOF)
 343         fatal (_("unterminated type name at end of file"));
 344       if (c == '\n')
 345         {
 346           complain (_("unterminated type name"));
 347           ungetc (c, fin);
 348           break;
 349         }
 350
 351       obstack_1grow (&token_obstack, c);
 352       c = getc (fin);
 353     }
 354   obstack_1grow (&token_obstack, '\0');
 355   token_buffer = obstack_finish (&token_obstack);
 356 }
 357
 358
 359 token_t
 360 lex (void)
 361 {
 362   int c;
 363
 364   /* Just to make sure. */
 365   token_buffer = NULL;
 366
 367   if (unlexed != tok_undef)
 368     {
 369       token_t res = unlexed;
 370       symval = unlexed_symval;
 371       token_buffer = unlexed_token_buffer;
 372       unlexed = tok_undef;
 373       return res;
 374     }
 375
 376   c = skip_white_space ();
 377
 378   switch (c)
 379     {
 380     case EOF:
 381       token_buffer = "EOF";
 382       return tok_eof;
 383
 384     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 385     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 386     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 387     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 388     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 389     case 'Z':
 390     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 391     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 392     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 393     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 394     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 395     case 'z':
 396     case '.':    case '_':
 397
 398       while (isalnum (c) || c == '_' || c == '.')
 399         {
 400           obstack_1grow (&token_obstack, c);
 401           c = getc (finput);
 402         }
 403       obstack_1grow (&token_obstack, '\0');
 404       token_buffer = obstack_finish (&token_obstack);
 405       ungetc (c, finput);
 406       symval = getsym (token_buffer);
 407       return tok_identifier;
 408
 409     case '0':    case '1':    case '2':    case '3':    case '4':
 410     case '5':    case '6':    case '7':    case '8':    case '9':
 411       {
 412         numval = 0;
 413
 414         while (isdigit (c))
 415           {
 416             obstack_1grow (&token_obstack, c);
 417             numval = numval * 10 + c - '0';
 418             c = getc (finput);
 419           }
 420         obstack_1grow (&token_obstack, '\0');
 421         token_buffer = obstack_finish (&token_obstack);
 422         ungetc (c, finput);
 423         return tok_number;
 424       }
 425
 426     case '\'':
 427       /* parse the literal token and compute character code in  code  */
 428
 429       {
 430         int code, discode;
 431
 432         obstack_1grow (&token_obstack, '\'');
 433         literalchar (&token_obstack, &code, '\'');
 434
 435         c = getc (finput);
 436         if (c != '\'')
 437           {
 438             complain (_("use \"...\" for multi-character literal tokens"));
 439             while (1)
 440               if (!literalchar (0, &discode, '\''))
 441                 break;
 442           }
 443         obstack_1grow (&token_obstack, '\'');
 444         obstack_1grow (&token_obstack, '\0');
 445         token_buffer = obstack_finish (&token_obstack);
 446         symval = getsym (token_buffer);
 447         symval->class = token_sym;
 448         if (!symval->user_token_number)
 449           symval->user_token_number = code;
 450         return tok_identifier;
 451       }
 452
 453     case '\"':
 454       /* parse the literal string token and treat as an identifier */
 455
 456       {
 457         int code;               /* ignored here */
 458
 459         obstack_1grow (&token_obstack, '\"');
 460         /* Read up to and including ".  */
 461         while (literalchar (&token_obstack, &code, '\"'))
 462           /* nothing */;
 463         obstack_1grow (&token_obstack, '\0');
 464         token_buffer = obstack_finish (&token_obstack);
 465
 466         symval = getsym (token_buffer);
 467         symval->class = token_sym;
 468
 469         return tok_identifier;
 470       }
 471
 472     case ',':
 473       token_buffer = ",";
 474       return tok_comma;
 475
 476     case ':':
 477       token_buffer = ":";
 478       return tok_colon;
 479
 480     case ';':
 481       token_buffer = ";";
 482       return tok_semicolon;
 483
 484     case '|':
 485       token_buffer = "|";
 486       return tok_bar;
 487
 488     case '{':
 489       token_buffer = "{";
 490       return tok_left_curly;
 491
 492     case '=':
 493       obstack_1grow (&token_obstack, c);
 494       do
 495         {
 496           c = getc (finput);
 497           obstack_1grow (&token_obstack, c);
 498           if (c == '\n')
 499             lineno++;
 500         }
 501       while (c == ' ' || c == '\n' || c == '\t');
 502       obstack_1grow (&token_obstack, '\0');
 503       token_buffer = obstack_finish (&token_obstack);
 504
 505       if (c == '{')
 506         {
 507           return tok_left_curly;
 508         }
 509       else
 510         {
 511           ungetc (c, finput);
 512           return tok_illegal;
 513         }
 514
 515     case '<':
 516       read_type_name (finput);
 517       return tok_typename;
 518
 519     case '%':
 520       return parse_percent_token ();
 521
 522     default:
 523       obstack_1grow (&token_obstack, c);
 524       obstack_1grow (&token_obstack, '\0');
 525       token_buffer = obstack_finish (&token_obstack);
 526       return tok_illegal;
 527     }
 528 }
 529
 530 /* the following table dictates the action taken for the various %
 531    directives.  A set_flag value causes the named flag to be set.  A
 532    retval action returns the code.  */
 533 struct percent_table_struct
 534 {
 535   const char *name;
 536   void *set_flag;
 537   token_t retval;
 538 };
 539
 540 struct percent_table_struct percent_table[] =
 541 {
 542   { "token",            NULL,                   tok_token },
 543   { "term",             NULL,                   tok_token },
 544   { "nterm",            NULL,                   tok_nterm },
 545   { "type",             NULL,                   tok_type },
 546   { "guard",            NULL,                   tok_guard },
 547   { "union",            NULL,                   tok_union },
 548   { "expect",           NULL,                   tok_expect },
 549   { "thong",            NULL,                   tok_thong },
 550   { "start",            NULL,                   tok_start },
 551   { "left",             NULL,                   tok_left },
 552   { "right",            NULL,                   tok_right },
 553   { "nonassoc",         NULL,                   tok_nonassoc },
 554   { "binary",           NULL,                   tok_nonassoc },
 555   { "prec",             NULL,                   tok_prec },
 556   { "locations",        &locations_flag,        tok_noop },     /* -l */
 557   { "no_lines",         &no_lines_flag,         tok_noop },     /* -l */
 558   { "raw",              NULL,                   tok_obsolete }, /* -r */
 559   { "token_table",      &token_table_flag,      tok_noop },     /* -k */
 560   { "yacc",             &yacc_flag,             tok_noop },     /* -y */
 561   { "fixed_output_files",&yacc_flag,            tok_noop },     /* -y */
 562   { "defines",          &defines_flag,          tok_noop },     /* -d */
 563   { "no_parser",        &no_parser_flag,        tok_noop },     /* -n */
 564   { "graph",            &graph_flag,            tok_noop },     /* -g */
 565 #if 0
 566   /* For the time being, this is not enabled yet, while it's possible
 567      though, since we use obstacks.  The only risk is with semantic
 568      parsers which will output an `include' of an output file: be sure
 569      that the name included is indeed the name of the output file.  */
 570   { "output_file",      &spec_outfile,          tok_setopt },   /* -o */
 571   { "file_prefix",      &spec_file_prefix,      tok_setopt },   /* -b */
 572   { "name_prefix",      &spec_name_prefix,      tok_setopt },   /* -p */
 573 #endif
 574   { "verbose",          &verbose_flag,          tok_noop },     /* -v */
 575   { "debug",            &debug_flag,            tok_noop },     /* -t */
 576   { "semantic_parser",  &semantic_parser,       tok_noop },
 577   { "pure_parser",      &pure_parser,           tok_noop },
 578
 579   { NULL, NULL, tok_illegal}
 580 };
 581
 582 /* Parse a token which starts with %.
 583    Assumes the % has already been read and discarded.  */
 584
 585 token_t
 586 parse_percent_token (void)
 587 {
 588   struct percent_table_struct *tx;
 589
 590   int c = getc (finput);
 591
 592   switch (c)
 593     {
 594     case '%':
 595       return tok_two_percents;
 596
 597     case '{':
 598       return tok_percent_left_curly;
 599
 600     case '<':
 601       return tok_left;
 602
 603     case '>':
 604       return tok_right;
 605
 606     case '2':
 607       return tok_nonassoc;
 608
 609     case '0':
 610       return tok_token;
 611
 612     case '=':
 613       return tok_prec;
 614     }
 615
 616   if (!isalpha (c))
 617     return tok_illegal;
 618
 619   obstack_1grow (&token_obstack, '%');
 620   while (isalpha (c) || c == '_' || c == '-')
 621     {
 622       if (c == '-')
 623         c = '_';
 624       obstack_1grow (&token_obstack, c);
 625       c = getc (finput);
 626     }
 627
 628   ungetc (c, finput);
 629   obstack_1grow (&token_obstack, '\0');
 630   token_buffer = obstack_finish (&token_obstack);
 631
 632   /* table lookup % directive */
 633   for (tx = percent_table; tx->name; tx++)
 634     if (strcmp (token_buffer + 1, tx->name) == 0)
 635       break;
 636
 637   if (tx->set_flag)
 638     {
 639       *((int *) (tx->set_flag)) = 1;
 640       return tok_noop;
 641     }
 642
 643   switch (tx->retval)
 644     {
 645     case tok_setopt:
 646       *((char **) (tx->set_flag)) = optarg;
 647       return tok_noop;
 648       break;
 649
 650     case tok_obsolete:
 651       fatal (_("`%s' is no longer supported"), token_buffer);
 652       break;
 653
 654     default:
 655       /* Other cases do not apply here. */
 656       break;
 657     }
 658
 659   return tx->retval;
 660 }