src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 struct obstack token_obstack;
  34 const char *token_buffer = NULL;
  35
  36 bucket *symval;
  37 int numval;
  38
  39 /* these two describe a token to be reread */
  40 static token_t unlexed = tok_undef;
  41 /* by the next call to lex */
  42 static bucket *unlexed_symval = NULL;
  43
  44
  45 void
  46 init_lex (void)
  47 {
  48   obstack_init (&token_obstack);
  49   unlexed = tok_undef;
  50 }
  51
  52
  53 int
  54 skip_white_space (void)
  55 {
  56   int c;
  57   int inside;
  58
  59   c = getc (finput);
  60
  61   for (;;)
  62     {
  63       int cplus_comment;
  64
  65       switch (c)
  66         {
  67         case '/':
  68           /* FIXME: Should probably be merged with copy_comment.  */
  69           c = getc (finput);
  70           if (c != '*' && c != '/')
  71             {
  72               complain (_("unexpected `/' found and ignored"));
  73               break;
  74             }
  75           cplus_comment = (c == '/');
  76
  77           c = getc (finput);
  78
  79           inside = 1;
  80           while (inside)
  81             {
  82               if (!cplus_comment && c == '*')
  83                 {
  84                   while (c == '*')
  85                     c = getc (finput);
  86
  87                   if (c == '/')
  88                     {
  89                       inside = 0;
  90                       c = getc (finput);
  91                     }
  92                 }
  93               else if (c == '\n')
  94                 {
  95                   lineno++;
  96                   if (cplus_comment)
  97                     inside = 0;
  98                   c = getc (finput);
  99                 }
 100               else if (c == EOF)
 101                 fatal (_("unterminated comment"));
 102               else
 103                 c = getc (finput);
 104             }
 105
 106           break;
 107
 108         case '\n':
 109           lineno++;
 110
 111         case ' ':
 112         case '\t':
 113         case '\f':
 114           c = getc (finput);
 115           break;
 116
 117         default:
 118           return c;
 119         }
 120     }
 121 }
 122
 123
 124 /*-----------------------------------------------------.
 125 | Do a getc, but give error message if EOF encountered |
 126 `-----------------------------------------------------*/
 127
 128 static int
 129 xgetc (FILE *f)
 130 {
 131   int c = getc (f);
 132   if (c == EOF)
 133     fatal (_("unexpected end of file"));
 134   return c;
 135 }
 136
 137
 138 /*------------------------------------------------------------------.
 139 | Read one literal character from finput.  Process \ escapes.       |
 140 | Append the normalized string version of the char to OUT.  Assign  |
 141 | the character code to *PCODE. Return 1 unless the character is an |
 142 | unescaped `term' or \n report error for \n.                       |
 143 `------------------------------------------------------------------*/
 144
 145 /* FIXME: We could directly work in the obstack, but that would make
 146    it more difficult to move to quotearg some day.  So for the time
 147    being, I prefer have literalchar behave like quotearg, and change
 148    my mind later if I was wrong.  */
 149
 150 static int
 151 literalchar (struct obstack *out, int *pcode, char term)
 152 {
 153   int c;
 154   char buf[4096];
 155   char *cp;
 156   int code;
 157   int wasquote = 0;
 158
 159   c = xgetc (finput);
 160   if (c == '\n')
 161     {
 162       complain (_("unescaped newline in constant"));
 163       ungetc (c, finput);
 164       code = '?';
 165       wasquote = 1;
 166     }
 167   else if (c != '\\')
 168     {
 169       code = c;
 170       if (c == term)
 171         wasquote = 1;
 172     }
 173   else
 174     {
 175       c = xgetc (finput);
 176       if (c == 't')
 177         code = '\t';
 178       else if (c == 'n')
 179         code = '\n';
 180       else if (c == 'a')
 181         code = '\007';
 182       else if (c == 'r')
 183         code = '\r';
 184       else if (c == 'f')
 185         code = '\f';
 186       else if (c == 'b')
 187         code = '\b';
 188       else if (c == 'v')
 189         code = '\013';
 190       else if (c == '\\')
 191         code = '\\';
 192       else if (c == '\'')
 193         code = '\'';
 194       else if (c == '\"')
 195         code = '\"';
 196       else if (c <= '7' && c >= '0')
 197         {
 198           code = 0;
 199           while (c <= '7' && c >= '0')
 200             {
 201               code = (code * 8) + (c - '0');
 202               if (code >= 256 || code < 0)
 203                 {
 204                   complain (_("octal value outside range 0...255: `\\%o'"),
 205                             code);
 206                   code &= 0xFF;
 207                   break;
 208                 }
 209               c = xgetc (finput);
 210             }
 211           ungetc (c, finput);
 212         }
 213       else if (c == 'x')
 214         {
 215           c = xgetc (finput);
 216           code = 0;
 217           while (1)
 218             {
 219               if (c >= '0' && c <= '9')
 220                 code *= 16, code += c - '0';
 221               else if (c >= 'a' && c <= 'f')
 222                 code *= 16, code += c - 'a' + 10;
 223               else if (c >= 'A' && c <= 'F')
 224                 code *= 16, code += c - 'A' + 10;
 225               else
 226                 break;
 227               if (code >= 256 || code < 0)
 228                 {
 229                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 230                   code &= 0xFF;
 231                   break;
 232                 }
 233               c = xgetc (finput);
 234             }
 235           ungetc (c, finput);
 236         }
 237       else
 238         {
 239           char badchar [] = "c";
 240           badchar[0] = c;
 241           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 242                     quote (badchar));
 243           code = '?';
 244         }
 245     }                           /* has \ */
 246
 247   /* now fill BUF with the canonical name for this character as a
 248      literal token.  Do not use what the user typed, so that `\012'
 249      and `\n' can be interchangeable.  */
 250
 251   cp = buf;
 252   if (code == term && wasquote)
 253     *cp++ = code;
 254   else if (code == '\\')
 255     {
 256       *cp++ = '\\';
 257       *cp++ = '\\';
 258     }
 259   else if (code == '\'')
 260     {
 261       *cp++ = '\\';
 262       *cp++ = '\'';
 263     }
 264   else if (code == '\"')
 265     {
 266       *cp++ = '\\';
 267       *cp++ = '\"';
 268     }
 269   else if (code >= 040 && code < 0177)
 270     *cp++ = code;
 271   else if (code == '\t')
 272     {
 273       *cp++ = '\\';
 274       *cp++ = 't';
 275     }
 276   else if (code == '\n')
 277     {
 278       *cp++ = '\\';
 279       *cp++ = 'n';
 280     }
 281   else if (code == '\r')
 282     {
 283       *cp++ = '\\';
 284       *cp++ = 'r';
 285     }
 286   else if (code == '\v')
 287     {
 288       *cp++ = '\\';
 289       *cp++ = 'v';
 290     }
 291   else if (code == '\b')
 292     {
 293       *cp++ = '\\';
 294       *cp++ = 'b';
 295     }
 296   else if (code == '\f')
 297     {
 298       *cp++ = '\\';
 299       *cp++ = 'f';
 300     }
 301   else
 302     {
 303       *cp++ = '\\';
 304       *cp++ = code / 0100 + '0';
 305       *cp++ = ((code / 010) & 07) + '0';
 306       *cp++ = (code & 07) + '0';
 307     }
 308   *cp = '\0';
 309
 310   if (out)
 311     obstack_sgrow (out, buf);
 312   *pcode = code;
 313   return !wasquote;
 314 }
 315
 316
 317 void
 318 unlex (int token)
 319 {
 320   unlexed = token;
 321   unlexed_symval = symval;
 322 }
 323
 324 /*-----------------------------------------------------------------.
 325 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 326 | specified between the `<...>'.                                   |
 327 `-----------------------------------------------------------------*/
 328
 329 void
 330 read_type_name (FILE *fin)
 331 {
 332   int c = getc (fin);
 333
 334   while (c != '>')
 335     {
 336       if (c == EOF)
 337         fatal (_("unterminated type name at end of file"));
 338       if (c == '\n')
 339         {
 340           complain (_("unterminated type name"));
 341           ungetc (c, fin);
 342           break;
 343         }
 344
 345       obstack_1grow (&token_obstack, c);
 346       c = getc (fin);
 347     }
 348   obstack_1grow (&token_obstack, '\0');
 349   token_buffer = obstack_finish (&token_obstack);
 350 }
 351
 352
 353 token_t
 354 lex (void)
 355 {
 356   int c;
 357
 358   /* Just to make sure. */
 359   token_buffer = NULL;
 360
 361   if (unlexed != tok_undef)
 362     {
 363       token_t res = unlexed;
 364       symval = unlexed_symval;
 365       unlexed = tok_undef;
 366       return res;
 367     }
 368
 369   c = skip_white_space ();
 370
 371   switch (c)
 372     {
 373     case EOF:
 374       token_buffer = "EOF";
 375       return tok_eof;
 376
 377     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 378     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 379     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 380     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 381     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 382     case 'Z':
 383     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 384     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 385     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 386     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 387     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 388     case 'z':
 389     case '.':    case '_':
 390
 391       while (isalnum (c) || c == '_' || c == '.')
 392         {
 393           obstack_1grow (&token_obstack, c);
 394           c = getc (finput);
 395         }
 396       obstack_1grow (&token_obstack, '\0');
 397       token_buffer = obstack_finish (&token_obstack);
 398       ungetc (c, finput);
 399       symval = getsym (token_buffer);
 400       return tok_identifier;
 401
 402     case '0':    case '1':    case '2':    case '3':    case '4':
 403     case '5':    case '6':    case '7':    case '8':    case '9':
 404       {
 405         numval = 0;
 406
 407         while (isdigit (c))
 408           {
 409             obstack_1grow (&token_obstack, c);
 410             numval = numval * 10 + c - '0';
 411             c = getc (finput);
 412           }
 413         obstack_1grow (&token_obstack, '\0');
 414         token_buffer = obstack_finish (&token_obstack);
 415         ungetc (c, finput);
 416         return tok_number;
 417       }
 418
 419     case '\'':
 420       /* parse the literal token and compute character code in  code  */
 421
 422       translations = -1;
 423       {
 424         int code, discode;
 425
 426         obstack_1grow (&token_obstack, '\'');
 427         literalchar (&token_obstack, &code, '\'');
 428
 429         c = getc (finput);
 430         if (c != '\'')
 431           {
 432             complain (_("use \"...\" for multi-character literal tokens"));
 433             while (1)
 434               if (!literalchar (0, &discode, '\''))
 435                 break;
 436           }
 437         obstack_1grow (&token_obstack, '\'');
 438         obstack_1grow (&token_obstack, '\0');
 439         token_buffer = obstack_finish (&token_obstack);
 440         symval = getsym (token_buffer);
 441         symval->class = token_sym;
 442         if (!symval->user_token_number)
 443           symval->user_token_number = code;
 444         return tok_identifier;
 445       }
 446
 447     case '\"':
 448       /* parse the literal string token and treat as an identifier */
 449
 450       translations = -1;
 451       {
 452         int code;               /* ignored here */
 453
 454         obstack_1grow (&token_obstack, '\"');
 455         /* Read up to and including ".  */
 456         while (literalchar (&token_obstack, &code, '\"'))
 457           /* nothing */;
 458         obstack_1grow (&token_obstack, '\0');
 459         token_buffer = obstack_finish (&token_obstack);
 460
 461         symval = getsym (token_buffer);
 462         symval->class = token_sym;
 463
 464         return tok_identifier;
 465       }
 466
 467     case ',':
 468       return tok_comma;
 469
 470     case ':':
 471       return tok_colon;
 472
 473     case ';':
 474       return tok_semicolon;
 475
 476     case '|':
 477       return tok_bar;
 478
 479     case '{':
 480       return tok_left_curly;
 481
 482     case '=':
 483       do
 484         {
 485           c = getc (finput);
 486           if (c == '\n')
 487             lineno++;
 488         }
 489       while (c == ' ' || c == '\n' || c == '\t');
 490
 491       if (c == '{')
 492         {
 493           token_buffer = "={";
 494           return tok_left_curly;
 495         }
 496       else
 497         {
 498           ungetc (c, finput);
 499           return tok_illegal;
 500         }
 501
 502     case '<':
 503       read_type_name (finput);
 504       return tok_typename;
 505
 506     case '%':
 507       return parse_percent_token ();
 508
 509     default:
 510       return tok_illegal;
 511     }
 512 }
 513
 514 /* the following table dictates the action taken for the various %
 515    directives.  A set_flag value causes the named flag to be set.  A
 516    retval action returns the code.  */
 517 struct percent_table_struct
 518 {
 519   const char *name;
 520   void *set_flag;
 521   token_t retval;
 522 };
 523
 524 struct percent_table_struct percent_table[] =
 525 {
 526   { "token",            NULL,                   tok_token },
 527   { "term",             NULL,                   tok_token },
 528   { "nterm",            NULL,                   tok_nterm },
 529   { "type",             NULL,                   tok_type },
 530   { "guard",            NULL,                   tok_guard },
 531   { "union",            NULL,                   tok_union },
 532   { "expect",           NULL,                   tok_expect },
 533   { "thong",            NULL,                   tok_thong },
 534   { "start",            NULL,                   tok_start },
 535   { "left",             NULL,                   tok_left },
 536   { "right",            NULL,                   tok_right },
 537   { "nonassoc",         NULL,                   tok_nonassoc },
 538   { "binary",           NULL,                   tok_nonassoc },
 539   { "prec",             NULL,                   tok_prec },
 540   { "locations",        &locations_flag,        tok_noop },     /* -l */
 541   { "no_lines",         &no_lines_flag,         tok_noop },     /* -l */
 542   { "raw",              NULL,                   tok_obsolete }, /* -r */
 543   { "token_table",      &token_table_flag,      tok_noop },     /* -k */
 544   { "yacc",             &yacc_flag,             tok_noop },     /* -y */
 545   { "fixed_output_files",&yacc_flag,            tok_noop },     /* -y */
 546   { "defines",          &defines_flag,          tok_noop },     /* -d */
 547   { "no_parser",        &no_parser_flag,        tok_noop },     /* -n */
 548   { "graph",            &graph_flag,            tok_noop },     /* -g */
 549 #if 0
 550   /* For the time being, this is not enabled yet, while it's possible
 551      though, since we use obstacks.  The only risk is with semantic
 552      parsers which will output an `include' of an output file: be sure
 553      that the name included is indeed the name of the output file.  */
 554   { "output_file",      &spec_outfile,          tok_setopt },   /* -o */
 555   { "file_prefix",      &spec_file_prefix,      tok_setopt },   /* -b */
 556   { "name_prefix",      &spec_name_prefix,      tok_setopt },   /* -p */
 557 #endif
 558   { "header_extension", NULL,                   tok_hdrext },
 559   { "source_extension", NULL,                   tok_srcext },
 560   { "verbose",          &verbose_flag,          tok_noop },     /* -v */
 561   { "debug",            &debug_flag,            tok_noop },     /* -t */
 562   { "semantic_parser",  &semantic_parser,       tok_noop },
 563   { "pure_parser",      &pure_parser,           tok_noop },
 564
 565   { NULL, NULL, tok_illegal}
 566 };
 567
 568 /* Parse a token which starts with %.
 569    Assumes the % has already been read and discarded.  */
 570
 571 token_t
 572 parse_percent_token (void)
 573 {
 574   struct percent_table_struct *tx;
 575
 576   int c = getc (finput);
 577
 578   switch (c)
 579     {
 580     case '%':
 581       return tok_two_percents;
 582
 583     case '{':
 584       return tok_percent_left_curly;
 585
 586     case '<':
 587       return tok_left;
 588
 589     case '>':
 590       return tok_right;
 591
 592     case '2':
 593       return tok_nonassoc;
 594
 595     case '0':
 596       return tok_token;
 597
 598     case '=':
 599       return tok_prec;
 600     }
 601
 602   if (!isalpha (c))
 603     return tok_illegal;
 604
 605   obstack_1grow (&token_obstack, '%');
 606   while (isalpha (c) || c == '_' || c == '-')
 607     {
 608       if (c == '-')
 609         c = '_';
 610       obstack_1grow (&token_obstack, c);
 611       c = getc (finput);
 612     }
 613
 614   ungetc (c, finput);
 615   obstack_1grow (&token_obstack, '\0');
 616   token_buffer = obstack_finish (&token_obstack);
 617
 618   /* table lookup % directive */
 619   for (tx = percent_table; tx->name; tx++)
 620     if (strcmp (token_buffer + 1, tx->name) == 0)
 621       break;
 622
 623   if (tx->set_flag)
 624     {
 625       *((int *) (tx->set_flag)) = 1;
 626       return tok_noop;
 627     }
 628
 629   switch (tx->retval)
 630     {
 631     case tok_setopt:
 632       *((char **) (tx->set_flag)) = optarg;
 633       return tok_noop;
 634       break;
 635
 636     case tok_obsolete:
 637       fatal (_("`%s' is no longer supported"), token_buffer);
 638       break;
 639
 640     default:
 641       /* Other cases do not apply here. */
 642       break;
 643     }
 644
 645   return tx->retval;
 646 }