src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 struct obstack token_obstack;
  34 const char *token_buffer = NULL;
  35
  36 bucket *symval;
  37 int numval;
  38
  39 static int unlexed;             /* these two describe a token to be reread */
  40 static bucket *unlexed_symval;  /* by the next call to lex */
  41
  42
  43 void
  44 init_lex (void)
  45 {
  46   obstack_init (&token_obstack);
  47   unlexed = -1;
  48 }
  49
  50
  51 int
  52 skip_white_space (void)
  53 {
  54   int c;
  55   int inside;
  56
  57   c = getc (finput);
  58
  59   for (;;)
  60     {
  61       int cplus_comment;
  62
  63       switch (c)
  64         {
  65         case '/':
  66           /* FIXME: Should probably be merged with copy_comment.  */
  67           c = getc (finput);
  68           if (c != '*' && c != '/')
  69             {
  70               complain (_("unexpected `/' found and ignored"));
  71               break;
  72             }
  73           cplus_comment = (c == '/');
  74
  75           c = getc (finput);
  76
  77           inside = 1;
  78           while (inside)
  79             {
  80               if (!cplus_comment && c == '*')
  81                 {
  82                   while (c == '*')
  83                     c = getc (finput);
  84
  85                   if (c == '/')
  86                     {
  87                       inside = 0;
  88                       c = getc (finput);
  89                     }
  90                 }
  91               else if (c == '\n')
  92                 {
  93                   lineno++;
  94                   if (cplus_comment)
  95                     inside = 0;
  96                   c = getc (finput);
  97                 }
  98               else if (c == EOF)
  99                 fatal (_("unterminated comment"));
 100               else
 101                 c = getc (finput);
 102             }
 103
 104           break;
 105
 106         case '\n':
 107           lineno++;
 108
 109         case ' ':
 110         case '\t':
 111         case '\f':
 112           c = getc (finput);
 113           break;
 114
 115         default:
 116           return c;
 117         }
 118     }
 119 }
 120
 121
 122 /*-----------------------------------------------------.
 123 | Do a getc, but give error message if EOF encountered |
 124 `-----------------------------------------------------*/
 125
 126 static int
 127 xgetc (FILE *f)
 128 {
 129   int c = getc (f);
 130   if (c == EOF)
 131     fatal (_("unexpected end of file"));
 132   return c;
 133 }
 134
 135
 136 /*------------------------------------------------------------------.
 137 | Read one literal character from finput.  Process \ escapes.       |
 138 | Append the normalized string version of the char to OUT.  Assign  |
 139 | the character code to *PCODE. Return 1 unless the character is an |
 140 | unescaped `term' or \n report error for \n.                       |
 141 `------------------------------------------------------------------*/
 142
 143 /* FIXME: We could directly work in the obstack, but that would make
 144    it more difficult to move to quotearg some day.  So for the time
 145    being, I prefer have literalchar behave like quotearg, and change
 146    my mind later if I was wrong.  */
 147
 148 static int
 149 literalchar (struct obstack *out, int *pcode, char term)
 150 {
 151   int c;
 152   char buf[4096];
 153   char *cp;
 154   int code;
 155   int wasquote = 0;
 156
 157   c = xgetc (finput);
 158   if (c == '\n')
 159     {
 160       complain (_("unescaped newline in constant"));
 161       ungetc (c, finput);
 162       code = '?';
 163       wasquote = 1;
 164     }
 165   else if (c != '\\')
 166     {
 167       code = c;
 168       if (c == term)
 169         wasquote = 1;
 170     }
 171   else
 172     {
 173       c = xgetc (finput);
 174       if (c == 't')
 175         code = '\t';
 176       else if (c == 'n')
 177         code = '\n';
 178       else if (c == 'a')
 179         code = '\007';
 180       else if (c == 'r')
 181         code = '\r';
 182       else if (c == 'f')
 183         code = '\f';
 184       else if (c == 'b')
 185         code = '\b';
 186       else if (c == 'v')
 187         code = '\013';
 188       else if (c == '\\')
 189         code = '\\';
 190       else if (c == '\'')
 191         code = '\'';
 192       else if (c == '\"')
 193         code = '\"';
 194       else if (c <= '7' && c >= '0')
 195         {
 196           code = 0;
 197           while (c <= '7' && c >= '0')
 198             {
 199               code = (code * 8) + (c - '0');
 200               if (code >= 256 || code < 0)
 201                 {
 202                   complain (_("octal value outside range 0...255: `\\%o'"),
 203                             code);
 204                   code &= 0xFF;
 205                   break;
 206                 }
 207               c = xgetc (finput);
 208             }
 209           ungetc (c, finput);
 210         }
 211       else if (c == 'x')
 212         {
 213           c = xgetc (finput);
 214           code = 0;
 215           while (1)
 216             {
 217               if (c >= '0' && c <= '9')
 218                 code *= 16, code += c - '0';
 219               else if (c >= 'a' && c <= 'f')
 220                 code *= 16, code += c - 'a' + 10;
 221               else if (c >= 'A' && c <= 'F')
 222                 code *= 16, code += c - 'A' + 10;
 223               else
 224                 break;
 225               if (code >= 256 || code < 0)
 226                 {
 227                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 228                   code &= 0xFF;
 229                   break;
 230                 }
 231               c = xgetc (finput);
 232             }
 233           ungetc (c, finput);
 234         }
 235       else
 236         {
 237           char badchar [] = "c";
 238           badchar[0] = c;
 239           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 240                     quote (badchar));
 241           code = '?';
 242         }
 243     }                           /* has \ */
 244
 245   /* now fill BUF with the canonical name for this character as a
 246      literal token.  Do not use what the user typed, so that `\012'
 247      and `\n' can be interchangeable.  */
 248
 249   cp = buf;
 250   if (code == term && wasquote)
 251     *cp++ = code;
 252   else if (code == '\\')
 253     {
 254       *cp++ = '\\';
 255       *cp++ = '\\';
 256     }
 257   else if (code == '\'')
 258     {
 259       *cp++ = '\\';
 260       *cp++ = '\'';
 261     }
 262   else if (code == '\"')
 263     {
 264       *cp++ = '\\';
 265       *cp++ = '\"';
 266     }
 267   else if (code >= 040 && code < 0177)
 268     *cp++ = code;
 269   else if (code == '\t')
 270     {
 271       *cp++ = '\\';
 272       *cp++ = 't';
 273     }
 274   else if (code == '\n')
 275     {
 276       *cp++ = '\\';
 277       *cp++ = 'n';
 278     }
 279   else if (code == '\r')
 280     {
 281       *cp++ = '\\';
 282       *cp++ = 'r';
 283     }
 284   else if (code == '\v')
 285     {
 286       *cp++ = '\\';
 287       *cp++ = 'v';
 288     }
 289   else if (code == '\b')
 290     {
 291       *cp++ = '\\';
 292       *cp++ = 'b';
 293     }
 294   else if (code == '\f')
 295     {
 296       *cp++ = '\\';
 297       *cp++ = 'f';
 298     }
 299   else
 300     {
 301       *cp++ = '\\';
 302       *cp++ = code / 0100 + '0';
 303       *cp++ = ((code / 010) & 07) + '0';
 304       *cp++ = (code & 07) + '0';
 305     }
 306   *cp = '\0';
 307
 308   if (out)
 309     obstack_sgrow (out, buf);
 310   *pcode = code;
 311   return !wasquote;
 312 }
 313
 314
 315 void
 316 unlex (int token)
 317 {
 318   unlexed = token;
 319   unlexed_symval = symval;
 320 }
 321
 322 /*-----------------------------------------------------------------.
 323 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 324 | specified between the `<...>'.                                   |
 325 `-----------------------------------------------------------------*/
 326
 327 void
 328 read_type_name (FILE *fin)
 329 {
 330   int c = getc (fin);
 331
 332   while (c != '>')
 333     {
 334       if (c == EOF)
 335         fatal (_("unterminated type name at end of file"));
 336       if (c == '\n')
 337         {
 338           complain (_("unterminated type name"));
 339           ungetc (c, fin);
 340           break;
 341         }
 342
 343       obstack_1grow (&token_obstack, c);
 344       c = getc (fin);
 345     }
 346   obstack_1grow (&token_obstack, '\0');
 347   token_buffer = obstack_finish (&token_obstack);
 348 }
 349
 350
 351 token_t
 352 lex (void)
 353 {
 354   int c;
 355
 356   /* Just to make sure. */
 357   token_buffer = NULL;
 358
 359   if (unlexed >= 0)
 360     {
 361       symval = unlexed_symval;
 362       c = unlexed;
 363       unlexed = -1;
 364       return c;
 365     }
 366
 367   c = skip_white_space ();
 368
 369   switch (c)
 370     {
 371     case EOF:
 372       token_buffer = "EOF";
 373       return tok_eof;
 374
 375     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 376     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 377     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 378     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 379     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 380     case 'Z':
 381     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 382     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 383     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 384     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 385     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 386     case 'z':
 387     case '.':    case '_':
 388
 389       while (isalnum (c) || c == '_' || c == '.')
 390         {
 391           obstack_1grow (&token_obstack, c);
 392           c = getc (finput);
 393         }
 394       obstack_1grow (&token_obstack, '\0');
 395       token_buffer = obstack_finish (&token_obstack);
 396       ungetc (c, finput);
 397       symval = getsym (token_buffer);
 398       return tok_identifier;
 399
 400     case '0':    case '1':    case '2':    case '3':    case '4':
 401     case '5':    case '6':    case '7':    case '8':    case '9':
 402       {
 403         numval = 0;
 404
 405         while (isdigit (c))
 406           {
 407             obstack_1grow (&token_obstack, c);
 408             numval = numval * 10 + c - '0';
 409             c = getc (finput);
 410           }
 411         obstack_1grow (&token_obstack, '\0');
 412         token_buffer = obstack_finish (&token_obstack);
 413         ungetc (c, finput);
 414         return tok_number;
 415       }
 416
 417     case '\'':
 418       /* parse the literal token and compute character code in  code  */
 419
 420       translations = -1;
 421       {
 422         int code, discode;
 423
 424         obstack_1grow (&token_obstack, '\'');
 425         literalchar (&token_obstack, &code, '\'');
 426
 427         c = getc (finput);
 428         if (c != '\'')
 429           {
 430             complain (_("use \"...\" for multi-character literal tokens"));
 431             while (1)
 432               if (!literalchar (0, &discode, '\''))
 433                 break;
 434           }
 435         obstack_1grow (&token_obstack, '\'');
 436         obstack_1grow (&token_obstack, '\0');
 437         token_buffer = obstack_finish (&token_obstack);
 438         symval = getsym (token_buffer);
 439         symval->class = token_sym;
 440         if (!symval->user_token_number)
 441           symval->user_token_number = code;
 442         return tok_identifier;
 443       }
 444
 445     case '\"':
 446       /* parse the literal string token and treat as an identifier */
 447
 448       translations = -1;
 449       {
 450         int code;               /* ignored here */
 451
 452         obstack_1grow (&token_obstack, '\"');
 453         /* Read up to and including ".  */
 454         while (literalchar (&token_obstack, &code, '\"'))
 455           /* nothing */;
 456         obstack_1grow (&token_obstack, '\0');
 457         token_buffer = obstack_finish (&token_obstack);
 458
 459         symval = getsym (token_buffer);
 460         symval->class = token_sym;
 461
 462         return tok_identifier;
 463       }
 464
 465     case ',':
 466       return tok_comma;
 467
 468     case ':':
 469       return tok_colon;
 470
 471     case ';':
 472       return tok_semicolon;
 473
 474     case '|':
 475       return tok_bar;
 476
 477     case '{':
 478       return tok_left_curly;
 479
 480     case '=':
 481       do
 482         {
 483           c = getc (finput);
 484           if (c == '\n')
 485             lineno++;
 486         }
 487       while (c == ' ' || c == '\n' || c == '\t');
 488
 489       if (c == '{')
 490         {
 491           token_buffer = "={";
 492           return tok_left_curly;
 493         }
 494       else
 495         {
 496           ungetc (c, finput);
 497           return tok_illegal;
 498         }
 499
 500     case '<':
 501       read_type_name (finput);
 502       return tok_typename;
 503
 504     case '%':
 505       return parse_percent_token ();
 506
 507     default:
 508       return tok_illegal;
 509     }
 510 }
 511
 512 /* the following table dictates the action taken for the various %
 513    directives.  A set_flag value causes the named flag to be set.  A
 514    retval action returns the code.  */
 515 struct percent_table_struct
 516 {
 517   const char *name;
 518   void *set_flag;
 519   int retval;
 520 };
 521
 522 struct percent_table_struct percent_table[] =
 523 {
 524   { "token",            NULL,                   tok_token },
 525   { "term",             NULL,                   tok_token },
 526   { "nterm",            NULL,                   tok_nterm },
 527   { "type",             NULL,                   tok_type },
 528   { "guard",            NULL,                   tok_guard },
 529   { "union",            NULL,                   tok_union },
 530   { "expect",           NULL,                   tok_expect },
 531   { "thong",            NULL,                   tok_thong },
 532   { "start",            NULL,                   tok_start },
 533   { "left",             NULL,                   tok_left },
 534   { "right",            NULL,                   tok_right },
 535   { "nonassoc",         NULL,                   tok_nonassoc },
 536   { "binary",           NULL,                   tok_nonassoc },
 537   { "prec",             NULL,                   tok_prec },
 538   { "locations",        &locations_flag,        tok_noop },     /* -l */
 539   { "no_lines",         &no_lines_flag,         tok_noop },     /* -l */
 540   { "raw",              NULL,                   tok_obsolete }, /* -r */
 541   { "token_table",      &token_table_flag,      tok_noop },     /* -k */
 542   { "yacc",             &yacc_flag,             tok_noop },     /* -y */
 543   { "fixed_output_files",&yacc_flag,            tok_noop },     /* -y */
 544   { "defines",          &defines_flag,          tok_noop },     /* -d */
 545   { "no_parser",        &no_parser_flag,        tok_noop },     /* -n */
 546 #if 0
 547   /* For the time being, this is not enabled yet, while it's possible
 548      though, since we use obstacks.  The only risk is with semantic
 549      parsers which will output an `include' of an output file: be sure
 550      that the naem included is indeed the name of the output file.  */
 551   { "output_file",      &spec_outfile,          tok_setopt },   /* -o */
 552   { "file_prefix",      &spec_file_prefix,      tok_setopt },   /* -b */
 553   { "name_prefix",      &spec_name_prefix,      tok_setopt },   /* -p */
 554 #endif
 555   { "header_extension", NULL,                   tok_hdrext},
 556   { "source_extension", NULL,                   tok_srcext},
 557   { "verbose",          &verbose_flag,          tok_noop },     /* -v */
 558   { "debug",            &debug_flag,            tok_noop },     /* -t */
 559   { "semantic_parser",  &semantic_parser,       tok_noop },
 560   { "pure_parser",      &pure_parser,           tok_noop },
 561
 562   { NULL, NULL, tok_illegal}
 563 };
 564
 565 /* Parse a token which starts with %.
 566    Assumes the % has already been read and discarded.  */
 567
 568 int
 569 parse_percent_token (void)
 570 {
 571   int c;
 572   struct percent_table_struct *tx;
 573
 574   c = getc (finput);
 575
 576   switch (c)
 577     {
 578     case '%':
 579       return tok_two_percents;
 580
 581     case '{':
 582       return tok_percent_left_curly;
 583
 584     case '<':
 585       return tok_left;
 586
 587     case '>':
 588       return tok_right;
 589
 590     case '2':
 591       return tok_nonassoc;
 592
 593     case '0':
 594       return tok_token;
 595
 596     case '=':
 597       return tok_prec;
 598     }
 599
 600   if (!isalpha (c))
 601     return tok_illegal;
 602
 603   obstack_1grow (&token_obstack, '%');
 604   while (isalpha (c) || c == '_' || c == '-')
 605     {
 606       if (c == '-')
 607         c = '_';
 608       obstack_1grow (&token_obstack, c);
 609       c = getc (finput);
 610     }
 611
 612   ungetc (c, finput);
 613   obstack_1grow (&token_obstack, '\0');
 614   token_buffer = obstack_finish (&token_obstack);
 615
 616   /* table lookup % directive */
 617   for (tx = percent_table; tx->name; tx++)
 618     if (strcmp (token_buffer + 1, tx->name) == 0)
 619       break;
 620
 621   if (tx->set_flag)
 622     {
 623       *((int *) (tx->set_flag)) = 1;
 624       return tok_noop;
 625     }
 626
 627   switch (tx->retval)
 628     {
 629     case tok_setopt:
 630       *((char **) (tx->set_flag)) = optarg;
 631       return tok_noop;
 632       break;
 633
 634     case tok_obsolete:
 635       fatal (_("`%s' is no longer supported"), token_buffer);
 636       break;
 637     }
 638
 639   return tx->retval;
 640 }