src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 char *token_buffer;
  34
  35 /* Allocated size of token_buffer, not including space for terminator.  */
  36 int maxtoken;
  37
  38 bucket *symval;
  39 int numval;
  40
  41 static int unlexed;             /* these two describe a token to be reread */
  42 static bucket *unlexed_symval;  /* by the next call to lex */
  43
  44
  45 void
  46 init_lex (void)
  47 {
  48   maxtoken = 100;
  49   token_buffer = XCALLOC (char, maxtoken + 1);
  50   unlexed = -1;
  51 }
  52
  53
  54 char *
  55 grow_token_buffer (char *p)
  56 {
  57   int offset = p - token_buffer;
  58   maxtoken *= 2;
  59   token_buffer = XREALLOC (token_buffer, char, maxtoken + 1);
  60   return token_buffer + offset;
  61 }
  62
  63
  64 int
  65 skip_white_space (void)
  66 {
  67   int c;
  68   int inside;
  69
  70   c = getc (finput);
  71
  72   for (;;)
  73     {
  74       int cplus_comment;
  75
  76       switch (c)
  77         {
  78         case '/':
  79           /* FIXME: Should probably be merged with copy_comment.  */
  80           c = getc (finput);
  81           if (c != '*' && c != '/')
  82             {
  83               complain (_("unexpected `/' found and ignored"));
  84               break;
  85             }
  86           cplus_comment = (c == '/');
  87
  88           c = getc (finput);
  89
  90           inside = 1;
  91           while (inside)
  92             {
  93               if (!cplus_comment && c == '*')
  94                 {
  95                   while (c == '*')
  96                     c = getc (finput);
  97
  98                   if (c == '/')
  99                     {
 100                       inside = 0;
 101                       c = getc (finput);
 102                     }
 103                 }
 104               else if (c == '\n')
 105                 {
 106                   lineno++;
 107                   if (cplus_comment)
 108                     inside = 0;
 109                   c = getc (finput);
 110                 }
 111               else if (c == EOF)
 112                 fatal (_("unterminated comment"));
 113               else
 114                 c = getc (finput);
 115             }
 116
 117           break;
 118
 119         case '\n':
 120           lineno++;
 121
 122         case ' ':
 123         case '\t':
 124         case '\f':
 125           c = getc (finput);
 126           break;
 127
 128         default:
 129           return c;
 130         }
 131     }
 132 }
 133
 134
 135 /*-----------------------------------------------------.
 136 | Do a getc, but give error message if EOF encountered |
 137 `-----------------------------------------------------*/
 138
 139 static int
 140 xgetc (FILE *f)
 141 {
 142   int c = getc (f);
 143   if (c == EOF)
 144     fatal (_("unexpected end of file"));
 145   return c;
 146 }
 147
 148
 149 /*------------------------------------------------------------------.
 150 | Read one literal character from finput.  Process \ escapes.       |
 151 | Append the normalized string version of the char to *PP.  Assign  |
 152 | the character code to *PCODE. Return 1 unless the character is an |
 153 | unescaped `term' or \n report error for \n                        |
 154 `------------------------------------------------------------------*/
 155
 156 static int
 157 literalchar (char **pp, int *pcode, char term)
 158 {
 159   int c;
 160   char *p;
 161   int code;
 162   int wasquote = 0;
 163
 164   c = xgetc (finput);
 165   if (c == '\n')
 166     {
 167       complain (_("unescaped newline in constant"));
 168       ungetc (c, finput);
 169       code = '?';
 170       wasquote = 1;
 171     }
 172   else if (c != '\\')
 173     {
 174       code = c;
 175       if (c == term)
 176         wasquote = 1;
 177     }
 178   else
 179     {
 180       c = xgetc (finput);
 181       if (c == 't')
 182         code = '\t';
 183       else if (c == 'n')
 184         code = '\n';
 185       else if (c == 'a')
 186         code = '\007';
 187       else if (c == 'r')
 188         code = '\r';
 189       else if (c == 'f')
 190         code = '\f';
 191       else if (c == 'b')
 192         code = '\b';
 193       else if (c == 'v')
 194         code = '\013';
 195       else if (c == '\\')
 196         code = '\\';
 197       else if (c == '\'')
 198         code = '\'';
 199       else if (c == '\"')
 200         code = '\"';
 201       else if (c <= '7' && c >= '0')
 202         {
 203           code = 0;
 204           while (c <= '7' && c >= '0')
 205             {
 206               code = (code * 8) + (c - '0');
 207               if (code >= 256 || code < 0)
 208                 {
 209                   complain (_("octal value outside range 0...255: `\\%o'"),
 210                             code);
 211                   code &= 0xFF;
 212                   break;
 213                 }
 214               c = xgetc (finput);
 215             }
 216           ungetc (c, finput);
 217         }
 218       else if (c == 'x')
 219         {
 220           c = xgetc (finput);
 221           code = 0;
 222           while (1)
 223             {
 224               if (c >= '0' && c <= '9')
 225                 code *= 16, code += c - '0';
 226               else if (c >= 'a' && c <= 'f')
 227                 code *= 16, code += c - 'a' + 10;
 228               else if (c >= 'A' && c <= 'F')
 229                 code *= 16, code += c - 'A' + 10;
 230               else
 231                 break;
 232               if (code >= 256 || code < 0)
 233                 {
 234                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 235                   code &= 0xFF;
 236                   break;
 237                 }
 238               c = xgetc (finput);
 239             }
 240           ungetc (c, finput);
 241         }
 242       else
 243         {
 244           char buf [] = "c";
 245           buf[0] = c;
 246           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 247                     quote (buf));
 248           code = '?';
 249         }
 250     }                           /* has \ */
 251
 252   /* now fill token_buffer with the canonical name for this character
 253      as a literal token.  Do not use what the user typed,
 254      so that `\012' and `\n' can be interchangeable.  */
 255
 256   p = *pp;
 257   if (code == term && wasquote)
 258     *p++ = code;
 259   else if (code == '\\')
 260     {
 261       *p++ = '\\';
 262       *p++ = '\\';
 263     }
 264   else if (code == '\'')
 265     {
 266       *p++ = '\\';
 267       *p++ = '\'';
 268     }
 269   else if (code == '\"')
 270     {
 271       *p++ = '\\';
 272       *p++ = '\"';
 273     }
 274   else if (code >= 040 && code < 0177)
 275     *p++ = code;
 276   else if (code == '\t')
 277     {
 278       *p++ = '\\';
 279       *p++ = 't';
 280     }
 281   else if (code == '\n')
 282     {
 283       *p++ = '\\';
 284       *p++ = 'n';
 285     }
 286   else if (code == '\r')
 287     {
 288       *p++ = '\\';
 289       *p++ = 'r';
 290     }
 291   else if (code == '\v')
 292     {
 293       *p++ = '\\';
 294       *p++ = 'v';
 295     }
 296   else if (code == '\b')
 297     {
 298       *p++ = '\\';
 299       *p++ = 'b';
 300     }
 301   else if (code == '\f')
 302     {
 303       *p++ = '\\';
 304       *p++ = 'f';
 305     }
 306   else
 307     {
 308       *p++ = '\\';
 309       *p++ = code / 0100 + '0';
 310       *p++ = ((code / 010) & 07) + '0';
 311       *p++ = (code & 07) + '0';
 312     }
 313   *pp = p;
 314   *pcode = code;
 315   return !wasquote;
 316 }
 317
 318
 319 void
 320 unlex (int token)
 321 {
 322   unlexed = token;
 323   unlexed_symval = symval;
 324 }
 325
 326 /*-----------------------------------------------------------------.
 327 | We just read `<' from FIN.  Store in TOKEN_BUFFER, the type name |
 328 | specified between the `<...>'.                                   |
 329 `-----------------------------------------------------------------*/
 330
 331 void
 332 read_type_name (FILE *fin)
 333 {
 334   char *p = token_buffer;
 335   int c = getc (fin);
 336
 337   while (c != '>')
 338     {
 339       if (c == EOF)
 340         fatal (_("unterminated type name at end of file"));
 341       if (c == '\n')
 342         {
 343           complain (_("unterminated type name"));
 344           ungetc (c, fin);
 345           break;
 346         }
 347
 348       if (p == token_buffer + maxtoken)
 349         p = grow_token_buffer (p);
 350
 351       *p++ = c;
 352       c = getc (fin);
 353     }
 354   *p = 0;
 355 }
 356
 357
 358 int
 359 lex (void)
 360 {
 361   int c;
 362   char *p;
 363
 364   if (unlexed >= 0)
 365     {
 366       symval = unlexed_symval;
 367       c = unlexed;
 368       unlexed = -1;
 369       return c;
 370     }
 371
 372   c = skip_white_space ();
 373   /* for error messages (token buffer always valid) */
 374   *token_buffer = c;
 375   token_buffer[1] = 0;
 376
 377   switch (c)
 378     {
 379     case EOF:
 380       strcpy (token_buffer, "EOF");
 381       return ENDFILE;
 382
 383     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 384     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 385     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 386     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 387     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 388     case 'Z':
 389     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 390     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 391     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 392     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 393     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 394     case 'z':
 395     case '.':    case '_':
 396
 397       p = token_buffer;
 398       while (isalnum (c) || c == '_' || c == '.')
 399         {
 400           if (p == token_buffer + maxtoken)
 401             p = grow_token_buffer (p);
 402
 403           *p++ = c;
 404           c = getc (finput);
 405         }
 406
 407       *p = 0;
 408       ungetc (c, finput);
 409       symval = getsym (token_buffer);
 410       return IDENTIFIER;
 411
 412     case '0':    case '1':    case '2':    case '3':    case '4':
 413     case '5':    case '6':    case '7':    case '8':    case '9':
 414       {
 415         numval = 0;
 416
 417         p = token_buffer;
 418         while (isdigit (c))
 419           {
 420             if (p == token_buffer + maxtoken)
 421               p = grow_token_buffer (p);
 422
 423             *p++ = c;
 424             numval = numval * 10 + c - '0';
 425             c = getc (finput);
 426           }
 427         *p = 0;
 428         ungetc (c, finput);
 429         return NUMBER;
 430       }
 431
 432     case '\'':
 433       /* parse the literal token and compute character code in  code  */
 434
 435       translations = -1;
 436       {
 437         int code, discode;
 438         char discard[10], *dp;
 439
 440         p = token_buffer;
 441         *p++ = '\'';
 442         literalchar (&p, &code, '\'');
 443
 444         c = getc (finput);
 445         if (c != '\'')
 446           {
 447             complain (_("use \"...\" for multi-character literal tokens"));
 448             while (1)
 449               {
 450                 dp = discard;
 451                 if (!literalchar (&dp, &discode, '\''))
 452                   break;
 453               }
 454           }
 455         *p++ = '\'';
 456         *p = 0;
 457         symval = getsym (token_buffer);
 458         symval->class = token_sym;
 459         if (!symval->user_token_number)
 460           symval->user_token_number = code;
 461         return IDENTIFIER;
 462       }
 463
 464     case '\"':
 465       /* parse the literal string token and treat as an identifier */
 466
 467       translations = -1;
 468       {
 469         int code;               /* ignored here */
 470         p = token_buffer;
 471         *p++ = '\"';
 472         /* Read up to and including ".  */
 473         while (literalchar (&p, &code, '\"'))
 474           {
 475             if (p >= token_buffer + maxtoken - 4)
 476               p = grow_token_buffer (p);
 477           }
 478         *p = 0;
 479
 480         symval = getsym (token_buffer);
 481         symval->class = token_sym;
 482
 483         return IDENTIFIER;
 484       }
 485
 486     case ',':
 487       return COMMA;
 488
 489     case ':':
 490       return COLON;
 491
 492     case ';':
 493       return SEMICOLON;
 494
 495     case '|':
 496       return BAR;
 497
 498     case '{':
 499       return LEFT_CURLY;
 500
 501     case '=':
 502       do
 503         {
 504           c = getc (finput);
 505           if (c == '\n')
 506             lineno++;
 507         }
 508       while (c == ' ' || c == '\n' || c == '\t');
 509
 510       if (c == '{')
 511         {
 512           strcpy (token_buffer, "={");
 513           return LEFT_CURLY;
 514         }
 515       else
 516         {
 517           ungetc (c, finput);
 518           return ILLEGAL;
 519         }
 520
 521     case '<':
 522       read_type_name (finput);
 523       return TYPENAME;
 524
 525     case '%':
 526       return parse_percent_token ();
 527
 528     default:
 529       return ILLEGAL;
 530     }
 531 }
 532
 533 /* the following table dictates the action taken for the various %
 534    directives.  A set_flag value causes the named flag to be set.  A
 535    retval action returns the code.  */
 536 struct percent_table_struct
 537 {
 538   const char *name;
 539   void *set_flag;
 540   int retval;
 541 }
 542 percent_table[] =
 543 {
 544   { "token", NULL, TOKEN },
 545   { "term", NULL, TOKEN },
 546   { "nterm", NULL, NTERM },
 547   { "type", NULL, TYPE },
 548   { "guard", NULL, GUARD },
 549   { "union", NULL, UNION },
 550   { "expect", NULL, EXPECT },
 551   { "thong", NULL, THONG },
 552   { "start", NULL, START },
 553   { "left", NULL, LEFT },
 554   { "right", NULL, RIGHT },
 555   { "nonassoc", NULL, NONASSOC },
 556   { "binary", NULL, NONASSOC },
 557   { "semantic_parser", NULL, SEMANTIC_PARSER },
 558   { "pure_parser", NULL, PURE_PARSER },
 559   { "prec", NULL, PREC },
 560   { "locations", &locations_flag, NOOP},        /* -l */
 561   { "no_lines", &no_lines_flag, NOOP},  /* -l */
 562   { "raw", &raw_flag, NOOP },   /* -r */
 563   { "token_table", &token_table_flag, NOOP},    /* -k */
 564 #if 0
 565     /* These can be utilized after main is reoganized so
 566        open_files() is deferred 'til after read_declarations().
 567        But %{ and %union both put information into files
 568        that have to be opened before read_declarations().
 569      */
 570   { "yacc", &yacc_flag, NOOP},                          /* -y */
 571   { "fixed_output_files", &yacc_flag, NOOP},            /* -y */
 572   { "defines", &defines_flag, NOOP},                    /* -d */
 573   { "no_parser", &no_parser_flag, NOOP},                /* -n */
 574   { "output_file", &spec_outfile, SETOPT},              /* -o */
 575   { "file_prefix", &spec_file_prefix, SETOPT},          /* -b */
 576   { "name_prefix", &spec_name_prefix, SETOPT},          /* -p */
 577     /* These would be acceptable, but they do not affect processing */
 578   { "verbose", &verbose_flag, NOOP},                    /* -v */
 579   { "debug", &debug_flag, NOOP},                        /* -t */
 580 /*    {"help", <print usage stmt>, NOOP}, *//* -h */
 581 /*    {"version", <print version number> ,  NOOP}, *//* -V */
 582 #endif
 583   { NULL, NULL, ILLEGAL}
 584 };
 585
 586 /* Parse a token which starts with %.
 587    Assumes the % has already been read and discarded.  */
 588
 589 int
 590 parse_percent_token (void)
 591 {
 592   int c;
 593   char *p;
 594   struct percent_table_struct *tx;
 595
 596   p = token_buffer;
 597   c = getc (finput);
 598   *p++ = '%';
 599   *p++ = c;                     /* for error msg */
 600   *p = 0;
 601
 602   switch (c)
 603     {
 604     case '%':
 605       return TWO_PERCENTS;
 606
 607     case '{':
 608       return PERCENT_LEFT_CURLY;
 609
 610     case '<':
 611       return LEFT;
 612
 613     case '>':
 614       return RIGHT;
 615
 616     case '2':
 617       return NONASSOC;
 618
 619     case '0':
 620       return TOKEN;
 621
 622     case '=':
 623       return PREC;
 624     }
 625   if (!isalpha (c))
 626     return ILLEGAL;
 627
 628   p = token_buffer;
 629   *p++ = '%';
 630   while (isalpha (c) || c == '_' || c == '-')
 631     {
 632       if (p == token_buffer + maxtoken)
 633         p = grow_token_buffer (p);
 634
 635       if (c == '-')
 636         c = '_';
 637       *p++ = c;
 638       c = getc (finput);
 639     }
 640
 641   ungetc (c, finput);
 642
 643   *p = 0;
 644
 645   /* table lookup % directive */
 646   for (tx = percent_table; tx->name; tx++)
 647     if (strcmp (token_buffer + 1, tx->name) == 0)
 648       break;
 649   if (tx->retval == SETOPT)
 650     {
 651       *((char **) (tx->set_flag)) = optarg;
 652       return NOOP;
 653     }
 654   if (tx->set_flag)
 655     {
 656       *((int *) (tx->set_flag)) = 1;
 657       return NOOP;
 658     }
 659   return tx->retval;
 660 }