src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 char *token_buffer;
  34
  35 /* Allocated size of token_buffer, not including space for terminator.  */
  36 int maxtoken;
  37
  38 bucket *symval;
  39 int numval;
  40
  41 static int unlexed;             /* these two describe a token to be reread */
  42 static bucket *unlexed_symval;  /* by the next call to lex */
  43
  44
  45 void
  46 init_lex (void)
  47 {
  48   maxtoken = 100;
  49   token_buffer = XCALLOC (char, maxtoken + 1);
  50   unlexed = -1;
  51 }
  52
  53
  54 char *
  55 grow_token_buffer (char *p)
  56 {
  57   int offset = p - token_buffer;
  58   maxtoken *= 2;
  59   token_buffer = XREALLOC (token_buffer, char, maxtoken + 1);
  60   return token_buffer + offset;
  61 }
  62
  63
  64 int
  65 skip_white_space (void)
  66 {
  67   int c;
  68   int inside;
  69
  70   c = getc (finput);
  71
  72   for (;;)
  73     {
  74       int cplus_comment;
  75
  76       switch (c)
  77         {
  78         case '/':
  79           /* FIXME: Should probably be merged with copy_comment.  */
  80           c = getc (finput);
  81           if (c != '*' && c != '/')
  82             {
  83               complain (_("unexpected `/' found and ignored"));
  84               break;
  85             }
  86           cplus_comment = (c == '/');
  87
  88           c = getc (finput);
  89
  90           inside = 1;
  91           while (inside)
  92             {
  93               if (!cplus_comment && c == '*')
  94                 {
  95                   while (c == '*')
  96                     c = getc (finput);
  97
  98                   if (c == '/')
  99                     {
 100                       inside = 0;
 101                       c = getc (finput);
 102                     }
 103                 }
 104               else if (c == '\n')
 105                 {
 106                   lineno++;
 107                   if (cplus_comment)
 108                     inside = 0;
 109                   c = getc (finput);
 110                 }
 111               else if (c == EOF)
 112                 fatal (_("unterminated comment"));
 113               else
 114                 c = getc (finput);
 115             }
 116
 117           break;
 118
 119         case '\n':
 120           lineno++;
 121
 122         case ' ':
 123         case '\t':
 124         case '\f':
 125           c = getc (finput);
 126           break;
 127
 128         default:
 129           return c;
 130         }
 131     }
 132 }
 133
 134
 135 /*-----------------------------------------------------.
 136 | Do a getc, but give error message if EOF encountered |
 137 `-----------------------------------------------------*/
 138
 139 static int
 140 xgetc (FILE *f)
 141 {
 142   int c = getc (f);
 143   if (c == EOF)
 144     fatal (_("unexpected end of file"));
 145   return c;
 146 }
 147
 148
 149 /*------------------------------------------------------------------.
 150 | Read one literal character from finput.  Process \ escapes.       |
 151 | Append the normalized string version of the char to *PP.  Assign  |
 152 | the character code to *PCODE. Return 1 unless the character is an |
 153 | unescaped `term' or \n report error for \n                        |
 154 `------------------------------------------------------------------*/
 155
 156 static int
 157 literalchar (char **pp, int *pcode, char term)
 158 {
 159   int c;
 160   char *p;
 161   int code;
 162   int wasquote = 0;
 163
 164   c = xgetc (finput);
 165   if (c == '\n')
 166     {
 167       complain (_("unescaped newline in constant"));
 168       ungetc (c, finput);
 169       code = '?';
 170       wasquote = 1;
 171     }
 172   else if (c != '\\')
 173     {
 174       code = c;
 175       if (c == term)
 176         wasquote = 1;
 177     }
 178   else
 179     {
 180       c = xgetc (finput);
 181       if (c == 't')
 182         code = '\t';
 183       else if (c == 'n')
 184         code = '\n';
 185       else if (c == 'a')
 186         code = '\007';
 187       else if (c == 'r')
 188         code = '\r';
 189       else if (c == 'f')
 190         code = '\f';
 191       else if (c == 'b')
 192         code = '\b';
 193       else if (c == 'v')
 194         code = '\013';
 195       else if (c == '\\')
 196         code = '\\';
 197       else if (c == '\'')
 198         code = '\'';
 199       else if (c == '\"')
 200         code = '\"';
 201       else if (c <= '7' && c >= '0')
 202         {
 203           code = 0;
 204           while (c <= '7' && c >= '0')
 205             {
 206               code = (code * 8) + (c - '0');
 207               if (code >= 256 || code < 0)
 208                 {
 209                   complain (_("octal value outside range 0...255: `\\%o'"),
 210                             code);
 211                   code &= 0xFF;
 212                   break;
 213                 }
 214               c = xgetc (finput);
 215             }
 216           ungetc (c, finput);
 217         }
 218       else if (c == 'x')
 219         {
 220           c = xgetc (finput);
 221           code = 0;
 222           while (1)
 223             {
 224               if (c >= '0' && c <= '9')
 225                 code *= 16, code += c - '0';
 226               else if (c >= 'a' && c <= 'f')
 227                 code *= 16, code += c - 'a' + 10;
 228               else if (c >= 'A' && c <= 'F')
 229                 code *= 16, code += c - 'A' + 10;
 230               else
 231                 break;
 232               if (code >= 256 || code < 0)
 233                 {
 234                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 235                   code &= 0xFF;
 236                   break;
 237                 }
 238               c = xgetc (finput);
 239             }
 240           ungetc (c, finput);
 241         }
 242       else
 243         {
 244           char buf [] = "c";
 245           buf[0] = c;
 246           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 247                     quote (buf));
 248           code = '?';
 249         }
 250     }                           /* has \ */
 251
 252   /* now fill token_buffer with the canonical name for this character
 253      as a literal token.  Do not use what the user typed,
 254      so that `\012' and `\n' can be interchangeable.  */
 255
 256   p = *pp;
 257   if (code == term && wasquote)
 258     *p++ = code;
 259   else if (code == '\\')
 260     {
 261       *p++ = '\\';
 262       *p++ = '\\';
 263     }
 264   else if (code == '\'')
 265     {
 266       *p++ = '\\';
 267       *p++ = '\'';
 268     }
 269   else if (code == '\"')
 270     {
 271       *p++ = '\\';
 272       *p++ = '\"';
 273     }
 274   else if (code >= 040 && code < 0177)
 275     *p++ = code;
 276   else if (code == '\t')
 277     {
 278       *p++ = '\\';
 279       *p++ = 't';
 280     }
 281   else if (code == '\n')
 282     {
 283       *p++ = '\\';
 284       *p++ = 'n';
 285     }
 286   else if (code == '\r')
 287     {
 288       *p++ = '\\';
 289       *p++ = 'r';
 290     }
 291   else if (code == '\v')
 292     {
 293       *p++ = '\\';
 294       *p++ = 'v';
 295     }
 296   else if (code == '\b')
 297     {
 298       *p++ = '\\';
 299       *p++ = 'b';
 300     }
 301   else if (code == '\f')
 302     {
 303       *p++ = '\\';
 304       *p++ = 'f';
 305     }
 306   else
 307     {
 308       *p++ = '\\';
 309       *p++ = code / 0100 + '0';
 310       *p++ = ((code / 010) & 07) + '0';
 311       *p++ = (code & 07) + '0';
 312     }
 313   *pp = p;
 314   *pcode = code;
 315   return !wasquote;
 316 }
 317
 318
 319 void
 320 unlex (int token)
 321 {
 322   unlexed = token;
 323   unlexed_symval = symval;
 324 }
 325
 326
 327 int
 328 lex (void)
 329 {
 330   int c;
 331   char *p;
 332
 333   if (unlexed >= 0)
 334     {
 335       symval = unlexed_symval;
 336       c = unlexed;
 337       unlexed = -1;
 338       return c;
 339     }
 340
 341   c = skip_white_space ();
 342   /* for error messages (token buffer always valid) */
 343   *token_buffer = c;
 344   token_buffer[1] = 0;
 345
 346   switch (c)
 347     {
 348     case EOF:
 349       strcpy (token_buffer, "EOF");
 350       return ENDFILE;
 351
 352     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 353     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 354     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 355     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 356     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 357     case 'Z':
 358     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 359     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 360     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 361     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 362     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 363     case 'z':
 364     case '.':    case '_':
 365
 366       p = token_buffer;
 367       while (isalnum (c) || c == '_' || c == '.')
 368         {
 369           if (p == token_buffer + maxtoken)
 370             p = grow_token_buffer (p);
 371
 372           *p++ = c;
 373           c = getc (finput);
 374         }
 375
 376       *p = 0;
 377       ungetc (c, finput);
 378       symval = getsym (token_buffer);
 379       return IDENTIFIER;
 380
 381     case '0':    case '1':    case '2':    case '3':    case '4':
 382     case '5':    case '6':    case '7':    case '8':    case '9':
 383       {
 384         numval = 0;
 385
 386         p = token_buffer;
 387         while (isdigit (c))
 388           {
 389             if (p == token_buffer + maxtoken)
 390               p = grow_token_buffer (p);
 391
 392             *p++ = c;
 393             numval = numval * 10 + c - '0';
 394             c = getc (finput);
 395           }
 396         *p = 0;
 397         ungetc (c, finput);
 398         return NUMBER;
 399       }
 400
 401     case '\'':
 402       /* parse the literal token and compute character code in  code  */
 403
 404       translations = -1;
 405       {
 406         int code, discode;
 407         char discard[10], *dp;
 408
 409         p = token_buffer;
 410         *p++ = '\'';
 411         literalchar (&p, &code, '\'');
 412
 413         c = getc (finput);
 414         if (c != '\'')
 415           {
 416             complain (_("use \"...\" for multi-character literal tokens"));
 417             while (1)
 418               {
 419                 dp = discard;
 420                 if (!literalchar (&dp, &discode, '\''))
 421                   break;
 422               }
 423           }
 424         *p++ = '\'';
 425         *p = 0;
 426         symval = getsym (token_buffer);
 427         symval->class = token_sym;
 428         if (!symval->user_token_number)
 429           symval->user_token_number = code;
 430         return IDENTIFIER;
 431       }
 432
 433     case '\"':
 434       /* parse the literal string token and treat as an identifier */
 435
 436       translations = -1;
 437       {
 438         int code;               /* ignored here */
 439         p = token_buffer;
 440         *p++ = '\"';
 441         /* Read up to and including ".  */
 442         while (literalchar (&p, &code, '\"'))
 443           {
 444             if (p >= token_buffer + maxtoken - 4)
 445               p = grow_token_buffer (p);
 446           }
 447         *p = 0;
 448
 449         symval = getsym (token_buffer);
 450         symval->class = token_sym;
 451
 452         return IDENTIFIER;
 453       }
 454
 455     case ',':
 456       return COMMA;
 457
 458     case ':':
 459       return COLON;
 460
 461     case ';':
 462       return SEMICOLON;
 463
 464     case '|':
 465       return BAR;
 466
 467     case '{':
 468       return LEFT_CURLY;
 469
 470     case '=':
 471       do
 472         {
 473           c = getc (finput);
 474           if (c == '\n')
 475             lineno++;
 476         }
 477       while (c == ' ' || c == '\n' || c == '\t');
 478
 479       if (c == '{')
 480         {
 481           strcpy (token_buffer, "={");
 482           return LEFT_CURLY;
 483         }
 484       else
 485         {
 486           ungetc (c, finput);
 487           return ILLEGAL;
 488         }
 489
 490     case '<':
 491       p = token_buffer;
 492       c = getc (finput);
 493       while (c != '>')
 494         {
 495           if (c == EOF)
 496             fatal (_("unterminated type name at end of file"));
 497           if (c == '\n')
 498             {
 499               complain (_("unterminated type name"));
 500               ungetc (c, finput);
 501               break;
 502             }
 503
 504           if (p == token_buffer + maxtoken)
 505             p = grow_token_buffer (p);
 506
 507           *p++ = c;
 508           c = getc (finput);
 509         }
 510       *p = 0;
 511       return TYPENAME;
 512
 513
 514     case '%':
 515       return parse_percent_token ();
 516
 517     default:
 518       return ILLEGAL;
 519     }
 520 }
 521
 522 /* the following table dictates the action taken for the various %
 523    directives.  A set_flag value causes the named flag to be set.  A
 524    retval action returns the code.  */
 525 struct percent_table_struct
 526 {
 527   const char *name;
 528   void *set_flag;
 529   int retval;
 530 }
 531 percent_table[] =
 532 {
 533   { "token", NULL, TOKEN },
 534   { "term", NULL, TOKEN },
 535   { "nterm", NULL, NTERM },
 536   { "type", NULL, TYPE },
 537   { "guard", NULL, GUARD },
 538   { "union", NULL, UNION },
 539   { "expect", NULL, EXPECT },
 540   { "thong", NULL, THONG },
 541   { "start", NULL, START },
 542   { "left", NULL, LEFT },
 543   { "right", NULL, RIGHT },
 544   { "nonassoc", NULL, NONASSOC },
 545   { "binary", NULL, NONASSOC },
 546   { "semantic_parser", NULL, SEMANTIC_PARSER },
 547   { "pure_parser", NULL, PURE_PARSER },
 548   { "prec", NULL, PREC },
 549   { "locations", &locations_flag, NOOP},        /* -l */
 550   { "no_lines", &no_lines_flag, NOOP},  /* -l */
 551   { "raw", &raw_flag, NOOP },   /* -r */
 552   { "token_table", &token_table_flag, NOOP},    /* -k */
 553 #if 0
 554     /* These can be utilized after main is reoganized so
 555        open_files() is deferred 'til after read_declarations().
 556        But %{ and %union both put information into files
 557        that have to be opened before read_declarations().
 558      */
 559   { "yacc", &yacc_flag, NOOP},                          /* -y */
 560   { "fixed_output_files", &yacc_flag, NOOP},            /* -y */
 561   { "defines", &defines_flag, NOOP},                    /* -d */
 562   { "no_parser", &no_parser_flag, NOOP},                /* -n */
 563   { "output_file", &spec_outfile, SETOPT},              /* -o */
 564   { "file_prefix", &spec_file_prefix, SETOPT},          /* -b */
 565   { "name_prefix", &spec_name_prefix, SETOPT},          /* -p */
 566     /* These would be acceptable, but they do not affect processing */
 567   { "verbose", &verbose_flag, NOOP},                    /* -v */
 568   { "debug", &debug_flag, NOOP},                        /* -t */
 569 /*    {"help", <print usage stmt>, NOOP}, *//* -h */
 570 /*    {"version", <print version number> ,  NOOP}, *//* -V */
 571 #endif
 572   { NULL, NULL, ILLEGAL}
 573 };
 574
 575 /* Parse a token which starts with %.
 576    Assumes the % has already been read and discarded.  */
 577
 578 int
 579 parse_percent_token (void)
 580 {
 581   int c;
 582   char *p;
 583   struct percent_table_struct *tx;
 584
 585   p = token_buffer;
 586   c = getc (finput);
 587   *p++ = '%';
 588   *p++ = c;                     /* for error msg */
 589   *p = 0;
 590
 591   switch (c)
 592     {
 593     case '%':
 594       return TWO_PERCENTS;
 595
 596     case '{':
 597       return PERCENT_LEFT_CURLY;
 598
 599     case '<':
 600       return LEFT;
 601
 602     case '>':
 603       return RIGHT;
 604
 605     case '2':
 606       return NONASSOC;
 607
 608     case '0':
 609       return TOKEN;
 610
 611     case '=':
 612       return PREC;
 613     }
 614   if (!isalpha (c))
 615     return ILLEGAL;
 616
 617   p = token_buffer;
 618   *p++ = '%';
 619   while (isalpha (c) || c == '_' || c == '-')
 620     {
 621       if (p == token_buffer + maxtoken)
 622         p = grow_token_buffer (p);
 623
 624       if (c == '-')
 625         c = '_';
 626       *p++ = c;
 627       c = getc (finput);
 628     }
 629
 630   ungetc (c, finput);
 631
 632   *p = 0;
 633
 634   /* table lookup % directive */
 635   for (tx = percent_table; tx->name; tx++)
 636     if (strcmp (token_buffer + 1, tx->name) == 0)
 637       break;
 638   if (tx->retval == SETOPT)
 639     {
 640       *((char **) (tx->set_flag)) = optarg;
 641       return NOOP;
 642     }
 643   if (tx->set_flag)
 644     {
 645       *((int *) (tx->set_flag)) = 1;
 646       return NOOP;
 647     }
 648   return tx->retval;
 649 }