src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30 #include "quote.h"
  31
  32 /* Buffer for storing the current token.  */
  33 char *token_buffer;
  34
  35 /* Allocated size of token_buffer, not including space for terminator.  */
  36 int maxtoken;
  37
  38 bucket *symval;
  39 int numval;
  40
  41 static int unlexed;             /* these two describe a token to be reread */
  42 static bucket *unlexed_symval;  /* by the next call to lex */
  43
  44
  45 void
  46 init_lex (void)
  47 {
  48   maxtoken = 100;
  49   token_buffer = XCALLOC (char, maxtoken + 1);
  50   unlexed = -1;
  51 }
  52
  53
  54 char *
  55 grow_token_buffer (char *p)
  56 {
  57   int offset = p - token_buffer;
  58   maxtoken *= 2;
  59   token_buffer = XREALLOC (token_buffer, char, maxtoken + 1);
  60   return token_buffer + offset;
  61 }
  62
  63
  64 int
  65 skip_white_space (void)
  66 {
  67   int c;
  68   int inside;
  69
  70   c = getc (finput);
  71
  72   for (;;)
  73     {
  74       int cplus_comment;
  75
  76       switch (c)
  77         {
  78         case '/':
  79           c = getc (finput);
  80           if (c != '*' && c != '/')
  81             {
  82               complain (_("unexpected `/' found and ignored"));
  83               break;
  84             }
  85           cplus_comment = (c == '/');
  86
  87           c = getc (finput);
  88
  89           inside = 1;
  90           while (inside)
  91             {
  92               if (!cplus_comment && c == '*')
  93                 {
  94                   while (c == '*')
  95                     c = getc (finput);
  96
  97                   if (c == '/')
  98                     {
  99                       inside = 0;
 100                       c = getc (finput);
 101                     }
 102                 }
 103               else if (c == '\n')
 104                 {
 105                   lineno++;
 106                   if (cplus_comment)
 107                     inside = 0;
 108                   c = getc (finput);
 109                 }
 110               else if (c == EOF)
 111                 fatal (_("unterminated comment"));
 112               else
 113                 c = getc (finput);
 114             }
 115
 116           break;
 117
 118         case '\n':
 119           lineno++;
 120
 121         case ' ':
 122         case '\t':
 123         case '\f':
 124           c = getc (finput);
 125           break;
 126
 127         default:
 128           return c;
 129         }
 130     }
 131 }
 132
 133 /* do a getc, but give error message if EOF encountered */
 134 static int
 135 xgetc (FILE *f)
 136 {
 137   int c = getc (f);
 138   if (c == EOF)
 139     fatal (_("unexpected end of file"));
 140   return c;
 141 }
 142
 143
 144 /*------------------------------------------------------------------.
 145 | Read one literal character from finput.  Process \ escapes.       |
 146 | Append the normalized string version of the char to *PP.  Assign  |
 147 | the character code to *PCODE. Return 1 unless the character is an |
 148 | unescaped `term' or \n report error for \n                        |
 149 `------------------------------------------------------------------*/
 150
 151 static int
 152 literalchar (char **pp, int *pcode, char term)
 153 {
 154   int c;
 155   char *p;
 156   int code;
 157   int wasquote = 0;
 158
 159   c = xgetc (finput);
 160   if (c == '\n')
 161     {
 162       complain (_("unescaped newline in constant"));
 163       ungetc (c, finput);
 164       code = '?';
 165       wasquote = 1;
 166     }
 167   else if (c != '\\')
 168     {
 169       code = c;
 170       if (c == term)
 171         wasquote = 1;
 172     }
 173   else
 174     {
 175       c = xgetc (finput);
 176       if (c == 't')
 177         code = '\t';
 178       else if (c == 'n')
 179         code = '\n';
 180       else if (c == 'a')
 181         code = '\007';
 182       else if (c == 'r')
 183         code = '\r';
 184       else if (c == 'f')
 185         code = '\f';
 186       else if (c == 'b')
 187         code = '\b';
 188       else if (c == 'v')
 189         code = '\013';
 190       else if (c == '\\')
 191         code = '\\';
 192       else if (c == '\'')
 193         code = '\'';
 194       else if (c == '\"')
 195         code = '\"';
 196       else if (c <= '7' && c >= '0')
 197         {
 198           code = 0;
 199           while (c <= '7' && c >= '0')
 200             {
 201               code = (code * 8) + (c - '0');
 202               if (code >= 256 || code < 0)
 203                 {
 204                   complain (_("octal value outside range 0...255: `\\%o'"),
 205                             code);
 206                   code &= 0xFF;
 207                   break;
 208                 }
 209               c = xgetc (finput);
 210             }
 211           ungetc (c, finput);
 212         }
 213       else if (c == 'x')
 214         {
 215           c = xgetc (finput);
 216           code = 0;
 217           while (1)
 218             {
 219               if (c >= '0' && c <= '9')
 220                 code *= 16, code += c - '0';
 221               else if (c >= 'a' && c <= 'f')
 222                 code *= 16, code += c - 'a' + 10;
 223               else if (c >= 'A' && c <= 'F')
 224                 code *= 16, code += c - 'A' + 10;
 225               else
 226                 break;
 227               if (code >= 256 || code < 0)
 228                 {
 229                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 230                   code &= 0xFF;
 231                   break;
 232                 }
 233               c = xgetc (finput);
 234             }
 235           ungetc (c, finput);
 236         }
 237       else
 238         {
 239           char buf [] = "c";
 240           buf[0] = c;
 241           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 242                     quote (buf));
 243           code = '?';
 244         }
 245     }                           /* has \ */
 246
 247   /* now fill token_buffer with the canonical name for this character
 248      as a literal token.  Do not use what the user typed,
 249      so that `\012' and `\n' can be interchangeable.  */
 250
 251   p = *pp;
 252   if (code == term && wasquote)
 253     *p++ = code;
 254   else if (code == '\\')
 255     {
 256       *p++ = '\\';
 257       *p++ = '\\';
 258     }
 259   else if (code == '\'')
 260     {
 261       *p++ = '\\';
 262       *p++ = '\'';
 263     }
 264   else if (code == '\"')
 265     {
 266       *p++ = '\\';
 267       *p++ = '\"';
 268     }
 269   else if (code >= 040 && code < 0177)
 270     *p++ = code;
 271   else if (code == '\t')
 272     {
 273       *p++ = '\\';
 274       *p++ = 't';
 275     }
 276   else if (code == '\n')
 277     {
 278       *p++ = '\\';
 279       *p++ = 'n';
 280     }
 281   else if (code == '\r')
 282     {
 283       *p++ = '\\';
 284       *p++ = 'r';
 285     }
 286   else if (code == '\v')
 287     {
 288       *p++ = '\\';
 289       *p++ = 'v';
 290     }
 291   else if (code == '\b')
 292     {
 293       *p++ = '\\';
 294       *p++ = 'b';
 295     }
 296   else if (code == '\f')
 297     {
 298       *p++ = '\\';
 299       *p++ = 'f';
 300     }
 301   else
 302     {
 303       *p++ = '\\';
 304       *p++ = code / 0100 + '0';
 305       *p++ = ((code / 010) & 07) + '0';
 306       *p++ = (code & 07) + '0';
 307     }
 308   *pp = p;
 309   *pcode = code;
 310   return !wasquote;
 311 }
 312
 313
 314 void
 315 unlex (int token)
 316 {
 317   unlexed = token;
 318   unlexed_symval = symval;
 319 }
 320
 321
 322 int
 323 lex (void)
 324 {
 325   int c;
 326   char *p;
 327
 328   if (unlexed >= 0)
 329     {
 330       symval = unlexed_symval;
 331       c = unlexed;
 332       unlexed = -1;
 333       return c;
 334     }
 335
 336   c = skip_white_space ();
 337   *token_buffer = c;            /* for error messages (token buffer always valid) */
 338   token_buffer[1] = 0;
 339
 340   switch (c)
 341     {
 342     case EOF:
 343       strcpy (token_buffer, "EOF");
 344       return ENDFILE;
 345
 346     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 347     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 348     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 349     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 350     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 351     case 'Z':
 352     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 353     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 354     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 355     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 356     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 357     case 'z':
 358     case '.':    case '_':
 359
 360       p = token_buffer;
 361       while (isalnum (c) || c == '_' || c == '.')
 362         {
 363           if (p == token_buffer + maxtoken)
 364             p = grow_token_buffer (p);
 365
 366           *p++ = c;
 367           c = getc (finput);
 368         }
 369
 370       *p = 0;
 371       ungetc (c, finput);
 372       symval = getsym (token_buffer);
 373       return IDENTIFIER;
 374
 375     case '0':    case '1':    case '2':    case '3':    case '4':
 376     case '5':    case '6':    case '7':    case '8':    case '9':
 377       {
 378         numval = 0;
 379
 380         p = token_buffer;
 381         while (isdigit (c))
 382           {
 383             if (p == token_buffer + maxtoken)
 384               p = grow_token_buffer (p);
 385
 386             *p++ = c;
 387             numval = numval * 10 + c - '0';
 388             c = getc (finput);
 389           }
 390         *p = 0;
 391         ungetc (c, finput);
 392         return NUMBER;
 393       }
 394
 395     case '\'':
 396       /* parse the literal token and compute character code in  code  */
 397
 398       translations = -1;
 399       {
 400         int code, discode;
 401         char discard[10], *dp;
 402
 403         p = token_buffer;
 404         *p++ = '\'';
 405         literalchar (&p, &code, '\'');
 406
 407         c = getc (finput);
 408         if (c != '\'')
 409           {
 410             complain (_("use \"...\" for multi-character literal tokens"));
 411             while (1)
 412               {
 413                 dp = discard;
 414                 if (!literalchar (&dp, &discode, '\''))
 415                   break;
 416               }
 417           }
 418         *p++ = '\'';
 419         *p = 0;
 420         symval = getsym (token_buffer);
 421         symval->class = token_sym;
 422         if (!symval->user_token_number)
 423           symval->user_token_number = code;
 424         return IDENTIFIER;
 425       }
 426
 427     case '\"':
 428       /* parse the literal string token and treat as an identifier */
 429
 430       translations = -1;
 431       {
 432         int code;               /* ignored here */
 433         p = token_buffer;
 434         *p++ = '\"';
 435         while (literalchar (&p, &code, '\"'))   /* read up to and including " */
 436           {
 437             if (p >= token_buffer + maxtoken - 4)
 438               p = grow_token_buffer (p);
 439           }
 440         *p = 0;
 441
 442         symval = getsym (token_buffer);
 443         symval->class = token_sym;
 444
 445         return IDENTIFIER;
 446       }
 447
 448     case ',':
 449       return COMMA;
 450
 451     case ':':
 452       return COLON;
 453
 454     case ';':
 455       return SEMICOLON;
 456
 457     case '|':
 458       return BAR;
 459
 460     case '{':
 461       return LEFT_CURLY;
 462
 463     case '=':
 464       do
 465         {
 466           c = getc (finput);
 467           if (c == '\n')
 468             lineno++;
 469         }
 470       while (c == ' ' || c == '\n' || c == '\t');
 471
 472       if (c == '{')
 473         {
 474           strcpy (token_buffer, "={");
 475           return LEFT_CURLY;
 476         }
 477       else
 478         {
 479           ungetc (c, finput);
 480           return ILLEGAL;
 481         }
 482
 483     case '<':
 484       p = token_buffer;
 485       c = getc (finput);
 486       while (c != '>')
 487         {
 488           if (c == EOF)
 489             fatal (_("unterminated type name at end of file"));
 490           if (c == '\n')
 491             {
 492               complain (_("unterminated type name"));
 493               ungetc (c, finput);
 494               break;
 495             }
 496
 497           if (p == token_buffer + maxtoken)
 498             p = grow_token_buffer (p);
 499
 500           *p++ = c;
 501           c = getc (finput);
 502         }
 503       *p = 0;
 504       return TYPENAME;
 505
 506
 507     case '%':
 508       return parse_percent_token ();
 509
 510     default:
 511       return ILLEGAL;
 512     }
 513 }
 514
 515 /* the following table dictates the action taken for the various %
 516    directives.  A set_flag value causes the named flag to be set.  A
 517    retval action returns the code.  */
 518 struct percent_table_struct
 519 {
 520   const char *name;
 521   void *set_flag;
 522   int retval;
 523 }
 524 percent_table[] =
 525 {
 526   { "token", NULL, TOKEN },
 527   { "term", NULL, TOKEN },
 528   { "nterm", NULL, NTERM },
 529   { "type", NULL, TYPE },
 530   { "guard", NULL, GUARD },
 531   { "union", NULL, UNION },
 532   { "expect", NULL, EXPECT },
 533   { "thong", NULL, THONG },
 534   { "start", NULL, START },
 535   { "left", NULL, LEFT },
 536   { "right", NULL, RIGHT },
 537   { "nonassoc", NULL, NONASSOC },
 538   { "binary", NULL, NONASSOC },
 539   { "semantic_parser", NULL, SEMANTIC_PARSER },
 540   { "pure_parser", NULL, PURE_PARSER },
 541   { "prec", NULL, PREC },
 542   { "locations", &locations_flag, NOOP},        /* -l */
 543   { "no_lines", &no_lines_flag, NOOP},  /* -l */
 544   { "raw", &raw_flag, NOOP },   /* -r */
 545   { "token_table", &token_table_flag, NOOP},    /* -k */
 546 #if 0
 547     /* These can be utilized after main is reoganized so
 548        open_files() is deferred 'til after read_declarations().
 549        But %{ and %union both put information into files
 550        that have to be opened before read_declarations().
 551      */
 552   { "yacc", &yacc_flag, NOOP},                          /* -y */
 553   { "fixed_output_files", &yacc_flag, NOOP},            /* -y */
 554   { "defines", &defines_flag, NOOP},                    /* -d */
 555   { "no_parser", &no_parser_flag, NOOP},                /* -n */
 556   { "output_file", &spec_outfile, SETOPT},              /* -o */
 557   { "file_prefix", &spec_file_prefix, SETOPT},          /* -b */
 558   { "name_prefix", &spec_name_prefix, SETOPT},          /* -p */
 559     /* These would be acceptable, but they do not affect processing */
 560   { "verbose", &verbose_flag, NOOP},                    /* -v */
 561   { "debug", &debug_flag, NOOP},                        /* -t */
 562 /*    {"help", <print usage stmt>, NOOP}, *//* -h */
 563 /*    {"version", <print version number> ,  NOOP}, *//* -V */
 564 #endif
 565   { NULL, NULL, ILLEGAL}
 566 };
 567
 568 /* Parse a token which starts with %.
 569    Assumes the % has already been read and discarded.  */
 570
 571 int
 572 parse_percent_token (void)
 573 {
 574   int c;
 575   char *p;
 576   struct percent_table_struct *tx;
 577
 578   p = token_buffer;
 579   c = getc (finput);
 580   *p++ = '%';
 581   *p++ = c;                     /* for error msg */
 582   *p = 0;
 583
 584   switch (c)
 585     {
 586     case '%':
 587       return TWO_PERCENTS;
 588
 589     case '{':
 590       return PERCENT_LEFT_CURLY;
 591
 592     case '<':
 593       return LEFT;
 594
 595     case '>':
 596       return RIGHT;
 597
 598     case '2':
 599       return NONASSOC;
 600
 601     case '0':
 602       return TOKEN;
 603
 604     case '=':
 605       return PREC;
 606     }
 607   if (!isalpha (c))
 608     return ILLEGAL;
 609
 610   p = token_buffer;
 611   *p++ = '%';
 612   while (isalpha (c) || c == '_' || c == '-')
 613     {
 614       if (p == token_buffer + maxtoken)
 615         p = grow_token_buffer (p);
 616
 617       if (c == '-')
 618         c = '_';
 619       *p++ = c;
 620       c = getc (finput);
 621     }
 622
 623   ungetc (c, finput);
 624
 625   *p = 0;
 626
 627   /* table lookup % directive */
 628   for (tx = percent_table; tx->name; tx++)
 629     if (strcmp (token_buffer + 1, tx->name) == 0)
 630       break;
 631   if (tx->retval == SETOPT)
 632     {
 633       *((char **) (tx->set_flag)) = optarg;
 634       return NOOP;
 635     }
 636   if (tx->set_flag)
 637     {
 638       *((int *) (tx->set_flag)) = 1;
 639       return NOOP;
 640     }
 641   return tx->retval;
 642 }