src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "xalloc.h"
  28 #include "complain.h"
  29 #include "gram.h"
  30
  31 /* functions from main.c */
  32 extern char *printable_version PARAMS ((int));
  33
  34 /* Buffer for storing the current token.  */
  35 char *token_buffer;
  36
  37 /* Allocated size of token_buffer, not including space for terminator.  */
  38 int maxtoken;
  39
  40 bucket *symval;
  41 int numval;
  42
  43 static int unlexed;             /* these two describe a token to be reread */
  44 static bucket *unlexed_symval;  /* by the next call to lex */
  45
  46
  47 void
  48 init_lex (void)
  49 {
  50   maxtoken = 100;
  51   token_buffer = XCALLOC (char, maxtoken + 1);
  52   unlexed = -1;
  53 }
  54
  55
  56 char *
  57 grow_token_buffer (char *p)
  58 {
  59   int offset = p - token_buffer;
  60   maxtoken *= 2;
  61   token_buffer = XREALLOC (token_buffer, char, maxtoken + 1);
  62   return token_buffer + offset;
  63 }
  64
  65
  66 int
  67 skip_white_space (void)
  68 {
  69   int c;
  70   int inside;
  71
  72   c = getc (finput);
  73
  74   for (;;)
  75     {
  76       int cplus_comment;
  77
  78       switch (c)
  79         {
  80         case '/':
  81           c = getc (finput);
  82           if (c != '*' && c != '/')
  83             {
  84               complain (_("unexpected `/' found and ignored"));
  85               break;
  86             }
  87           cplus_comment = (c == '/');
  88
  89           c = getc (finput);
  90
  91           inside = 1;
  92           while (inside)
  93             {
  94               if (!cplus_comment && c == '*')
  95                 {
  96                   while (c == '*')
  97                     c = getc (finput);
  98
  99                   if (c == '/')
 100                     {
 101                       inside = 0;
 102                       c = getc (finput);
 103                     }
 104                 }
 105               else if (c == '\n')
 106                 {
 107                   lineno++;
 108                   if (cplus_comment)
 109                     inside = 0;
 110                   c = getc (finput);
 111                 }
 112               else if (c == EOF)
 113                 fatal (_("unterminated comment"));
 114               else
 115                 c = getc (finput);
 116             }
 117
 118           break;
 119
 120         case '\n':
 121           lineno++;
 122
 123         case ' ':
 124         case '\t':
 125         case '\f':
 126           c = getc (finput);
 127           break;
 128
 129         default:
 130           return c;
 131         }
 132     }
 133 }
 134
 135 /* do a getc, but give error message if EOF encountered */
 136 static int
 137 xgetc (FILE *f)
 138 {
 139   int c = getc (f);
 140   if (c == EOF)
 141     fatal (_("unexpected end of file"));
 142   return c;
 143 }
 144
 145
 146 /*------------------------------------------------------------------.
 147 | Read one literal character from finput.  Process \ escapes.       |
 148 | Append the normalized string version of the char to *PP.  Assign  |
 149 | the character code to *PCODE. Return 1 unless the character is an |
 150 | unescaped `term' or \n report error for \n                        |
 151 `------------------------------------------------------------------*/
 152
 153 static int
 154 literalchar (char **pp, int *pcode, char term)
 155 {
 156   int c;
 157   char *p;
 158   int code;
 159   int wasquote = 0;
 160
 161   c = xgetc (finput);
 162   if (c == '\n')
 163     {
 164       complain (_("unescaped newline in constant"));
 165       ungetc (c, finput);
 166       code = '?';
 167       wasquote = 1;
 168     }
 169   else if (c != '\\')
 170     {
 171       code = c;
 172       if (c == term)
 173         wasquote = 1;
 174     }
 175   else
 176     {
 177       c = xgetc (finput);
 178       if (c == 't')
 179         code = '\t';
 180       else if (c == 'n')
 181         code = '\n';
 182       else if (c == 'a')
 183         code = '\007';
 184       else if (c == 'r')
 185         code = '\r';
 186       else if (c == 'f')
 187         code = '\f';
 188       else if (c == 'b')
 189         code = '\b';
 190       else if (c == 'v')
 191         code = '\013';
 192       else if (c == '\\')
 193         code = '\\';
 194       else if (c == '\'')
 195         code = '\'';
 196       else if (c == '\"')
 197         code = '\"';
 198       else if (c <= '7' && c >= '0')
 199         {
 200           code = 0;
 201           while (c <= '7' && c >= '0')
 202             {
 203               code = (code * 8) + (c - '0');
 204               if (code >= 256 || code < 0)
 205                 {
 206                   complain (_("octal value outside range 0...255: `\\%o'"),
 207                             code);
 208                   code &= 0xFF;
 209                   break;
 210                 }
 211               c = xgetc (finput);
 212             }
 213           ungetc (c, finput);
 214         }
 215       else if (c == 'x')
 216         {
 217           c = xgetc (finput);
 218           code = 0;
 219           while (1)
 220             {
 221               if (c >= '0' && c <= '9')
 222                 code *= 16, code += c - '0';
 223               else if (c >= 'a' && c <= 'f')
 224                 code *= 16, code += c - 'a' + 10;
 225               else if (c >= 'A' && c <= 'F')
 226                 code *= 16, code += c - 'A' + 10;
 227               else
 228                 break;
 229               if (code >= 256 || code < 0)
 230                 {
 231                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 232                   code &= 0xFF;
 233                   break;
 234                 }
 235               c = xgetc (finput);
 236             }
 237           ungetc (c, finput);
 238         }
 239       else
 240         {
 241           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 242                     printable_version (c));
 243           code = '?';
 244         }
 245     }                           /* has \ */
 246
 247   /* now fill token_buffer with the canonical name for this character
 248      as a literal token.  Do not use what the user typed,
 249      so that `\012' and `\n' can be interchangeable.  */
 250
 251   p = *pp;
 252   if (code == term && wasquote)
 253     *p++ = code;
 254   else if (code == '\\')
 255     {
 256       *p++ = '\\';
 257       *p++ = '\\';
 258     }
 259   else if (code == '\'')
 260     {
 261       *p++ = '\\';
 262       *p++ = '\'';
 263     }
 264   else if (code == '\"')
 265     {
 266       *p++ = '\\';
 267       *p++ = '\"';
 268     }
 269   else if (code >= 040 && code < 0177)
 270     *p++ = code;
 271   else if (code == '\t')
 272     {
 273       *p++ = '\\';
 274       *p++ = 't';
 275     }
 276   else if (code == '\n')
 277     {
 278       *p++ = '\\';
 279       *p++ = 'n';
 280     }
 281   else if (code == '\r')
 282     {
 283       *p++ = '\\';
 284       *p++ = 'r';
 285     }
 286   else if (code == '\v')
 287     {
 288       *p++ = '\\';
 289       *p++ = 'v';
 290     }
 291   else if (code == '\b')
 292     {
 293       *p++ = '\\';
 294       *p++ = 'b';
 295     }
 296   else if (code == '\f')
 297     {
 298       *p++ = '\\';
 299       *p++ = 'f';
 300     }
 301   else
 302     {
 303       *p++ = '\\';
 304       *p++ = code / 0100 + '0';
 305       *p++ = ((code / 010) & 07) + '0';
 306       *p++ = (code & 07) + '0';
 307     }
 308   *pp = p;
 309   *pcode = code;
 310   return !wasquote;
 311 }
 312
 313
 314 void
 315 unlex (int token)
 316 {
 317   unlexed = token;
 318   unlexed_symval = symval;
 319 }
 320
 321
 322 int
 323 lex (void)
 324 {
 325   int c;
 326   char *p;
 327
 328   if (unlexed >= 0)
 329     {
 330       symval = unlexed_symval;
 331       c = unlexed;
 332       unlexed = -1;
 333       return c;
 334     }
 335
 336   c = skip_white_space ();
 337   *token_buffer = c;            /* for error messages (token buffer always valid) */
 338   token_buffer[1] = 0;
 339
 340   switch (c)
 341     {
 342     case EOF:
 343       strcpy (token_buffer, "EOF");
 344       return ENDFILE;
 345
 346     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 347     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 348     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 349     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 350     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 351     case 'Z':
 352     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 353     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 354     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 355     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 356     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 357     case 'z':
 358     case '.':    case '_':
 359
 360       p = token_buffer;
 361       while (isalnum (c) || c == '_' || c == '.')
 362         {
 363           if (p == token_buffer + maxtoken)
 364             p = grow_token_buffer (p);
 365
 366           *p++ = c;
 367           c = getc (finput);
 368         }
 369
 370       *p = 0;
 371       ungetc (c, finput);
 372       symval = getsym (token_buffer);
 373       return IDENTIFIER;
 374
 375     case '0':    case '1':    case '2':    case '3':    case '4':
 376     case '5':    case '6':    case '7':    case '8':    case '9':
 377       {
 378         numval = 0;
 379
 380         p = token_buffer;
 381         while (isdigit (c))
 382           {
 383             if (p == token_buffer + maxtoken)
 384               p = grow_token_buffer (p);
 385
 386             *p++ = c;
 387             numval = numval * 10 + c - '0';
 388             c = getc (finput);
 389           }
 390         *p = 0;
 391         ungetc (c, finput);
 392         return NUMBER;
 393       }
 394
 395     case '\'':
 396       /* parse the literal token and compute character code in  code  */
 397
 398       translations = -1;
 399       {
 400         int code, discode;
 401         char discard[10], *dp;
 402
 403         p = token_buffer;
 404         *p++ = '\'';
 405         literalchar (&p, &code, '\'');
 406
 407         c = getc (finput);
 408         if (c != '\'')
 409           {
 410             complain (_("use \"...\" for multi-character literal tokens"));
 411             while (1)
 412               {
 413                 dp = discard;
 414                 if (!literalchar (&dp, &discode, '\''))
 415                   break;
 416               }
 417           }
 418         *p++ = '\'';
 419         *p = 0;
 420         symval = getsym (token_buffer);
 421         symval->class = STOKEN;
 422         if (!symval->user_token_number)
 423           symval->user_token_number = code;
 424         return IDENTIFIER;
 425       }
 426
 427     case '\"':
 428       /* parse the literal string token and treat as an identifier */
 429
 430       translations = -1;
 431       {
 432         int code;               /* ignored here */
 433         p = token_buffer;
 434         *p++ = '\"';
 435         while (literalchar (&p, &code, '\"'))   /* read up to and including " */
 436           {
 437             if (p >= token_buffer + maxtoken - 4)
 438               p = grow_token_buffer (p);
 439           }
 440         *p = 0;
 441
 442         symval = getsym (token_buffer);
 443         symval->class = STOKEN;
 444
 445         return IDENTIFIER;
 446       }
 447
 448     case ',':
 449       return COMMA;
 450
 451     case ':':
 452       return COLON;
 453
 454     case ';':
 455       return SEMICOLON;
 456
 457     case '|':
 458       return BAR;
 459
 460     case '{':
 461       return LEFT_CURLY;
 462
 463     case '=':
 464       do
 465         {
 466           c = getc (finput);
 467           if (c == '\n')
 468             lineno++;
 469         }
 470       while (c == ' ' || c == '\n' || c == '\t');
 471
 472       if (c == '{')
 473         {
 474           strcpy (token_buffer, "={");
 475           return LEFT_CURLY;
 476         }
 477       else
 478         {
 479           ungetc (c, finput);
 480           return ILLEGAL;
 481         }
 482
 483     case '<':
 484       p = token_buffer;
 485       c = getc (finput);
 486       while (c != '>')
 487         {
 488           if (c == EOF)
 489             fatal (_("unterminated type name at end of file"));
 490           if (c == '\n')
 491             {
 492               complain (_("unterminated type name"));
 493               ungetc (c, finput);
 494               break;
 495             }
 496
 497           if (p == token_buffer + maxtoken)
 498             p = grow_token_buffer (p);
 499
 500           *p++ = c;
 501           c = getc (finput);
 502         }
 503       *p = 0;
 504       return TYPENAME;
 505
 506
 507     case '%':
 508       return parse_percent_token ();
 509
 510     default:
 511       return ILLEGAL;
 512     }
 513 }
 514
 515 /* the following table dictates the action taken for the various %
 516    directives.  A setflag value causes the named flag to be set.  A
 517    retval action returns the code.  */
 518 struct percent_table_struct
 519 {
 520   const char *name;
 521   void *setflag;
 522   int retval;
 523 }
 524 percent_table[] =
 525 {
 526   { "token", NULL, TOKEN },
 527   { "term", NULL, TOKEN },
 528   { "nterm", NULL, NTERM },
 529   { "type", NULL, TYPE },
 530   { "guard", NULL, GUARD },
 531   { "union", NULL, UNION },
 532   { "expect", NULL, EXPECT },
 533   { "thong", NULL, THONG },
 534   { "start", NULL, START },
 535   { "left", NULL, LEFT },
 536   { "right", NULL, RIGHT },
 537   { "nonassoc", NULL, NONASSOC },
 538   { "binary", NULL, NONASSOC },
 539   { "semantic_parser", NULL, SEMANTIC_PARSER },
 540   { "pure_parser", NULL, PURE_PARSER },
 541   { "prec", NULL, PREC },
 542   { "no_lines", &nolinesflag, NOOP},    /* -l */
 543   { "raw", &rawtoknumflag, NOOP },      /* -r */
 544   { "token_table", &toknumflag, NOOP},  /* -k */
 545 #if 0
 546     /* These can be utilized after main is reoganized so
 547        open_files() is deferred 'til after read_declarations().
 548        But %{ and %union both put information into files
 549        that have to be opened before read_declarations().
 550      */
 551   { "yacc", &yaccflag, NOOP},                   /* -y */
 552   { "fixed_output_files", &yaccflag, NOOP},     /* -y */
 553   { "defines", &definesflag, NOOP},                     /* -d */
 554   { "no_parser", &noparserflag, NOOP},                  /* -n */
 555   { "output_file", &spec_outfile, SETOPT},              /* -o */
 556   { "file_prefix", &spec_file_prefix, SETOPT},          /* -b */
 557   { "name_prefix", &spec_name_prefix, SETOPT},          /* -p */
 558     /* These would be acceptable, but they do not affect processing */
 559   { "verbose", &verboseflag, NOOP},                     /* -v */
 560   { "debug", &debugflag, NOOP},                         /* -t */
 561 /*    {"help", <print usage stmt>, NOOP}, *//* -h */
 562 /*    {"version", <print version number> ,  NOOP}, *//* -V */
 563 #endif
 564   { NULL, NULL, ILLEGAL}
 565 };
 566
 567 /* Parse a token which starts with %.
 568    Assumes the % has already been read and discarded.  */
 569
 570 int
 571 parse_percent_token (void)
 572 {
 573   int c;
 574   char *p;
 575   struct percent_table_struct *tx;
 576
 577   p = token_buffer;
 578   c = getc (finput);
 579   *p++ = '%';
 580   *p++ = c;                     /* for error msg */
 581   *p = 0;
 582
 583   switch (c)
 584     {
 585     case '%':
 586       return TWO_PERCENTS;
 587
 588     case '{':
 589       return PERCENT_LEFT_CURLY;
 590
 591     case '<':
 592       return LEFT;
 593
 594     case '>':
 595       return RIGHT;
 596
 597     case '2':
 598       return NONASSOC;
 599
 600     case '0':
 601       return TOKEN;
 602
 603     case '=':
 604       return PREC;
 605     }
 606   if (!isalpha (c))
 607     return ILLEGAL;
 608
 609   p = token_buffer;
 610   *p++ = '%';
 611   while (isalpha (c) || c == '_' || c == '-')
 612     {
 613       if (p == token_buffer + maxtoken)
 614         p = grow_token_buffer (p);
 615
 616       if (c == '-')
 617         c = '_';
 618       *p++ = c;
 619       c = getc (finput);
 620     }
 621
 622   ungetc (c, finput);
 623
 624   *p = 0;
 625
 626   /* table lookup % directive */
 627   for (tx = percent_table; tx->name; tx++)
 628     if (strcmp (token_buffer + 1, tx->name) == 0)
 629       break;
 630   if (tx->retval == SETOPT)
 631     {
 632       *((char **) (tx->setflag)) = optarg;
 633       return NOOP;
 634     }
 635   if (tx->setflag)
 636     {
 637       *((int *) (tx->setflag)) = 1;
 638       return NOOP;
 639     }
 640   return tx->retval;
 641 }