src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 #include "system.h"
  22 #include "getargs.h"
  23 #include "files.h"
  24 #include "getopt.h"             /* for optarg */
  25 #include "symtab.h"
  26 #include "lex.h"
  27 #include "alloc.h"
  28 #include "complain.h"
  29
  30 /*spec_outfile is declared in files.h, for -o */
  31
  32 extern int translations;
  33
  34 /* functions from main.c */
  35 extern char *printable_version PARAMS ((int));
  36
  37 /* Buffer for storing the current token.  */
  38 char *token_buffer;
  39
  40 /* Allocated size of token_buffer, not including space for terminator.  */
  41 int maxtoken;
  42
  43 bucket *symval;
  44 int numval;
  45
  46 static int unlexed;             /* these two describe a token to be reread */
  47 static bucket *unlexed_symval;  /* by the next call to lex */
  48
  49
  50 void
  51 init_lex (void)
  52 {
  53   maxtoken = 100;
  54   token_buffer = NEW2 (maxtoken + 1, char);
  55   unlexed = -1;
  56 }
  57
  58
  59 char *
  60 grow_token_buffer (char *p)
  61 {
  62   int offset = p - token_buffer;
  63   maxtoken *= 2;
  64   token_buffer = (char *) xrealloc (token_buffer, maxtoken + 1);
  65   return token_buffer + offset;
  66 }
  67
  68
  69 int
  70 skip_white_space (void)
  71 {
  72   int c;
  73   int inside;
  74
  75   c = getc (finput);
  76
  77   for (;;)
  78     {
  79       int cplus_comment;
  80
  81       switch (c)
  82         {
  83         case '/':
  84           c = getc (finput);
  85           if (c != '*' && c != '/')
  86             {
  87               complain (_("unexpected `/' found and ignored"));
  88               break;
  89             }
  90           cplus_comment = (c == '/');
  91
  92           c = getc (finput);
  93
  94           inside = 1;
  95           while (inside)
  96             {
  97               if (!cplus_comment && c == '*')
  98                 {
  99                   while (c == '*')
 100                     c = getc (finput);
 101
 102                   if (c == '/')
 103                     {
 104                       inside = 0;
 105                       c = getc (finput);
 106                     }
 107                 }
 108               else if (c == '\n')
 109                 {
 110                   lineno++;
 111                   if (cplus_comment)
 112                     inside = 0;
 113                   c = getc (finput);
 114                 }
 115               else if (c == EOF)
 116                 fatal (_("unterminated comment"));
 117               else
 118                 c = getc (finput);
 119             }
 120
 121           break;
 122
 123         case '\n':
 124           lineno++;
 125
 126         case ' ':
 127         case '\t':
 128         case '\f':
 129           c = getc (finput);
 130           break;
 131
 132         default:
 133           return c;
 134         }
 135     }
 136 }
 137
 138 /* do a getc, but give error message if EOF encountered */
 139 static int
 140 xgetc (FILE *f)
 141 {
 142   int c = getc (f);
 143   if (c == EOF)
 144     fatal (_("unexpected end of file"));
 145   return c;
 146 }
 147
 148
 149 /*------------------------------------------------------------------.
 150 | Read one literal character from finput.  Process \ escapes.       |
 151 | Append the normalized string version of the char to *PP.  Assign  |
 152 | the character code to *PCODE. Return 1 unless the character is an |
 153 | unescaped `term' or \n report error for \n                        |
 154 `------------------------------------------------------------------*/
 155
 156 static int
 157 literalchar (char **pp, int *pcode, char term)
 158 {
 159   int c;
 160   char *p;
 161   int code;
 162   int wasquote = 0;
 163
 164   c = xgetc (finput);
 165   if (c == '\n')
 166     {
 167       complain (_("unescaped newline in constant"));
 168       ungetc (c, finput);
 169       code = '?';
 170       wasquote = 1;
 171     }
 172   else if (c != '\\')
 173     {
 174       code = c;
 175       if (c == term)
 176         wasquote = 1;
 177     }
 178   else
 179     {
 180       c = xgetc (finput);
 181       if (c == 't')
 182         code = '\t';
 183       else if (c == 'n')
 184         code = '\n';
 185       else if (c == 'a')
 186         code = '\007';
 187       else if (c == 'r')
 188         code = '\r';
 189       else if (c == 'f')
 190         code = '\f';
 191       else if (c == 'b')
 192         code = '\b';
 193       else if (c == 'v')
 194         code = '\013';
 195       else if (c == '\\')
 196         code = '\\';
 197       else if (c == '\'')
 198         code = '\'';
 199       else if (c == '\"')
 200         code = '\"';
 201       else if (c <= '7' && c >= '0')
 202         {
 203           code = 0;
 204           while (c <= '7' && c >= '0')
 205             {
 206               code = (code * 8) + (c - '0');
 207               if (code >= 256 || code < 0)
 208                 {
 209                   complain (_("octal value outside range 0...255: `\\%o'"),
 210                             code);
 211                   code &= 0xFF;
 212                   break;
 213                 }
 214               c = xgetc (finput);
 215             }
 216           ungetc (c, finput);
 217         }
 218       else if (c == 'x')
 219         {
 220           c = xgetc (finput);
 221           code = 0;
 222           while (1)
 223             {
 224               if (c >= '0' && c <= '9')
 225                 code *= 16, code += c - '0';
 226               else if (c >= 'a' && c <= 'f')
 227                 code *= 16, code += c - 'a' + 10;
 228               else if (c >= 'A' && c <= 'F')
 229                 code *= 16, code += c - 'A' + 10;
 230               else
 231                 break;
 232               if (code >= 256 || code < 0)
 233                 {
 234                   complain (_("hexadecimal value above 255: `\\x%x'"), code);
 235                   code &= 0xFF;
 236                   break;
 237                 }
 238               c = xgetc (finput);
 239             }
 240           ungetc (c, finput);
 241         }
 242       else
 243         {
 244           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 245                     printable_version (c));
 246           code = '?';
 247         }
 248     }                           /* has \ */
 249
 250   /* now fill token_buffer with the canonical name for this character
 251      as a literal token.  Do not use what the user typed,
 252      so that `\012' and `\n' can be interchangeable.  */
 253
 254   p = *pp;
 255   if (code == term && wasquote)
 256     *p++ = code;
 257   else if (code == '\\')
 258     {
 259       *p++ = '\\';
 260       *p++ = '\\';
 261     }
 262   else if (code == '\'')
 263     {
 264       *p++ = '\\';
 265       *p++ = '\'';
 266     }
 267   else if (code == '\"')
 268     {
 269       *p++ = '\\';
 270       *p++ = '\"';
 271     }
 272   else if (code >= 040 && code < 0177)
 273     *p++ = code;
 274   else if (code == '\t')
 275     {
 276       *p++ = '\\';
 277       *p++ = 't';
 278     }
 279   else if (code == '\n')
 280     {
 281       *p++ = '\\';
 282       *p++ = 'n';
 283     }
 284   else if (code == '\r')
 285     {
 286       *p++ = '\\';
 287       *p++ = 'r';
 288     }
 289   else if (code == '\v')
 290     {
 291       *p++ = '\\';
 292       *p++ = 'v';
 293     }
 294   else if (code == '\b')
 295     {
 296       *p++ = '\\';
 297       *p++ = 'b';
 298     }
 299   else if (code == '\f')
 300     {
 301       *p++ = '\\';
 302       *p++ = 'f';
 303     }
 304   else
 305     {
 306       *p++ = '\\';
 307       *p++ = code / 0100 + '0';
 308       *p++ = ((code / 010) & 07) + '0';
 309       *p++ = (code & 07) + '0';
 310     }
 311   *pp = p;
 312   *pcode = code;
 313   return !wasquote;
 314 }
 315
 316
 317 void
 318 unlex (int token)
 319 {
 320   unlexed = token;
 321   unlexed_symval = symval;
 322 }
 323
 324
 325 int
 326 lex (void)
 327 {
 328   int c;
 329   char *p;
 330
 331   if (unlexed >= 0)
 332     {
 333       symval = unlexed_symval;
 334       c = unlexed;
 335       unlexed = -1;
 336       return c;
 337     }
 338
 339   c = skip_white_space ();
 340   *token_buffer = c;            /* for error messages (token buffer always valid) */
 341   token_buffer[1] = 0;
 342
 343   switch (c)
 344     {
 345     case EOF:
 346       strcpy (token_buffer, "EOF");
 347       return ENDFILE;
 348
 349     case 'A':    case 'B':    case 'C':    case 'D':    case 'E':
 350     case 'F':    case 'G':    case 'H':    case 'I':    case 'J':
 351     case 'K':    case 'L':    case 'M':    case 'N':    case 'O':
 352     case 'P':    case 'Q':    case 'R':    case 'S':    case 'T':
 353     case 'U':    case 'V':    case 'W':    case 'X':    case 'Y':
 354     case 'Z':
 355     case 'a':    case 'b':    case 'c':    case 'd':    case 'e':
 356     case 'f':    case 'g':    case 'h':    case 'i':    case 'j':
 357     case 'k':    case 'l':    case 'm':    case 'n':    case 'o':
 358     case 'p':    case 'q':    case 'r':    case 's':    case 't':
 359     case 'u':    case 'v':    case 'w':    case 'x':    case 'y':
 360     case 'z':
 361     case '.':    case '_':
 362
 363       p = token_buffer;
 364       while (isalnum (c) || c == '_' || c == '.')
 365         {
 366           if (p == token_buffer + maxtoken)
 367             p = grow_token_buffer (p);
 368
 369           *p++ = c;
 370           c = getc (finput);
 371         }
 372
 373       *p = 0;
 374       ungetc (c, finput);
 375       symval = getsym (token_buffer);
 376       return IDENTIFIER;
 377
 378     case '0':    case '1':    case '2':    case '3':    case '4':
 379     case '5':    case '6':    case '7':    case '8':    case '9':
 380       {
 381         numval = 0;
 382
 383         p = token_buffer;
 384         while (isdigit (c))
 385           {
 386             if (p == token_buffer + maxtoken)
 387               p = grow_token_buffer (p);
 388
 389             *p++ = c;
 390             numval = numval * 10 + c - '0';
 391             c = getc (finput);
 392           }
 393         *p = 0;
 394         ungetc (c, finput);
 395         return NUMBER;
 396       }
 397
 398     case '\'':
 399       /* parse the literal token and compute character code in  code  */
 400
 401       translations = -1;
 402       {
 403         int code, discode;
 404         char discard[10], *dp;
 405
 406         p = token_buffer;
 407         *p++ = '\'';
 408         literalchar (&p, &code, '\'');
 409
 410         c = getc (finput);
 411         if (c != '\'')
 412           {
 413             complain (_("use \"...\" for multi-character literal tokens"));
 414             while (1)
 415               {
 416                 dp = discard;
 417                 if (!literalchar (&dp, &discode, '\''))
 418                   break;
 419               }
 420           }
 421         *p++ = '\'';
 422         *p = 0;
 423         symval = getsym (token_buffer);
 424         symval->class = STOKEN;
 425         if (!symval->user_token_number)
 426           symval->user_token_number = code;
 427         return IDENTIFIER;
 428       }
 429
 430     case '\"':
 431       /* parse the literal string token and treat as an identifier */
 432
 433       translations = -1;
 434       {
 435         int code;               /* ignored here */
 436         p = token_buffer;
 437         *p++ = '\"';
 438         while (literalchar (&p, &code, '\"'))   /* read up to and including " */
 439           {
 440             if (p >= token_buffer + maxtoken - 4)
 441               p = grow_token_buffer (p);
 442           }
 443         *p = 0;
 444
 445         symval = getsym (token_buffer);
 446         symval->class = STOKEN;
 447
 448         return IDENTIFIER;
 449       }
 450
 451     case ',':
 452       return COMMA;
 453
 454     case ':':
 455       return COLON;
 456
 457     case ';':
 458       return SEMICOLON;
 459
 460     case '|':
 461       return BAR;
 462
 463     case '{':
 464       return LEFT_CURLY;
 465
 466     case '=':
 467       do
 468         {
 469           c = getc (finput);
 470           if (c == '\n')
 471             lineno++;
 472         }
 473       while (c == ' ' || c == '\n' || c == '\t');
 474
 475       if (c == '{')
 476         {
 477           strcpy (token_buffer, "={");
 478           return LEFT_CURLY;
 479         }
 480       else
 481         {
 482           ungetc (c, finput);
 483           return ILLEGAL;
 484         }
 485
 486     case '<':
 487       p = token_buffer;
 488       c = getc (finput);
 489       while (c != '>')
 490         {
 491           if (c == EOF)
 492             fatal (_("unterminated type name at end of file"));
 493           if (c == '\n')
 494             {
 495               complain (_("unterminated type name"));
 496               ungetc (c, finput);
 497               break;
 498             }
 499
 500           if (p == token_buffer + maxtoken)
 501             p = grow_token_buffer (p);
 502
 503           *p++ = c;
 504           c = getc (finput);
 505         }
 506       *p = 0;
 507       return TYPENAME;
 508
 509
 510     case '%':
 511       return parse_percent_token ();
 512
 513     default:
 514       return ILLEGAL;
 515     }
 516 }
 517
 518 /* the following table dictates the action taken for the various %
 519    directives.  A setflag value causes the named flag to be set.  A
 520    retval action returns the code.  */
 521 struct percent_table_struct
 522 {
 523   const char *name;
 524   void *setflag;
 525   int retval;
 526 }
 527 percent_table[] =
 528 {
 529   { "token", NULL, TOKEN },
 530   { "term", NULL, TOKEN },
 531   { "nterm", NULL, NTERM },
 532   { "type", NULL, TYPE },
 533   { "guard", NULL, GUARD },
 534   { "union", NULL, UNION },
 535   { "expect", NULL, EXPECT },
 536   { "thong", NULL, THONG },
 537   { "start", NULL, START },
 538   { "left", NULL, LEFT },
 539   { "right", NULL, RIGHT },
 540   { "nonassoc", NULL, NONASSOC },
 541   { "binary", NULL, NONASSOC },
 542   { "semantic_parser", NULL, SEMANTIC_PARSER },
 543   { "pure_parser", NULL, PURE_PARSER },
 544   { "prec", NULL, PREC },
 545   { "no_lines", &nolinesflag, NOOP},    /* -l */
 546   { "raw", &rawtoknumflag, NOOP },      /* -r */
 547   { "token_table", &toknumflag, NOOP},  /* -k */
 548 #if 0
 549     /* These can be utilized after main is reoganized so
 550        open_files() is deferred 'til after read_declarations().
 551        But %{ and %union both put information into files
 552        that have to be opened before read_declarations().
 553      */
 554   { "yacc", &fixed_outfiles, NOOP},                     /* -y */
 555   { "fixed_output_files", &fixed_outfiles, NOOP},       /* -y */
 556   { "defines", &definesflag, NOOP},                     /* -d */
 557   { "no_parser", &noparserflag, NOOP},                  /* -n */
 558   { "output_file", &spec_outfile, SETOPT},              /* -o */
 559   { "file_prefix", &spec_file_prefix, SETOPT},          /* -b */
 560   { "name_prefix", &spec_name_prefix, SETOPT},          /* -p */
 561     /* These would be acceptable, but they do not affect processing */
 562   { "verbose", &verboseflag, NOOP},                     /* -v */
 563   { "debug", &debugflag, NOOP},                         /* -t */
 564 /*    {"help", <print usage stmt>, NOOP}, *//* -h */
 565 /*    {"version", <print version number> ,  NOOP}, *//* -V */
 566 #endif
 567   { NULL, NULL, ILLEGAL}
 568 };
 569
 570 /* Parse a token which starts with %.
 571    Assumes the % has already been read and discarded.  */
 572
 573 int
 574 parse_percent_token (void)
 575 {
 576   int c;
 577   char *p;
 578   struct percent_table_struct *tx;
 579
 580   p = token_buffer;
 581   c = getc (finput);
 582   *p++ = '%';
 583   *p++ = c;                     /* for error msg */
 584   *p = 0;
 585
 586   switch (c)
 587     {
 588     case '%':
 589       return TWO_PERCENTS;
 590
 591     case '{':
 592       return PERCENT_LEFT_CURLY;
 593
 594     case '<':
 595       return LEFT;
 596
 597     case '>':
 598       return RIGHT;
 599
 600     case '2':
 601       return NONASSOC;
 602
 603     case '0':
 604       return TOKEN;
 605
 606     case '=':
 607       return PREC;
 608     }
 609   if (!isalpha (c))
 610     return ILLEGAL;
 611
 612   p = token_buffer;
 613   *p++ = '%';
 614   while (isalpha (c) || c == '_' || c == '-')
 615     {
 616       if (p == token_buffer + maxtoken)
 617         p = grow_token_buffer (p);
 618
 619       if (c == '-')
 620         c = '_';
 621       *p++ = c;
 622       c = getc (finput);
 623     }
 624
 625   ungetc (c, finput);
 626
 627   *p = 0;
 628
 629   /* table lookup % directive */
 630   for (tx = percent_table; tx->name; tx++)
 631     if (strcmp (token_buffer + 1, tx->name) == 0)
 632       break;
 633   if (tx->retval == SETOPT)
 634     {
 635       *((char **) (tx->setflag)) = optarg;
 636       return NOOP;
 637     }
 638   if (tx->setflag)
 639     {
 640       *((int *) (tx->setflag)) = 1;
 641       return NOOP;
 642     }
 643   return tx->retval;
 644 }