src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21
  22 /*
  23    lex is the entry point.  It is called from reader.c.
  24    It returns one of the token-type codes defined in lex.h.
  25    When an identifier is seen, the code IDENTIFIER is returned
  26    and the name is looked up in the symbol table using symtab.c;
  27    symval is set to a pointer to the entry found.  */
  28
  29 #include "system.h"
  30 #include "getargs.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "alloc.h"
  36 #include "complain.h"
  37
  38 /*spec_outfile is declared in files.h, for -o */
  39
  40 extern int translations;
  41
  42 extern void init_lex PARAMS((void));
  43 extern char *grow_token_buffer PARAMS((char *));
  44 extern int skip_white_space PARAMS((void));
  45 extern void unlex PARAMS((int));
  46 extern int lex PARAMS((void));
  47 extern int parse_percent_token PARAMS((void));
  48
  49 static int safegetc PARAMS((FILE *));
  50 static int literalchar PARAMS((char **, int *, char));
  51
  52 /* functions from main.c */
  53 extern char *printable_version PARAMS((int));
  54
  55 /* Buffer for storing the current token.  */
  56 char *token_buffer;
  57
  58 /* Allocated size of token_buffer, not including space for terminator.  */
  59 int maxtoken;
  60
  61 bucket *symval;
  62 int numval;
  63
  64 static int unlexed;             /* these two describe a token to be reread */
  65 static bucket *unlexed_symval;  /* by the next call to lex */
  66
  67
  68 void
  69 init_lex (void)
  70 {
  71   maxtoken = 100;
  72   token_buffer = NEW2 (maxtoken + 1, char);
  73   unlexed = -1;
  74 }
  75
  76
  77 char *
  78 grow_token_buffer (char *p)
  79 {
  80   int offset = p - token_buffer;
  81   maxtoken *= 2;
  82   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  83   return token_buffer + offset;
  84 }
  85
  86
  87 int
  88 skip_white_space (void)
  89 {
  90   register int c;
  91   register int inside;
  92
  93   c = getc(finput);
  94
  95   for (;;)
  96     {
  97       int cplus_comment;
  98
  99       switch (c)
 100         {
 101         case '/':
 102           c = getc(finput);
 103           if (c != '*' && c != '/')
 104             {
 105               complain (_("unexpected `/' found and ignored"));
 106               break;
 107             }
 108           cplus_comment = (c == '/');
 109
 110           c = getc(finput);
 111
 112           inside = 1;
 113           while (inside)
 114             {
 115               if (!cplus_comment && c == '*')
 116                 {
 117                   while (c == '*')
 118                     c = getc(finput);
 119
 120                   if (c == '/')
 121                     {
 122                       inside = 0;
 123                       c = getc(finput);
 124                     }
 125                 }
 126               else if (c == '\n')
 127                 {
 128                   lineno++;
 129                   if (cplus_comment)
 130                     inside = 0;
 131                   c = getc(finput);
 132                 }
 133               else if (c == EOF)
 134                 fatal (_("unterminated comment"));
 135               else
 136                 c = getc(finput);
 137             }
 138
 139           break;
 140
 141         case '\n':
 142           lineno++;
 143
 144         case ' ':
 145         case '\t':
 146         case '\f':
 147           c = getc(finput);
 148           break;
 149
 150         default:
 151           return c;
 152         }
 153     }
 154 }
 155
 156 /* do a getc, but give error message if EOF encountered */
 157 static int
 158 safegetc (FILE *f)
 159 {
 160   register int c = getc(f);
 161   if (c == EOF)
 162     fatal (_("unexpected end of file"));
 163   return c;
 164 }
 165
 166 /* read one literal character from finput.  process \ escapes.
 167    append the normalized string version of the char to *pp.
 168    assign the character code to *pcode
 169    return 1 unless the character is an unescaped `term' or \n
 170         report error for \n
 171 */
 172 static int
 173 literalchar (char **pp, int *pcode, char term)
 174 {
 175   register int c;
 176   register char *p;
 177   register int code;
 178   int wasquote = 0;
 179
 180   c = safegetc(finput);
 181   if (c == '\n')
 182     {
 183       complain (_("unescaped newline in constant"));
 184       ungetc(c, finput);
 185       code = '?';
 186       wasquote = 1;
 187     }
 188   else if (c != '\\')
 189     {
 190       code = c;
 191       if (c == term)
 192         wasquote = 1;
 193     }
 194   else
 195     {
 196       c = safegetc(finput);
 197       if (c == 't')  code = '\t';
 198       else if (c == 'n')  code = '\n';
 199       else if (c == 'a')  code = '\007';
 200       else if (c == 'r')  code = '\r';
 201       else if (c == 'f')  code = '\f';
 202       else if (c == 'b')  code = '\b';
 203       else if (c == 'v')  code = '\013';
 204       else if (c == '\\')  code = '\\';
 205       else if (c == '\'')  code = '\'';
 206       else if (c == '\"')  code = '\"';
 207       else if (c <= '7' && c >= '0')
 208         {
 209           code = 0;
 210           while (c <= '7' && c >= '0')
 211             {
 212               code = (code * 8) + (c - '0');
 213               if (code >= 256 || code < 0)
 214                 {
 215                   complain (_("octal value outside range 0...255: `\\%o'"),
 216                             code);
 217                   code &= 0xFF;
 218                   break;
 219                 }
 220               c = safegetc(finput);
 221             }
 222           ungetc(c, finput);
 223         }
 224       else if (c == 'x')
 225         {
 226           c = safegetc(finput);
 227           code = 0;
 228           while (1)
 229             {
 230               if (c >= '0' && c <= '9')
 231                 code *= 16,  code += c - '0';
 232               else if (c >= 'a' && c <= 'f')
 233                 code *= 16,  code += c - 'a' + 10;
 234               else if (c >= 'A' && c <= 'F')
 235                 code *= 16,  code += c - 'A' + 10;
 236               else
 237                 break;
 238               if (code >= 256 || code<0)
 239                 {
 240                   complain (_("hexadecimal value above 255: `\\x%x'"),
 241                             code);
 242                   code &= 0xFF;
 243                   break;
 244                 }
 245               c = safegetc(finput);
 246             }
 247           ungetc(c, finput);
 248         }
 249       else
 250         {
 251           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 252                     printable_version(c));
 253           code = '?';
 254         }
 255     } /* has \ */
 256
 257   /* now fill token_buffer with the canonical name for this character
 258      as a literal token.  Do not use what the user typed,
 259      so that `\012' and `\n' can be interchangeable.  */
 260
 261   p = *pp;
 262   if (code == term && wasquote)
 263     *p++ = code;
 264   else if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 265   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 266   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 267   else if (code >= 040 && code < 0177)
 268     *p++ = code;
 269   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 270   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 271   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 272   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 273   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 274   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 275   else
 276     {
 277       *p++ = '\\';
 278       *p++ = code / 0100 + '0';
 279       *p++ = ((code / 010) & 07) + '0';
 280       *p++ = (code & 07) + '0';
 281     }
 282   *pp = p;
 283   *pcode = code;
 284   return  ! wasquote;
 285 }
 286
 287
 288 void
 289 unlex (int token)
 290 {
 291   unlexed = token;
 292   unlexed_symval = symval;
 293 }
 294
 295
 296 int
 297 lex (void)
 298 {
 299   register int c;
 300   char *p;
 301
 302   if (unlexed >= 0)
 303     {
 304       symval = unlexed_symval;
 305       c = unlexed;
 306       unlexed = -1;
 307       return c;
 308     }
 309
 310   c = skip_white_space();
 311   *token_buffer = c;    /* for error messages (token buffer always valid) */
 312   token_buffer[1] = 0;
 313
 314   switch (c)
 315     {
 316     case EOF:
 317       strcpy(token_buffer, "EOF");
 318       return ENDFILE;
 319
 320     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 321     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 322     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 323     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 324     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 325     case 'Z':
 326     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 327     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 328     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 329     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 330     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 331     case 'z':
 332     case '.':  case '_':
 333       p = token_buffer;
 334       while (isalnum(c) || c == '_' || c == '.')
 335         {
 336           if (p == token_buffer + maxtoken)
 337             p = grow_token_buffer(p);
 338
 339           *p++ = c;
 340           c = getc(finput);
 341         }
 342
 343       *p = 0;
 344       ungetc(c, finput);
 345       symval = getsym(token_buffer);
 346       return IDENTIFIER;
 347
 348     case '0':  case '1':  case '2':  case '3':  case '4':
 349     case '5':  case '6':  case '7':  case '8':  case '9':
 350       {
 351         numval = 0;
 352
 353         p = token_buffer;
 354         while (isdigit(c))
 355           {
 356             if (p == token_buffer + maxtoken)
 357               p = grow_token_buffer(p);
 358
 359             *p++ = c;
 360             numval = numval*10 + c - '0';
 361             c = getc(finput);
 362           }
 363         *p = 0;
 364         ungetc(c, finput);
 365         return NUMBER;
 366       }
 367
 368     case '\'':
 369
 370       /* parse the literal token and compute character code in  code  */
 371
 372       translations = -1;
 373       {
 374         int code, discode;
 375         char discard[10], *dp;
 376
 377         p = token_buffer;
 378         *p++ = '\'';
 379         literalchar(&p, &code, '\'');
 380
 381         c = getc(finput);
 382         if (c != '\'')
 383           {
 384             complain (_("use \"...\" for multi-character literal tokens"));
 385             while (1)
 386               {
 387                 dp = discard;
 388                 if (! literalchar(&dp, &discode, '\''))
 389                   break;
 390               }
 391           }
 392         *p++ = '\'';
 393         *p = 0;
 394         symval = getsym(token_buffer);
 395         symval->class = STOKEN;
 396         if (! symval->user_token_number)
 397           symval->user_token_number = code;
 398         return IDENTIFIER;
 399       }
 400
 401     case '\"':
 402
 403       /* parse the literal string token and treat as an identifier */
 404
 405       translations = -1;
 406       {
 407         int code;       /* ignored here */
 408         p = token_buffer;
 409         *p++ = '\"';
 410         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 411           {
 412             if (p >= token_buffer + maxtoken - 4)
 413               p = grow_token_buffer(p);
 414           }
 415         *p = 0;
 416
 417         symval = getsym(token_buffer);
 418         symval->class = STOKEN;
 419
 420         return IDENTIFIER;
 421       }
 422
 423     case ',':
 424       return COMMA;
 425
 426     case ':':
 427       return COLON;
 428
 429     case ';':
 430       return SEMICOLON;
 431
 432     case '|':
 433       return BAR;
 434
 435     case '{':
 436       return LEFT_CURLY;
 437
 438     case '=':
 439       do
 440         {
 441           c = getc(finput);
 442           if (c == '\n') lineno++;
 443         }
 444       while(c==' ' || c=='\n' || c=='\t');
 445
 446       if (c == '{')
 447         {
 448           strcpy(token_buffer, "={");
 449           return LEFT_CURLY;
 450         }
 451       else
 452         {
 453           ungetc(c, finput);
 454           return ILLEGAL;
 455         }
 456
 457     case '<':
 458       p = token_buffer;
 459       c = getc(finput);
 460       while (c != '>')
 461         {
 462           if (c == EOF)
 463             fatal (_("unterminated type name at end of file"));
 464           if (c == '\n')
 465             {
 466               complain (_("unterminated type name"));
 467               ungetc(c, finput);
 468               break;
 469             }
 470
 471           if (p == token_buffer + maxtoken)
 472             p = grow_token_buffer(p);
 473
 474           *p++ = c;
 475           c = getc(finput);
 476         }
 477       *p = 0;
 478       return TYPENAME;
 479
 480
 481     case '%':
 482       return parse_percent_token();
 483
 484     default:
 485       return ILLEGAL;
 486     }
 487 }
 488
 489 /* the following table dictates the action taken for the various
 490         % directives.  A setflag value causes the named flag to be
 491         set.  A retval action returns the code.
 492 */
 493 struct percent_table_struct {
 494         const char *name;
 495         void *setflag;
 496         int retval;
 497 } percent_table[] =
 498 {
 499   {"token", NULL, TOKEN},
 500   {"term", NULL, TOKEN},
 501   {"nterm", NULL, NTERM},
 502   {"type", NULL, TYPE},
 503   {"guard", NULL, GUARD},
 504   {"union", NULL, UNION},
 505   {"expect", NULL, EXPECT},
 506   {"thong", NULL, THONG},
 507   {"start", NULL, START},
 508   {"left", NULL, LEFT},
 509   {"right", NULL, RIGHT},
 510   {"nonassoc", NULL, NONASSOC},
 511   {"binary", NULL, NONASSOC},
 512   {"semantic_parser", NULL, SEMANTIC_PARSER},
 513   {"pure_parser", NULL, PURE_PARSER},
 514   {"prec", NULL, PREC},
 515
 516   {"no_lines", &nolinesflag, NOOP}, /* -l */
 517   {"raw", &rawtoknumflag, NOOP}, /* -r */
 518   {"token_table", &toknumflag, NOOP}, /* -k */
 519
 520 #if 0
 521   /* These can be utilized after main is reoganized so
 522      open_files() is deferred 'til after read_declarations().
 523      But %{ and %union both put information into files
 524      that have to be opened before read_declarations().
 525      */
 526   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 527   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 528   {"defines", &definesflag, NOOP}, /* -d */
 529   {"no_parser", &noparserflag, NOOP}, /* -n */
 530   {"output_file", &spec_outfile, SETOPT}, /* -o */
 531   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 532   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 533
 534   /* These would be acceptable, but they do not affect processing */
 535   {"verbose", &verboseflag, NOOP}, /* -v */
 536   {"debug", &debugflag, NOOP},  /* -t */
 537   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 538   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 539 #endif
 540
 541   {NULL, NULL, ILLEGAL}
 542 };
 543
 544 /* Parse a token which starts with %.
 545    Assumes the % has already been read and discarded.  */
 546
 547 int
 548 parse_percent_token (void)
 549 {
 550   register int c;
 551   register char *p;
 552   register struct percent_table_struct *tx;
 553
 554   p = token_buffer;
 555   c = getc(finput);
 556   *p++ = '%';
 557   *p++ = c;     /* for error msg */
 558   *p = 0;
 559
 560   switch (c)
 561     {
 562     case '%':
 563       return TWO_PERCENTS;
 564
 565     case '{':
 566       return PERCENT_LEFT_CURLY;
 567
 568     case '<':
 569       return LEFT;
 570
 571     case '>':
 572       return RIGHT;
 573
 574     case '2':
 575       return NONASSOC;
 576
 577     case '0':
 578       return TOKEN;
 579
 580     case '=':
 581       return PREC;
 582     }
 583   if (!isalpha(c))
 584     return ILLEGAL;
 585
 586   p = token_buffer;
 587   *p++ = '%';
 588   while (isalpha(c) || c == '_' || c == '-')
 589     {
 590       if (p == token_buffer + maxtoken)
 591         p = grow_token_buffer(p);
 592
 593       if (c == '-') c = '_';
 594       *p++ = c;
 595       c = getc(finput);
 596     }
 597
 598   ungetc(c, finput);
 599
 600   *p = 0;
 601
 602   /* table lookup % directive */
 603   for (tx = percent_table; tx->name; tx++)
 604     if (strcmp(token_buffer+1, tx->name) == 0)
 605       break;
 606   if (tx->retval == SETOPT)
 607     {
 608       *((char **)(tx->setflag)) = optarg;
 609       return NOOP;
 610     }
 611   if (tx->setflag)
 612     {
 613       *((int *)(tx->setflag)) = 1;
 614       return NOOP;
 615     }
 616   return tx->retval;
 617 }