src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 Boston, MA 02111-1307, USA.  */
  20
  21
  22 /*
  23    lex is the entry point.  It is called from reader.c.
  24    It returns one of the token-type codes defined in lex.h.
  25    When an identifier is seen, the code IDENTIFIER is returned
  26    and the name is looked up in the symbol table using symtab.c;
  27    symval is set to a pointer to the entry found.  */
  28
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "alloc.h"
  36
  37 /* flags set by % directives */
  38 extern int definesflag;         /* for -d */
  39 extern int toknumflag;          /* for -k */
  40 extern int noparserflag;        /* for -n */
  41 extern int fixed_outfiles;      /* for -y */
  42 extern int nolinesflag;         /* for -l */
  43 extern int rawtoknumflag;       /* for -r */
  44 extern int verboseflag; /* for -v */
  45 extern int debugflag;           /* for -t */
  46 extern char *spec_name_prefix;  /* for -p */
  47 extern char *spec_file_prefix;  /* for -b */
  48 /*spec_outfile is declared in files.h, for -o */
  49
  50 extern int lineno;
  51 extern int translations;
  52
  53 void init_lex PARAMS((void));
  54 char *grow_token_buffer PARAMS((char *));
  55 int skip_white_space PARAMS((void));
  56 int safegetc PARAMS((FILE *));
  57 int literalchar PARAMS((char **, int *, char));
  58 void unlex PARAMS((int));
  59 int lex PARAMS((void));
  60 int parse_percent_token PARAMS((void));
  61
  62 /* functions from main.c */
  63 extern char *printable_version PARAMS((int));
  64 extern void fatal PARAMS((char *));
  65 extern void warn PARAMS((char *));
  66 extern void warni PARAMS((char *, int));
  67 extern void warns PARAMS((char *, char *));
  68
  69 /* Buffer for storing the current token.  */
  70 char *token_buffer;
  71
  72 /* Allocated size of token_buffer, not including space for terminator.  */
  73 int maxtoken;
  74
  75 bucket *symval;
  76 int numval;
  77
  78 static int unlexed;             /* these two describe a token to be reread */
  79 static bucket *unlexed_symval;  /* by the next call to lex */
  80
  81
  82 void
  83 init_lex (void)
  84 {
  85   maxtoken = 100;
  86   token_buffer = NEW2 (maxtoken + 1, char);
  87   unlexed = -1;
  88 }
  89
  90
  91 char *
  92 grow_token_buffer (char *p)
  93 {
  94   int offset = p - token_buffer;
  95   maxtoken *= 2;
  96   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  97   return token_buffer + offset;
  98 }
  99
 100
 101 int
 102 skip_white_space (void)
 103 {
 104   register int c;
 105   register int inside;
 106
 107   c = getc(finput);
 108
 109   for (;;)
 110     {
 111       int cplus_comment;
 112
 113       switch (c)
 114         {
 115         case '/':
 116           c = getc(finput);
 117           if (c != '*' && c != '/')
 118             {
 119               warn(_("unexpected `/' found and ignored"));
 120               break;
 121             }
 122           cplus_comment = (c == '/');
 123
 124           c = getc(finput);
 125
 126           inside = 1;
 127           while (inside)
 128             {
 129               if (!cplus_comment && c == '*')
 130                 {
 131                   while (c == '*')
 132                     c = getc(finput);
 133
 134                   if (c == '/')
 135                     {
 136                       inside = 0;
 137                       c = getc(finput);
 138                     }
 139                 }
 140               else if (c == '\n')
 141                 {
 142                   lineno++;
 143                   if (cplus_comment)
 144                     inside = 0;
 145                   c = getc(finput);
 146                 }
 147               else if (c == EOF)
 148                 fatal(_("unterminated comment"));
 149               else
 150                 c = getc(finput);
 151             }
 152
 153           break;
 154
 155         case '\n':
 156           lineno++;
 157
 158         case ' ':
 159         case '\t':
 160         case '\f':
 161           c = getc(finput);
 162           break;
 163
 164         default:
 165           return (c);
 166         }
 167     }
 168 }
 169
 170 /* do a getc, but give error message if EOF encountered */
 171 int
 172 safegetc (FILE *f)
 173 {
 174   register int c = getc(f);
 175   if (c == EOF)
 176     fatal(_("Unexpected end of file"));
 177   return c;
 178 }
 179
 180 /* read one literal character from finput.  process \ escapes.
 181    append the normalized string version of the char to *pp.
 182    assign the character code to *pcode
 183    return 1 unless the character is an unescaped `term' or \n
 184         report error for \n
 185 */
 186 int
 187 literalchar (char **pp, int *pcode, char term)
 188 {
 189   register int c;
 190   register char *p;
 191   register int code;
 192   int wasquote = 0;
 193
 194   c = safegetc(finput);
 195   if (c == '\n')
 196     {
 197       warn(_("unescaped newline in constant"));
 198       ungetc(c, finput);
 199       code = '?';
 200       wasquote = 1;
 201     }
 202   else if (c != '\\')
 203     {
 204       code = c;
 205       if (c == term)
 206         wasquote = 1;
 207     }
 208   else
 209     {
 210       c = safegetc(finput);
 211       if (c == 't')  code = '\t';
 212       else if (c == 'n')  code = '\n';
 213       else if (c == 'a')  code = '\007';
 214       else if (c == 'r')  code = '\r';
 215       else if (c == 'f')  code = '\f';
 216       else if (c == 'b')  code = '\b';
 217       else if (c == 'v')  code = '\013';
 218       else if (c == '\\')  code = '\\';
 219       else if (c == '\'')  code = '\'';
 220       else if (c == '\"')  code = '\"';
 221       else if (c <= '7' && c >= '0')
 222         {
 223           code = 0;
 224           while (c <= '7' && c >= '0')
 225             {
 226               code = (code * 8) + (c - '0');
 227               if (code >= 256 || code < 0)
 228                 {
 229                   warni(_("octal value outside range 0...255: `\\%o'"), code);
 230                   code &= 0xFF;
 231                   break;
 232                 }
 233               c = safegetc(finput);
 234             }
 235           ungetc(c, finput);
 236         }
 237       else if (c == 'x')
 238         {
 239           c = safegetc(finput);
 240           code = 0;
 241           while (1)
 242             {
 243               if (c >= '0' && c <= '9')
 244                 code *= 16,  code += c - '0';
 245               else if (c >= 'a' && c <= 'f')
 246                 code *= 16,  code += c - 'a' + 10;
 247               else if (c >= 'A' && c <= 'F')
 248                 code *= 16,  code += c - 'A' + 10;
 249               else
 250                 break;
 251               if (code >= 256 || code<0)
 252                 {
 253                   warni(_("hexadecimal value above 255: `\\x%x'"), code);
 254                   code &= 0xFF;
 255                   break;
 256                 }
 257               c = safegetc(finput);
 258             }
 259           ungetc(c, finput);
 260         }
 261       else
 262         {
 263           warns (_("unknown escape sequence: `\\' followed by `%s'"),
 264                  printable_version(c));
 265           code = '?';
 266         }
 267     } /* has \ */
 268
 269   /* now fill token_buffer with the canonical name for this character
 270      as a literal token.  Do not use what the user typed,
 271      so that `\012' and `\n' can be interchangeable.  */
 272
 273   p = *pp;
 274   if (code == term && wasquote)
 275     *p++ = code;
 276   else if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 277   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 278   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 279   else if (code >= 040 && code < 0177)
 280     *p++ = code;
 281   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 282   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 283   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 284   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 285   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 286   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 287   else
 288     {
 289       *p++ = '\\';
 290       *p++ = code / 0100 + '0';
 291       *p++ = ((code / 010) & 07) + '0';
 292       *p++ = (code & 07) + '0';
 293     }
 294   *pp = p;
 295   *pcode = code;
 296   return  ! wasquote;
 297 }
 298
 299
 300 void
 301 unlex (int token)
 302 {
 303   unlexed = token;
 304   unlexed_symval = symval;
 305 }
 306
 307
 308 int
 309 lex (void)
 310 {
 311   register int c;
 312   char *p;
 313
 314   if (unlexed >= 0)
 315     {
 316       symval = unlexed_symval;
 317       c = unlexed;
 318       unlexed = -1;
 319       return (c);
 320     }
 321
 322   c = skip_white_space();
 323   *token_buffer = c;    /* for error messages (token buffer always valid) */
 324   token_buffer[1] = 0;
 325
 326   switch (c)
 327     {
 328     case EOF:
 329       strcpy(token_buffer, "EOF");
 330       return (ENDFILE);
 331
 332     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 333     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 334     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 335     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 336     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 337     case 'Z':
 338     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 339     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 340     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 341     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 342     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 343     case 'z':
 344     case '.':  case '_':
 345       p = token_buffer;
 346       while (isalnum(c) || c == '_' || c == '.')
 347         {
 348           if (p == token_buffer + maxtoken)
 349             p = grow_token_buffer(p);
 350
 351           *p++ = c;
 352           c = getc(finput);
 353         }
 354
 355       *p = 0;
 356       ungetc(c, finput);
 357       symval = getsym(token_buffer);
 358       return (IDENTIFIER);
 359
 360     case '0':  case '1':  case '2':  case '3':  case '4':
 361     case '5':  case '6':  case '7':  case '8':  case '9':
 362       {
 363         numval = 0;
 364
 365         p = token_buffer;
 366         while (isdigit(c))
 367           {
 368             if (p == token_buffer + maxtoken)
 369               p = grow_token_buffer(p);
 370
 371             *p++ = c;
 372             numval = numval*10 + c - '0';
 373             c = getc(finput);
 374           }
 375         *p = 0;
 376         ungetc(c, finput);
 377         return (NUMBER);
 378       }
 379
 380     case '\'':
 381
 382       /* parse the literal token and compute character code in  code  */
 383
 384       translations = -1;
 385       {
 386         int code, discode;
 387         char discard[10], *dp;
 388
 389         p = token_buffer;
 390         *p++ = '\'';
 391         literalchar(&p, &code, '\'');
 392
 393         c = getc(finput);
 394         if (c != '\'')
 395           {
 396             warn(_("use \"...\" for multi-character literal tokens"));
 397             while (1)
 398               {
 399                 dp = discard;
 400                 if (! literalchar(&dp, &discode, '\''))
 401                   break;
 402               }
 403           }
 404         *p++ = '\'';
 405         *p = 0;
 406         symval = getsym(token_buffer);
 407         symval->class = STOKEN;
 408         if (! symval->user_token_number)
 409           symval->user_token_number = code;
 410         return (IDENTIFIER);
 411       }
 412
 413     case '\"':
 414
 415       /* parse the literal string token and treat as an identifier */
 416
 417       translations = -1;
 418       {
 419         int code;       /* ignored here */
 420         p = token_buffer;
 421         *p++ = '\"';
 422         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 423           {
 424             if (p >= token_buffer + maxtoken - 4)
 425               p = grow_token_buffer(p);
 426           }
 427         *p = 0;
 428
 429         symval = getsym(token_buffer);
 430         symval->class = STOKEN;
 431
 432         return (IDENTIFIER);
 433       }
 434
 435     case ',':
 436       return (COMMA);
 437
 438     case ':':
 439       return (COLON);
 440
 441     case ';':
 442       return (SEMICOLON);
 443
 444     case '|':
 445       return (BAR);
 446
 447     case '{':
 448       return (LEFT_CURLY);
 449
 450     case '=':
 451       do
 452         {
 453           c = getc(finput);
 454           if (c == '\n') lineno++;
 455         }
 456       while(c==' ' || c=='\n' || c=='\t');
 457
 458       if (c == '{')
 459         {
 460           strcpy(token_buffer, "={");
 461           return(LEFT_CURLY);
 462         }
 463       else
 464         {
 465           ungetc(c, finput);
 466           return(ILLEGAL);
 467         }
 468
 469     case '<':
 470       p = token_buffer;
 471       c = getc(finput);
 472       while (c != '>')
 473         {
 474           if (c == EOF)
 475             fatal(_("unterminated type name at end of file"));
 476           if (c == '\n')
 477             {
 478               warn(_("unterminated type name"));
 479               ungetc(c, finput);
 480               break;
 481             }
 482
 483           if (p == token_buffer + maxtoken)
 484             p = grow_token_buffer(p);
 485
 486           *p++ = c;
 487           c = getc(finput);
 488         }
 489       *p = 0;
 490       return (TYPENAME);
 491
 492
 493     case '%':
 494       return (parse_percent_token());
 495
 496     default:
 497       return (ILLEGAL);
 498     }
 499 }
 500
 501 /* the following table dictates the action taken for the various
 502         % directives.  A setflag value causes the named flag to be
 503         set.  A retval action returns the code.
 504 */
 505 struct percent_table_struct {
 506         char *name;
 507         void *setflag;
 508         int retval;
 509 } percent_table[] =
 510 {
 511   {"token", NULL, TOKEN},
 512   {"term", NULL, TOKEN},
 513   {"nterm", NULL, NTERM},
 514   {"type", NULL, TYPE},
 515   {"guard", NULL, GUARD},
 516   {"union", NULL, UNION},
 517   {"expect", NULL, EXPECT},
 518   {"thong", NULL, THONG},
 519   {"start", NULL, START},
 520   {"left", NULL, LEFT},
 521   {"right", NULL, RIGHT},
 522   {"nonassoc", NULL, NONASSOC},
 523   {"binary", NULL, NONASSOC},
 524   {"semantic_parser", NULL, SEMANTIC_PARSER},
 525   {"pure_parser", NULL, PURE_PARSER},
 526   {"prec", NULL, PREC},
 527
 528   {"no_lines", &nolinesflag, NOOP}, /* -l */
 529   {"raw", &rawtoknumflag, NOOP}, /* -r */
 530   {"token_table", &toknumflag, NOOP}, /* -k */
 531
 532 #if 0
 533   /* These can be utilized after main is reoganized so
 534      open_files() is deferred 'til after read_declarations().
 535      But %{ and %union both put information into files
 536      that have to be opened before read_declarations().
 537      */
 538   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 539   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 540   {"defines", &definesflag, NOOP}, /* -d */
 541   {"no_parser", &noparserflag, NOOP}, /* -n */
 542   {"output_file", &spec_outfile, SETOPT}, /* -o */
 543   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 544   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 545
 546   /* These would be acceptable, but they do not affect processing */
 547   {"verbose", &verboseflag, NOOP}, /* -v */
 548   {"debug", &debugflag, NOOP},  /* -t */
 549   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 550   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 551 #endif
 552
 553   {NULL, NULL, ILLEGAL}
 554 };
 555
 556 /* Parse a token which starts with %.
 557    Assumes the % has already been read and discarded.  */
 558
 559 int
 560 parse_percent_token (void)
 561 {
 562   register int c;
 563   register char *p;
 564   register struct percent_table_struct *tx;
 565
 566   p = token_buffer;
 567   c = getc(finput);
 568   *p++ = '%';
 569   *p++ = c;     /* for error msg */
 570   *p = 0;
 571
 572   switch (c)
 573     {
 574     case '%':
 575       return (TWO_PERCENTS);
 576
 577     case '{':
 578       return (PERCENT_LEFT_CURLY);
 579
 580     case '<':
 581       return (LEFT);
 582
 583     case '>':
 584       return (RIGHT);
 585
 586     case '2':
 587       return (NONASSOC);
 588
 589     case '0':
 590       return (TOKEN);
 591
 592     case '=':
 593       return (PREC);
 594     }
 595   if (!isalpha(c))
 596     return (ILLEGAL);
 597
 598   p = token_buffer;
 599   *p++ = '%';
 600   while (isalpha(c) || c == '_' || c == '-')
 601     {
 602       if (p == token_buffer + maxtoken)
 603         p = grow_token_buffer(p);
 604
 605       if (c == '-') c = '_';
 606       *p++ = c;
 607       c = getc(finput);
 608     }
 609
 610   ungetc(c, finput);
 611
 612   *p = 0;
 613
 614   /* table lookup % directive */
 615   for (tx = percent_table; tx->name; tx++)
 616     if (strcmp(token_buffer+1, tx->name) == 0)
 617       break;
 618   if (tx->retval == SETOPT)
 619     {
 620       *((char **)(tx->setflag)) = optarg;
 621       return NOOP;
 622     }
 623   if (tx->setflag)
 624     {
 625       *((int *)(tx->setflag)) = 1;
 626       return NOOP;
 627     }
 628   return tx->retval;
 629 }