src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20
  21 /*
  22    lex is the entry point.  It is called from reader.c.
  23    It returns one of the token-type codes defined in lex.h.
  24    When an identifier is seen, the code IDENTIFIER is returned
  25    and the name is looked up in the symbol table using symtab.c;
  26    symval is set to a pointer to the entry found.  */
  27
  28 #include <stdio.h>
  29 #include <ctype.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "new.h"
  36
  37 /* flags set by % directives */
  38 extern int definesflag;         /* for -d */
  39 extern int toknumflag;          /* for -k */
  40 extern int noparserflag;        /* for -n */
  41 extern int fixed_outfiles;      /* for -y */
  42 extern int nolinesflag;         /* for -l */
  43 extern int rawtoknumflag;       /* for -r */
  44 extern int verboseflag; /* for -v */
  45 extern int debugflag;           /* for -t */
  46 extern char *spec_name_prefix;  /* for -p */
  47 extern char *spec_file_prefix;  /* for -b */
  48 /*spec_outfile is declared in files.h, for -o */
  49
  50 extern int lineno;
  51 extern int translations;
  52
  53 int parse_percent_token();
  54
  55 /* functions from main.c */
  56 extern char *printable_version();
  57 extern void fatal();
  58 extern void warni();
  59 extern void warn();
  60
  61 /* Buffer for storing the current token.  */
  62 char *token_buffer;
  63
  64 /* Allocated size of token_buffer, not including space for terminator.  */
  65 static int maxtoken;
  66
  67 bucket *symval;
  68 int numval;
  69
  70 static int unlexed;             /* these two describe a token to be reread */
  71 static bucket *unlexed_symval;  /* by the next call to lex */
  72
  73
  74 void
  75 init_lex()
  76 {
  77   maxtoken = 100;
  78   token_buffer = NEW2 (maxtoken + 1, char);
  79   unlexed = -1;
  80 }
  81
  82
  83 static char *
  84 grow_token_buffer (p)
  85      char *p;
  86 {
  87   int offset = p - token_buffer;
  88   maxtoken *= 2;
  89   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  90   return token_buffer + offset;
  91 }
  92
  93
  94 int
  95 skip_white_space()
  96 {
  97   register int c;
  98   register int inside;
  99
 100   c = getc(finput);
 101
 102   for (;;)
 103     {
 104       int cplus_comment;
 105
 106       switch (c)
 107         {
 108         case '/':
 109           c = getc(finput);
 110           if (c != '*' && c != '/')
 111             {
 112               warn("unexpected `/' found and ignored");
 113               break;
 114             }
 115           cplus_comment = (c == '/');
 116
 117           c = getc(finput);
 118
 119           inside = 1;
 120           while (inside)
 121             {
 122               if (!cplus_comment && c == '*')
 123                 {
 124                   while (c == '*')
 125                     c = getc(finput);
 126
 127                   if (c == '/')
 128                     {
 129                       inside = 0;
 130                       c = getc(finput);
 131                     }
 132                 }
 133               else if (c == '\n')
 134                 {
 135                   lineno++;
 136                   if (cplus_comment)
 137                     inside = 0;
 138                   c = getc(finput);
 139                 }
 140               else if (c == EOF)
 141                 fatal("unterminated comment");
 142               else
 143                 c = getc(finput);
 144             }
 145
 146           break;
 147
 148         case '\n':
 149           lineno++;
 150
 151         case ' ':
 152         case '\t':
 153         case '\f':
 154           c = getc(finput);
 155           break;
 156
 157         default:
 158           return (c);
 159         }
 160     }
 161 }
 162
 163 /* do a getc, but give error message if EOF encountered */
 164 int
 165 safegetc(f)
 166   FILE *f;
 167 {
 168   register int c = getc(f);
 169   if (c == EOF)
 170     fatal("Unexpected end of file");
 171   return c;
 172 }
 173
 174 /* read one literal character from finput.  process \ escapes.
 175    append the normalized string version of the char to *pp.
 176    assign the character code to *pcode
 177    return 1 unless the character is an unescaped `term' or \n
 178         report error for \n
 179 */
 180 int
 181 literalchar(pp, pcode, term)
 182   char **pp;
 183   int *pcode;
 184   char term;
 185 {
 186   register int c;
 187   register char *p;
 188   register int code;
 189   int wasquote = 0;
 190
 191   c = safegetc(finput);
 192   if (c == '\n')
 193     {
 194       warn("unescaped newline in constant");
 195       ungetc(c, finput);
 196       code = '?';
 197       wasquote = 1;
 198     }
 199   else if (c != '\\')
 200     {
 201       code = c;
 202       if (c == term)
 203         wasquote = 1;
 204     }
 205   else
 206     {
 207       c = safegetc(finput);
 208       if (c == 't')  code = '\t';
 209       else if (c == 'n')  code = '\n';
 210       else if (c == 'a')  code = '\007';
 211       else if (c == 'r')  code = '\r';
 212       else if (c == 'f')  code = '\f';
 213       else if (c == 'b')  code = '\b';
 214       else if (c == 'v')  code = 013;
 215       else if (c == '\\')  code = '\\';
 216       else if (c == '\'')  code = '\'';
 217       else if (c == '\"')  code = '\"';
 218       else if (c <= '7' && c >= '0')
 219         {
 220           code = 0;
 221           while (c <= '7' && c >= '0')
 222             {
 223               code = (code * 8) + (c - '0');
 224               if (code >= 256 || code < 0)
 225                 {
 226                   warni("octal value outside range 0...255: `\\%o'", code);
 227                   code &= 0xFF;
 228                   break;
 229                 }
 230               c = safegetc(finput);
 231             }
 232           ungetc(c, finput);
 233         }
 234       else if (c == 'x')
 235         {
 236           c = safegetc(finput);
 237           code = 0;
 238           while (1)
 239             {
 240               if (c >= '0' && c <= '9')
 241                 code *= 16,  code += c - '0';
 242               else if (c >= 'a' && c <= 'f')
 243                 code *= 16,  code += c - 'a' + 10;
 244               else if (c >= 'A' && c <= 'F')
 245                 code *= 16,  code += c - 'A' + 10;
 246               else
 247                 break;
 248               if (code >= 256 || code<0)
 249                 {
 250                   warni("hexadecimal value above 255: `\\x%x'", code);
 251                   code &= 0xFF;
 252                   break;
 253                 }
 254               c = safegetc(finput);
 255             }
 256           ungetc(c, finput);
 257         }
 258       else
 259         {
 260           warni ("unknown escape sequence: `\\' followed by `%s'",
 261                  printable_version(c));
 262           code = '?';
 263         }
 264     } /* has \ */
 265
 266   /* now fill token_buffer with the canonical name for this character
 267      as a literal token.  Do not use what the user typed,
 268      so that `\012' and `\n' can be interchangeable.  */
 269
 270   p = *pp;
 271   if (code >= 040 && code < 0177)
 272     *p++ = code;
 273   else if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 274   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 275   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 276   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 277   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 278   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 279   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 280   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 281   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 282   else
 283     {
 284       *p++ = '\\';
 285       *p++ = code / 0100 + '0';
 286       *p++ = ((code / 010) & 07) + '0';
 287       *p++ = (code & 07) + '0';
 288     }
 289   *pp = p;
 290   *pcode = code;
 291   return  ! wasquote;
 292 }
 293
 294
 295 void
 296 unlex(token)
 297      int token;
 298 {
 299   unlexed = token;
 300   unlexed_symval = symval;
 301 }
 302
 303
 304 int
 305 lex()
 306 {
 307   register int c;
 308   char *p;
 309
 310   if (unlexed >= 0)
 311     {
 312       symval = unlexed_symval;
 313       c = unlexed;
 314       unlexed = -1;
 315       return (c);
 316     }
 317
 318   c = skip_white_space();
 319   *token_buffer = c;    /* for error messages (token buffer always valid) */
 320   token_buffer[1] = 0;
 321
 322   switch (c)
 323     {
 324     case EOF:
 325       strcpy(token_buffer, "EOF");
 326       return (ENDFILE);
 327
 328     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 329     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 330     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 331     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 332     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 333     case 'Z':
 334     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 335     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 336     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 337     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 338     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 339     case 'z':
 340     case '.':  case '_':
 341       p = token_buffer;
 342       while (isalnum(c) || c == '_' || c == '.')
 343         {
 344           if (p == token_buffer + maxtoken)
 345             p = grow_token_buffer(p);
 346
 347           *p++ = c;
 348           c = getc(finput);
 349         }
 350
 351       *p = 0;
 352       ungetc(c, finput);
 353       symval = getsym(token_buffer);
 354       return (IDENTIFIER);
 355
 356     case '0':  case '1':  case '2':  case '3':  case '4':
 357     case '5':  case '6':  case '7':  case '8':  case '9':
 358       {
 359         numval = 0;
 360
 361         p = token_buffer;
 362         while (isdigit(c))
 363           {
 364             if (p == token_buffer + maxtoken)
 365               p = grow_token_buffer(p);
 366
 367             *p++ = c;
 368             numval = numval*10 + c - '0';
 369             c = getc(finput);
 370           }
 371         *p = 0;
 372         ungetc(c, finput);
 373         return (NUMBER);
 374       }
 375
 376     case '\'':
 377
 378       /* parse the literal token and compute character code in  code  */
 379
 380       translations = -1;
 381       {
 382         int code, discode;
 383         char discard[10], *dp;
 384         p = token_buffer;
 385         *p++ = '\'';
 386         literalchar(&p, &code, '\'');
 387
 388         c = getc(finput);
 389         if (c != '\'')
 390           {
 391             warn("use \"...\" for multi-character literal tokens");
 392             dp = discard;
 393             while (literalchar(&dp, &discode, '\'')) {}
 394           }
 395         *p++ = '\'';
 396         *p = 0;
 397         symval = getsym(token_buffer);
 398         symval->class = STOKEN;
 399         if (! symval->user_token_number)
 400           symval->user_token_number = code;
 401         return (IDENTIFIER);
 402       }
 403
 404     case '\"':
 405
 406       /* parse the literal string token and treat as an identifier */
 407
 408       translations = -1;
 409       {
 410         int code;       /* ignored here */
 411         p = token_buffer;
 412         *p++ = '\"';
 413         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 414           {
 415             if (p >= token_buffer + maxtoken - 4)
 416               p = grow_token_buffer(p);
 417           }
 418         *p = 0;
 419
 420         symval = getsym(token_buffer);
 421         symval->class = STOKEN;
 422
 423         return (IDENTIFIER);
 424       }
 425
 426     case ',':
 427       return (COMMA);
 428
 429     case ':':
 430       return (COLON);
 431
 432     case ';':
 433       return (SEMICOLON);
 434
 435     case '|':
 436       return (BAR);
 437
 438     case '{':
 439       return (LEFT_CURLY);
 440
 441     case '=':
 442       do
 443         {
 444           c = getc(finput);
 445           if (c == '\n') lineno++;
 446         }
 447       while(c==' ' || c=='\n' || c=='\t');
 448
 449       if (c == '{')
 450         {
 451           strcpy(token_buffer, "={");
 452           return(LEFT_CURLY);
 453         }
 454       else
 455         {
 456           ungetc(c, finput);
 457           return(ILLEGAL);
 458         }
 459
 460     case '<':
 461       p = token_buffer;
 462       c = getc(finput);
 463       while (c != '>')
 464         {
 465           if (c == EOF)
 466             fatal("unterminated type name at end of file");
 467           if (c == '\n')
 468             {
 469               warn("unterminated type name");
 470               ungetc(c, finput);
 471               break;
 472             }
 473
 474           if (p == token_buffer + maxtoken)
 475             p = grow_token_buffer(p);
 476
 477           *p++ = c;
 478           c = getc(finput);
 479         }
 480       *p = 0;
 481       return (TYPENAME);
 482
 483
 484     case '%':
 485       return (parse_percent_token());
 486
 487     default:
 488       return (ILLEGAL);
 489     }
 490 }
 491
 492 /* the following table dictates the action taken for the various
 493         % directives.  A setflag value causes the named flag to be
 494         set.  A retval action returns the code.
 495 */
 496 struct percent_table_struct {
 497         char *name;
 498         void *setflag;
 499         int retval;
 500 } percent_table[] =
 501 {
 502   {"token", NULL, TOKEN},
 503   {"term", NULL, TOKEN},
 504   {"nterm", NULL, NTERM},
 505   {"type", NULL, TYPE},
 506   {"guard", NULL, GUARD},
 507   {"union", NULL, UNION},
 508   {"expect", NULL, EXPECT},
 509   {"thong", NULL, THONG},
 510   {"start", NULL, START},
 511   {"left", NULL, LEFT},
 512   {"right", NULL, RIGHT},
 513   {"nonassoc", NULL, NONASSOC},
 514   {"binary", NULL, NONASSOC},
 515   {"semantic_parser", NULL, SEMANTIC_PARSER},
 516   {"pure_parser", NULL, PURE_PARSER},
 517   {"prec", NULL, PREC},
 518
 519   {"no_lines", &nolinesflag, NOOP}, /* -l */
 520   {"raw", &rawtoknumflag, NOOP}, /* -r */
 521   {"token_table", &toknumflag, NOOP}, /* -k */
 522
 523 #if 0
 524   /* These can be utilized after main is reoganized so
 525      open_files() is deferred 'til after read_declarations().
 526      But %{ and %union both put information into files
 527      that have to be opened before read_declarations().
 528      */
 529   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 530   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 531   {"defines", &definesflag, NOOP}, /* -d */
 532   {"no_parser", &noparserflag, NOOP}, /* -n */
 533   {"output_file", &spec_outfile, SETOPT}, /* -o */
 534   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 535   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 536
 537   /* These would be acceptable, but they do not affect processing */
 538   {"verbose", &verboseflag, NOOP}, /* -v */
 539   {"debug", &debugflag, NOOP},  /* -t */
 540   /*    {"help", <print usage stmt>, NOOP},     /* -h */
 541   /*    {"version", <print version number> ,  NOOP},    /* -V */
 542 #endif
 543
 544   {NULL, NULL, ILLEGAL}
 545 };
 546
 547 /* Parse a token which starts with %.
 548    Assumes the % has already been read and discarded.  */
 549
 550 int
 551 parse_percent_token ()
 552 {
 553   register int c;
 554   register char *p;
 555   register struct percent_table_struct *tx;
 556
 557   p = token_buffer;
 558   c = getc(finput);
 559   *p++ = '%';
 560   *p++ = c;     /* for error msg */
 561   *p = 0;
 562
 563   switch (c)
 564     {
 565     case '%':
 566       return (TWO_PERCENTS);
 567
 568     case '{':
 569       return (PERCENT_LEFT_CURLY);
 570
 571     case '<':
 572       return (LEFT);
 573
 574     case '>':
 575       return (RIGHT);
 576
 577     case '2':
 578       return (NONASSOC);
 579
 580     case '0':
 581       return (TOKEN);
 582
 583     case '=':
 584       return (PREC);
 585     }
 586   if (!isalpha(c))
 587     return (ILLEGAL);
 588
 589   p = token_buffer;
 590   *p++ = '%';
 591   while (isalpha(c) || c == '_' || c == '-')
 592     {
 593       if (p == token_buffer + maxtoken)
 594         p = grow_token_buffer(p);
 595
 596       if (c == '-') c = '_';
 597       *p++ = c;
 598       c = getc(finput);
 599     }
 600
 601   ungetc(c, finput);
 602
 603   *p = 0;
 604
 605   /* table lookup % directive */
 606   for (tx = percent_table; tx->name; tx++)
 607     if (strcmp(token_buffer+1, tx->name) == 0)
 608       break;
 609   if (tx->retval == SETOPT)
 610     {
 611       *((char **)(tx->setflag)) = optarg;
 612       return NOOP;
 613     }
 614   if (tx->setflag)
 615     {
 616       *((int *)(tx->setflag)) = 1;
 617       return NOOP;
 618     }
 619   return tx->retval;
 620 }