src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20
  21 /*
  22    lex is the entry point.  It is called from reader.c.
  23    It returns one of the token-type codes defined in lex.h.
  24    When an identifier is seen, the code IDENTIFIER is returned
  25    and the name is looked up in the symbol table using symtab.c;
  26    symval is set to a pointer to the entry found.  */
  27
  28 #include <stdio.h>
  29 #include <ctype.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "new.h"
  36
  37 /* flags set by % directives */
  38 extern int definesflag;         /* for -d */
  39 extern int toknumflag;          /* for -k */
  40 extern int noparserflag;        /* for -n */
  41 extern int fixed_outfiles;      /* for -y */
  42 extern int nolinesflag;         /* for -l */
  43 extern int rawtoknumflag;       /* for -r */
  44 extern int verboseflag; /* for -v */
  45 extern int debugflag;           /* for -t */
  46 extern char *spec_name_prefix;  /* for -p */
  47 extern char *spec_file_prefix;  /* for -b */
  48 /*spec_outfile is declared in files.h, for -o */
  49
  50 extern int lineno;
  51 extern int translations;
  52
  53 int parse_percent_token();
  54
  55 /* functions from main.c */
  56 extern char *printable_version();
  57 extern void fatal();
  58 extern void warni();
  59 extern void warn();
  60
  61 /* Buffer for storing the current token.  */
  62 char *token_buffer;
  63
  64 /* Allocated size of token_buffer, not including space for terminator.  */
  65 static int maxtoken;
  66
  67 bucket *symval;
  68 int numval;
  69
  70 static int unlexed;             /* these two describe a token to be reread */
  71 static bucket *unlexed_symval;  /* by the next call to lex */
  72
  73
  74 void
  75 init_lex()
  76 {
  77   maxtoken = 100;
  78   token_buffer = NEW2 (maxtoken + 1, char);
  79   unlexed = -1;
  80 }
  81
  82
  83 static char *
  84 grow_token_buffer (p)
  85      char *p;
  86 {
  87   int offset = p - token_buffer;
  88   maxtoken *= 2;
  89   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  90   return token_buffer + offset;
  91 }
  92
  93
  94 int
  95 skip_white_space()
  96 {
  97   register int c;
  98   register int inside;
  99
 100   c = getc(finput);
 101
 102   for (;;)
 103     {
 104       int cplus_comment;
 105
 106       switch (c)
 107         {
 108         case '/':
 109           c = getc(finput);
 110           if (c != '*' && c != '/')
 111             {
 112               warn(_("unexpected `/' found and ignored"));
 113               break;
 114             }
 115           cplus_comment = (c == '/');
 116
 117           c = getc(finput);
 118
 119           inside = 1;
 120           while (inside)
 121             {
 122               if (!cplus_comment && c == '*')
 123                 {
 124                   while (c == '*')
 125                     c = getc(finput);
 126
 127                   if (c == '/')
 128                     {
 129                       inside = 0;
 130                       c = getc(finput);
 131                     }
 132                 }
 133               else if (c == '\n')
 134                 {
 135                   lineno++;
 136                   if (cplus_comment)
 137                     inside = 0;
 138                   c = getc(finput);
 139                 }
 140               else if (c == EOF)
 141                 fatal(_("unterminated comment"));
 142               else
 143                 c = getc(finput);
 144             }
 145
 146           break;
 147
 148         case '\n':
 149           lineno++;
 150
 151         case ' ':
 152         case '\t':
 153         case '\f':
 154           c = getc(finput);
 155           break;
 156
 157         default:
 158           return (c);
 159         }
 160     }
 161 }
 162
 163 /* do a getc, but give error message if EOF encountered */
 164 int
 165 safegetc(f)
 166   FILE *f;
 167 {
 168   register int c = getc(f);
 169   if (c == EOF)
 170     fatal(_("Unexpected end of file"));
 171   return c;
 172 }
 173
 174 /* read one literal character from finput.  process \ escapes.
 175    append the normalized string version of the char to *pp.
 176    assign the character code to *pcode
 177    return 1 unless the character is an unescaped `term' or \n
 178         report error for \n
 179 */
 180 int
 181 literalchar(pp, pcode, term)
 182   char **pp;
 183   int *pcode;
 184   char term;
 185 {
 186   register int c;
 187   register char *p;
 188   register int code;
 189   int wasquote = 0;
 190
 191   c = safegetc(finput);
 192   if (c == '\n')
 193     {
 194       warn(_("unescaped newline in constant"));
 195       ungetc(c, finput);
 196       code = '?';
 197       wasquote = 1;
 198     }
 199   else if (c != '\\')
 200     {
 201       code = c;
 202       if (c == term)
 203         wasquote = 1;
 204     }
 205   else
 206     {
 207       c = safegetc(finput);
 208       if (c == 't')  code = '\t';
 209       else if (c == 'n')  code = '\n';
 210       else if (c == 'a')  code = '\007';
 211       else if (c == 'r')  code = '\r';
 212       else if (c == 'f')  code = '\f';
 213       else if (c == 'b')  code = '\b';
 214       else if (c == 'v')  code = 013;
 215       else if (c == '\\')  code = '\\';
 216       else if (c == '\'')  code = '\'';
 217       else if (c == '\"')  code = '\"';
 218       else if (c <= '7' && c >= '0')
 219         {
 220           code = 0;
 221           while (c <= '7' && c >= '0')
 222             {
 223               code = (code * 8) + (c - '0');
 224               if (code >= 256 || code < 0)
 225                 {
 226                   warni(_("octal value outside range 0...255: `\\%o'"), code);
 227                   code &= 0xFF;
 228                   break;
 229                 }
 230               c = safegetc(finput);
 231             }
 232           ungetc(c, finput);
 233         }
 234       else if (c == 'x')
 235         {
 236           c = safegetc(finput);
 237           code = 0;
 238           while (1)
 239             {
 240               if (c >= '0' && c <= '9')
 241                 code *= 16,  code += c - '0';
 242               else if (c >= 'a' && c <= 'f')
 243                 code *= 16,  code += c - 'a' + 10;
 244               else if (c >= 'A' && c <= 'F')
 245                 code *= 16,  code += c - 'A' + 10;
 246               else
 247                 break;
 248               if (code >= 256 || code<0)
 249                 {
 250                   warni(_("hexadecimal value above 255: `\\x%x'"), code);
 251                   code &= 0xFF;
 252                   break;
 253                 }
 254               c = safegetc(finput);
 255             }
 256           ungetc(c, finput);
 257         }
 258       else
 259         {
 260           warni (_("unknown escape sequence: `\\' followed by `%s'"),
 261                  printable_version(c));
 262           code = '?';
 263         }
 264     } /* has \ */
 265
 266   /* now fill token_buffer with the canonical name for this character
 267      as a literal token.  Do not use what the user typed,
 268      so that `\012' and `\n' can be interchangeable.  */
 269
 270   p = *pp;
 271   if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 272   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 273   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 274   else if (code >= 040 && code < 0177)
 275     *p++ = code;
 276   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 277   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 278   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 279   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 280   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 281   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 282   else
 283     {
 284       *p++ = '\\';
 285       *p++ = code / 0100 + '0';
 286       *p++ = ((code / 010) & 07) + '0';
 287       *p++ = (code & 07) + '0';
 288     }
 289   *pp = p;
 290   *pcode = code;
 291   return  ! wasquote;
 292 }
 293
 294
 295 void
 296 unlex(token)
 297      int token;
 298 {
 299   unlexed = token;
 300   unlexed_symval = symval;
 301 }
 302
 303
 304 int
 305 lex()
 306 {
 307   register int c;
 308   char *p;
 309
 310   if (unlexed >= 0)
 311     {
 312       symval = unlexed_symval;
 313       c = unlexed;
 314       unlexed = -1;
 315       return (c);
 316     }
 317
 318   c = skip_white_space();
 319   *token_buffer = c;    /* for error messages (token buffer always valid) */
 320   token_buffer[1] = 0;
 321
 322   switch (c)
 323     {
 324     case EOF:
 325       strcpy(token_buffer, "EOF");
 326       return (ENDFILE);
 327
 328     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 329     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 330     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 331     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 332     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 333     case 'Z':
 334     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 335     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 336     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 337     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 338     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 339     case 'z':
 340     case '.':  case '_':
 341       p = token_buffer;
 342       while (isalnum(c) || c == '_' || c == '.')
 343         {
 344           if (p == token_buffer + maxtoken)
 345             p = grow_token_buffer(p);
 346
 347           *p++ = c;
 348           c = getc(finput);
 349         }
 350
 351       *p = 0;
 352       ungetc(c, finput);
 353       symval = getsym(token_buffer);
 354       return (IDENTIFIER);
 355
 356     case '0':  case '1':  case '2':  case '3':  case '4':
 357     case '5':  case '6':  case '7':  case '8':  case '9':
 358       {
 359         numval = 0;
 360
 361         p = token_buffer;
 362         while (isdigit(c))
 363           {
 364             if (p == token_buffer + maxtoken)
 365               p = grow_token_buffer(p);
 366
 367             *p++ = c;
 368             numval = numval*10 + c - '0';
 369             c = getc(finput);
 370           }
 371         *p = 0;
 372         ungetc(c, finput);
 373         return (NUMBER);
 374       }
 375
 376     case '\'':
 377
 378       /* parse the literal token and compute character code in  code  */
 379
 380       translations = -1;
 381       {
 382         int code, discode;
 383         char discard[10], *dp;
 384
 385         p = token_buffer;
 386         *p++ = '\'';
 387         literalchar(&p, &code, '\'');
 388
 389         c = getc(finput);
 390         if (c != '\'')
 391           {
 392             warn(_("use \"...\" for multi-character literal tokens"));
 393             while (1)
 394               {
 395                 dp = discard;
 396                 if (! literalchar(&dp, &discode, '\''))
 397                   break;
 398               }
 399           }
 400         *p++ = '\'';
 401         *p = 0;
 402         symval = getsym(token_buffer);
 403         symval->class = STOKEN;
 404         if (! symval->user_token_number)
 405           symval->user_token_number = code;
 406         return (IDENTIFIER);
 407       }
 408
 409     case '\"':
 410
 411       /* parse the literal string token and treat as an identifier */
 412
 413       translations = -1;
 414       {
 415         int code;       /* ignored here */
 416         p = token_buffer;
 417         *p++ = '\"';
 418         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 419           {
 420             if (p >= token_buffer + maxtoken - 4)
 421               p = grow_token_buffer(p);
 422           }
 423         *p = 0;
 424
 425         symval = getsym(token_buffer);
 426         symval->class = STOKEN;
 427
 428         return (IDENTIFIER);
 429       }
 430
 431     case ',':
 432       return (COMMA);
 433
 434     case ':':
 435       return (COLON);
 436
 437     case ';':
 438       return (SEMICOLON);
 439
 440     case '|':
 441       return (BAR);
 442
 443     case '{':
 444       return (LEFT_CURLY);
 445
 446     case '=':
 447       do
 448         {
 449           c = getc(finput);
 450           if (c == '\n') lineno++;
 451         }
 452       while(c==' ' || c=='\n' || c=='\t');
 453
 454       if (c == '{')
 455         {
 456           strcpy(token_buffer, "={");
 457           return(LEFT_CURLY);
 458         }
 459       else
 460         {
 461           ungetc(c, finput);
 462           return(ILLEGAL);
 463         }
 464
 465     case '<':
 466       p = token_buffer;
 467       c = getc(finput);
 468       while (c != '>')
 469         {
 470           if (c == EOF)
 471             fatal(_("unterminated type name at end of file"));
 472           if (c == '\n')
 473             {
 474               warn(_("unterminated type name"));
 475               ungetc(c, finput);
 476               break;
 477             }
 478
 479           if (p == token_buffer + maxtoken)
 480             p = grow_token_buffer(p);
 481
 482           *p++ = c;
 483           c = getc(finput);
 484         }
 485       *p = 0;
 486       return (TYPENAME);
 487
 488
 489     case '%':
 490       return (parse_percent_token());
 491
 492     default:
 493       return (ILLEGAL);
 494     }
 495 }
 496
 497 /* the following table dictates the action taken for the various
 498         % directives.  A setflag value causes the named flag to be
 499         set.  A retval action returns the code.
 500 */
 501 struct percent_table_struct {
 502         char *name;
 503         void *setflag;
 504         int retval;
 505 } percent_table[] =
 506 {
 507   {"token", NULL, TOKEN},
 508   {"term", NULL, TOKEN},
 509   {"nterm", NULL, NTERM},
 510   {"type", NULL, TYPE},
 511   {"guard", NULL, GUARD},
 512   {"union", NULL, UNION},
 513   {"expect", NULL, EXPECT},
 514   {"thong", NULL, THONG},
 515   {"start", NULL, START},
 516   {"left", NULL, LEFT},
 517   {"right", NULL, RIGHT},
 518   {"nonassoc", NULL, NONASSOC},
 519   {"binary", NULL, NONASSOC},
 520   {"semantic_parser", NULL, SEMANTIC_PARSER},
 521   {"pure_parser", NULL, PURE_PARSER},
 522   {"prec", NULL, PREC},
 523
 524   {"no_lines", &nolinesflag, NOOP}, /* -l */
 525   {"raw", &rawtoknumflag, NOOP}, /* -r */
 526   {"token_table", &toknumflag, NOOP}, /* -k */
 527
 528 #if 0
 529   /* These can be utilized after main is reoganized so
 530      open_files() is deferred 'til after read_declarations().
 531      But %{ and %union both put information into files
 532      that have to be opened before read_declarations().
 533      */
 534   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 535   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 536   {"defines", &definesflag, NOOP}, /* -d */
 537   {"no_parser", &noparserflag, NOOP}, /* -n */
 538   {"output_file", &spec_outfile, SETOPT}, /* -o */
 539   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 540   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 541
 542   /* These would be acceptable, but they do not affect processing */
 543   {"verbose", &verboseflag, NOOP}, /* -v */
 544   {"debug", &debugflag, NOOP},  /* -t */
 545   /*    {"help", <print usage stmt>, NOOP},     /* -h */
 546   /*    {"version", <print version number> ,  NOOP},    /* -V */
 547 #endif
 548
 549   {NULL, NULL, ILLEGAL}
 550 };
 551
 552 /* Parse a token which starts with %.
 553    Assumes the % has already been read and discarded.  */
 554
 555 int
 556 parse_percent_token ()
 557 {
 558   register int c;
 559   register char *p;
 560   register struct percent_table_struct *tx;
 561
 562   p = token_buffer;
 563   c = getc(finput);
 564   *p++ = '%';
 565   *p++ = c;     /* for error msg */
 566   *p = 0;
 567
 568   switch (c)
 569     {
 570     case '%':
 571       return (TWO_PERCENTS);
 572
 573     case '{':
 574       return (PERCENT_LEFT_CURLY);
 575
 576     case '<':
 577       return (LEFT);
 578
 579     case '>':
 580       return (RIGHT);
 581
 582     case '2':
 583       return (NONASSOC);
 584
 585     case '0':
 586       return (TOKEN);
 587
 588     case '=':
 589       return (PREC);
 590     }
 591   if (!isalpha(c))
 592     return (ILLEGAL);
 593
 594   p = token_buffer;
 595   *p++ = '%';
 596   while (isalpha(c) || c == '_' || c == '-')
 597     {
 598       if (p == token_buffer + maxtoken)
 599         p = grow_token_buffer(p);
 600
 601       if (c == '-') c = '_';
 602       *p++ = c;
 603       c = getc(finput);
 604     }
 605
 606   ungetc(c, finput);
 607
 608   *p = 0;
 609
 610   /* table lookup % directive */
 611   for (tx = percent_table; tx->name; tx++)
 612     if (strcmp(token_buffer+1, tx->name) == 0)
 613       break;
 614   if (tx->retval == SETOPT)
 615     {
 616       *((char **)(tx->setflag)) = optarg;
 617       return NOOP;
 618     }
 619   if (tx->setflag)
 620     {
 621       *((int *)(tx->setflag)) = 1;
 622       return NOOP;
 623     }
 624   return tx->retval;
 625 }