src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 Boston, MA 02111-1307, USA.  */
  20
  21
  22 /*
  23    lex is the entry point.  It is called from reader.c.
  24    It returns one of the token-type codes defined in lex.h.
  25    When an identifier is seen, the code IDENTIFIER is returned
  26    and the name is looked up in the symbol table using symtab.c;
  27    symval is set to a pointer to the entry found.  */
  28
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "alloc.h"
  36
  37 /* flags set by % directives */
  38 extern int definesflag;         /* for -d */
  39 extern int toknumflag;          /* for -k */
  40 extern int noparserflag;        /* for -n */
  41 extern int fixed_outfiles;      /* for -y */
  42 extern int nolinesflag;         /* for -l */
  43 extern int rawtoknumflag;       /* for -r */
  44 extern int verboseflag; /* for -v */
  45 extern int debugflag;           /* for -t */
  46 extern char *spec_name_prefix;  /* for -p */
  47 extern char *spec_file_prefix;  /* for -b */
  48 /*spec_outfile is declared in files.h, for -o */
  49
  50 extern int lineno;
  51 extern int translations;
  52
  53 void init_lex PARAMS((void));
  54 char *grow_token_buffer PARAMS((char *));
  55 int skip_white_space PARAMS((void));
  56 int safegetc PARAMS((FILE *));
  57 int literalchar PARAMS((char **, int *, char));
  58 void unlex PARAMS((int));
  59 int lex PARAMS((void));
  60 int parse_percent_token PARAMS((void));
  61
  62 /* functions from main.c */
  63 extern char *printable_version PARAMS((int));
  64 extern void fatal PARAMS((char *));
  65 extern void warn PARAMS((char *));
  66 extern void warni PARAMS((char *, int));
  67 extern void warns PARAMS((char *, char *));
  68
  69 /* Buffer for storing the current token.  */
  70 char *token_buffer;
  71
  72 /* Allocated size of token_buffer, not including space for terminator.  */
  73 int maxtoken;
  74
  75 bucket *symval;
  76 int numval;
  77
  78 static int unlexed;             /* these two describe a token to be reread */
  79 static bucket *unlexed_symval;  /* by the next call to lex */
  80
  81
  82 void
  83 init_lex (void)
  84 {
  85   maxtoken = 100;
  86   token_buffer = NEW2 (maxtoken + 1, char);
  87   unlexed = -1;
  88 }
  89
  90
  91 char *
  92 grow_token_buffer (char *p)
  93 {
  94   int offset = p - token_buffer;
  95   maxtoken *= 2;
  96   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  97   return token_buffer + offset;
  98 }
  99
 100
 101 int
 102 skip_white_space (void)
 103 {
 104   register int c;
 105   register int inside;
 106
 107   c = getc(finput);
 108
 109   for (;;)
 110     {
 111       int cplus_comment;
 112
 113       switch (c)
 114         {
 115         case '/':
 116           c = getc(finput);
 117           if (c != '*' && c != '/')
 118             {
 119               warn(_("unexpected `/' found and ignored"));
 120               break;
 121             }
 122           cplus_comment = (c == '/');
 123
 124           c = getc(finput);
 125
 126           inside = 1;
 127           while (inside)
 128             {
 129               if (!cplus_comment && c == '*')
 130                 {
 131                   while (c == '*')
 132                     c = getc(finput);
 133
 134                   if (c == '/')
 135                     {
 136                       inside = 0;
 137                       c = getc(finput);
 138                     }
 139                 }
 140               else if (c == '\n')
 141                 {
 142                   lineno++;
 143                   if (cplus_comment)
 144                     inside = 0;
 145                   c = getc(finput);
 146                 }
 147               else if (c == EOF)
 148                 fatal(_("unterminated comment"));
 149               else
 150                 c = getc(finput);
 151             }
 152
 153           break;
 154
 155         case '\n':
 156           lineno++;
 157
 158         case ' ':
 159         case '\t':
 160         case '\f':
 161           c = getc(finput);
 162           break;
 163
 164         default:
 165           return (c);
 166         }
 167     }
 168 }
 169
 170 /* do a getc, but give error message if EOF encountered */
 171 int
 172 safegetc (FILE *f)
 173 {
 174   register int c = getc(f);
 175   if (c == EOF)
 176     fatal(_("Unexpected end of file"));
 177   return c;
 178 }
 179
 180 /* read one literal character from finput.  process \ escapes.
 181    append the normalized string version of the char to *pp.
 182    assign the character code to *pcode
 183    return 1 unless the character is an unescaped `term' or \n
 184         report error for \n
 185 */
 186 int
 187 literalchar (char **pp, int *pcode, char term)
 188 {
 189   register int c;
 190   register char *p;
 191   register int code;
 192   int wasquote = 0;
 193
 194   c = safegetc(finput);
 195   if (c == '\n')
 196     {
 197       warn(_("unescaped newline in constant"));
 198       ungetc(c, finput);
 199       code = '?';
 200       wasquote = 1;
 201     }
 202   else if (c != '\\')
 203     {
 204       code = c;
 205       if (c == term)
 206         wasquote = 1;
 207     }
 208   else
 209     {
 210       c = safegetc(finput);
 211       if (c == 't')  code = '\t';
 212       else if (c == 'n')  code = '\n';
 213       else if (c == 'a')  code = '\007';
 214       else if (c == 'r')  code = '\r';
 215       else if (c == 'f')  code = '\f';
 216       else if (c == 'b')  code = '\b';
 217       else if (c == 'v')  code = '\013';
 218       else if (c == '\\')  code = '\\';
 219       else if (c == '\'')  code = '\'';
 220       else if (c == '\"')  code = '\"';
 221       else if (c <= '7' && c >= '0')
 222         {
 223           code = 0;
 224           while (c <= '7' && c >= '0')
 225             {
 226               code = (code * 8) + (c - '0');
 227               if (code >= 256 || code < 0)
 228                 {
 229                   warni(_("octal value outside range 0...255: `\\%o'"), code);
 230                   code &= 0xFF;
 231                   break;
 232                 }
 233               c = safegetc(finput);
 234             }
 235           ungetc(c, finput);
 236         }
 237       else if (c == 'x')
 238         {
 239           c = safegetc(finput);
 240           code = 0;
 241           while (1)
 242             {
 243               if (c >= '0' && c <= '9')
 244                 code *= 16,  code += c - '0';
 245               else if (c >= 'a' && c <= 'f')
 246                 code *= 16,  code += c - 'a' + 10;
 247               else if (c >= 'A' && c <= 'F')
 248                 code *= 16,  code += c - 'A' + 10;
 249               else
 250                 break;
 251               if (code >= 256 || code<0)
 252                 {
 253                   warni(_("hexadecimal value above 255: `\\x%x'"), code);
 254                   code &= 0xFF;
 255                   break;
 256                 }
 257               c = safegetc(finput);
 258             }
 259           ungetc(c, finput);
 260         }
 261       else
 262         {
 263           warns (_("unknown escape sequence: `\\' followed by `%s'"),
 264                  printable_version(c));
 265           code = '?';
 266         }
 267     } /* has \ */
 268
 269   /* now fill token_buffer with the canonical name for this character
 270      as a literal token.  Do not use what the user typed,
 271      so that `\012' and `\n' can be interchangeable.  */
 272
 273   p = *pp;
 274   if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 275   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 276   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 277   else if (code >= 040 && code < 0177)
 278     *p++ = code;
 279   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 280   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 281   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 282   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 283   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 284   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 285   else
 286     {
 287       *p++ = '\\';
 288       *p++ = code / 0100 + '0';
 289       *p++ = ((code / 010) & 07) + '0';
 290       *p++ = (code & 07) + '0';
 291     }
 292   *pp = p;
 293   *pcode = code;
 294   return  ! wasquote;
 295 }
 296
 297
 298 void
 299 unlex (int token)
 300 {
 301   unlexed = token;
 302   unlexed_symval = symval;
 303 }
 304
 305
 306 int
 307 lex (void)
 308 {
 309   register int c;
 310   char *p;
 311
 312   if (unlexed >= 0)
 313     {
 314       symval = unlexed_symval;
 315       c = unlexed;
 316       unlexed = -1;
 317       return (c);
 318     }
 319
 320   c = skip_white_space();
 321   *token_buffer = c;    /* for error messages (token buffer always valid) */
 322   token_buffer[1] = 0;
 323
 324   switch (c)
 325     {
 326     case EOF:
 327       strcpy(token_buffer, "EOF");
 328       return (ENDFILE);
 329
 330     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 331     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 332     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 333     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 334     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 335     case 'Z':
 336     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 337     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 338     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 339     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 340     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 341     case 'z':
 342     case '.':  case '_':
 343       p = token_buffer;
 344       while (isalnum(c) || c == '_' || c == '.')
 345         {
 346           if (p == token_buffer + maxtoken)
 347             p = grow_token_buffer(p);
 348
 349           *p++ = c;
 350           c = getc(finput);
 351         }
 352
 353       *p = 0;
 354       ungetc(c, finput);
 355       symval = getsym(token_buffer);
 356       return (IDENTIFIER);
 357
 358     case '0':  case '1':  case '2':  case '3':  case '4':
 359     case '5':  case '6':  case '7':  case '8':  case '9':
 360       {
 361         numval = 0;
 362
 363         p = token_buffer;
 364         while (isdigit(c))
 365           {
 366             if (p == token_buffer + maxtoken)
 367               p = grow_token_buffer(p);
 368
 369             *p++ = c;
 370             numval = numval*10 + c - '0';
 371             c = getc(finput);
 372           }
 373         *p = 0;
 374         ungetc(c, finput);
 375         return (NUMBER);
 376       }
 377
 378     case '\'':
 379
 380       /* parse the literal token and compute character code in  code  */
 381
 382       translations = -1;
 383       {
 384         int code, discode;
 385         char discard[10], *dp;
 386
 387         p = token_buffer;
 388         *p++ = '\'';
 389         literalchar(&p, &code, '\'');
 390
 391         c = getc(finput);
 392         if (c != '\'')
 393           {
 394             warn(_("use \"...\" for multi-character literal tokens"));
 395             while (1)
 396               {
 397                 dp = discard;
 398                 if (! literalchar(&dp, &discode, '\''))
 399                   break;
 400               }
 401           }
 402         *p++ = '\'';
 403         *p = 0;
 404         symval = getsym(token_buffer);
 405         symval->class = STOKEN;
 406         if (! symval->user_token_number)
 407           symval->user_token_number = code;
 408         return (IDENTIFIER);
 409       }
 410
 411     case '\"':
 412
 413       /* parse the literal string token and treat as an identifier */
 414
 415       translations = -1;
 416       {
 417         int code;       /* ignored here */
 418         p = token_buffer;
 419         *p++ = '\"';
 420         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 421           {
 422             if (p >= token_buffer + maxtoken - 4)
 423               p = grow_token_buffer(p);
 424           }
 425         *p = 0;
 426
 427         symval = getsym(token_buffer);
 428         symval->class = STOKEN;
 429
 430         return (IDENTIFIER);
 431       }
 432
 433     case ',':
 434       return (COMMA);
 435
 436     case ':':
 437       return (COLON);
 438
 439     case ';':
 440       return (SEMICOLON);
 441
 442     case '|':
 443       return (BAR);
 444
 445     case '{':
 446       return (LEFT_CURLY);
 447
 448     case '=':
 449       do
 450         {
 451           c = getc(finput);
 452           if (c == '\n') lineno++;
 453         }
 454       while(c==' ' || c=='\n' || c=='\t');
 455
 456       if (c == '{')
 457         {
 458           strcpy(token_buffer, "={");
 459           return(LEFT_CURLY);
 460         }
 461       else
 462         {
 463           ungetc(c, finput);
 464           return(ILLEGAL);
 465         }
 466
 467     case '<':
 468       p = token_buffer;
 469       c = getc(finput);
 470       while (c != '>')
 471         {
 472           if (c == EOF)
 473             fatal(_("unterminated type name at end of file"));
 474           if (c == '\n')
 475             {
 476               warn(_("unterminated type name"));
 477               ungetc(c, finput);
 478               break;
 479             }
 480
 481           if (p == token_buffer + maxtoken)
 482             p = grow_token_buffer(p);
 483
 484           *p++ = c;
 485           c = getc(finput);
 486         }
 487       *p = 0;
 488       return (TYPENAME);
 489
 490
 491     case '%':
 492       return (parse_percent_token());
 493
 494     default:
 495       return (ILLEGAL);
 496     }
 497 }
 498
 499 /* the following table dictates the action taken for the various
 500         % directives.  A setflag value causes the named flag to be
 501         set.  A retval action returns the code.
 502 */
 503 struct percent_table_struct {
 504         char *name;
 505         void *setflag;
 506         int retval;
 507 } percent_table[] =
 508 {
 509   {"token", NULL, TOKEN},
 510   {"term", NULL, TOKEN},
 511   {"nterm", NULL, NTERM},
 512   {"type", NULL, TYPE},
 513   {"guard", NULL, GUARD},
 514   {"union", NULL, UNION},
 515   {"expect", NULL, EXPECT},
 516   {"thong", NULL, THONG},
 517   {"start", NULL, START},
 518   {"left", NULL, LEFT},
 519   {"right", NULL, RIGHT},
 520   {"nonassoc", NULL, NONASSOC},
 521   {"binary", NULL, NONASSOC},
 522   {"semantic_parser", NULL, SEMANTIC_PARSER},
 523   {"pure_parser", NULL, PURE_PARSER},
 524   {"prec", NULL, PREC},
 525
 526   {"no_lines", &nolinesflag, NOOP}, /* -l */
 527   {"raw", &rawtoknumflag, NOOP}, /* -r */
 528   {"token_table", &toknumflag, NOOP}, /* -k */
 529
 530 #if 0
 531   /* These can be utilized after main is reoganized so
 532      open_files() is deferred 'til after read_declarations().
 533      But %{ and %union both put information into files
 534      that have to be opened before read_declarations().
 535      */
 536   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 537   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 538   {"defines", &definesflag, NOOP}, /* -d */
 539   {"no_parser", &noparserflag, NOOP}, /* -n */
 540   {"output_file", &spec_outfile, SETOPT}, /* -o */
 541   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 542   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 543
 544   /* These would be acceptable, but they do not affect processing */
 545   {"verbose", &verboseflag, NOOP}, /* -v */
 546   {"debug", &debugflag, NOOP},  /* -t */
 547   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 548   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 549 #endif
 550
 551   {NULL, NULL, ILLEGAL}
 552 };
 553
 554 /* Parse a token which starts with %.
 555    Assumes the % has already been read and discarded.  */
 556
 557 int
 558 parse_percent_token (void)
 559 {
 560   register int c;
 561   register char *p;
 562   register struct percent_table_struct *tx;
 563
 564   p = token_buffer;
 565   c = getc(finput);
 566   *p++ = '%';
 567   *p++ = c;     /* for error msg */
 568   *p = 0;
 569
 570   switch (c)
 571     {
 572     case '%':
 573       return (TWO_PERCENTS);
 574
 575     case '{':
 576       return (PERCENT_LEFT_CURLY);
 577
 578     case '<':
 579       return (LEFT);
 580
 581     case '>':
 582       return (RIGHT);
 583
 584     case '2':
 585       return (NONASSOC);
 586
 587     case '0':
 588       return (TOKEN);
 589
 590     case '=':
 591       return (PREC);
 592     }
 593   if (!isalpha(c))
 594     return (ILLEGAL);
 595
 596   p = token_buffer;
 597   *p++ = '%';
 598   while (isalpha(c) || c == '_' || c == '-')
 599     {
 600       if (p == token_buffer + maxtoken)
 601         p = grow_token_buffer(p);
 602
 603       if (c == '-') c = '_';
 604       *p++ = c;
 605       c = getc(finput);
 606     }
 607
 608   ungetc(c, finput);
 609
 610   *p = 0;
 611
 612   /* table lookup % directive */
 613   for (tx = percent_table; tx->name; tx++)
 614     if (strcmp(token_buffer+1, tx->name) == 0)
 615       break;
 616   if (tx->retval == SETOPT)
 617     {
 618       *((char **)(tx->setflag)) = optarg;
 619       return NOOP;
 620     }
 621   if (tx->setflag)
 622     {
 623       *((int *)(tx->setflag)) = 1;
 624       return NOOP;
 625     }
 626   return tx->retval;
 627 }