src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21
  22 /*
  23    lex is the entry point.  It is called from reader.c.
  24    It returns one of the token-type codes defined in lex.h.
  25    When an identifier is seen, the code IDENTIFIER is returned
  26    and the name is looked up in the symbol table using symtab.c;
  27    symval is set to a pointer to the entry found.  */
  28
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "alloc.h"
  36 #include "complain.h"
  37
  38 /* flags set by % directives */
  39 extern int definesflag;         /* for -d */
  40 extern int toknumflag;          /* for -k */
  41 extern int noparserflag;        /* for -n */
  42 extern int fixed_outfiles;      /* for -y */
  43 extern int nolinesflag;         /* for -l */
  44 extern int rawtoknumflag;       /* for -r */
  45 extern int verboseflag; /* for -v */
  46 extern int debugflag;           /* for -t */
  47 extern char *spec_name_prefix;  /* for -p */
  48 extern char *spec_file_prefix;  /* for -b */
  49 /*spec_outfile is declared in files.h, for -o */
  50
  51 extern int translations;
  52
  53 extern void init_lex PARAMS((void));
  54 extern char *grow_token_buffer PARAMS((char *));
  55 extern int skip_white_space PARAMS((void));
  56 extern void unlex PARAMS((int));
  57 extern int lex PARAMS((void));
  58 extern int parse_percent_token PARAMS((void));
  59
  60 static int safegetc PARAMS((FILE *));
  61 static int literalchar PARAMS((char **, int *, char));
  62
  63 /* functions from main.c */
  64 extern char *printable_version PARAMS((int));
  65
  66 /* Buffer for storing the current token.  */
  67 char *token_buffer;
  68
  69 /* Allocated size of token_buffer, not including space for terminator.  */
  70 int maxtoken;
  71
  72 bucket *symval;
  73 int numval;
  74
  75 static int unlexed;             /* these two describe a token to be reread */
  76 static bucket *unlexed_symval;  /* by the next call to lex */
  77
  78
  79 void
  80 init_lex (void)
  81 {
  82   maxtoken = 100;
  83   token_buffer = NEW2 (maxtoken + 1, char);
  84   unlexed = -1;
  85 }
  86
  87
  88 char *
  89 grow_token_buffer (char *p)
  90 {
  91   int offset = p - token_buffer;
  92   maxtoken *= 2;
  93   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  94   return token_buffer + offset;
  95 }
  96
  97
  98 int
  99 skip_white_space (void)
 100 {
 101   register int c;
 102   register int inside;
 103
 104   c = getc(finput);
 105
 106   for (;;)
 107     {
 108       int cplus_comment;
 109
 110       switch (c)
 111         {
 112         case '/':
 113           c = getc(finput);
 114           if (c != '*' && c != '/')
 115             {
 116               complain (_("unexpected `/' found and ignored"));
 117               break;
 118             }
 119           cplus_comment = (c == '/');
 120
 121           c = getc(finput);
 122
 123           inside = 1;
 124           while (inside)
 125             {
 126               if (!cplus_comment && c == '*')
 127                 {
 128                   while (c == '*')
 129                     c = getc(finput);
 130
 131                   if (c == '/')
 132                     {
 133                       inside = 0;
 134                       c = getc(finput);
 135                     }
 136                 }
 137               else if (c == '\n')
 138                 {
 139                   lineno++;
 140                   if (cplus_comment)
 141                     inside = 0;
 142                   c = getc(finput);
 143                 }
 144               else if (c == EOF)
 145                 fatal (_("unterminated comment"));
 146               else
 147                 c = getc(finput);
 148             }
 149
 150           break;
 151
 152         case '\n':
 153           lineno++;
 154
 155         case ' ':
 156         case '\t':
 157         case '\f':
 158           c = getc(finput);
 159           break;
 160
 161         default:
 162           return c;
 163         }
 164     }
 165 }
 166
 167 /* do a getc, but give error message if EOF encountered */
 168 static int
 169 safegetc (FILE *f)
 170 {
 171   register int c = getc(f);
 172   if (c == EOF)
 173     fatal (_("unexpected end of file"));
 174   return c;
 175 }
 176
 177 /* read one literal character from finput.  process \ escapes.
 178    append the normalized string version of the char to *pp.
 179    assign the character code to *pcode
 180    return 1 unless the character is an unescaped `term' or \n
 181         report error for \n
 182 */
 183 static int
 184 literalchar (char **pp, int *pcode, char term)
 185 {
 186   register int c;
 187   register char *p;
 188   register int code;
 189   int wasquote = 0;
 190
 191   c = safegetc(finput);
 192   if (c == '\n')
 193     {
 194       complain (_("unescaped newline in constant"));
 195       ungetc(c, finput);
 196       code = '?';
 197       wasquote = 1;
 198     }
 199   else if (c != '\\')
 200     {
 201       code = c;
 202       if (c == term)
 203         wasquote = 1;
 204     }
 205   else
 206     {
 207       c = safegetc(finput);
 208       if (c == 't')  code = '\t';
 209       else if (c == 'n')  code = '\n';
 210       else if (c == 'a')  code = '\007';
 211       else if (c == 'r')  code = '\r';
 212       else if (c == 'f')  code = '\f';
 213       else if (c == 'b')  code = '\b';
 214       else if (c == 'v')  code = '\013';
 215       else if (c == '\\')  code = '\\';
 216       else if (c == '\'')  code = '\'';
 217       else if (c == '\"')  code = '\"';
 218       else if (c <= '7' && c >= '0')
 219         {
 220           code = 0;
 221           while (c <= '7' && c >= '0')
 222             {
 223               code = (code * 8) + (c - '0');
 224               if (code >= 256 || code < 0)
 225                 {
 226                   complain (_("octal value outside range 0...255: `\\%o'"),
 227                             code);
 228                   code &= 0xFF;
 229                   break;
 230                 }
 231               c = safegetc(finput);
 232             }
 233           ungetc(c, finput);
 234         }
 235       else if (c == 'x')
 236         {
 237           c = safegetc(finput);
 238           code = 0;
 239           while (1)
 240             {
 241               if (c >= '0' && c <= '9')
 242                 code *= 16,  code += c - '0';
 243               else if (c >= 'a' && c <= 'f')
 244                 code *= 16,  code += c - 'a' + 10;
 245               else if (c >= 'A' && c <= 'F')
 246                 code *= 16,  code += c - 'A' + 10;
 247               else
 248                 break;
 249               if (code >= 256 || code<0)
 250                 {
 251                   complain (_("hexadecimal value above 255: `\\x%x'"),
 252                             code);
 253                   code &= 0xFF;
 254                   break;
 255                 }
 256               c = safegetc(finput);
 257             }
 258           ungetc(c, finput);
 259         }
 260       else
 261         {
 262           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 263                     printable_version(c));
 264           code = '?';
 265         }
 266     } /* has \ */
 267
 268   /* now fill token_buffer with the canonical name for this character
 269      as a literal token.  Do not use what the user typed,
 270      so that `\012' and `\n' can be interchangeable.  */
 271
 272   p = *pp;
 273   if (code == term && wasquote)
 274     *p++ = code;
 275   else if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 276   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 277   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 278   else if (code >= 040 && code < 0177)
 279     *p++ = code;
 280   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 281   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 282   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 283   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 284   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 285   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 286   else
 287     {
 288       *p++ = '\\';
 289       *p++ = code / 0100 + '0';
 290       *p++ = ((code / 010) & 07) + '0';
 291       *p++ = (code & 07) + '0';
 292     }
 293   *pp = p;
 294   *pcode = code;
 295   return  ! wasquote;
 296 }
 297
 298
 299 void
 300 unlex (int token)
 301 {
 302   unlexed = token;
 303   unlexed_symval = symval;
 304 }
 305
 306
 307 int
 308 lex (void)
 309 {
 310   register int c;
 311   char *p;
 312
 313   if (unlexed >= 0)
 314     {
 315       symval = unlexed_symval;
 316       c = unlexed;
 317       unlexed = -1;
 318       return c;
 319     }
 320
 321   c = skip_white_space();
 322   *token_buffer = c;    /* for error messages (token buffer always valid) */
 323   token_buffer[1] = 0;
 324
 325   switch (c)
 326     {
 327     case EOF:
 328       strcpy(token_buffer, "EOF");
 329       return ENDFILE;
 330
 331     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 332     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 333     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 334     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 335     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 336     case 'Z':
 337     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 338     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 339     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 340     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 341     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 342     case 'z':
 343     case '.':  case '_':
 344       p = token_buffer;
 345       while (isalnum(c) || c == '_' || c == '.')
 346         {
 347           if (p == token_buffer + maxtoken)
 348             p = grow_token_buffer(p);
 349
 350           *p++ = c;
 351           c = getc(finput);
 352         }
 353
 354       *p = 0;
 355       ungetc(c, finput);
 356       symval = getsym(token_buffer);
 357       return IDENTIFIER;
 358
 359     case '0':  case '1':  case '2':  case '3':  case '4':
 360     case '5':  case '6':  case '7':  case '8':  case '9':
 361       {
 362         numval = 0;
 363
 364         p = token_buffer;
 365         while (isdigit(c))
 366           {
 367             if (p == token_buffer + maxtoken)
 368               p = grow_token_buffer(p);
 369
 370             *p++ = c;
 371             numval = numval*10 + c - '0';
 372             c = getc(finput);
 373           }
 374         *p = 0;
 375         ungetc(c, finput);
 376         return NUMBER;
 377       }
 378
 379     case '\'':
 380
 381       /* parse the literal token and compute character code in  code  */
 382
 383       translations = -1;
 384       {
 385         int code, discode;
 386         char discard[10], *dp;
 387
 388         p = token_buffer;
 389         *p++ = '\'';
 390         literalchar(&p, &code, '\'');
 391
 392         c = getc(finput);
 393         if (c != '\'')
 394           {
 395             complain (_("use \"...\" for multi-character literal tokens"));
 396             while (1)
 397               {
 398                 dp = discard;
 399                 if (! literalchar(&dp, &discode, '\''))
 400                   break;
 401               }
 402           }
 403         *p++ = '\'';
 404         *p = 0;
 405         symval = getsym(token_buffer);
 406         symval->class = STOKEN;
 407         if (! symval->user_token_number)
 408           symval->user_token_number = code;
 409         return IDENTIFIER;
 410       }
 411
 412     case '\"':
 413
 414       /* parse the literal string token and treat as an identifier */
 415
 416       translations = -1;
 417       {
 418         int code;       /* ignored here */
 419         p = token_buffer;
 420         *p++ = '\"';
 421         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 422           {
 423             if (p >= token_buffer + maxtoken - 4)
 424               p = grow_token_buffer(p);
 425           }
 426         *p = 0;
 427
 428         symval = getsym(token_buffer);
 429         symval->class = STOKEN;
 430
 431         return IDENTIFIER;
 432       }
 433
 434     case ',':
 435       return COMMA;
 436
 437     case ':':
 438       return COLON;
 439
 440     case ';':
 441       return SEMICOLON;
 442
 443     case '|':
 444       return BAR;
 445
 446     case '{':
 447       return LEFT_CURLY;
 448
 449     case '=':
 450       do
 451         {
 452           c = getc(finput);
 453           if (c == '\n') lineno++;
 454         }
 455       while(c==' ' || c=='\n' || c=='\t');
 456
 457       if (c == '{')
 458         {
 459           strcpy(token_buffer, "={");
 460           return LEFT_CURLY;
 461         }
 462       else
 463         {
 464           ungetc(c, finput);
 465           return ILLEGAL;
 466         }
 467
 468     case '<':
 469       p = token_buffer;
 470       c = getc(finput);
 471       while (c != '>')
 472         {
 473           if (c == EOF)
 474             fatal (_("unterminated type name at end of file"));
 475           if (c == '\n')
 476             {
 477               complain (_("unterminated type name"));
 478               ungetc(c, finput);
 479               break;
 480             }
 481
 482           if (p == token_buffer + maxtoken)
 483             p = grow_token_buffer(p);
 484
 485           *p++ = c;
 486           c = getc(finput);
 487         }
 488       *p = 0;
 489       return TYPENAME;
 490
 491
 492     case '%':
 493       return parse_percent_token();
 494
 495     default:
 496       return ILLEGAL;
 497     }
 498 }
 499
 500 /* the following table dictates the action taken for the various
 501         % directives.  A setflag value causes the named flag to be
 502         set.  A retval action returns the code.
 503 */
 504 struct percent_table_struct {
 505         const char *name;
 506         void *setflag;
 507         int retval;
 508 } percent_table[] =
 509 {
 510   {"token", NULL, TOKEN},
 511   {"term", NULL, TOKEN},
 512   {"nterm", NULL, NTERM},
 513   {"type", NULL, TYPE},
 514   {"guard", NULL, GUARD},
 515   {"union", NULL, UNION},
 516   {"expect", NULL, EXPECT},
 517   {"thong", NULL, THONG},
 518   {"start", NULL, START},
 519   {"left", NULL, LEFT},
 520   {"right", NULL, RIGHT},
 521   {"nonassoc", NULL, NONASSOC},
 522   {"binary", NULL, NONASSOC},
 523   {"semantic_parser", NULL, SEMANTIC_PARSER},
 524   {"pure_parser", NULL, PURE_PARSER},
 525   {"prec", NULL, PREC},
 526
 527   {"no_lines", &nolinesflag, NOOP}, /* -l */
 528   {"raw", &rawtoknumflag, NOOP}, /* -r */
 529   {"token_table", &toknumflag, NOOP}, /* -k */
 530
 531 #if 0
 532   /* These can be utilized after main is reoganized so
 533      open_files() is deferred 'til after read_declarations().
 534      But %{ and %union both put information into files
 535      that have to be opened before read_declarations().
 536      */
 537   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 538   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 539   {"defines", &definesflag, NOOP}, /* -d */
 540   {"no_parser", &noparserflag, NOOP}, /* -n */
 541   {"output_file", &spec_outfile, SETOPT}, /* -o */
 542   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 543   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 544
 545   /* These would be acceptable, but they do not affect processing */
 546   {"verbose", &verboseflag, NOOP}, /* -v */
 547   {"debug", &debugflag, NOOP},  /* -t */
 548   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 549   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 550 #endif
 551
 552   {NULL, NULL, ILLEGAL}
 553 };
 554
 555 /* Parse a token which starts with %.
 556    Assumes the % has already been read and discarded.  */
 557
 558 int
 559 parse_percent_token (void)
 560 {
 561   register int c;
 562   register char *p;
 563   register struct percent_table_struct *tx;
 564
 565   p = token_buffer;
 566   c = getc(finput);
 567   *p++ = '%';
 568   *p++ = c;     /* for error msg */
 569   *p = 0;
 570
 571   switch (c)
 572     {
 573     case '%':
 574       return TWO_PERCENTS;
 575
 576     case '{':
 577       return PERCENT_LEFT_CURLY;
 578
 579     case '<':
 580       return LEFT;
 581
 582     case '>':
 583       return RIGHT;
 584
 585     case '2':
 586       return NONASSOC;
 587
 588     case '0':
 589       return TOKEN;
 590
 591     case '=':
 592       return PREC;
 593     }
 594   if (!isalpha(c))
 595     return ILLEGAL;
 596
 597   p = token_buffer;
 598   *p++ = '%';
 599   while (isalpha(c) || c == '_' || c == '-')
 600     {
 601       if (p == token_buffer + maxtoken)
 602         p = grow_token_buffer(p);
 603
 604       if (c == '-') c = '_';
 605       *p++ = c;
 606       c = getc(finput);
 607     }
 608
 609   ungetc(c, finput);
 610
 611   *p = 0;
 612
 613   /* table lookup % directive */
 614   for (tx = percent_table; tx->name; tx++)
 615     if (strcmp(token_buffer+1, tx->name) == 0)
 616       break;
 617   if (tx->retval == SETOPT)
 618     {
 619       *((char **)(tx->setflag)) = optarg;
 620       return NOOP;
 621     }
 622   if (tx->setflag)
 623     {
 624       *((int *)(tx->setflag)) = 1;
 625       return NOOP;
 626     }
 627   return tx->retval;
 628 }