src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc.
   3
   4    This file is part of Bison, the GNU Compiler Compiler.
   5
   6    Bison is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    Bison is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Bison; see the file COPYING.  If not, write to
  18    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21
  22 /*
  23    lex is the entry point.  It is called from reader.c.
  24    It returns one of the token-type codes defined in lex.h.
  25    When an identifier is seen, the code IDENTIFIER is returned
  26    and the name is looked up in the symbol table using symtab.c;
  27    symval is set to a pointer to the entry found.  */
  28
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "getopt.h"             /* for optarg */
  33 #include "symtab.h"
  34 #include "lex.h"
  35 #include "alloc.h"
  36 #include "complain.h"
  37
  38 /* flags set by % directives */
  39 extern int definesflag;         /* for -d */
  40 extern int toknumflag;          /* for -k */
  41 extern int noparserflag;        /* for -n */
  42 extern int fixed_outfiles;      /* for -y */
  43 extern int nolinesflag;         /* for -l */
  44 extern int rawtoknumflag;       /* for -r */
  45 extern int verboseflag; /* for -v */
  46 extern int debugflag;           /* for -t */
  47 extern char *spec_name_prefix;  /* for -p */
  48 extern char *spec_file_prefix;  /* for -b */
  49 /*spec_outfile is declared in files.h, for -o */
  50
  51 extern int translations;
  52
  53 void init_lex PARAMS((void));
  54 char *grow_token_buffer PARAMS((char *));
  55 int skip_white_space PARAMS((void));
  56 int safegetc PARAMS((FILE *));
  57 int literalchar PARAMS((char **, int *, char));
  58 void unlex PARAMS((int));
  59 int lex PARAMS((void));
  60 int parse_percent_token PARAMS((void));
  61
  62 /* functions from main.c */
  63 extern char *printable_version PARAMS((int));
  64
  65 /* Buffer for storing the current token.  */
  66 char *token_buffer;
  67
  68 /* Allocated size of token_buffer, not including space for terminator.  */
  69 int maxtoken;
  70
  71 bucket *symval;
  72 int numval;
  73
  74 static int unlexed;             /* these two describe a token to be reread */
  75 static bucket *unlexed_symval;  /* by the next call to lex */
  76
  77
  78 void
  79 init_lex (void)
  80 {
  81   maxtoken = 100;
  82   token_buffer = NEW2 (maxtoken + 1, char);
  83   unlexed = -1;
  84 }
  85
  86
  87 char *
  88 grow_token_buffer (char *p)
  89 {
  90   int offset = p - token_buffer;
  91   maxtoken *= 2;
  92   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  93   return token_buffer + offset;
  94 }
  95
  96
  97 int
  98 skip_white_space (void)
  99 {
 100   register int c;
 101   register int inside;
 102
 103   c = getc(finput);
 104
 105   for (;;)
 106     {
 107       int cplus_comment;
 108
 109       switch (c)
 110         {
 111         case '/':
 112           c = getc(finput);
 113           if (c != '*' && c != '/')
 114             {
 115               complain (_("unexpected `/' found and ignored"));
 116               break;
 117             }
 118           cplus_comment = (c == '/');
 119
 120           c = getc(finput);
 121
 122           inside = 1;
 123           while (inside)
 124             {
 125               if (!cplus_comment && c == '*')
 126                 {
 127                   while (c == '*')
 128                     c = getc(finput);
 129
 130                   if (c == '/')
 131                     {
 132                       inside = 0;
 133                       c = getc(finput);
 134                     }
 135                 }
 136               else if (c == '\n')
 137                 {
 138                   lineno++;
 139                   if (cplus_comment)
 140                     inside = 0;
 141                   c = getc(finput);
 142                 }
 143               else if (c == EOF)
 144                 fatal (_("unterminated comment"));
 145               else
 146                 c = getc(finput);
 147             }
 148
 149           break;
 150
 151         case '\n':
 152           lineno++;
 153
 154         case ' ':
 155         case '\t':
 156         case '\f':
 157           c = getc(finput);
 158           break;
 159
 160         default:
 161           return c;
 162         }
 163     }
 164 }
 165
 166 /* do a getc, but give error message if EOF encountered */
 167 int
 168 safegetc (FILE *f)
 169 {
 170   register int c = getc(f);
 171   if (c == EOF)
 172     fatal (_("unexpected end of file"));
 173   return c;
 174 }
 175
 176 /* read one literal character from finput.  process \ escapes.
 177    append the normalized string version of the char to *pp.
 178    assign the character code to *pcode
 179    return 1 unless the character is an unescaped `term' or \n
 180         report error for \n
 181 */
 182 int
 183 literalchar (char **pp, int *pcode, char term)
 184 {
 185   register int c;
 186   register char *p;
 187   register int code;
 188   int wasquote = 0;
 189
 190   c = safegetc(finput);
 191   if (c == '\n')
 192     {
 193       complain (_("unescaped newline in constant"));
 194       ungetc(c, finput);
 195       code = '?';
 196       wasquote = 1;
 197     }
 198   else if (c != '\\')
 199     {
 200       code = c;
 201       if (c == term)
 202         wasquote = 1;
 203     }
 204   else
 205     {
 206       c = safegetc(finput);
 207       if (c == 't')  code = '\t';
 208       else if (c == 'n')  code = '\n';
 209       else if (c == 'a')  code = '\007';
 210       else if (c == 'r')  code = '\r';
 211       else if (c == 'f')  code = '\f';
 212       else if (c == 'b')  code = '\b';
 213       else if (c == 'v')  code = '\013';
 214       else if (c == '\\')  code = '\\';
 215       else if (c == '\'')  code = '\'';
 216       else if (c == '\"')  code = '\"';
 217       else if (c <= '7' && c >= '0')
 218         {
 219           code = 0;
 220           while (c <= '7' && c >= '0')
 221             {
 222               code = (code * 8) + (c - '0');
 223               if (code >= 256 || code < 0)
 224                 {
 225                   complain (_("octal value outside range 0...255: `\\%o'"),
 226                             code);
 227                   code &= 0xFF;
 228                   break;
 229                 }
 230               c = safegetc(finput);
 231             }
 232           ungetc(c, finput);
 233         }
 234       else if (c == 'x')
 235         {
 236           c = safegetc(finput);
 237           code = 0;
 238           while (1)
 239             {
 240               if (c >= '0' && c <= '9')
 241                 code *= 16,  code += c - '0';
 242               else if (c >= 'a' && c <= 'f')
 243                 code *= 16,  code += c - 'a' + 10;
 244               else if (c >= 'A' && c <= 'F')
 245                 code *= 16,  code += c - 'A' + 10;
 246               else
 247                 break;
 248               if (code >= 256 || code<0)
 249                 {
 250                   complain (_("hexadecimal value above 255: `\\x%x'"),
 251                             code);
 252                   code &= 0xFF;
 253                   break;
 254                 }
 255               c = safegetc(finput);
 256             }
 257           ungetc(c, finput);
 258         }
 259       else
 260         {
 261           complain (_("unknown escape sequence: `\\' followed by `%s'"),
 262                     printable_version(c));
 263           code = '?';
 264         }
 265     } /* has \ */
 266
 267   /* now fill token_buffer with the canonical name for this character
 268      as a literal token.  Do not use what the user typed,
 269      so that `\012' and `\n' can be interchangeable.  */
 270
 271   p = *pp;
 272   if (code == term && wasquote)
 273     *p++ = code;
 274   else if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 275   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 276   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 277   else if (code >= 040 && code < 0177)
 278     *p++ = code;
 279   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 280   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 281   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 282   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 283   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 284   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 285   else
 286     {
 287       *p++ = '\\';
 288       *p++ = code / 0100 + '0';
 289       *p++ = ((code / 010) & 07) + '0';
 290       *p++ = (code & 07) + '0';
 291     }
 292   *pp = p;
 293   *pcode = code;
 294   return  ! wasquote;
 295 }
 296
 297
 298 void
 299 unlex (int token)
 300 {
 301   unlexed = token;
 302   unlexed_symval = symval;
 303 }
 304
 305
 306 int
 307 lex (void)
 308 {
 309   register int c;
 310   char *p;
 311
 312   if (unlexed >= 0)
 313     {
 314       symval = unlexed_symval;
 315       c = unlexed;
 316       unlexed = -1;
 317       return c;
 318     }
 319
 320   c = skip_white_space();
 321   *token_buffer = c;    /* for error messages (token buffer always valid) */
 322   token_buffer[1] = 0;
 323
 324   switch (c)
 325     {
 326     case EOF:
 327       strcpy(token_buffer, "EOF");
 328       return ENDFILE;
 329
 330     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 331     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 332     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 333     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 334     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 335     case 'Z':
 336     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 337     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 338     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 339     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 340     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 341     case 'z':
 342     case '.':  case '_':
 343       p = token_buffer;
 344       while (isalnum(c) || c == '_' || c == '.')
 345         {
 346           if (p == token_buffer + maxtoken)
 347             p = grow_token_buffer(p);
 348
 349           *p++ = c;
 350           c = getc(finput);
 351         }
 352
 353       *p = 0;
 354       ungetc(c, finput);
 355       symval = getsym(token_buffer);
 356       return IDENTIFIER;
 357
 358     case '0':  case '1':  case '2':  case '3':  case '4':
 359     case '5':  case '6':  case '7':  case '8':  case '9':
 360       {
 361         numval = 0;
 362
 363         p = token_buffer;
 364         while (isdigit(c))
 365           {
 366             if (p == token_buffer + maxtoken)
 367               p = grow_token_buffer(p);
 368
 369             *p++ = c;
 370             numval = numval*10 + c - '0';
 371             c = getc(finput);
 372           }
 373         *p = 0;
 374         ungetc(c, finput);
 375         return NUMBER;
 376       }
 377
 378     case '\'':
 379
 380       /* parse the literal token and compute character code in  code  */
 381
 382       translations = -1;
 383       {
 384         int code, discode;
 385         char discard[10], *dp;
 386
 387         p = token_buffer;
 388         *p++ = '\'';
 389         literalchar(&p, &code, '\'');
 390
 391         c = getc(finput);
 392         if (c != '\'')
 393           {
 394             complain (_("use \"...\" for multi-character literal tokens"));
 395             while (1)
 396               {
 397                 dp = discard;
 398                 if (! literalchar(&dp, &discode, '\''))
 399                   break;
 400               }
 401           }
 402         *p++ = '\'';
 403         *p = 0;
 404         symval = getsym(token_buffer);
 405         symval->class = STOKEN;
 406         if (! symval->user_token_number)
 407           symval->user_token_number = code;
 408         return IDENTIFIER;
 409       }
 410
 411     case '\"':
 412
 413       /* parse the literal string token and treat as an identifier */
 414
 415       translations = -1;
 416       {
 417         int code;       /* ignored here */
 418         p = token_buffer;
 419         *p++ = '\"';
 420         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 421           {
 422             if (p >= token_buffer + maxtoken - 4)
 423               p = grow_token_buffer(p);
 424           }
 425         *p = 0;
 426
 427         symval = getsym(token_buffer);
 428         symval->class = STOKEN;
 429
 430         return IDENTIFIER;
 431       }
 432
 433     case ',':
 434       return COMMA;
 435
 436     case ':':
 437       return COLON;
 438
 439     case ';':
 440       return SEMICOLON;
 441
 442     case '|':
 443       return BAR;
 444
 445     case '{':
 446       return LEFT_CURLY;
 447
 448     case '=':
 449       do
 450         {
 451           c = getc(finput);
 452           if (c == '\n') lineno++;
 453         }
 454       while(c==' ' || c=='\n' || c=='\t');
 455
 456       if (c == '{')
 457         {
 458           strcpy(token_buffer, "={");
 459           return LEFT_CURLY;
 460         }
 461       else
 462         {
 463           ungetc(c, finput);
 464           return ILLEGAL;
 465         }
 466
 467     case '<':
 468       p = token_buffer;
 469       c = getc(finput);
 470       while (c != '>')
 471         {
 472           if (c == EOF)
 473             fatal (_("unterminated type name at end of file"));
 474           if (c == '\n')
 475             {
 476               complain (_("unterminated type name"));
 477               ungetc(c, finput);
 478               break;
 479             }
 480
 481           if (p == token_buffer + maxtoken)
 482             p = grow_token_buffer(p);
 483
 484           *p++ = c;
 485           c = getc(finput);
 486         }
 487       *p = 0;
 488       return TYPENAME;
 489
 490
 491     case '%':
 492       return parse_percent_token();
 493
 494     default:
 495       return ILLEGAL;
 496     }
 497 }
 498
 499 /* the following table dictates the action taken for the various
 500         % directives.  A setflag value causes the named flag to be
 501         set.  A retval action returns the code.
 502 */
 503 struct percent_table_struct {
 504         char *name;
 505         void *setflag;
 506         int retval;
 507 } percent_table[] =
 508 {
 509   {"token", NULL, TOKEN},
 510   {"term", NULL, TOKEN},
 511   {"nterm", NULL, NTERM},
 512   {"type", NULL, TYPE},
 513   {"guard", NULL, GUARD},
 514   {"union", NULL, UNION},
 515   {"expect", NULL, EXPECT},
 516   {"thong", NULL, THONG},
 517   {"start", NULL, START},
 518   {"left", NULL, LEFT},
 519   {"right", NULL, RIGHT},
 520   {"nonassoc", NULL, NONASSOC},
 521   {"binary", NULL, NONASSOC},
 522   {"semantic_parser", NULL, SEMANTIC_PARSER},
 523   {"pure_parser", NULL, PURE_PARSER},
 524   {"prec", NULL, PREC},
 525
 526   {"no_lines", &nolinesflag, NOOP}, /* -l */
 527   {"raw", &rawtoknumflag, NOOP}, /* -r */
 528   {"token_table", &toknumflag, NOOP}, /* -k */
 529
 530 #if 0
 531   /* These can be utilized after main is reoganized so
 532      open_files() is deferred 'til after read_declarations().
 533      But %{ and %union both put information into files
 534      that have to be opened before read_declarations().
 535      */
 536   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 537   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 538   {"defines", &definesflag, NOOP}, /* -d */
 539   {"no_parser", &noparserflag, NOOP}, /* -n */
 540   {"output_file", &spec_outfile, SETOPT}, /* -o */
 541   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 542   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 543
 544   /* These would be acceptable, but they do not affect processing */
 545   {"verbose", &verboseflag, NOOP}, /* -v */
 546   {"debug", &debugflag, NOOP},  /* -t */
 547   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 548   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 549 #endif
 550
 551   {NULL, NULL, ILLEGAL}
 552 };
 553
 554 /* Parse a token which starts with %.
 555    Assumes the % has already been read and discarded.  */
 556
 557 int
 558 parse_percent_token (void)
 559 {
 560   register int c;
 561   register char *p;
 562   register struct percent_table_struct *tx;
 563
 564   p = token_buffer;
 565   c = getc(finput);
 566   *p++ = '%';
 567   *p++ = c;     /* for error msg */
 568   *p = 0;
 569
 570   switch (c)
 571     {
 572     case '%':
 573       return TWO_PERCENTS;
 574
 575     case '{':
 576       return PERCENT_LEFT_CURLY;
 577
 578     case '<':
 579       return LEFT;
 580
 581     case '>':
 582       return RIGHT;
 583
 584     case '2':
 585       return NONASSOC;
 586
 587     case '0':
 588       return TOKEN;
 589
 590     case '=':
 591       return PREC;
 592     }
 593   if (!isalpha(c))
 594     return ILLEGAL;
 595
 596   p = token_buffer;
 597   *p++ = '%';
 598   while (isalpha(c) || c == '_' || c == '-')
 599     {
 600       if (p == token_buffer + maxtoken)
 601         p = grow_token_buffer(p);
 602
 603       if (c == '-') c = '_';
 604       *p++ = c;
 605       c = getc(finput);
 606     }
 607
 608   ungetc(c, finput);
 609
 610   *p = 0;
 611
 612   /* table lookup % directive */
 613   for (tx = percent_table; tx->name; tx++)
 614     if (strcmp(token_buffer+1, tx->name) == 0)
 615       break;
 616   if (tx->retval == SETOPT)
 617     {
 618       *((char **)(tx->setflag)) = optarg;
 619       return NOOP;
 620     }
 621   if (tx->setflag)
 622     {
 623       *((int *)(tx->setflag)) = 1;
 624       return NOOP;
 625     }
 626   return tx->retval;
 627 }