src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989, 1992 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20
  21 /*
  22    lex is the entry point.  It is called from reader.c.
  23    It returns one of the token-type codes defined in lex.h.
  24    When an identifier is seen, the code IDENTIFIER is returned
  25    and the name is looked up in the symbol table using symtab.c;
  26    symval is set to a pointer to the entry found.  */
  27
  28 #include <stdio.h>
  29 #include "system.h"
  30 #include "files.h"
  31 #include "getopt.h"             /* for optarg */
  32 #include "symtab.h"
  33 #include "lex.h"
  34 #include "alloc.h"
  35
  36 /* flags set by % directives */
  37 extern int definesflag;         /* for -d */
  38 extern int toknumflag;          /* for -k */
  39 extern int noparserflag;        /* for -n */
  40 extern int fixed_outfiles;      /* for -y */
  41 extern int nolinesflag;         /* for -l */
  42 extern int rawtoknumflag;       /* for -r */
  43 extern int verboseflag; /* for -v */
  44 extern int debugflag;           /* for -t */
  45 extern char *spec_name_prefix;  /* for -p */
  46 extern char *spec_file_prefix;  /* for -b */
  47 /*spec_outfile is declared in files.h, for -o */
  48
  49 extern int lineno;
  50 extern int translations;
  51
  52 void init_lex PARAMS((void));
  53 char *grow_token_buffer PARAMS((char *));
  54 int skip_white_space PARAMS((void));
  55 int safegetc PARAMS((FILE *));
  56 int literalchar PARAMS((char **, int *, char));
  57 void unlex PARAMS((int));
  58 int lex PARAMS((void));
  59 int parse_percent_token PARAMS((void));
  60
  61 /* functions from main.c */
  62 extern char *printable_version PARAMS((int));
  63 extern void fatal PARAMS((char *));
  64 extern void warn PARAMS((char *));
  65 extern void warni PARAMS((char *, int));
  66 extern void warns PARAMS((char *, char *));
  67
  68 /* Buffer for storing the current token.  */
  69 char *token_buffer;
  70
  71 /* Allocated size of token_buffer, not including space for terminator.  */
  72 int maxtoken;
  73
  74 bucket *symval;
  75 int numval;
  76
  77 static int unlexed;             /* these two describe a token to be reread */
  78 static bucket *unlexed_symval;  /* by the next call to lex */
  79
  80
  81 void
  82 init_lex (void)
  83 {
  84   maxtoken = 100;
  85   token_buffer = NEW2 (maxtoken + 1, char);
  86   unlexed = -1;
  87 }
  88
  89
  90 char *
  91 grow_token_buffer (char *p)
  92 {
  93   int offset = p - token_buffer;
  94   maxtoken *= 2;
  95   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  96   return token_buffer + offset;
  97 }
  98
  99
 100 int
 101 skip_white_space (void)
 102 {
 103   register int c;
 104   register int inside;
 105
 106   c = getc(finput);
 107
 108   for (;;)
 109     {
 110       int cplus_comment;
 111
 112       switch (c)
 113         {
 114         case '/':
 115           c = getc(finput);
 116           if (c != '*' && c != '/')
 117             {
 118               warn(_("unexpected `/' found and ignored"));
 119               break;
 120             }
 121           cplus_comment = (c == '/');
 122
 123           c = getc(finput);
 124
 125           inside = 1;
 126           while (inside)
 127             {
 128               if (!cplus_comment && c == '*')
 129                 {
 130                   while (c == '*')
 131                     c = getc(finput);
 132
 133                   if (c == '/')
 134                     {
 135                       inside = 0;
 136                       c = getc(finput);
 137                     }
 138                 }
 139               else if (c == '\n')
 140                 {
 141                   lineno++;
 142                   if (cplus_comment)
 143                     inside = 0;
 144                   c = getc(finput);
 145                 }
 146               else if (c == EOF)
 147                 fatal(_("unterminated comment"));
 148               else
 149                 c = getc(finput);
 150             }
 151
 152           break;
 153
 154         case '\n':
 155           lineno++;
 156
 157         case ' ':
 158         case '\t':
 159         case '\f':
 160           c = getc(finput);
 161           break;
 162
 163         default:
 164           return (c);
 165         }
 166     }
 167 }
 168
 169 /* do a getc, but give error message if EOF encountered */
 170 int
 171 safegetc (FILE *f)
 172 {
 173   register int c = getc(f);
 174   if (c == EOF)
 175     fatal(_("Unexpected end of file"));
 176   return c;
 177 }
 178
 179 /* read one literal character from finput.  process \ escapes.
 180    append the normalized string version of the char to *pp.
 181    assign the character code to *pcode
 182    return 1 unless the character is an unescaped `term' or \n
 183         report error for \n
 184 */
 185 int
 186 literalchar (char **pp, int *pcode, char term)
 187 {
 188   register int c;
 189   register char *p;
 190   register int code;
 191   int wasquote = 0;
 192
 193   c = safegetc(finput);
 194   if (c == '\n')
 195     {
 196       warn(_("unescaped newline in constant"));
 197       ungetc(c, finput);
 198       code = '?';
 199       wasquote = 1;
 200     }
 201   else if (c != '\\')
 202     {
 203       code = c;
 204       if (c == term)
 205         wasquote = 1;
 206     }
 207   else
 208     {
 209       c = safegetc(finput);
 210       if (c == 't')  code = '\t';
 211       else if (c == 'n')  code = '\n';
 212       else if (c == 'a')  code = '\007';
 213       else if (c == 'r')  code = '\r';
 214       else if (c == 'f')  code = '\f';
 215       else if (c == 'b')  code = '\b';
 216       else if (c == 'v')  code = '\013';
 217       else if (c == '\\')  code = '\\';
 218       else if (c == '\'')  code = '\'';
 219       else if (c == '\"')  code = '\"';
 220       else if (c <= '7' && c >= '0')
 221         {
 222           code = 0;
 223           while (c <= '7' && c >= '0')
 224             {
 225               code = (code * 8) + (c - '0');
 226               if (code >= 256 || code < 0)
 227                 {
 228                   warni(_("octal value outside range 0...255: `\\%o'"), code);
 229                   code &= 0xFF;
 230                   break;
 231                 }
 232               c = safegetc(finput);
 233             }
 234           ungetc(c, finput);
 235         }
 236       else if (c == 'x')
 237         {
 238           c = safegetc(finput);
 239           code = 0;
 240           while (1)
 241             {
 242               if (c >= '0' && c <= '9')
 243                 code *= 16,  code += c - '0';
 244               else if (c >= 'a' && c <= 'f')
 245                 code *= 16,  code += c - 'a' + 10;
 246               else if (c >= 'A' && c <= 'F')
 247                 code *= 16,  code += c - 'A' + 10;
 248               else
 249                 break;
 250               if (code >= 256 || code<0)
 251                 {
 252                   warni(_("hexadecimal value above 255: `\\x%x'"), code);
 253                   code &= 0xFF;
 254                   break;
 255                 }
 256               c = safegetc(finput);
 257             }
 258           ungetc(c, finput);
 259         }
 260       else
 261         {
 262           warns (_("unknown escape sequence: `\\' followed by `%s'"),
 263                  printable_version(c));
 264           code = '?';
 265         }
 266     } /* has \ */
 267
 268   /* now fill token_buffer with the canonical name for this character
 269      as a literal token.  Do not use what the user typed,
 270      so that `\012' and `\n' can be interchangeable.  */
 271
 272   p = *pp;
 273   if (code == '\\')  {*p++ = '\\'; *p++ = '\\';}
 274   else if (code == '\'')  {*p++ = '\\'; *p++ = '\'';}
 275   else if (code == '\"')  {*p++ = '\\'; *p++ = '\"';}
 276   else if (code >= 040 && code < 0177)
 277     *p++ = code;
 278   else if (code == '\t')  {*p++ = '\\'; *p++ = 't';}
 279   else if (code == '\n')  {*p++ = '\\'; *p++ = 'n';}
 280   else if (code == '\r')  {*p++ = '\\'; *p++ = 'r';}
 281   else if (code == '\v')  {*p++ = '\\'; *p++ = 'v';}
 282   else if (code == '\b')  {*p++ = '\\'; *p++ = 'b';}
 283   else if (code == '\f')  {*p++ = '\\'; *p++ = 'f';}
 284   else
 285     {
 286       *p++ = '\\';
 287       *p++ = code / 0100 + '0';
 288       *p++ = ((code / 010) & 07) + '0';
 289       *p++ = (code & 07) + '0';
 290     }
 291   *pp = p;
 292   *pcode = code;
 293   return  ! wasquote;
 294 }
 295
 296
 297 void
 298 unlex (int token)
 299 {
 300   unlexed = token;
 301   unlexed_symval = symval;
 302 }
 303
 304
 305 int
 306 lex (void)
 307 {
 308   register int c;
 309   char *p;
 310
 311   if (unlexed >= 0)
 312     {
 313       symval = unlexed_symval;
 314       c = unlexed;
 315       unlexed = -1;
 316       return (c);
 317     }
 318
 319   c = skip_white_space();
 320   *token_buffer = c;    /* for error messages (token buffer always valid) */
 321   token_buffer[1] = 0;
 322
 323   switch (c)
 324     {
 325     case EOF:
 326       strcpy(token_buffer, "EOF");
 327       return (ENDFILE);
 328
 329     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 330     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 331     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 332     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 333     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 334     case 'Z':
 335     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 336     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 337     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 338     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 339     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 340     case 'z':
 341     case '.':  case '_':
 342       p = token_buffer;
 343       while (isalnum(c) || c == '_' || c == '.')
 344         {
 345           if (p == token_buffer + maxtoken)
 346             p = grow_token_buffer(p);
 347
 348           *p++ = c;
 349           c = getc(finput);
 350         }
 351
 352       *p = 0;
 353       ungetc(c, finput);
 354       symval = getsym(token_buffer);
 355       return (IDENTIFIER);
 356
 357     case '0':  case '1':  case '2':  case '3':  case '4':
 358     case '5':  case '6':  case '7':  case '8':  case '9':
 359       {
 360         numval = 0;
 361
 362         p = token_buffer;
 363         while (isdigit(c))
 364           {
 365             if (p == token_buffer + maxtoken)
 366               p = grow_token_buffer(p);
 367
 368             *p++ = c;
 369             numval = numval*10 + c - '0';
 370             c = getc(finput);
 371           }
 372         *p = 0;
 373         ungetc(c, finput);
 374         return (NUMBER);
 375       }
 376
 377     case '\'':
 378
 379       /* parse the literal token and compute character code in  code  */
 380
 381       translations = -1;
 382       {
 383         int code, discode;
 384         char discard[10], *dp;
 385
 386         p = token_buffer;
 387         *p++ = '\'';
 388         literalchar(&p, &code, '\'');
 389
 390         c = getc(finput);
 391         if (c != '\'')
 392           {
 393             warn(_("use \"...\" for multi-character literal tokens"));
 394             while (1)
 395               {
 396                 dp = discard;
 397                 if (! literalchar(&dp, &discode, '\''))
 398                   break;
 399               }
 400           }
 401         *p++ = '\'';
 402         *p = 0;
 403         symval = getsym(token_buffer);
 404         symval->class = STOKEN;
 405         if (! symval->user_token_number)
 406           symval->user_token_number = code;
 407         return (IDENTIFIER);
 408       }
 409
 410     case '\"':
 411
 412       /* parse the literal string token and treat as an identifier */
 413
 414       translations = -1;
 415       {
 416         int code;       /* ignored here */
 417         p = token_buffer;
 418         *p++ = '\"';
 419         while (literalchar(&p, &code, '\"'))  /* read up to and including " */
 420           {
 421             if (p >= token_buffer + maxtoken - 4)
 422               p = grow_token_buffer(p);
 423           }
 424         *p = 0;
 425
 426         symval = getsym(token_buffer);
 427         symval->class = STOKEN;
 428
 429         return (IDENTIFIER);
 430       }
 431
 432     case ',':
 433       return (COMMA);
 434
 435     case ':':
 436       return (COLON);
 437
 438     case ';':
 439       return (SEMICOLON);
 440
 441     case '|':
 442       return (BAR);
 443
 444     case '{':
 445       return (LEFT_CURLY);
 446
 447     case '=':
 448       do
 449         {
 450           c = getc(finput);
 451           if (c == '\n') lineno++;
 452         }
 453       while(c==' ' || c=='\n' || c=='\t');
 454
 455       if (c == '{')
 456         {
 457           strcpy(token_buffer, "={");
 458           return(LEFT_CURLY);
 459         }
 460       else
 461         {
 462           ungetc(c, finput);
 463           return(ILLEGAL);
 464         }
 465
 466     case '<':
 467       p = token_buffer;
 468       c = getc(finput);
 469       while (c != '>')
 470         {
 471           if (c == EOF)
 472             fatal(_("unterminated type name at end of file"));
 473           if (c == '\n')
 474             {
 475               warn(_("unterminated type name"));
 476               ungetc(c, finput);
 477               break;
 478             }
 479
 480           if (p == token_buffer + maxtoken)
 481             p = grow_token_buffer(p);
 482
 483           *p++ = c;
 484           c = getc(finput);
 485         }
 486       *p = 0;
 487       return (TYPENAME);
 488
 489
 490     case '%':
 491       return (parse_percent_token());
 492
 493     default:
 494       return (ILLEGAL);
 495     }
 496 }
 497
 498 /* the following table dictates the action taken for the various
 499         % directives.  A setflag value causes the named flag to be
 500         set.  A retval action returns the code.
 501 */
 502 struct percent_table_struct {
 503         char *name;
 504         void *setflag;
 505         int retval;
 506 } percent_table[] =
 507 {
 508   {"token", NULL, TOKEN},
 509   {"term", NULL, TOKEN},
 510   {"nterm", NULL, NTERM},
 511   {"type", NULL, TYPE},
 512   {"guard", NULL, GUARD},
 513   {"union", NULL, UNION},
 514   {"expect", NULL, EXPECT},
 515   {"thong", NULL, THONG},
 516   {"start", NULL, START},
 517   {"left", NULL, LEFT},
 518   {"right", NULL, RIGHT},
 519   {"nonassoc", NULL, NONASSOC},
 520   {"binary", NULL, NONASSOC},
 521   {"semantic_parser", NULL, SEMANTIC_PARSER},
 522   {"pure_parser", NULL, PURE_PARSER},
 523   {"prec", NULL, PREC},
 524
 525   {"no_lines", &nolinesflag, NOOP}, /* -l */
 526   {"raw", &rawtoknumflag, NOOP}, /* -r */
 527   {"token_table", &toknumflag, NOOP}, /* -k */
 528
 529 #if 0
 530   /* These can be utilized after main is reoganized so
 531      open_files() is deferred 'til after read_declarations().
 532      But %{ and %union both put information into files
 533      that have to be opened before read_declarations().
 534      */
 535   {"yacc", &fixed_outfiles, NOOP}, /* -y */
 536   {"fixed_output_files", &fixed_outfiles, NOOP}, /* -y */
 537   {"defines", &definesflag, NOOP}, /* -d */
 538   {"no_parser", &noparserflag, NOOP}, /* -n */
 539   {"output_file", &spec_outfile, SETOPT}, /* -o */
 540   {"file_prefix", &spec_file_prefix, SETOPT}, /* -b */
 541   {"name_prefix", &spec_name_prefix, SETOPT}, /* -p */
 542
 543   /* These would be acceptable, but they do not affect processing */
 544   {"verbose", &verboseflag, NOOP}, /* -v */
 545   {"debug", &debugflag, NOOP},  /* -t */
 546   /*    {"help", <print usage stmt>, NOOP},*/   /* -h */
 547   /*    {"version", <print version number> ,  NOOP},*/  /* -V */
 548 #endif
 549
 550   {NULL, NULL, ILLEGAL}
 551 };
 552
 553 /* Parse a token which starts with %.
 554    Assumes the % has already been read and discarded.  */
 555
 556 int
 557 parse_percent_token (void)
 558 {
 559   register int c;
 560   register char *p;
 561   register struct percent_table_struct *tx;
 562
 563   p = token_buffer;
 564   c = getc(finput);
 565   *p++ = '%';
 566   *p++ = c;     /* for error msg */
 567   *p = 0;
 568
 569   switch (c)
 570     {
 571     case '%':
 572       return (TWO_PERCENTS);
 573
 574     case '{':
 575       return (PERCENT_LEFT_CURLY);
 576
 577     case '<':
 578       return (LEFT);
 579
 580     case '>':
 581       return (RIGHT);
 582
 583     case '2':
 584       return (NONASSOC);
 585
 586     case '0':
 587       return (TOKEN);
 588
 589     case '=':
 590       return (PREC);
 591     }
 592   if (!isalpha(c))
 593     return (ILLEGAL);
 594
 595   p = token_buffer;
 596   *p++ = '%';
 597   while (isalpha(c) || c == '_' || c == '-')
 598     {
 599       if (p == token_buffer + maxtoken)
 600         p = grow_token_buffer(p);
 601
 602       if (c == '-') c = '_';
 603       *p++ = c;
 604       c = getc(finput);
 605     }
 606
 607   ungetc(c, finput);
 608
 609   *p = 0;
 610
 611   /* table lookup % directive */
 612   for (tx = percent_table; tx->name; tx++)
 613     if (strcmp(token_buffer+1, tx->name) == 0)
 614       break;
 615   if (tx->retval == SETOPT)
 616     {
 617       *((char **)(tx->setflag)) = optarg;
 618       return NOOP;
 619     }
 620   if (tx->setflag)
 621     {
 622       *((int *)(tx->setflag)) = 1;
 623       return NOOP;
 624     }
 625   return tx->retval;
 626 }