src/lex.c

   1 /* Token-reader for Bison's input parser,
   2    Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc.
   3
   4 This file is part of Bison, the GNU Compiler Compiler.
   5
   6 Bison is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 Bison is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Bison; see the file COPYING.  If not, write to
  18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20
  21 /*
  22    lex() is the entry point.  It is called from reader.c.
  23    It returns one of the token-type codes defined in lex.h.
  24    When an identifier is seen, the code IDENTIFIER is returned
  25    and the name is looked up in the symbol table using symtab.c;
  26    symval is set to a pointer to the entry found.  */
  27
  28 #include <stdio.h>
  29 #include <ctype.h>
  30 #include "system.h"
  31 #include "files.h"
  32 #include "symtab.h"
  33 #include "lex.h"
  34 #include "new.h"
  35
  36
  37 extern int lineno;
  38 extern int translations;
  39
  40 int parse_percent_token();
  41
  42 extern void fatals();
  43 extern void fatal();
  44
  45 /* Buffer for storing the current token.  */
  46 char *token_buffer;
  47
  48 /* Allocated size of token_buffer, not including space for terminator.  */
  49 static int maxtoken;
  50
  51 bucket *symval;
  52 int numval;
  53
  54 static int unlexed;             /* these two describe a token to be reread */
  55 static bucket *unlexed_symval;  /* by the next call to lex */
  56
  57
  58 void
  59 init_lex()
  60 {
  61   maxtoken = 100;
  62   token_buffer = NEW2 (maxtoken + 1, char);
  63   unlexed = -1;
  64 }
  65
  66
  67 static char *
  68 grow_token_buffer (p)
  69      char *p;
  70 {
  71   int offset = p - token_buffer;
  72   maxtoken *= 2;
  73   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
  74   return token_buffer + offset;
  75 }
  76
  77
  78 int
  79 skip_white_space()
  80 {
  81   register int c;
  82   register int inside;
  83
  84   c = getc(finput);
  85
  86   for (;;)
  87     {
  88       int cplus_comment;
  89
  90       switch (c)
  91         {
  92         case '/':
  93           c = getc(finput);
  94           if (c != '*' && c != '/')
  95             fatals("unexpected `/%c' found",c);
  96           cplus_comment = (c == '/');
  97
  98           c = getc(finput);
  99
 100           inside = 1;
 101           while (inside)
 102             {
 103               if (!cplus_comment && c == '*')
 104                 {
 105                   while (c == '*')
 106                     c = getc(finput);
 107
 108                   if (c == '/')
 109                     {
 110                       inside = 0;
 111                       c = getc(finput);
 112                     }
 113                 }
 114               else if (c == '\n')
 115                 {
 116                   lineno++;
 117                   if (cplus_comment)
 118                     inside = 0;
 119                   c = getc(finput);
 120                 }
 121               else if (c == EOF)
 122                 fatal("unterminated comment");
 123               else
 124                 c = getc(finput);
 125             }
 126
 127           break;
 128
 129         case '\n':
 130           lineno++;
 131
 132         case ' ':
 133         case '\t':
 134         case '\f':
 135           c = getc(finput);
 136           break;
 137
 138         default:
 139           return (c);
 140         }
 141     }
 142 }
 143
 144
 145 void
 146 unlex(token)
 147 int token;
 148 {
 149   unlexed = token;
 150   unlexed_symval = symval;
 151 }
 152
 153
 154
 155 int
 156 lex()
 157 {
 158   register int c;
 159   register char *p;
 160
 161   if (unlexed >= 0)
 162     {
 163       symval = unlexed_symval;
 164       c = unlexed;
 165       unlexed = -1;
 166       return (c);
 167     }
 168
 169   c = skip_white_space();
 170
 171   switch (c)
 172     {
 173     case EOF:
 174       return (ENDFILE);
 175
 176     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
 177     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
 178     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
 179     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
 180     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
 181     case 'Z':
 182     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
 183     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
 184     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
 185     case 'p':  case 'q':  case 'r':  case 's':  case 't':
 186     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
 187     case 'z':
 188     case '.':  case '_':
 189       p = token_buffer;
 190       while (isalnum(c) || c == '_' || c == '.')
 191         {
 192           if (p == token_buffer + maxtoken)
 193             p = grow_token_buffer(p);
 194
 195           *p++ = c;
 196           c = getc(finput);
 197         }
 198
 199       *p = 0;
 200       ungetc(c, finput);
 201       symval = getsym(token_buffer);
 202       return (IDENTIFIER);
 203
 204     case '0':  case '1':  case '2':  case '3':  case '4':
 205     case '5':  case '6':  case '7':  case '8':  case '9':
 206       {
 207         numval = 0;
 208
 209         while (isdigit(c))
 210           {
 211             numval = numval*10 + c - '0';
 212             c = getc(finput);
 213           }
 214         ungetc(c, finput);
 215         return (NUMBER);
 216       }
 217
 218     case '\'':
 219       translations = -1;
 220
 221       /* parse the literal token and compute character code in  code  */
 222
 223       c = getc(finput);
 224       {
 225         register int code = 0;
 226
 227         if (c == '\\')
 228           {
 229             c = getc(finput);
 230
 231             if (c <= '7' && c >= '0')
 232               {
 233                 while (c <= '7' && c >= '0')
 234                   {
 235                     code = (code * 8) + (c - '0');
 236                     c = getc(finput);
 237                     if (code >= 256 || code < 0)
 238                       fatals("malformatted literal token `\\%03o'", code);
 239                   }
 240               }
 241             else
 242               {
 243                 if (c == 't')
 244                   code = '\t';
 245                 else if (c == 'n')
 246                   code = '\n';
 247                 else if (c == 'a')
 248                   code = '\007';
 249                 else if (c == 'r')
 250                   code = '\r';
 251                 else if (c == 'f')
 252                   code = '\f';
 253                 else if (c == 'b')
 254                   code = '\b';
 255                 else if (c == 'v')
 256                   code = 013;
 257                 else if (c == 'x')
 258                   {
 259                     c = getc(finput);
 260                     while ((c <= '9' && c >= '0')
 261                            || (c >= 'a' && c <= 'z')
 262                            || (c >= 'A' && c <= 'Z'))
 263                       {
 264                         code *= 16;
 265                         if (c <= '9' && c >= '0')
 266                           code += c - '0';
 267                         else if (c >= 'a' && c <= 'z')
 268                           code += c - 'a' + 10;
 269                         else if (c >= 'A' && c <= 'Z')
 270                           code += c - 'A' + 10;
 271                         if (code >= 256 || code<0)/* JF this said if(c>=128) */
 272                           fatals("malformatted literal token `\\x%x'",code);
 273                         c = getc(finput);
 274                       }
 275                     ungetc(c, finput);
 276                   }
 277                 else if (c == '\\')
 278                   code = '\\';
 279                 else if (c == '\'')
 280                   code = '\'';
 281                 else if (c == '\"')     /* JF this is a good idea */
 282                   code = '\"';
 283                 else
 284                   {
 285                     if (c >= 040 && c <= 0177)
 286                       fatals ("unknown escape sequence `\\%c'", c);
 287                     else
 288                       fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c);
 289                   }
 290
 291                 c = getc(finput);
 292               }
 293           }
 294         else
 295           {
 296             code = c;
 297             c = getc(finput);
 298           }
 299         if (c != '\'')
 300           fatal("multicharacter literal tokens not supported");
 301
 302         /* now fill token_buffer with the canonical name for this character
 303            as a literal token.  Do not use what the user typed,
 304            so that '\012' and '\n' can be interchangeable.  */
 305
 306         p = token_buffer;
 307         *p++ = '\'';
 308         if (code == '\\')
 309           {
 310             *p++ = '\\';
 311             *p++ = '\\';
 312           }
 313         else if (code == '\'')
 314           {
 315             *p++ = '\\';
 316             *p++ = '\'';
 317           }
 318         else if (code >= 040 && code != 0177)
 319           *p++ = code;
 320         else if (code == '\t')
 321           {
 322             *p++ = '\\';
 323             *p++ = 't';
 324           }
 325         else if (code == '\n')
 326           {
 327             *p++ = '\\';
 328             *p++ = 'n';
 329           }
 330         else if (code == '\r')
 331           {
 332             *p++ = '\\';
 333             *p++ = 'r';
 334           }
 335         else if (code == '\v')
 336           {
 337             *p++ = '\\';
 338             *p++ = 'v';
 339           }
 340         else if (code == '\b')
 341           {
 342             *p++ = '\\';
 343             *p++ = 'b';
 344           }
 345         else if (code == '\f')
 346           {
 347             *p++ = '\\';
 348             *p++ = 'f';
 349           }
 350         else
 351           {
 352             *p++ = code / 0100 + '0';
 353             *p++ = ((code / 010) & 07) + '0';
 354             *p++ = (code & 07) + '0';
 355           }
 356         *p++ = '\'';
 357         *p = 0;
 358         symval = getsym(token_buffer);
 359         symval->class = STOKEN;
 360         if (! symval->user_token_number)
 361           symval->user_token_number = code;
 362         return (IDENTIFIER);
 363       }
 364
 365     case ',':
 366       return (COMMA);
 367
 368     case ':':
 369       return (COLON);
 370
 371     case ';':
 372       return (SEMICOLON);
 373
 374     case '|':
 375       return (BAR);
 376
 377     case '{':
 378       return (LEFT_CURLY);
 379
 380     case '=':
 381       do
 382         {
 383           c = getc(finput);
 384           if (c == '\n') lineno++;
 385         }
 386       while(c==' ' || c=='\n' || c=='\t');
 387
 388       if (c == '{')
 389         return(LEFT_CURLY);
 390       else
 391         {
 392           ungetc(c, finput);
 393           return(ILLEGAL);
 394         }
 395
 396     case '<':
 397       p = token_buffer;
 398       c = getc(finput);
 399       while (c != '>')
 400         {
 401           if (c == '\n' || c == EOF)
 402             fatal("unterminated type name");
 403
 404           if (p == token_buffer + maxtoken)
 405             p = grow_token_buffer(p);
 406
 407           *p++ = c;
 408           c = getc(finput);
 409         }
 410       *p = 0;
 411       return (TYPENAME);
 412
 413
 414     case '%':
 415       return (parse_percent_token());
 416
 417     default:
 418       return (ILLEGAL);
 419     }
 420 }
 421
 422
 423 /* parse a token which starts with %.  Assumes the % has already been read and discarded.  */
 424
 425 int
 426 parse_percent_token ()
 427 {
 428   register int c;
 429   register char *p;
 430
 431   p = token_buffer;
 432   c = getc(finput);
 433
 434   switch (c)
 435     {
 436     case '%':
 437       return (TWO_PERCENTS);
 438
 439     case '{':
 440       return (PERCENT_LEFT_CURLY);
 441
 442     case '<':
 443       return (LEFT);
 444
 445     case '>':
 446       return (RIGHT);
 447
 448     case '2':
 449       return (NONASSOC);
 450
 451     case '0':
 452       return (TOKEN);
 453
 454     case '=':
 455       return (PREC);
 456     }
 457   if (!isalpha(c))
 458     return (ILLEGAL);
 459
 460   while (isalpha(c) || c == '_')
 461     {
 462       if (p == token_buffer + maxtoken)
 463         p = grow_token_buffer(p);
 464
 465       *p++ = c;
 466       c = getc(finput);
 467     }
 468
 469   ungetc(c, finput);
 470
 471   *p = 0;
 472
 473   if (strcmp(token_buffer, "token") == 0
 474       ||
 475       strcmp(token_buffer, "term") == 0)
 476     return (TOKEN);
 477   else if (strcmp(token_buffer, "nterm") == 0)
 478     return (NTERM);
 479   else if (strcmp(token_buffer, "type") == 0)
 480     return (TYPE);
 481   else if (strcmp(token_buffer, "guard") == 0)
 482     return (GUARD);
 483   else if (strcmp(token_buffer, "union") == 0)
 484     return (UNION);
 485   else if (strcmp(token_buffer, "expect") == 0)
 486     return (EXPECT);
 487   else if (strcmp(token_buffer, "start") == 0)
 488     return (START);
 489   else if (strcmp(token_buffer, "left") == 0)
 490     return (LEFT);
 491   else if (strcmp(token_buffer, "right") == 0)
 492     return (RIGHT);
 493   else if (strcmp(token_buffer, "nonassoc") == 0
 494            ||
 495            strcmp(token_buffer, "binary") == 0)
 496     return (NONASSOC);
 497   else if (strcmp(token_buffer, "semantic_parser") == 0)
 498     return (SEMANTIC_PARSER);
 499   else if (strcmp(token_buffer, "pure_parser") == 0)
 500     return (PURE_PARSER);
 501   else if (strcmp(token_buffer, "prec") == 0)
 502     return (PREC);
 503   else return (ILLEGAL);
 504 }