X-Git-Url: https://git.saurik.com/bison.git/blobdiff_plain/40675e7cc90b44d3c82c424946ea55083eb78121..ea6cfe9ebd960e2cb4573c2c6f02b02931ac1f86:/src/lex.c diff --git a/src/lex.c b/src/lex.c index 20d89a1e..489fdf9c 100644 --- a/src/lex.c +++ b/src/lex.c @@ -1,87 +1,67 @@ /* Token-reader for Bison's input parser, - Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc. + Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc. -This file is part of Bison, the GNU Compiler Compiler. + This file is part of Bison, the GNU Compiler Compiler. -Bison is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. + Bison is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. -Bison is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. + Bison is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with Bison; see the file COPYING. If not, write to -the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + You should have received a copy of the GNU General Public License + along with Bison; see the file COPYING. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ - -/* - lex() is the entry point. It is called from reader.c. - It returns one of the token-type codes defined in lex.h. - When an identifier is seen, the code IDENTIFIER is returned - and the name is looked up in the symbol table using symtab.c; - symval is set to a pointer to the entry found. */ - -#include -#include #include "system.h" +#include "getargs.h" #include "files.h" #include "symtab.h" +#include "options.h" #include "lex.h" -#include "new.h" - - -extern int lineno; -extern int translations; - -int parse_percent_token(); - -extern void fatals(); -extern void fatal(); +#include "complain.h" +#include "gram.h" +#include "quote.h" /* Buffer for storing the current token. */ -char *token_buffer; +static struct obstack token_obstack; +const char *token_buffer = NULL; -/* Allocated size of token_buffer, not including space for terminator. */ -static int maxtoken; - -bucket *symval; +bucket *symval = NULL; int numval; -static int unlexed; /* these two describe a token to be reread */ -static bucket *unlexed_symval; /* by the next call to lex */ - +/* A token to be reread, see unlex and lex. */ +static token_t unlexed = tok_undef; +static bucket *unlexed_symval = NULL; +static const char *unlexed_token_buffer = NULL; void -init_lex() +lex_init (void) { - maxtoken = 100; - token_buffer = NEW2 (maxtoken + 1, char); - unlexed = -1; + obstack_init (&token_obstack); + unlexed = tok_undef; } -static char * -grow_token_buffer (p) - char *p; +void +lex_free (void) { - int offset = p - token_buffer; - maxtoken *= 2; - token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); - return token_buffer + offset; + obstack_free (&token_obstack, NULL); } int -skip_white_space() +skip_white_space (void) { - register int c; - register int inside; + int c; + int inside; - c = getc(finput); + c = getc (finput); for (;;) { @@ -90,12 +70,16 @@ skip_white_space() switch (c) { case '/': - c = getc(finput); + /* FIXME: Should probably be merged with copy_comment. */ + c = getc (finput); if (c != '*' && c != '/') - fatals("unexpected `/%c' found",c); + { + complain (_("unexpected `/' found and ignored")); + break; + } cplus_comment = (c == '/'); - c = getc(finput); + c = getc (finput); inside = 1; while (inside) @@ -103,12 +87,12 @@ skip_white_space() if (!cplus_comment && c == '*') { while (c == '*') - c = getc(finput); + c = getc (finput); if (c == '/') { inside = 0; - c = getc(finput); + c = getc (finput); } } else if (c == '\n') @@ -116,12 +100,12 @@ skip_white_space() lineno++; if (cplus_comment) inside = 0; - c = getc(finput); + c = getc (finput); } else if (c == EOF) - fatal("unterminated comment"); + fatal (_("unterminated comment")); else - c = getc(finput); + c = getc (finput); } break; @@ -132,373 +116,493 @@ skip_white_space() case ' ': case '\t': case '\f': - c = getc(finput); + c = getc (finput); break; default: - return (c); + return c; } } } +/*-----------------------------------------------------. +| Do a getc, but give error message if EOF encountered | +`-----------------------------------------------------*/ + +int +xgetc (FILE *f) +{ + int c = getc (f); + if (c == EOF) + fatal (_("unexpected end of file")); + return c; +} + + +/*---------------------------------------------------------------. +| Read one literal character from FINPUT, process \-escapes, and | +| return the character. | +`---------------------------------------------------------------*/ + +char +literalchar (void) +{ + int c; + int res; + + c = xgetc (finput); + if (c == '\n') + { + complain (_("unescaped newline in constant")); + ungetc (c, finput); + res = '?'; + } + else if (c != '\\') + { + res = c; + } + else + { + c = xgetc (finput); + if (c == 't') + res = '\t'; + else if (c == 'n') + res = '\n'; + else if (c == 'a') + res = '\007'; + else if (c == 'r') + res = '\r'; + else if (c == 'f') + res = '\f'; + else if (c == 'b') + res = '\b'; + else if (c == 'v') + res = '\013'; + else if (c == '\\') + res = '\\'; + else if (c == '\'') + res = '\''; + else if (c == '\"') + res = '\"'; + else if (c <= '7' && c >= '0') + { + res = 0; + while (c <= '7' && c >= '0') + { + res = (res * 8) + (c - '0'); + if (res >= 256 || res < 0) + { + complain (_("octal value outside range 0...255: `\\%o'"), + res); + res &= 0xFF; + break; + } + c = xgetc (finput); + } + ungetc (c, finput); + } + else if (c == 'x') + { + c = xgetc (finput); + res = 0; + while (1) + { + if (c >= '0' && c <= '9') + res *= 16, res += c - '0'; + else if (c >= 'a' && c <= 'f') + res *= 16, res += c - 'a' + 10; + else if (c >= 'A' && c <= 'F') + res *= 16, res += c - 'A' + 10; + else + break; + if (res >= 256 || res < 0) + { + complain (_("hexadecimal value above 255: `\\x%x'"), res); + res &= 0xFF; + break; + } + c = xgetc (finput); + } + ungetc (c, finput); + } + else + { + char badchar [] = "c"; + badchar[0] = c; + complain (_("unknown escape sequence: `\\' followed by `%s'"), + quote (badchar)); + res = '?'; + } + } /* has \ */ + + return res; +} + + void -unlex(token) -int token; +unlex (token_t token) { unlexed = token; + unlexed_token_buffer = token_buffer; unlexed_symval = symval; } +/*-----------------------------------------------------------------. +| We just read `<' from FIN. Store in TOKEN_BUFFER, the type name | +| specified between the `<...>'. | +`-----------------------------------------------------------------*/ + +void +read_type_name (FILE *fin) +{ + int c = getc (fin); + while (c != '>') + { + if (c == EOF) + fatal (_("unterminated type name at end of file")); + if (c == '\n') + { + complain (_("unterminated type name")); + ungetc (c, fin); + break; + } -int -lex() + obstack_1grow (&token_obstack, c); + c = getc (fin); + } + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); +} + + +token_t +lex (void) { - register int c; - register char *p; + int c; + + /* Just to make sure. */ + token_buffer = NULL; - if (unlexed >= 0) + if (unlexed != tok_undef) { + token_t res = unlexed; symval = unlexed_symval; - c = unlexed; - unlexed = -1; - return (c); + token_buffer = unlexed_token_buffer; + unlexed = tok_undef; + return res; } - c = skip_white_space(); + c = skip_white_space (); switch (c) { case EOF: - return (ENDFILE); - - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': case 'G': case 'H': case 'I': case 'J': - case 'K': case 'L': case 'M': case 'N': case 'O': - case 'P': case 'Q': case 'R': case 'S': case 'T': - case 'U': case 'V': case 'W': case 'X': case 'Y': + token_buffer = "EOF"; + return tok_eof; + + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': case 'g': case 'h': case 'i': case 'j': - case 'k': case 'l': case 'm': case 'n': case 'o': - case 'p': case 'q': case 'r': case 's': case 't': - case 'u': case 'v': case 'w': case 'x': case 'y': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': - case '.': case '_': - p = token_buffer; - while (isalnum(c) || c == '_' || c == '.') - { - if (p == token_buffer + maxtoken) - p = grow_token_buffer(p); + case '.': case '_': - *p++ = c; - c = getc(finput); + while (isalnum (c) || c == '_' || c == '.') + { + obstack_1grow (&token_obstack, c); + c = getc (finput); } - - *p = 0; - ungetc(c, finput); - symval = getsym(token_buffer); - return (IDENTIFIER); - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + ungetc (c, finput); + symval = getsym (token_buffer); + return tok_identifier; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { numval = 0; - while (isdigit(c)) + while (isdigit (c)) { - numval = numval*10 + c - '0'; - c = getc(finput); + obstack_1grow (&token_obstack, c); + numval = numval * 10 + c - '0'; + c = getc (finput); } - ungetc(c, finput); - return (NUMBER); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + ungetc (c, finput); + return tok_number; } case '\'': - translations = -1; - /* parse the literal token and compute character code in code */ - c = getc(finput); { - register int code = 0; + int code = literalchar (); - if (c == '\\') - { - c = getc(finput); - - if (c <= '7' && c >= '0') - { - while (c <= '7' && c >= '0') - { - code = (code * 8) + (c - '0'); - c = getc(finput); - if (code >= 256 || code < 0) - fatals("malformatted literal token `\\%03o'", code); - } - } - else - { - if (c == 't') - code = '\t'; - else if (c == 'n') - code = '\n'; - else if (c == 'a') - code = '\007'; - else if (c == 'r') - code = '\r'; - else if (c == 'f') - code = '\f'; - else if (c == 'b') - code = '\b'; - else if (c == 'v') - code = 013; - else if (c == 'x') - { - c = getc(finput); - while ((c <= '9' && c >= '0') - || (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z')) - { - code *= 16; - if (c <= '9' && c >= '0') - code += c - '0'; - else if (c >= 'a' && c <= 'z') - code += c - 'a' + 10; - else if (c >= 'A' && c <= 'Z') - code += c - 'A' + 10; - if (code >= 256 || code<0)/* JF this said if(c>=128) */ - fatals("malformatted literal token `\\x%x'",code); - c = getc(finput); - } - ungetc(c, finput); - } - else if (c == '\\') - code = '\\'; - else if (c == '\'') - code = '\''; - else if (c == '\"') /* JF this is a good idea */ - code = '\"'; - else - { - if (c >= 040 && c <= 0177) - fatals ("unknown escape sequence `\\%c'", c); - else - fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c); - } - - c = getc(finput); - } - } - else + obstack_1grow (&token_obstack, '\''); + obstack_1grow (&token_obstack, code); + + c = getc (finput); + if (c != '\'') { - code = c; - c = getc(finput); + complain (_("use \"...\" for multi-character literal tokens")); + while (literalchar () != '\'') + /* Skip. */; } - if (c != '\'') - fatal("multicharacter literal tokens not supported"); + obstack_1grow (&token_obstack, '\''); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + symval = getsym (token_buffer); + symval->class = token_sym; + if (symval->user_token_number == SUNDEF) + symval->user_token_number = code; + return tok_identifier; + } - /* now fill token_buffer with the canonical name for this character - as a literal token. Do not use what the user typed, - so that '\012' and '\n' can be interchangeable. */ + case '\"': + /* parse the literal string token and treat as an identifier */ - p = token_buffer; - *p++ = '\''; - if (code == '\\') - { - *p++ = '\\'; - *p++ = '\\'; - } - else if (code == '\'') - { - *p++ = '\\'; - *p++ = '\''; - } - else if (code >= 040 && code != 0177) - *p++ = code; - else if (code == '\t') - { - *p++ = '\\'; - *p++ = 't'; - } - else if (code == '\n') - { - *p++ = '\\'; - *p++ = 'n'; - } - else if (code == '\r') - { - *p++ = '\\'; - *p++ = 'r'; - } - else if (code == '\v') - { - *p++ = '\\'; - *p++ = 'v'; - } - else if (code == '\b') - { - *p++ = '\\'; - *p++ = 'b'; - } - else if (code == '\f') - { - *p++ = '\\'; - *p++ = 'f'; - } - else + { + int code; /* ignored here */ + + obstack_1grow (&token_obstack, '\"'); + /* Read up to and including ". */ + do { - *p++ = code / 0100 + '0'; - *p++ = ((code / 010) & 07) + '0'; - *p++ = (code & 07) + '0'; + code = literalchar (); + obstack_1grow (&token_obstack, code); } - *p++ = '\''; - *p = 0; - symval = getsym(token_buffer); - symval->class = STOKEN; - if (! symval->user_token_number) - symval->user_token_number = code; - return (IDENTIFIER); + while (code != '\"'); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + + symval = getsym (token_buffer); + symval->class = token_sym; + + return tok_identifier; } case ',': - return (COMMA); + token_buffer = ","; + return tok_comma; case ':': - return (COLON); + token_buffer = ":"; + return tok_colon; case ';': - return (SEMICOLON); + token_buffer = ";"; + return tok_semicolon; case '|': - return (BAR); + token_buffer = "|"; + return tok_bar; case '{': - return (LEFT_CURLY); + token_buffer = "{"; + return tok_left_curly; case '=': + obstack_1grow (&token_obstack, c); do { - c = getc(finput); - if (c == '\n') lineno++; + c = getc (finput); + obstack_1grow (&token_obstack, c); + if (c == '\n') + lineno++; } - while(c==' ' || c=='\n' || c=='\t'); + while (c == ' ' || c == '\n' || c == '\t'); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); if (c == '{') - return(LEFT_CURLY); + { + return tok_left_curly; + } else { - ungetc(c, finput); - return(ILLEGAL); + ungetc (c, finput); + return tok_illegal; } case '<': - p = token_buffer; - c = getc(finput); - while (c != '>') - { - if (c == '\n' || c == EOF) - fatal("unterminated type name"); - - if (p == token_buffer + maxtoken) - p = grow_token_buffer(p); - - *p++ = c; - c = getc(finput); - } - *p = 0; - return (TYPENAME); - + read_type_name (finput); + return tok_typename; case '%': - return (parse_percent_token()); + return parse_percent_token (); default: - return (ILLEGAL); + obstack_1grow (&token_obstack, c); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + return tok_illegal; } } +/* This function is a strcmp, which doesn't differentiate `-' and `_' + chars. */ -/* parse a token which starts with %. Assumes the % has already been read and discarded. */ +static int +option_strcmp (const char *left, const char *right) +{ + const unsigned char *l, *r; + int c; + + assert (left); + assert (right); + l = (const unsigned char *)left; + r = (const unsigned char *)right; + while (((c = *l - *r++) == 0 && *l != '\0') + || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-'))) + l++; + return c; +} -int -parse_percent_token () +/* Parse a token which starts with %. + Assumes the % has already been read and discarded. */ + +token_t +parse_percent_token (void) { - register int c; - register char *p; + const struct option_table_struct *tx = NULL; + const char *arg = NULL; + /* Where the ARG was found in token_buffer. */ + size_t arg_offset = 0; - p = token_buffer; - c = getc(finput); + int c = getc (finput); + obstack_1grow (&token_obstack, '%'); + obstack_1grow (&token_obstack, c); switch (c) { case '%': - return (TWO_PERCENTS); + token_buffer = obstack_finish (&token_obstack); + return tok_two_percents; case '{': - return (PERCENT_LEFT_CURLY); + token_buffer = obstack_finish (&token_obstack); + return tok_percent_left_curly; + /* The following guys are here for backward compatibility with + very ancient Yacc versions. The paper of Johnson mentions + them (as ancient :). */ case '<': - return (LEFT); + token_buffer = obstack_finish (&token_obstack); + return tok_left; case '>': - return (RIGHT); + token_buffer = obstack_finish (&token_obstack); + return tok_right; case '2': - return (NONASSOC); + token_buffer = obstack_finish (&token_obstack); + return tok_nonassoc; case '0': - return (TOKEN); + token_buffer = obstack_finish (&token_obstack); + return tok_token; case '=': - return (PREC); + token_buffer = obstack_finish (&token_obstack); + return tok_prec; } - if (!isalpha(c)) - return (ILLEGAL); - while (isalpha(c) || c == '_') + if (!isalpha (c)) { - if (p == token_buffer + maxtoken) - p = grow_token_buffer(p); + token_buffer = obstack_finish (&token_obstack); + return tok_illegal; + } - *p++ = c; - c = getc(finput); + while (c = getc (finput), isalpha (c) || c == '_' || c == '-') + { + if (c == '_') + c = '-'; + obstack_1grow (&token_obstack, c); + } + + /* %DIRECTIVE="ARG". Separate into + TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'. + This is a bit hackish, but once we move to a Bison parser, + things will be cleaned up. */ + if (c == '=') + { + /* End of the directive. We skip the `='. */ + obstack_1grow (&token_obstack, '\0'); + /* Fetch the ARG if present. */ + c = getc (finput); + if (c == '"') + { + int code; + arg_offset = obstack_object_size (&token_obstack); + /* Read up to and including `"'. Do not append the closing + `"' in the output: it's not part of the ARG. */ + while ((code = literalchar ()) != '"') + obstack_1grow (&token_obstack, code); + } + /* else: should be an error. */ } + else + ungetc (c, finput); + + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + if (arg_offset) + arg = token_buffer + arg_offset; + + /* table lookup % directive */ + for (tx = option_table; tx->name; tx++) + if ((tx->access == opt_percent || tx->access == opt_both) + && option_strcmp (token_buffer + 1, tx->name) == 0) + break; - ungetc(c, finput); - - *p = 0; - - if (strcmp(token_buffer, "token") == 0 - || - strcmp(token_buffer, "term") == 0) - return (TOKEN); - else if (strcmp(token_buffer, "nterm") == 0) - return (NTERM); - else if (strcmp(token_buffer, "type") == 0) - return (TYPE); - else if (strcmp(token_buffer, "guard") == 0) - return (GUARD); - else if (strcmp(token_buffer, "union") == 0) - return (UNION); - else if (strcmp(token_buffer, "expect") == 0) - return (EXPECT); - else if (strcmp(token_buffer, "start") == 0) - return (START); - else if (strcmp(token_buffer, "left") == 0) - return (LEFT); - else if (strcmp(token_buffer, "right") == 0) - return (RIGHT); - else if (strcmp(token_buffer, "nonassoc") == 0 - || - strcmp(token_buffer, "binary") == 0) - return (NONASSOC); - else if (strcmp(token_buffer, "semantic_parser") == 0) - return (SEMANTIC_PARSER); - else if (strcmp(token_buffer, "pure_parser") == 0) - return (PURE_PARSER); - else if (strcmp(token_buffer, "prec") == 0) - return (PREC); - else return (ILLEGAL); + if (arg && tx->ret_val != tok_stropt) + fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg)); + + + switch (tx->ret_val) + { + case tok_stropt: + assert (tx->set_flag); + if (arg) + { + /* Keep only the first assignment: command line options have + already been processed, and we want them to have + precedence. Side effect: if this %-option is used + several times, only the first is honored. Bah. */ + if (!*((char **) (tx->set_flag))) + *((char **) (tx->set_flag)) = xstrdup (arg); + } + else + fatal (_("`%s' requires an argument"), token_buffer); + return tok_noop; + break; + + case tok_intopt: + assert (tx->set_flag); + *((int *) (tx->set_flag)) = 1; + return tok_noop; + break; + + case tok_obsolete: + fatal (_("`%s' is no longer supported"), token_buffer); + return tok_noop; + break; + + default: + return tx->ret_val; + break; + } + abort (); }