X-Git-Url: https://git.saurik.com/bison.git/blobdiff_plain/1916f98ef86a1b73986f3f0ab709cccc4afb8f9e..1239777d4f4f96a23e5cd177121f16f0d9e71c68:/src/lex.c diff --git a/src/lex.c b/src/lex.c index cc27ccbb..deb6eb7a 100644 --- a/src/lex.c +++ b/src/lex.c @@ -1,5 +1,5 @@ /* Token-reader for Bison's input parser, - Copyright (C) 1984, 1986, 1989, 1992, 2000 Free Software Foundation, Inc. + Copyright 1984, 1986, 1989, 1992, 2000, 2001 Free Software Foundation, Inc. This file is part of Bison, the GNU Compiler Compiler. @@ -21,45 +21,37 @@ #include "system.h" #include "getargs.h" #include "files.h" -#include "getopt.h" /* for optarg */ #include "symtab.h" +#include "options.h" #include "lex.h" -#include "xalloc.h" #include "complain.h" #include "gram.h" - -/* functions from main.c */ -extern char *printable_version PARAMS ((int)); +#include "quote.h" /* Buffer for storing the current token. */ -char *token_buffer; - -/* Allocated size of token_buffer, not including space for terminator. */ -int maxtoken; +static struct obstack token_obstack; +const char *token_buffer = NULL; -bucket *symval; +bucket *symval = NULL; int numval; -static int unlexed; /* these two describe a token to be reread */ -static bucket *unlexed_symval; /* by the next call to lex */ - +/* A token to be reread, see unlex and lex. */ +static token_t unlexed = tok_undef; +static bucket *unlexed_symval = NULL; +static const char *unlexed_token_buffer = NULL; void -init_lex (void) +lex_init (void) { - maxtoken = 100; - token_buffer = XCALLOC (char, maxtoken + 1); - unlexed = -1; + obstack_init (&token_obstack); + unlexed = tok_undef; } -char * -grow_token_buffer (char *p) +void +lex_free (void) { - int offset = p - token_buffer; - maxtoken *= 2; - token_buffer = XREALLOC (token_buffer, char, maxtoken + 1); - return token_buffer + offset; + obstack_free (&token_obstack, NULL); } @@ -78,6 +70,7 @@ skip_white_space (void) switch (c) { case '/': + /* FIXME: Should probably be merged with copy_comment. */ c = getc (finput); if (c != '*' && c != '/') { @@ -132,8 +125,12 @@ skip_white_space (void) } } -/* do a getc, but give error message if EOF encountered */ -static int + +/*-----------------------------------------------------. +| Do a getc, but give error message if EOF encountered | +`-----------------------------------------------------*/ + +int xgetc (FILE *f) { int c = getc (f); @@ -143,69 +140,62 @@ xgetc (FILE *f) } -/*------------------------------------------------------------------. -| Read one literal character from finput. Process \ escapes. | -| Append the normalized string version of the char to *PP. Assign | -| the character code to *PCODE. Return 1 unless the character is an | -| unescaped `term' or \n report error for \n | -`------------------------------------------------------------------*/ +/*---------------------------------------------------------------. +| Read one literal character from FINPUT, process \-escapes, and | +| return the character. | +`---------------------------------------------------------------*/ -static int -literalchar (char **pp, int *pcode, char term) +char +literalchar (void) { int c; - char *p; - int code; - int wasquote = 0; + int res; c = xgetc (finput); if (c == '\n') { complain (_("unescaped newline in constant")); ungetc (c, finput); - code = '?'; - wasquote = 1; + res = '?'; } else if (c != '\\') { - code = c; - if (c == term) - wasquote = 1; + res = c; } else { c = xgetc (finput); if (c == 't') - code = '\t'; + res = '\t'; else if (c == 'n') - code = '\n'; + res = '\n'; else if (c == 'a') - code = '\007'; + res = '\007'; else if (c == 'r') - code = '\r'; + res = '\r'; else if (c == 'f') - code = '\f'; + res = '\f'; else if (c == 'b') - code = '\b'; + res = '\b'; else if (c == 'v') - code = '\013'; + res = '\013'; else if (c == '\\') - code = '\\'; + res = '\\'; else if (c == '\'') - code = '\''; + res = '\''; else if (c == '\"') - code = '\"'; + res = '\"'; else if (c <= '7' && c >= '0') { - code = 0; + res = 0; while (c <= '7' && c >= '0') { - code = (code * 8) + (c - '0'); - if (code >= 256 || code < 0) + res = (res * 8) + (c - '0'); + if (res >= 256 || res < 0) { complain (_("octal value outside range 0...255: `\\%o'"), - code); - code &= 0xFF; + res); + res &= 0xFF; break; } c = xgetc (finput); @@ -215,21 +205,21 @@ literalchar (char **pp, int *pcode, char term) else if (c == 'x') { c = xgetc (finput); - code = 0; + res = 0; while (1) { if (c >= '0' && c <= '9') - code *= 16, code += c - '0'; + res *= 16, res += c - '0'; else if (c >= 'a' && c <= 'f') - code *= 16, code += c - 'a' + 10; + res *= 16, res += c - 'a' + 10; else if (c >= 'A' && c <= 'F') - code *= 16, code += c - 'A' + 10; + res *= 16, res += c - 'A' + 10; else break; - if (code >= 256 || code < 0) + if (res >= 256 || res < 0) { - complain (_("hexadecimal value above 255: `\\x%x'"), code); - code &= 0xFF; + complain (_("hexadecimal value above 255: `\\x%x'"), res); + res &= 0xFF; break; } c = xgetc (finput); @@ -238,110 +228,79 @@ literalchar (char **pp, int *pcode, char term) } else { + char badchar [] = "c"; + badchar[0] = c; complain (_("unknown escape sequence: `\\' followed by `%s'"), - printable_version (c)); - code = '?'; + quote (badchar)); + res = '?'; } } /* has \ */ - /* now fill token_buffer with the canonical name for this character - as a literal token. Do not use what the user typed, - so that `\012' and `\n' can be interchangeable. */ - - p = *pp; - if (code == term && wasquote) - *p++ = code; - else if (code == '\\') - { - *p++ = '\\'; - *p++ = '\\'; - } - else if (code == '\'') - { - *p++ = '\\'; - *p++ = '\''; - } - else if (code == '\"') - { - *p++ = '\\'; - *p++ = '\"'; - } - else if (code >= 040 && code < 0177) - *p++ = code; - else if (code == '\t') - { - *p++ = '\\'; - *p++ = 't'; - } - else if (code == '\n') - { - *p++ = '\\'; - *p++ = 'n'; - } - else if (code == '\r') - { - *p++ = '\\'; - *p++ = 'r'; - } - else if (code == '\v') - { - *p++ = '\\'; - *p++ = 'v'; - } - else if (code == '\b') - { - *p++ = '\\'; - *p++ = 'b'; - } - else if (code == '\f') - { - *p++ = '\\'; - *p++ = 'f'; - } - else - { - *p++ = '\\'; - *p++ = code / 0100 + '0'; - *p++ = ((code / 010) & 07) + '0'; - *p++ = (code & 07) + '0'; - } - *pp = p; - *pcode = code; - return !wasquote; + return res; } void -unlex (int token) +unlex (token_t token) { unlexed = token; + unlexed_token_buffer = token_buffer; unlexed_symval = symval; } +/*-----------------------------------------------------------------. +| We just read `<' from FIN. Store in TOKEN_BUFFER, the type name | +| specified between the `<...>'. | +`-----------------------------------------------------------------*/ -int +void +read_type_name (FILE *fin) +{ + int c = getc (fin); + + while (c != '>') + { + if (c == EOF) + fatal (_("unterminated type name at end of file")); + if (c == '\n') + { + complain (_("unterminated type name")); + ungetc (c, fin); + break; + } + + obstack_1grow (&token_obstack, c); + c = getc (fin); + } + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); +} + + +token_t lex (void) { int c; - char *p; - if (unlexed >= 0) + /* Just to make sure. */ + token_buffer = NULL; + + if (unlexed != tok_undef) { + token_t res = unlexed; symval = unlexed_symval; - c = unlexed; - unlexed = -1; - return c; + token_buffer = unlexed_token_buffer; + unlexed = tok_undef; + return res; } c = skip_white_space (); - *token_buffer = c; /* for error messages (token buffer always valid) */ - token_buffer[1] = 0; switch (c) { case EOF: - strcpy (token_buffer, "EOF"); - return ENDFILE; + token_buffer = "EOF"; + return tok_eof; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': @@ -357,285 +316,282 @@ lex (void) case 'z': case '.': case '_': - p = token_buffer; while (isalnum (c) || c == '_' || c == '.') { - if (p == token_buffer + maxtoken) - p = grow_token_buffer (p); - - *p++ = c; + obstack_1grow (&token_obstack, c); c = getc (finput); } - - *p = 0; + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); ungetc (c, finput); symval = getsym (token_buffer); - return IDENTIFIER; + return tok_identifier; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { numval = 0; - p = token_buffer; while (isdigit (c)) { - if (p == token_buffer + maxtoken) - p = grow_token_buffer (p); - - *p++ = c; + obstack_1grow (&token_obstack, c); numval = numval * 10 + c - '0'; c = getc (finput); } - *p = 0; + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); ungetc (c, finput); - return NUMBER; + return tok_number; } case '\'': /* parse the literal token and compute character code in code */ - translations = -1; { - int code, discode; - char discard[10], *dp; + int code = literalchar (); - p = token_buffer; - *p++ = '\''; - literalchar (&p, &code, '\''); + obstack_1grow (&token_obstack, '\''); + obstack_1grow (&token_obstack, code); c = getc (finput); if (c != '\'') { complain (_("use \"...\" for multi-character literal tokens")); - while (1) - { - dp = discard; - if (!literalchar (&dp, &discode, '\'')) - break; - } + while (literalchar () != '\'') + /* Skip. */; } - *p++ = '\''; - *p = 0; + obstack_1grow (&token_obstack, '\''); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); symval = getsym (token_buffer); - symval->class = STOKEN; - if (!symval->user_token_number) + symval->class = token_sym; + if (symval->user_token_number == SUNDEF) symval->user_token_number = code; - return IDENTIFIER; + return tok_identifier; } case '\"': /* parse the literal string token and treat as an identifier */ - translations = -1; { int code; /* ignored here */ - p = token_buffer; - *p++ = '\"'; - while (literalchar (&p, &code, '\"')) /* read up to and including " */ + + obstack_1grow (&token_obstack, '\"'); + /* Read up to and including ". */ + do { - if (p >= token_buffer + maxtoken - 4) - p = grow_token_buffer (p); + code = literalchar (); + obstack_1grow (&token_obstack, code); } - *p = 0; + while (code != '\"'); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); symval = getsym (token_buffer); - symval->class = STOKEN; + symval->class = token_sym; - return IDENTIFIER; + return tok_identifier; } case ',': - return COMMA; + token_buffer = ","; + return tok_comma; case ':': - return COLON; + token_buffer = ":"; + return tok_colon; case ';': - return SEMICOLON; + token_buffer = ";"; + return tok_semicolon; case '|': - return BAR; + token_buffer = "|"; + return tok_bar; case '{': - return LEFT_CURLY; + token_buffer = "{"; + return tok_left_curly; case '=': + obstack_1grow (&token_obstack, c); do { c = getc (finput); + obstack_1grow (&token_obstack, c); if (c == '\n') lineno++; } while (c == ' ' || c == '\n' || c == '\t'); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); if (c == '{') { - strcpy (token_buffer, "={"); - return LEFT_CURLY; + return tok_left_curly; } else { ungetc (c, finput); - return ILLEGAL; + return tok_illegal; } case '<': - p = token_buffer; - c = getc (finput); - while (c != '>') - { - if (c == EOF) - fatal (_("unterminated type name at end of file")); - if (c == '\n') - { - complain (_("unterminated type name")); - ungetc (c, finput); - break; - } - - if (p == token_buffer + maxtoken) - p = grow_token_buffer (p); - - *p++ = c; - c = getc (finput); - } - *p = 0; - return TYPENAME; - + read_type_name (finput); + return tok_typename; case '%': return parse_percent_token (); default: - return ILLEGAL; + obstack_1grow (&token_obstack, c); + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + return tok_illegal; } } -/* the following table dictates the action taken for the various % - directives. A setflag value causes the named flag to be set. A - retval action returns the code. */ -struct percent_table_struct +/* This function is a strcmp, which doesn't differentiate `-' and `_' + chars. */ + +static int +option_strcmp (const char *left, const char *right) { - const char *name; - void *setflag; - int retval; + const unsigned char *l, *r; + int c; + + assert (left); + assert (right); + l = (const unsigned char *)left; + r = (const unsigned char *)right; + while (((c = *l - *r++) == 0 && *l != '\0') + || ((*l == '-' || *l == '_') && (*r == '_' || *r == '-'))) + l++; + return c; } -percent_table[] = -{ - { "token", NULL, TOKEN }, - { "term", NULL, TOKEN }, - { "nterm", NULL, NTERM }, - { "type", NULL, TYPE }, - { "guard", NULL, GUARD }, - { "union", NULL, UNION }, - { "expect", NULL, EXPECT }, - { "thong", NULL, THONG }, - { "start", NULL, START }, - { "left", NULL, LEFT }, - { "right", NULL, RIGHT }, - { "nonassoc", NULL, NONASSOC }, - { "binary", NULL, NONASSOC }, - { "semantic_parser", NULL, SEMANTIC_PARSER }, - { "pure_parser", NULL, PURE_PARSER }, - { "prec", NULL, PREC }, - { "no_lines", &nolinesflag, NOOP}, /* -l */ - { "raw", &rawtoknumflag, NOOP }, /* -r */ - { "token_table", &toknumflag, NOOP}, /* -k */ -#if 0 - /* These can be utilized after main is reoganized so - open_files() is deferred 'til after read_declarations(). - But %{ and %union both put information into files - that have to be opened before read_declarations(). - */ - { "yacc", &yaccflag, NOOP}, /* -y */ - { "fixed_output_files", &yaccflag, NOOP}, /* -y */ - { "defines", &definesflag, NOOP}, /* -d */ - { "no_parser", &noparserflag, NOOP}, /* -n */ - { "output_file", &spec_outfile, SETOPT}, /* -o */ - { "file_prefix", &spec_file_prefix, SETOPT}, /* -b */ - { "name_prefix", &spec_name_prefix, SETOPT}, /* -p */ - /* These would be acceptable, but they do not affect processing */ - { "verbose", &verboseflag, NOOP}, /* -v */ - { "debug", &debugflag, NOOP}, /* -t */ -/* {"help", , NOOP}, *//* -h */ -/* {"version", , NOOP}, *//* -V */ -#endif - { NULL, NULL, ILLEGAL} -}; /* Parse a token which starts with %. Assumes the % has already been read and discarded. */ -int +token_t parse_percent_token (void) { - int c; - char *p; - struct percent_table_struct *tx; + const struct option_table_struct *tx = NULL; + const char *arg = NULL; + /* Where the ARG was found in token_buffer. */ + size_t arg_offset = 0; - p = token_buffer; - c = getc (finput); - *p++ = '%'; - *p++ = c; /* for error msg */ - *p = 0; + int c = getc (finput); switch (c) { case '%': - return TWO_PERCENTS; + return tok_two_percents; case '{': - return PERCENT_LEFT_CURLY; + return tok_percent_left_curly; + /* FIXME: Who the heck are those 5 guys!?! `%<' = `%left'!!! + Let's ask for there removal. */ case '<': - return LEFT; + return tok_left; case '>': - return RIGHT; + return tok_right; case '2': - return NONASSOC; + return tok_nonassoc; case '0': - return TOKEN; + return tok_token; case '=': - return PREC; + return tok_prec; } + if (!isalpha (c)) - return ILLEGAL; + return tok_illegal; - p = token_buffer; - *p++ = '%'; + obstack_1grow (&token_obstack, '%'); while (isalpha (c) || c == '_' || c == '-') { - if (p == token_buffer + maxtoken) - p = grow_token_buffer (p); - - if (c == '-') - c = '_'; - *p++ = c; + if (c == '_') + c = '-'; + obstack_1grow (&token_obstack, c); c = getc (finput); } - ungetc (c, finput); + /* %DIRECTIVE="ARG". Separate into + TOKEN_BUFFER = `%DIRECTIVE\0ARG\0'. + This is a bit hackish, but once we move to a Bison parser, + things will be cleaned up. */ + if (c == '=') + { + /* End of the directive. We skip the `='. */ + obstack_1grow (&token_obstack, '\0'); + /* Fetch the ARG if present. */ + c = getc (finput); + if (c == '"') + { + int code; + arg_offset = obstack_object_size (&token_obstack); + /* Read up to and including `"'. Do not append the closing + `"' in the output: it's not part of the ARG. */ + while ((code = literalchar ()) != '"') + obstack_1grow (&token_obstack, code); + } + /* else: should be an error. */ + } + else + ungetc (c, finput); - *p = 0; + obstack_1grow (&token_obstack, '\0'); + token_buffer = obstack_finish (&token_obstack); + if (arg_offset) + arg = token_buffer + arg_offset; /* table lookup % directive */ - for (tx = percent_table; tx->name; tx++) - if (strcmp (token_buffer + 1, tx->name) == 0) + for (tx = option_table; tx->name; tx++) + if ((tx->access == opt_percent || tx->access == opt_both) + && option_strcmp (token_buffer + 1, tx->name) == 0) break; - if (tx->retval == SETOPT) - { - *((char **) (tx->setflag)) = optarg; - return NOOP; - } - if (tx->setflag) + + if (arg && tx->ret_val != tok_stropt) + fatal (_("`%s' supports no argument: %s"), token_buffer, quote (arg)); + + + switch (tx->ret_val) { - *((int *) (tx->setflag)) = 1; - return NOOP; + case tok_stropt: + assert (tx->set_flag); + if (arg) + { + /* Keep only the first assignment: command line options have + already been processed, and we want them to have + precedence. Side effect: if this %-option is used + several times, only the first is honored. Bah. */ + if (!*((char **) (tx->set_flag))) + *((char **) (tx->set_flag)) = xstrdup (arg); + } + else + fatal (_("`%s' requires an argument"), token_buffer); + return tok_noop; + break; + + case tok_intopt: + assert (tx->set_flag); + *((int *) (tx->set_flag)) = 1; + return tok_noop; + break; + + case tok_obsolete: + fatal (_("`%s' is no longer supported"), token_buffer); + return tok_noop; + break; + + default: + return tx->ret_val; + break; } - return tx->retval; + abort (); }