--- /dev/null
+/*
+ tre-parse.c - Regexp parser
+
+ This software is released under a BSD-style license.
+ See the file LICENSE for details and copyright.
+
+*/
+
+/*
+ This parser is just a simple recursive descent parser for POSIX.2
+ regexps. The parser supports both the obsolete default syntax and
+ the "extended" syntax, and some nonstandard extensions.
+*/
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+#include <string.h>
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+
+#include "xmalloc.h"
+#include "tre-mem.h"
+#include "tre-ast.h"
+#include "tre-stack.h"
+#include "tre-parse.h"
+
+/* BSD compatibility:
+ Before looking up a collating symbol, check if the name matches in
+ the character names (cnames) array; if so, use the corresponding
+ character.
+
+ Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
+ the implementation choice that for ERE, a non-numeric character following
+ a left brace that would normally be a bound, causes the left brace to be
+ literal. */
+#define BSD_COMPATIBILITY
+#ifdef BSD_COMPATIBILITY
+#include "cname.h"
+#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+#endif /* BSD_COMPATIBILITY */
+
+/* Characters with special meanings in regexp syntax. */
+#define CHAR_PIPE L'|'
+#define CHAR_LPAREN L'('
+#define CHAR_RPAREN L')'
+#define CHAR_LBRACE L'{'
+#define CHAR_RBRACE L'}'
+#define CHAR_LBRACKET L'['
+#define CHAR_RBRACKET L']'
+#define CHAR_MINUS L'-'
+#define CHAR_STAR L'*'
+#define CHAR_QUESTIONMARK L'?'
+#define CHAR_PLUS L'+'
+#define CHAR_PERIOD L'.'
+#define CHAR_COLON L':'
+#define CHAR_EQUAL L'='
+#define CHAR_COMMA L','
+#define CHAR_CARET L'^'
+#define CHAR_DOLLAR L'$'
+#define CHAR_BACKSLASH L'\\'
+#define CHAR_HASH L'#'
+#define CHAR_TILDE L'~'
+
+
+/* Some macros for expanding \w, \s, etc. */
+static const struct tre_macro_struct {
+ const char c;
+ const char *expansion;
+} tre_macros[] =
+ { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
+ {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
+ {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
+ {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
+ { 0, NULL }
+ };
+
+
+/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
+ must have at least `len' items. Sets buf[0] to zero if the there
+ is no match in `tre_macros'. */
+static void
+tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
+ tre_char_t *buf, size_t buf_len)
+{
+ int i;
+
+ buf[0] = 0;
+ if (regex >= regex_end)
+ return;
+
+ for (i = 0; tre_macros[i].expansion; i++)
+ {
+ if (tre_macros[i].c == *regex)
+ {
+ unsigned int j;
+ DPRINT(("Expanding macro '%c' => '%s'\n",
+ tre_macros[i].c, tre_macros[i].expansion));
+ for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
+ buf[j] = tre_macros[i].expansion[j];
+ buf[j] = 0;
+ break;
+ }
+ }
+}
+
+static reg_errcode_t
+tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
+ tre_bracket_match_list_t **items)
+{
+ reg_errcode_t status = REG_OK;
+ tre_bracket_match_list_t *array = *items;
+ int i = array->num_bracket_matches;
+ /* Allocate more space if necessary. */
+ if (i >= *max_i)
+ {
+ tre_bracket_match_list_t *new_items;
+ DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
+ /* If the array is already 1024 items large, give up -- there's
+ probably an error in the regexp (e.g. not a '\0' terminated
+ string and missing ']') */
+ if (*max_i >= 1024)
+ return REG_ESPACE;
+ *max_i *= 2;
+ new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
+ if (new_items == NULL)
+ return REG_ESPACE;
+ *items = array = new_items;
+ }
+ array->bracket_matches[i].type = type;
+ array->bracket_matches[i].value = val;
+ array->num_bracket_matches++;
+ return status;
+}
+
+#ifndef TRE_USE_SYSTEM_WCTYPE
+
+/* isalnum() and the rest may be macros, so wrap them to functions. */
+int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
+int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
+
+#ifdef tre_isascii
+int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
+#else /* !tre_isascii */
+int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
+#endif /* !tre_isascii */
+
+#ifdef tre_isblank
+int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
+#else /* !tre_isblank */
+int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
+#endif /* !tre_isblank */
+
+int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
+int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
+int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
+int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
+int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
+int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
+int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
+int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
+int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
+
+struct {
+ char *name;
+ int (*func)(tre_cint_t);
+} tre_ctype_map[] = {
+ { "alnum", &tre_isalnum_func },
+ { "alpha", &tre_isalpha_func },
+#ifdef tre_isascii
+ { "ascii", &tre_isascii_func },
+#endif /* tre_isascii */
+#ifdef tre_isblank
+ { "blank", &tre_isblank_func },
+#endif /* tre_isblank */
+ { "cntrl", &tre_iscntrl_func },
+ { "digit", &tre_isdigit_func },
+ { "graph", &tre_isgraph_func },
+ { "lower", &tre_islower_func },
+ { "print", &tre_isprint_func },
+ { "punct", &tre_ispunct_func },
+ { "space", &tre_isspace_func },
+ { "upper", &tre_isupper_func },
+ { "xdigit", &tre_isxdigit_func },
+ { NULL, NULL}
+};
+
+tre_ctype_t tre_ctype(const char *name)
+{
+ int i;
+ for (i = 0; tre_ctype_map[i].name != NULL; i++)
+ {
+ if (strcmp(name, tre_ctype_map[i].name) == 0)
+ return tre_ctype_map[i].func;
+ }
+ return (tre_ctype_t)0;
+}
+#endif /* !TRE_USE_SYSTEM_WCTYPE */
+
+#define REST(re) (int)(ctx->re_end - (re)), (re)
+
+#define START_COLLATING_SYMBOLS 16
+#define MAX_COLLATING_SYMBOL_LEN 4
+
+typedef struct {
+ const tre_char_t *start;
+ int len;
+} tre_collating_symbol;
+
+#include <xlocale.h>
+
+int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
+
+#ifdef BSD_COMPATIBILITY
+static wchar_t
+tre_search_cnames(const wchar_t *name, size_t len)
+{
+ size_t low = 0;
+ size_t high = NCNAMES - 1;
+ size_t cur;
+ int cmp;
+
+ while(low <= high)
+ {
+ cur = (low + high) / 2;
+ cmp = wcsncmp(name, cnames[cur].name, len);
+ if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
+ if (cmp > 0) low = cur + 1;
+ else high = cur - 1;
+ }
+ return (wchar_t)-1;
+}
+#endif /* BSD_COMPATIBILITY */
+
+/* Scan the contents of a bracket expression, and create a
+ * tre_bracket_match_list_t encoding the bracket expression. If during
+ * the scan, multi-character collating symbols are detected, switch
+ * into a mode to collect those MCCSs into a tre_collating_symbol
+ * list and pass them back. tre_parse_bracket will use that to
+ * create a new string composed of a union of the bracket expression
+ * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
+ * call tre_parse (recursive) to parse that new string (which will
+ * call tre_parse_bracket and tre_parse_bracket_items again. */
+static reg_errcode_t
+tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
+ int *items_size, tre_collating_symbol **result)
+{
+ const tre_char_t *re = ctx->re;
+ const tre_char_t *re_end = ctx->re_end;
+ tre_collating_symbol *col_syms = NULL;
+ tre_collating_symbol *cp = NULL;
+ int n_col_syms = 0;
+ reg_errcode_t status;
+ int max_i = *items_size;
+ int other = 0; /* contains content other than multi-character collating
+ * symbols */
+ int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
+ tre_cint_t min, c;
+ int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
+ int collect_MCCS = 0;
+ const tre_char_t *start;
+
+ for ( ;re < re_end; re++)
+ {
+ switch (*re)
+ {
+ case CHAR_MINUS:
+ /* A first hyphen */
+ if (re == ctx->re)
+ {
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ min = CHAR_MINUS;
+ other++;
+ range = 0;
+ break;
+ }
+ /* The hyphen is the end range */
+ if (range > 0)
+ {
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ c = CHAR_MINUS;
+ goto process_end_range;
+ }
+ if (re + 1 >= re_end)
+ {
+ status = REG_EBRACK;
+ goto error;
+ }
+ /* The hyphen is at the end */
+ if (re[1] == CHAR_RBRACKET)
+ {
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ c = CHAR_MINUS;
+ goto process_begin_range;
+ }
+ /* Two ranges are not allowed to share an endpoint, or begin
+ * range is illegal. */
+ if (range < 0)
+ {
+ status = REG_ERANGE;
+ goto error;
+ }
+ range = 1; /* Expect end range */
+ DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
+ break;
+
+ case CHAR_LBRACKET:
+ if (re + 1 >= re_end)
+ {
+ status = REG_EBRACK;
+ goto error;
+ }
+ switch (re[1])
+ {
+ case CHAR_PERIOD:
+ {
+ re += 2;
+ start = re;
+ for (;; re++)
+ {
+ if (re >= re_end)
+ {
+ status = REG_ECOLLATE;
+ goto error;
+ }
+ if (*re == CHAR_PERIOD)
+ {
+ if (re + 1 >= re_end)
+ {
+ status = REG_ECOLLATE;
+ goto error;
+ }
+ /* Found end */
+ if (re[1] == CHAR_RBRACKET)
+ {
+ DPRINT(("tre_parse_bracket: collating "
+ "symbol: '%.*" STRF "'\n",
+ REST(start - 2)));
+ /* Empty name */
+ if (re == start)
+ {
+ status = REG_ECOLLATE;
+ goto error;
+ }
+#ifdef BSD_COMPATIBILITY
+ /* Check if the name is in cnames; if so, use
+ the corresponding code */
+ c = tre_search_cnames(start, re - start);
+ if (c != (wchar_t)-1)
+ {
+ re++;
+ goto process_single_character;
+ }
+#endif /* BSD_COMPATIBILITY */
+ /* Verify this is a known sequence */
+ if (__collate_equiv_value(ctx->loc, start,
+ re - start) <= 0)
+ {
+ status = REG_ECOLLATE;
+ goto error;
+ }
+ /* Process single character collating symbols */
+ if (re - start == 1)
+ {
+ c = *start;
+ re++;
+ goto process_single_character;
+ }
+ /* Inverted MCCSs are undefined */
+ if (invert)
+ {
+ status = REG_ECOLLATE;
+ goto error;
+ }
+ /* Can't have MCCSs as an endpoint to a range */
+ if (range > 0)
+ {
+ status = REG_ERANGE;
+ goto error;
+ }
+ range = -1;
+ /* Switch into MCCS collection mode (if not
+ * already there */
+#if TRE_DEBUG
+ if (!collect_MCCS)
+ {
+ collect_MCCS = 1;
+ DPRINT(("tre_parse_bracket: Detected MCCS\n"));
+ }
+#else /* !TRE_DEBUG */
+ collect_MCCS = 1;
+#endif /* !TRE_DEBUG */
+ /* Allocate a memory block the first time */
+ if (!cp)
+ {
+ if ((col_syms = xmalloc(sizeof(*col_syms) *
+ (START_COLLATING_SYMBOLS + 2)))
+ == NULL)
+ return REG_ESPACE;
+ cp = col_syms + 1;
+ n_col_syms = START_COLLATING_SYMBOLS;
+ }
+ /* Enlarge the memory block is more is needed */
+ if ((cp - col_syms) - 1 >= n_col_syms)
+ {
+ int i = n_col_syms;
+ tre_collating_symbol *tmp =
+ xrealloc(col_syms, sizeof(*col_syms) *
+ ((n_col_syms *= 2) + 2));
+ if (tmp == NULL)
+ {
+ xfree(col_syms);
+ return REG_ESPACE;
+ }
+ DPRINT(("tre_list_collating_symbols: "
+ "Enlarging col_syms to %d\n",
+ n_col_syms));
+ col_syms = tmp;
+ cp = col_syms + i + 1;
+ }
+ cp->start = start;
+ cp->len = re - start;
+ cp++;
+ re++;
+ break;
+ }
+ }
+ }
+ break;
+ }
+
+ case CHAR_EQUAL:
+ case CHAR_COLON:
+ {
+ /* Process equivalence and character classes */
+ tre_char_t kind = re[1];
+
+ /* Can't have a class as an endpoint to a range */
+ if (range > 0)
+ {
+ status = REG_ERANGE;
+ goto error;
+ }
+ if (!collect_MCCS && range == 0)
+ {
+ status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
+ min, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ range = -1;
+ re += 2;
+ start = re;
+ for (;; re++)
+ {
+ if (re >= re_end)
+ {
+ status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
+ goto error;
+ }
+ if (*re == kind)
+ {
+ if (re + 1 >= re_end)
+ {
+ status = kind == CHAR_EQUAL ? REG_ECOLLATE :
+ REG_ECTYPE;
+ goto error;
+ }
+ /* Found end */
+ if (re[1] == CHAR_RBRACKET)
+ {
+ if (re == start)
+ {
+ /* Empty class name */
+ status = kind == CHAR_EQUAL ? REG_ECOLLATE :
+ REG_ECTYPE;
+ goto error;
+ }
+ /* Process equivalence class */
+ if (kind == CHAR_EQUAL)
+ {
+ int equiv;
+
+ DPRINT(("tre_parse_bracket: equivalence: '%.*"
+ STRF "'\n", REST(start - 2)));
+
+ /* While we find the collation value even for
+ multi-character collating elements , we
+ don't (yet) match any collation values
+ against multi-character sequences. We'd have
+ to enumerate those multi-character sequences
+ and like multi-character collating symbols,
+ create a union of those sequences with the
+ rest of the bracket expression. While
+ doable, a bracket expression matching
+ multiple characters, that doesn't explicitly
+ contain multi-character sequences, might
+ be unexpected, so we punt for now. */
+ if ((equiv = __collate_equiv_value(ctx->loc,
+ start, re - start)) <= 0)
+ {
+ /* The standard says that if no collating
+ element if found, we use the collating
+ symbol itself. But __collate_equiv_value
+ doesn't make a distinction between
+ an element that is in a equvalence
+ class with others, or is the only member,
+ so we already know there is no collating
+ symbol. (Note that in the case of a
+ collating element whose collation value
+ is unique, matching against the
+ collating element itself, or against
+ its collation value, is equivalent.) */
+#ifdef BSD_COMPATIBILITY
+ /* Check if the name is in cnames; if so,
+ use the corresponding code */
+ c = tre_search_cnames(start, re - start);
+ if (c != (wchar_t)-1)
+ {
+ re++;
+ goto process_single_character;
+ }
+#endif /* BSD_COMPATIBILITY */
+ status = REG_ECOLLATE;
+ goto error;
+ }
+ if (!collect_MCCS)
+ {
+ status = tre_new_item(ctx->mem,
+ TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
+ equiv, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ }
+ else
+ {
+ /* Process character class */
+ DPRINT(("tre_parse_bracket: class: '%.*" STRF
+ "'\n", REST(start - 2)));
+ if (!collect_MCCS)
+ {
+ char tmp_str[64];
+ tre_ctype_t class;
+ int len = MIN(re - start, 63);
+#ifdef TRE_WCHAR
+ {
+ tre_char_t tmp_wcs[64];
+ wcsncpy(tmp_wcs, start, (size_t)len);
+ tmp_wcs[len] = L'\0';
+#if defined HAVE_WCSRTOMBS
+ {
+ mbstate_t state;
+ const tre_char_t *src = tmp_wcs;
+ memset(&state, '\0', sizeof(state));
+ len = wcsrtombs_l(tmp_str, &src,
+ sizeof(tmp_str), &state,
+ ctx->loc);
+ }
+#elif defined HAVE_WCSTOMBS
+ len = wcstombs(tmp_str, tmp_wcs, 63);
+#endif /* defined HAVE_WCSTOMBS */
+ }
+#else /* !TRE_WCHAR */
+ strncpy(tmp_str, (const char*)start, len);
+#endif /* !TRE_WCHAR */
+ tmp_str[len] = '\0';
+ DPRINT((" class name: %s\n", tmp_str));
+ class = tre_ctype_l(tmp_str, ctx->loc);
+ if (!class)
+ {
+ status = REG_ECTYPE;
+ goto error;
+ }
+ status = tre_new_item(ctx->mem,
+ TRE_BRACKET_MATCH_TYPE_CLASS,
+ class, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ }
+ re++;
+ break;
+ }
+ }
+ }
+ other++;
+ break;
+ }
+
+ default:
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ c = CHAR_LBRACKET;
+ goto process_single_character;
+ break;
+ }
+ break;
+
+ case CHAR_RBRACKET:
+ /* A first right bracket */
+ if (re == ctx->re)
+ {
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ min = CHAR_RBRACKET;
+ range = 0;
+ other++;
+ break;
+ }
+ /* Done */
+ if (collect_MCCS)
+ {
+ DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
+ REST(re)));
+ if (col_syms)
+ {
+ /* Mark the character following the right bracket. Set len
+ * to whether there are other things besides the
+ * multi-character collating symbols */
+ col_syms->start = re + 1;
+ col_syms->len = other;
+ /* Mark the end of the list */
+ cp->start = NULL;
+ }
+ *result = col_syms;
+ return REG_OK;
+ }
+ /* range > 0 is not possible, since we did a lookahead after the
+ * hyphen */
+ if (range == 0)
+ {
+ status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
+ min, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
+ *items_size = max_i;
+ ctx->re = re + 1;
+ return REG_OK;
+
+ default:
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
+ c = *re;
+process_single_character:
+ /* Process single character */
+ if (range > 0)
+ {
+ int mine, maxe;
+
+process_end_range:
+ /* Get collation equivalence values */
+ mine = __collate_equiv_value(ctx->loc, &min, 1);
+ maxe = __collate_equiv_value(ctx->loc, &c, 1);
+ if (maxe < mine)
+ {
+ status = REG_ERANGE;
+ goto error;
+ }
+ if (!collect_MCCS)
+ {
+ status = tre_new_item(ctx->mem,
+ TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
+ mine, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ status = tre_new_item(ctx->mem,
+ TRE_BRACKET_MATCH_TYPE_RANGE_END,
+ maxe, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ range = -1;
+ }
+ else
+ {
+process_begin_range:
+ if (!collect_MCCS)
+ {
+ if (range == 0)
+ {
+ status = tre_new_item(ctx->mem,
+ TRE_BRACKET_MATCH_TYPE_CHAR,
+ min, &max_i, items);
+ if (status != REG_OK)
+ goto error;
+ }
+ min = c;
+ }
+ range = 0;
+ }
+ other++;
+ break;
+ }
+ }
+ status = REG_EBRACK;
+error:
+ DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
+ REST(re), status));
+ if (col_syms)
+ xfree(col_syms);
+ return status;
+}
+
+#ifdef TRE_DEBUG
+static const char *bracket_match_type_str[] = {
+ "unused",
+ "char",
+ "range begin",
+ "range end",
+ "class",
+ "equivalence value",
+};
+#endif /* TRE_DEBUG */
+
+static reg_errcode_t
+tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+{
+ tre_ast_node_t *node;
+ reg_errcode_t status = REG_OK;
+ tre_bracket_match_list_t *items;
+ int max_i = 32;
+ tre_collating_symbol *col_syms = NULL;
+
+ /* Handle special cases [[:<:]] and [[:>:]] */
+ if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
+ && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
+ && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
+ && ctx->re[5] == CHAR_RBRACKET)
+ {
+ *result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
+ -1);
+ DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
+ "[[:<:]]" : "[[:>:]]"));
+ ctx->re += 6;
+ return *result ? REG_OK : REG_ESPACE;
+ }
+
+ /* Start off with an array of `max_i' elements. */
+ items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
+ if (items == NULL)
+ return REG_ESPACE;
+
+ if (*ctx->re == CHAR_CARET)
+ {
+ DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
+ items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
+ ctx->re++;
+ }
+
+ status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
+
+ if (status != REG_OK)
+ goto parse_bracket_done;
+
+ /* If there are collating symbols, split off the multi-character ones
+ * into a union of the bracket expression (without the collating symbols)
+ * and the multiple-character sequences. We create an equivalent input
+ * string and run tre_parse() recursively */
+ if (col_syms)
+ {
+ tre_char_t *str, *sp;
+ tre_collating_symbol *cp;
+ tre_parse_ctx_t subctx;
+
+ /* Allocate a new string. We start with the size of the original
+ * bracket expression (minus 1) and add 2 (for a leading "[" and
+ * a trailing nil; don't need a "^", since it is illegal to have
+ * inverted MCCSs). Since a multi-character collating symbols
+ * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
+ * need to worry about the new string getting too long. */
+ xfree(items);
+ str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
+ if (str == NULL)
+ {
+ xfree(col_syms);
+ return REG_ESPACE;
+ }
+ sp = str;
+ if (col_syms->len > 0)
+ {
+ /* There are other items in the bracket expression besides the
+ * multi-character collating symbols, so create a new bracket
+ * expression with only those other itmes. */
+ const tre_char_t *re;
+ ptrdiff_t i;
+
+ *sp++ = '[';
+ re = ctx->re;
+ for (cp = col_syms + 1; cp->start; cp++)
+ {
+ /* The "- 2" is to account for the "[." */
+ if ((i = ((cp->start - re) - 2)) > 0)
+ {
+ memcpy(sp, re, sizeof(*sp) * i);
+ sp += i;
+ }
+ /* The "+ 2" is to account for the ".]" */
+ re = cp->start + cp->len + 2;
+ }
+ i = col_syms->start - re; /* Includes the trailing right bracket */
+ memcpy(sp, re, sizeof(*sp) * i);
+ sp += i;
+ *sp++ = '|';
+ }
+ for (cp = col_syms + 1; cp->start; cp++)
+ {
+ memcpy(sp, cp->start, sizeof(*sp) * cp->len);
+ sp += cp->len;
+ if (cp[1].start)
+ *sp++ = '|';
+ }
+ *sp = 0;
+ DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
+ str));
+
+ memcpy(&subctx, ctx, sizeof(subctx));
+ subctx.re = str;
+ subctx.len = sp - str;
+ subctx.nofirstsub = 1;
+ subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
+ status = tre_parse(&subctx);
+ xfree(str);
+ if (status != REG_OK)
+ {
+ xfree(col_syms);
+ return status;
+ }
+ ctx->re = col_syms->start;
+ ctx->position = subctx.position;
+ xfree(col_syms);
+ *result = subctx.result;
+ DPRINT(("tre_parse_bracket: Returning to original string\n"));
+ return REG_OK;
+ }
+
+ DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
+ node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
+ if (node == NULL)
+ {
+ status = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ else
+ {
+ tre_literal_t *l = node->obj;
+ l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
+ SIZEOF_BRACKET_MATCH_LIST(items));
+ if (l->u.bracket_match_list == NULL)
+ {
+ status = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
+ }
+
+#ifdef TRE_DEBUG
+ {
+ int i;
+ tre_bracket_match_t *b;
+ DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
+ items->num_bracket_matches, items->flags));
+ for (i = 0, b = items->bracket_matches;
+ i < items->num_bracket_matches; i++, b++)
+ {
+ DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
+ b->value));
+ }
+ }
+#endif /* TRE_DEBUG */
+
+ parse_bracket_done:
+ xfree(items);
+ ctx->position++;
+ *result = node;
+ return status;
+}
+
+
+/* Parses a positive decimal integer. Returns -1 if the string does not
+ contain a valid number. */
+static int
+tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
+{
+ int num = -1;
+ const tre_char_t *r = *regex;
+ while (r < regex_end && *r >= L'0' && *r <= L'9')
+ {
+ if (num < 0)
+ num = 0;
+ num = num * 10 + *r - L'0';
+ r++;
+ }
+ *regex = r;
+ return num;
+}
+
+
+static reg_errcode_t
+tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+{
+ int min, max;
+#ifdef TRE_APPROX
+ int i;
+ int cost_ins, cost_del, cost_subst, cost_max;
+ int limit_ins, limit_del, limit_subst, limit_err;
+ const tre_char_t *start;
+#endif /* TRE_APPROX */
+ const tre_char_t *r = ctx->re;
+ int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
+#ifdef TRE_APPROX
+ int approx = 0;
+ int costs_set = 0;
+ int counts_set = 0;
+
+ cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
+ limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
+#endif /* TRE_APPROX */
+
+ /* Parse number (minimum repetition count). */
+ min = -1;
+ if (r >= ctx->re_end)
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
+#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ return REG_EBRACE;
+#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ if (*r >= L'0' && *r <= L'9') {
+ DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
+ min = tre_parse_int(&r, ctx->re_end);
+ }
+#ifndef TRE_APPROX
+ else
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ /* For ERE, return REG_NOMATCH to signal that the lbrace should
+ be treated as a literal */
+ return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
+#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ return REG_BADBR;
+#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+#endif /* !TRE_APPROX */
+
+ /* Parse comma and second number (maximum repetition count). */
+ max = min;
+ if (r < ctx->re_end && *r == CHAR_COMMA)
+ {
+ r++;
+ DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
+ max = tre_parse_int(&r, ctx->re_end);
+ }
+
+ /* Check that the repeat counts are sane. */
+ if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
+ return REG_BADBR;
+
+
+#ifdef TRE_APPROX
+ /*
+ '{'
+ optionally followed immediately by a number == minimum repcount
+ optionally followed by , then a number == maximum repcount
+ + then a number == maximum insertion count
+ - then a number == maximum deletion count
+ # then a number == maximum substitution count
+ ~ then a number == maximum number of errors
+ Any of +, -, # or ~ without followed by a number means that
+ the maximum count/number of errors is infinite.
+
+ An equation of the form
+ Xi + Yd + Zs < C
+ can be specified to set costs and the cost limit to a value
+ different from the default value:
+ - X is the cost of an insertion
+ - Y is the cost of a deletion
+ - Z is the cost of a substitution
+ - C is the maximum cost
+
+ If no count limit or cost is set for an operation, the operation
+ is not allowed at all.
+ */
+
+
+ do {
+ int done;
+ start = r;
+
+ /* Parse count limit settings */
+ done = 0;
+ if (!counts_set)
+ while (r + 1 < ctx->re_end && !done)
+ {
+ switch (*r)
+ {
+ case CHAR_PLUS: /* Insert limit */
+ DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
+ r++;
+ limit_ins = tre_parse_int(&r, ctx->re_end);
+ if (limit_ins < 0)
+ limit_ins = INT_MAX;
+ counts_set = 1;
+ break;
+ case CHAR_MINUS: /* Delete limit */
+ DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
+ r++;
+ limit_del = tre_parse_int(&r, ctx->re_end);
+ if (limit_del < 0)
+ limit_del = INT_MAX;
+ counts_set = 1;
+ break;
+ case CHAR_HASH: /* Substitute limit */
+ DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
+ r++;
+ limit_subst = tre_parse_int(&r, ctx->re_end);
+ if (limit_subst < 0)
+ limit_subst = INT_MAX;
+ counts_set = 1;
+ break;
+ case CHAR_TILDE: /* Maximum number of changes */
+ DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
+ r++;
+ limit_err = tre_parse_int(&r, ctx->re_end);
+ if (limit_err < 0)
+ limit_err = INT_MAX;
+ approx = 1;
+ break;
+ case CHAR_COMMA:
+ r++;
+ break;
+ case L' ':
+ r++;
+ break;
+ case L'}':
+ done = 1;
+ break;
+ default:
+ done = 1;
+ break;
+ }
+ }
+
+ /* Parse cost restriction equation. */
+ done = 0;
+ if (!costs_set)
+ while (r + 1 < ctx->re_end && !done)
+ {
+ switch (*r)
+ {
+ case CHAR_PLUS:
+ case L' ':
+ r++;
+ break;
+ case L'<':
+ DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
+ r++;
+ while (*r == L' ')
+ r++;
+ cost_max = tre_parse_int(&r, ctx->re_end);
+ if (cost_max < 0)
+ cost_max = INT_MAX;
+ else
+ cost_max--;
+ approx = 1;
+ break;
+ case CHAR_COMMA:
+ r++;
+ done = 1;
+ break;
+ default:
+ if (*r >= L'0' && *r <= L'9')
+ {
+#ifdef TRE_DEBUG
+ const tre_char_t *sr = r;
+#endif /* TRE_DEBUG */
+ int cost = tre_parse_int(&r, ctx->re_end);
+ /* XXX - make sure r is not past end. */
+ switch (*r)
+ {
+ case L'i': /* Insert cost */
+ DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
+ REST(sr)));
+ r++;
+ cost_ins = cost;
+ costs_set = 1;
+ break;
+ case L'd': /* Delete cost */
+ DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
+ REST(sr)));
+ r++;
+ cost_del = cost;
+ costs_set = 1;
+ break;
+ case L's': /* Substitute cost */
+ DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
+ REST(sr)));
+ r++;
+ cost_subst = cost;
+ costs_set = 1;
+ break;
+ default:
+ return REG_BADBR;
+ }
+ }
+ else
+ {
+ done = 1;
+ break;
+ }
+ }
+ }
+ } while (start != r);
+#endif /* TRE_APPROX */
+
+ /*{*//* Missing }. */
+ if (r >= ctx->re_end)
+ return REG_EBRACE;
+
+ /* Empty contents of {}. */
+ if (r == ctx->re)
+ return REG_BADBR;
+
+ /* Parse the ending '}' or '\}'.*/
+ if (ctx->cflags & REG_EXTENDED)
+ {
+ if (r >= ctx->re_end || *r != CHAR_RBRACE)
+ return REG_BADBR;
+ r++;
+ /* Parse trailing '?' marking minimal repetition. */
+ if (r < ctx->re_end)
+ {
+ if (*r == CHAR_QUESTIONMARK)
+ {
+ /* Process the question mark only in enhanced mode.
+ Otherwise, the question mark is an error in ERE
+ or a literal in BRE */
+ if (ctx->cflags & REG_ENHANCED)
+ {
+ minimal = !(ctx->cflags & REG_UNGREEDY);
+ r++;
+ }
+ else return REG_BADRPT;
+ }
+ else if (*r == CHAR_STAR || *r == CHAR_PLUS)
+ {
+ /* These are reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ }
+ }
+ else
+ {
+ if (r + 1 >= ctx->re_end
+ || *r != CHAR_BACKSLASH
+ || *(r + 1) != CHAR_RBRACE)
+ return REG_BADBR;
+ r += 2;
+ if (r < ctx->re_end && *r == CHAR_STAR)
+ {
+ /* This is reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ }
+
+ if (minimal)
+ ctx->num_reorder_tags++;
+
+ if (!result) goto parse_bound_exit;
+ /* Create the AST node(s). */
+ /* Originally, if min == 0 && max == 0, we immediately replace the whole
+ iteration with EMPTY. This unfortunately drops any submatches, and
+ messes up setting the pmatch values (we can get tags of -1, and
+ tag values in the billions). So we leave it and process this case as
+ usual, and wait until tre_expand_ast() to replace with EMPTY */
+#ifdef TRE_APPROX
+ if (min < 0 && max < 0)
+ /* Only approximate parameters set, no repetitions. */
+ min = max = 1;
+#endif /* TRE_APPROX */
+
+ *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
+ if (!*result)
+ return REG_ESPACE;
+
+#ifdef TRE_APPROX
+ /* If approximate matching parameters are set, add them to the
+ iteration node. */
+ if (approx || costs_set || counts_set)
+ {
+ int *params;
+ tre_iteration_t *iter = (*result)->obj;
+
+ if (costs_set || counts_set)
+ {
+ if (limit_ins == TRE_PARAM_UNSET)
+ {
+ if (cost_ins == TRE_PARAM_UNSET)
+ limit_ins = 0;
+ else
+ limit_ins = INT_MAX;
+ }
+
+ if (limit_del == TRE_PARAM_UNSET)
+ {
+ if (cost_del == TRE_PARAM_UNSET)
+ limit_del = 0;
+ else
+ limit_del = INT_MAX;
+ }
+
+ if (limit_subst == TRE_PARAM_UNSET)
+ {
+ if (cost_subst == TRE_PARAM_UNSET)
+ limit_subst = 0;
+ else
+ limit_subst = INT_MAX;
+ }
+ }
+
+ if (cost_max == TRE_PARAM_UNSET)
+ cost_max = INT_MAX;
+ if (limit_err == TRE_PARAM_UNSET)
+ limit_err = INT_MAX;
+
+ ctx->have_approx = 1;
+ params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
+ if (!params)
+ return REG_ESPACE;
+ for (i = 0; i < TRE_PARAM_LAST; i++)
+ params[i] = TRE_PARAM_UNSET;
+ params[TRE_PARAM_COST_INS] = cost_ins;
+ params[TRE_PARAM_COST_DEL] = cost_del;
+ params[TRE_PARAM_COST_SUBST] = cost_subst;
+ params[TRE_PARAM_COST_MAX] = cost_max;
+ params[TRE_PARAM_MAX_INS] = limit_ins;
+ params[TRE_PARAM_MAX_DEL] = limit_del;
+ params[TRE_PARAM_MAX_SUBST] = limit_subst;
+ params[TRE_PARAM_MAX_ERR] = limit_err;
+ iter->params = params;
+ }
+#endif /* TRE_APPROX */
+
+parse_bound_exit:
+#ifdef TRE_APPROX
+ DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
+ "limits [%d,%d,%d, total %d]\n",
+ min, max, cost_ins, cost_del, cost_subst, cost_max,
+ limit_ins, limit_del, limit_subst, limit_err));
+#else /* !TRE_APPROX */
+ DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
+#endif /* !TRE_APPROX */
+
+
+ ctx->re = r;
+ return REG_OK;
+}
+
+/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
+ non-self-contained options, like (?i), this causes ((?i)fu)bar to be
+ treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
+ Because we now set up tags for even non-capturing parenthesized
+ subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
+ pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
+ have it restore cflags after the subexpression, we don't need to have
+ a separate PARSE_RESTORE_CFLAGS, and then after processing the
+ non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
+ This has the side-benefit of now matching the perl behavior: the RE
+ foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
+ of foo AND (?i) (bar OR zap). */
+typedef enum {
+ PARSE_RE = 0,
+ PARSE_ATOM,
+ PARSE_MARK_FOR_SUBMATCH,
+ PARSE_BRANCH,
+ PARSE_PIECE,
+ PARSE_CATENATION,
+ PARSE_POST_CATENATION,
+ PARSE_UNION,
+ PARSE_POST_UNION,
+ PARSE_POSTFIX,
+} tre_parse_re_stack_symbol_t;
+
+
+reg_errcode_t
+tre_parse(tre_parse_ctx_t *ctx)
+{
+ tre_ast_node_t *result = NULL;
+ tre_parse_re_stack_symbol_t symbol;
+ reg_errcode_t status = REG_OK;
+ tre_stack_t *stack = ctx->stack;
+ int bottom = tre_stack_num_objects(stack);
+ int depth = 0;
+ int temporary_cflags = 0;
+ int bre_branch_begin;
+#ifdef TRE_DEBUG
+ const tre_char_t *tmp_re;
+#endif
+
+ DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
+ ctx->len, ctx->re, ctx->len, ctx->cflags));
+
+ if (ctx->len <= 0) return REG_EMPTY;
+ if (!ctx->nofirstsub)
+ {
+ STACK_PUSH(stack, int, ctx->cflags);
+ STACK_PUSH(stack, int, ctx->submatch_id);
+ STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
+ ctx->submatch_id++;
+ }
+ STACK_PUSH(stack, int, 0); // bre_branch_begin
+ STACK_PUSH(stack, int, PARSE_RE);
+ ctx->re_start = ctx->re;
+ ctx->re_end = ctx->re + ctx->len;
+
+
+ /* The following is basically just a recursive descent parser. I use
+ an explicit stack instead of recursive functions mostly because of
+ two reasons: compatibility with systems which have an overflowable
+ call stack, and efficiency (both in lines of code and speed). */
+ while (tre_stack_num_objects(stack) > bottom)
+ {
+ symbol = tre_stack_pop_int(stack);
+ switch (symbol)
+ {
+ case PARSE_RE:
+ /* Parse a full regexp. A regexp is one or more branches,
+ separated by the union operator `|'. */
+ bre_branch_begin = tre_stack_pop_int(stack);
+ if (
+#ifdef REG_LITERAL
+ !(ctx->cflags & REG_LITERAL) &&
+#endif /* REG_LITERAL */
+ ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
+ STACK_PUSHX(stack, int, PARSE_UNION);
+ STACK_PUSHX(stack, int, bre_branch_begin);
+ STACK_PUSHX(stack, int, PARSE_BRANCH);
+ break;
+
+ case PARSE_BRANCH:
+ /* Parse a branch. A branch is one or more pieces, concatenated.
+ A piece is an atom possibly followed by a postfix operator. */
+ bre_branch_begin = tre_stack_pop_int(stack);
+ STACK_PUSHX(stack, int, PARSE_CATENATION);
+ STACK_PUSHX(stack, int, bre_branch_begin);
+ STACK_PUSHX(stack, int, PARSE_PIECE);
+ break;
+
+ case PARSE_PIECE:
+ /* Parse a piece. A piece is an atom possibly followed by one
+ or more postfix operators. */
+ bre_branch_begin = tre_stack_pop_int(stack);
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
+ STACK_PUSHX(stack, int, bre_branch_begin);
+ STACK_PUSHX(stack, int, PARSE_ATOM);
+ break;
+
+ case PARSE_CATENATION:
+ /* If the expression has not ended, parse another piece. */
+ {
+ tre_char_t c;
+ if (ctx->re >= ctx->re_end)
+ break;
+ c = *ctx->re;
+#ifdef REG_LITERAL
+ if (!(ctx->cflags & REG_LITERAL))
+ {
+#endif /* REG_LITERAL */
+ if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
+ ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
+ && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
+ *(ctx->re + 1) == CHAR_PIPE))
+ break;
+ if ((ctx->cflags & REG_EXTENDED
+ && c == CHAR_RPAREN && depth > 0)
+ || (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
+ && *(ctx->re + 1) == CHAR_RPAREN))
+ {
+ if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
+ return REG_EPAREN;
+ DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ depth--;
+ if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
+ ctx->re += 2;
+ break;
+ }
+#ifdef REG_LITERAL
+ }
+#endif /* REG_LITERAL */
+
+#ifdef REG_LEFT_ASSOC
+ if (ctx->cflags & REG_LEFT_ASSOC)
+ {
+ /* Left associative concatenation. */
+ STACK_PUSHX(stack, int, PARSE_CATENATION);
+ STACK_PUSHX(stack, voidptr, result);
+ STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
+ STACK_PUSHX(stack, int, 0); // bre_branch_begin
+ STACK_PUSHX(stack, int, PARSE_PIECE);
+ }
+ else
+#endif /* REG_LEFT_ASSOC */
+ {
+ /* Default case, right associative concatenation. */
+ STACK_PUSHX(stack, voidptr, result);
+ STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
+ STACK_PUSHX(stack, int, PARSE_CATENATION);
+ STACK_PUSHX(stack, int, 0); // bre_branch_begin
+ STACK_PUSHX(stack, int, PARSE_PIECE);
+ }
+ break;
+ }
+
+ case PARSE_POST_CATENATION:
+ {
+ tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
+ tre_ast_node_t *tmp_node;
+ tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
+ if (!tmp_node)
+ return REG_ESPACE;
+ result = tmp_node;
+ break;
+ }
+
+ case PARSE_UNION:
+ if (ctx->re >= ctx->re_end)
+ break;
+#ifdef REG_LITERAL
+ if (ctx->cflags & REG_LITERAL)
+ break;
+#endif /* REG_LITERAL */
+ if (!(ctx->cflags & REG_EXTENDED))
+ {
+ if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
+ break;
+ ctx->re++;
+ }
+ switch (*ctx->re)
+ {
+ case CHAR_PIPE:
+ DPRINT(("tre_parse: union: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ STACK_PUSHX(stack, int, PARSE_UNION);
+ STACK_PUSHX(stack, voidptr, (void *)ctx->re);
+ STACK_PUSHX(stack, voidptr, result);
+ STACK_PUSHX(stack, int, PARSE_POST_UNION);
+ /* We need to pass a boolean (eventually) to PARSE_ATOM to
+ indicate if this is the beginning of a BRE extended branch. */
+ STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
+ STACK_PUSHX(stack, int, PARSE_BRANCH);
+ ctx->re++;
+ break;
+
+ case CHAR_RPAREN:
+ ctx->re++;
+ break;
+
+ default:
+ if (!(ctx->cflags & REG_EXTENDED))
+ ctx->re--;
+ break;
+ }
+ break;
+
+ case PARSE_POST_UNION:
+ {
+ tre_ast_node_t *tmp_node;
+ tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
+ const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
+ /* error on empty expression at end of union */
+ if (pipechar == ctx->re - 1)
+ {
+ return REG_EMPTY;
+ }
+ tmp_node = tre_ast_new_union(ctx->mem, tree, result);
+ if (!tmp_node)
+ return REG_ESPACE;
+ result = tmp_node;
+ break;
+ }
+
+ case PARSE_POSTFIX:
+ /* Parse postfix operators. */
+ if (ctx->re >= ctx->re_end)
+ break;
+#ifdef REG_LITERAL
+ if (ctx->cflags & REG_LITERAL)
+ break;
+#endif /* REG_LITERAL */
+ int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
+ int rep_min = 0;
+ int rep_max = -1;
+#ifdef TRE_DEBUG
+ int lbrace_off;
+#endif
+ switch (*ctx->re)
+ {
+ case CHAR_PLUS:
+ case CHAR_QUESTIONMARK:
+ if (!(ctx->cflags & REG_EXTENDED))
+ break;
+ /*FALLTHROUGH*/
+ case CHAR_STAR:
+ {
+ tre_ast_node_t *tmp_node;
+#ifdef TRE_DEBUG
+ const char *tstr = "star";
+ tmp_re = ctx->re;
+#endif
+
+ handle_plus_or_question:
+ /* error on iteration of raw assertion (not in subexpression) */
+ if (result->type == LITERAL && result->submatch_id < 0 &&
+ IS_ASSERTION((tre_literal_t *)result->obj))
+ {
+ if (!(ctx->cflags & REG_EXTENDED)) break;
+ return REG_BADRPT;
+ }
+ if (*ctx->re == CHAR_PLUS)
+ {
+ rep_min = 1;
+#ifdef TRE_DEBUG
+ tstr = "plus";
+#endif
+ }
+ if (*ctx->re == CHAR_QUESTIONMARK)
+ {
+ rep_max = 1;
+#ifdef TRE_DEBUG
+ tstr = "questionmark";
+#endif
+ }
+
+ if (ctx->cflags & REG_EXTENDED)
+ {
+ if (ctx->re + 1 < ctx->re_end)
+ {
+ if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
+ {
+ /* Process the question mark only in enhanced mode.
+ Otherwise, the question mark is an error in ERE */
+ if (ctx->cflags & REG_ENHANCED)
+ {
+ minimal = !(ctx->cflags & REG_UNGREEDY);
+ ctx->re++;
+ }
+ else return REG_BADRPT;
+ }
+ else if (*(ctx->re + 1) == CHAR_STAR
+ || *(ctx->re + 1) == CHAR_PLUS)
+ {
+ /* These are reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ }
+ }
+ else
+ {
+ if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
+ {
+ /* This is reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ if (ctx->re + 2 < ctx->re_end)
+ {
+ if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
+ {
+ /* Process the question mark only in enhanced mode.
+ Otherwise, the question mark is a literal in BRE */
+ if (ctx->cflags & REG_ENHANCED)
+ {
+ minimal = !(ctx->cflags & REG_UNGREEDY);
+ ctx->re += 2;
+ }
+ }
+ else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
+ {
+ /* This is reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ }
+ }
+
+ if (minimal)
+ ctx->num_reorder_tags++;
+
+ DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
+ minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
+ if (result == NULL)
+ {
+ if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
+ else goto parse_literal;
+ }
+ ctx->re++;
+ tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
+ minimal);
+ if (tmp_node == NULL)
+ return REG_ESPACE;
+ result = tmp_node;
+
+ /* Set the iterator with a submatch id in the invisible range
+ * (which will be overridden if a real submatch is needed) */
+ result->submatch_id = ctx->submatch_id_invisible++;
+
+#if 0
+ /* We don't allow multiple postfixes, but this might be needed
+ to support approximate matching */
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
+#endif
+ }
+ break;
+
+ case CHAR_BACKSLASH:
+ /* "\{" is special without REG_EXTENDED */
+ /* "\+" and "\?" are special with REG_ENHANCED for BRE */
+ if (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end)
+ {
+ switch (*(ctx->re + 1))
+ {
+ case CHAR_LBRACE:
+ ctx->re++;
+#ifdef TRE_DEBUG
+ lbrace_off = 2;
+#endif
+ goto parse_brace;
+ case CHAR_PLUS:
+ case CHAR_QUESTIONMARK:
+ if (ctx->cflags & REG_ENHANCED)
+ {
+#ifdef TRE_DEBUG
+ tmp_re = ctx->re;
+#endif
+ ctx->re++;
+ goto handle_plus_or_question;
+ }
+ break;
+ }
+ break;
+ }
+ else
+ break;
+
+ case CHAR_LBRACE:
+ {
+ int raw_assertion;
+
+ /* "{" is literal without REG_EXTENDED */
+ if (!(ctx->cflags & REG_EXTENDED))
+ break;
+#ifdef TRE_DEBUG
+ lbrace_off = 1;
+#endif
+
+ parse_brace:
+ /* error on iteration of raw assertion (not in subexpression),
+ but wait until after parsing bounds */
+ raw_assertion = (result->type == LITERAL
+ && result->submatch_id < 0
+ && IS_ASSERTION((tre_literal_t *)result->obj));
+ ctx->re++;
+
+ status = tre_parse_bound(ctx, &result);
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ /* For ERE, if status is REG_NOMATCH, this mean the lbrace
+ is to be treated as a literal. */
+ if (status == REG_NOMATCH)
+ {
+ ctx->re--;
+ break;
+ }
+#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
+ REST(ctx->re - lbrace_off)));
+ if (status != REG_OK)
+ return status;
+ if (raw_assertion) return REG_BADRPT;
+
+ /* Set the iterator with a submatch id in the invisible range
+ * (which will be overridden if a real submatch is needed) */
+ if (result->type == ITERATION)
+ result->submatch_id = ctx->submatch_id_invisible++;
+
+#if 0
+ /* We don't allow multiple postfixes, but this might be needed
+ to support approximate matching */
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
+#endif
+ break;
+ }
+ }
+ break;
+
+ case PARSE_ATOM:
+ {
+ /* Parse an atom. An atom is a regular expression enclosed in `()',
+ an empty set of `()', a bracket expression, `.', `^', `$',
+ a `\' followed by a character, or a single character. */
+
+ /* The stack contains a boolean value, whether PARSE_ATOM is
+ being called just after the start of a group (left paren)
+ in a BRE */
+ bre_branch_begin = tre_stack_pop_int(stack);
+
+ /* End of regexp? (empty string). */
+ if (ctx->re >= ctx->re_end)
+ goto parse_literal;
+
+#ifdef REG_LITERAL
+ if (ctx->cflags & REG_LITERAL)
+ goto parse_literal;
+#endif /* REG_LITERAL */
+
+ switch (*ctx->re)
+ {
+ case CHAR_LPAREN: /* parenthesized subexpression */
+
+ /* Handle "(?...)" extensions. They work in a way similar
+ to Perls corresponding extensions. */
+ if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
+ (REG_EXTENDED|REG_ENHANCED)
+ && *(ctx->re + 1) == CHAR_QUESTIONMARK)
+ {
+ int new_cflags = ctx->cflags;
+ int bit = 1;
+ int invisible_submatch = 0;
+ DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ ctx->re += 2;
+ while (/*CONSTCOND*/1)
+ {
+ if (*ctx->re == L'i')
+ {
+ DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ if (bit)
+ new_cflags |= REG_ICASE;
+ else
+ new_cflags &= ~REG_ICASE;
+ ctx->re++;
+ }
+ else if (*ctx->re == L'n')
+ {
+ DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ if (bit)
+ new_cflags |= REG_NEWLINE;
+ else
+ new_cflags &= ~REG_NEWLINE;
+ ctx->re++;
+ }
+#ifdef REG_LEFT_ASSOC
+ else if (*ctx->re == L'l')
+ {
+ DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ if (bit)
+ new_cflags |= REG_LEFT_ASSOC;
+ else
+ new_cflags &= ~REG_LEFT_ASSOC;
+ ctx->re++;
+ }
+#endif /* REG_LEFT_ASSOC */
+#ifdef REG_UNGREEDY
+ else if (*ctx->re == L'U')
+ {
+ DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ if (bit)
+ new_cflags |= REG_UNGREEDY;
+ else
+ new_cflags &= ~REG_UNGREEDY;
+ ctx->re++;
+ }
+#endif /* REG_UNGREEDY */
+ else if (*ctx->re == CHAR_MINUS)
+ {
+ DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ ctx->re++;
+ bit = 0;
+ }
+ else if (*ctx->re == CHAR_COLON)
+ {
+ DPRINT(("tre_parse: no group: '%.*" STRF
+ "', (invisible submatch %d)\n",
+ REST(ctx->re), ctx->submatch_id_invisible));
+ ctx->re++;
+ depth++;
+ invisible_submatch = 1;
+ break;
+ }
+ else if (*ctx->re == CHAR_HASH)
+ {
+ DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ /* A comment can contain any character except a
+ right parenthesis */
+ while (*ctx->re != CHAR_RPAREN
+ && ctx->re < ctx->re_end)
+ ctx->re++;
+ if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
+ {
+ ctx->re++;
+ break;
+ }
+ else
+ return REG_BADPAT;
+ }
+ else if (*ctx->re == CHAR_RPAREN)
+ {
+ ctx->re++;
+ break;
+ }
+ else
+ return REG_BADRPT;
+ }
+
+ /* Turn on the cflags changes for the rest of the
+ enclosing group. */
+ if (invisible_submatch)
+ {
+ STACK_PUSHX(stack, int, ctx->cflags);
+ STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
+ STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
+ ctx->submatch_id_invisible++;
+ STACK_PUSHX(stack, int, 0); // bre_branch_begin
+ STACK_PUSHX(stack, int, PARSE_RE);
+ }
+ else {
+ STACK_PUSHX(stack, int, 0); // bre_branch_begin
+ STACK_PUSHX(stack, int, PARSE_ATOM);
+ }
+ ctx->cflags = new_cflags;
+ break;
+ }
+
+ if (ctx->cflags & REG_EXTENDED)
+ {
+ parse_bre_lparen:
+ DPRINT(("tre_parse: group begin: '%.*" STRF
+ "', submatch %d\n", REST(ctx->re),
+ ctx->submatch_id));
+ ctx->re++;
+ /* First parse a whole RE, then mark the resulting tree
+ for submatching. */
+ STACK_PUSHX(stack, int, ctx->cflags);
+ STACK_PUSHX(stack, int, ctx->submatch_id);
+ STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
+ /* We need to pass a boolean (eventually) to PARSE_ATOM to
+ indicate if this is the beginning of a BRE group. */
+ STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
+ STACK_PUSHX(stack, int, PARSE_RE);
+ ctx->submatch_id++;
+ depth++;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case CHAR_RPAREN: /* end of current subexpression */
+ if (ctx->cflags & REG_EXTENDED && depth > 0)
+ {
+ parse_bre_rparen_empty:
+ if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
+ return REG_EPAREN;
+ DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ /* We were expecting an atom, but instead the current
+ subexpression was closed. POSIX leaves the meaning of
+ this to be implementation-defined. We interpret this as
+ an empty expression (which matches an empty string). */
+ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ if (!(ctx->cflags & REG_EXTENDED))
+ ctx->re--;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case CHAR_LBRACKET: /* bracket expression */
+ DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ ctx->re++;
+ status = tre_parse_bracket(ctx, &result);
+ if (status != REG_OK)
+ return status;
+ break;
+
+ case CHAR_BACKSLASH:
+ /* Deal with "\(", "\)" or "\{" for BREs */
+ if (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end)
+ {
+ if (*(ctx->re + 1) == CHAR_LPAREN)
+ {
+ ctx->re++;
+ goto parse_bre_lparen;
+ }
+ else if (*(ctx->re + 1) == CHAR_RPAREN)
+ {
+ ctx->re++;
+ goto parse_bre_rparen_empty;
+ }
+ if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
+ }
+
+ if (ctx->re + 1 >= ctx->re_end)
+ /* Trailing backslash. */
+ return REG_EESCAPE;
+
+ if (!(ctx->cflags & REG_ENHANCED))
+ {
+ DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
+ ctx->re++;
+ goto unenhanced_backslash;
+ }
+
+ /* If a macro is used, parse the expanded macro recursively. */
+ {
+ tre_char_t buf[64];
+ tre_expand_macro(ctx->re + 1, ctx->re_end,
+ buf, elementsof(buf));
+ if (buf[0] != 0)
+ {
+ tre_parse_ctx_t subctx;
+ memcpy(&subctx, ctx, sizeof(subctx));
+ subctx.re = buf;
+ subctx.len = tre_strlen(buf);
+ subctx.nofirstsub = 1;
+ status = tre_parse(&subctx);
+ if (status != REG_OK)
+ return status;
+ ctx->re += 2;
+ ctx->position = subctx.position;
+ result = subctx.result;
+ break;
+ }
+ }
+
+#ifdef REG_LITERAL
+ if (*(ctx->re + 1) == L'Q')
+ {
+ DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ ctx->cflags |= REG_LITERAL;
+ temporary_cflags |= REG_LITERAL;
+ ctx->re += 2;
+ STACK_PUSHX(stack, int, 0);
+ STACK_PUSHX(stack, int, PARSE_ATOM);
+ break;
+ }
+#endif /* REG_LITERAL */
+
+ DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
+ ctx->re++;
+ switch (*ctx->re)
+ {
+ case L'b':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_WB, -1);
+ ctx->re++;
+ break;
+ case L'B':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_WB_NEG, -1);
+ ctx->re++;
+ break;
+ case L'<':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_BOW, -1);
+ ctx->re++;
+ break;
+ case L'>':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_EOW, -1);
+ ctx->re++;
+ break;
+ case L'x':
+ ctx->re++;
+ if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
+ {
+ /* 8 bit hex char. */
+ char tmp[3] = {0, 0, 0};
+ long val;
+ DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
+ REST(ctx->re - 2)));
+
+ if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
+ ctx->re < ctx->re_end)
+ {
+ tmp[0] = (char)ctx->re[0];
+ ctx->re++;
+ }
+ if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
+ ctx->re < ctx->re_end)
+ {
+ tmp[1] = (char)ctx->re[0];
+ ctx->re++;
+ }
+ val = strtol(tmp, NULL, 16);
+ result = tre_ast_new_literal(ctx->mem, (int)val,
+ (int)val, ctx->position);
+ ctx->position++;
+ break;
+ }
+ else if (ctx->re < ctx->re_end)
+ {
+ /* Wide char. */
+ char tmp[32];
+ long val;
+ int i = 0;
+ ctx->re++;
+ while (ctx->re_end - ctx->re >= 0)
+ {
+ if (ctx->re[0] == CHAR_RBRACE)
+ break;
+ if (tre_isxdigit_l(ctx->re[0], ctx->loc))
+ {
+ tmp[i] = (char)ctx->re[0];
+ i++;
+ ctx->re++;
+ continue;
+ }
+ return REG_EBRACE;
+ }
+ ctx->re++;
+ tmp[i] = 0;
+ val = strtol(tmp, NULL, 16);
+ result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
+ ctx->position);
+ ctx->position++;
+ break;
+ }
+ /*FALLTHROUGH*/
+
+ default:
+ unenhanced_backslash:
+ if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
+ REG_EXTENDED &&
+ tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
+ {
+ /* Back reference (only in BRE or enhanced). */
+ int val = *ctx->re - L'0';
+ DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
+ REST(ctx->re - 1)));
+ result = tre_ast_new_literal(ctx->mem, BACKREF, val,
+ ctx->position);
+ if (result == NULL)
+ return REG_ESPACE;
+
+ /* Set the backref with a submatch id in the invisible
+ * range (which will be overridden if a real submatch
+ * is needed) */
+ result->submatch_id = ctx->submatch_id_invisible++;
+
+ ctx->position++;
+ ctx->num_reorder_tags++;
+ ctx->max_backref = MAX(val, ctx->max_backref);
+ ctx->re++;
+ }
+ else
+ {
+ /* Escaped character. */
+ DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
+ REST(ctx->re - 1)));
+ result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
+ ctx->position);
+ ctx->position++;
+ ctx->re++;
+ }
+ break;
+ }
+ if (result == NULL)
+ return REG_ESPACE;
+ break;
+
+ case CHAR_PERIOD: /* the any-symbol */
+ DPRINT(("tre_parse: any: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ if (ctx->cflags & REG_NEWLINE)
+ {
+ tre_ast_node_t *tmp1;
+ tre_ast_node_t *tmp2;
+ tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
+ ctx->position);
+ if (!tmp1)
+ return REG_ESPACE;
+ tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
+ ctx->position + 1);
+ if (!tmp2)
+ return REG_ESPACE;
+ result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ if (!result)
+ return REG_ESPACE;
+ ctx->position += 2;
+ }
+ else
+ {
+ result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
+ ctx->position);
+ if (!result)
+ return REG_ESPACE;
+ ctx->position++;
+ }
+ ctx->re++;
+ break;
+
+ case CHAR_CARET: /* beginning of line assertion */
+ /* '^' has a special meaning everywhere in EREs, at the
+ beginning of the RE and after \( is BREs. It is also
+ special in enhanced BREs at the beginning of each branches
+ of a union */
+ if (ctx->cflags & REG_EXTENDED
+ || bre_branch_begin
+ || ctx->re == ctx->re_start)
+ {
+ DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_BOL, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ ctx->re++;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case CHAR_DOLLAR: /* end of line assertion. */
+ /* '$' is special everywhere in EREs, and in the end of the
+ string and before \) is BREs. */
+ if (ctx->cflags & REG_EXTENDED
+ || (ctx->re + 2 < ctx->re_end
+ && *(ctx->re + 1) == CHAR_BACKSLASH
+ && *(ctx->re + 2) == CHAR_RPAREN)
+ || ctx->re + 1 == ctx->re_end)
+ {
+ DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_EOL, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ ctx->re++;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ default:
+ parse_literal:
+
+ if (temporary_cflags && ctx->re + 1 < ctx->re_end
+ && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
+ {
+ DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ ctx->cflags &= ~temporary_cflags;
+ temporary_cflags = 0;
+ ctx->re += 2;
+ if (ctx->re < ctx->re_end)
+ {
+ STACK_PUSHX(stack, int, 0);
+ STACK_PUSHX(stack, int, PARSE_ATOM);
+ }
+ else
+ {
+ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (!result) return REG_ESPACE;
+ }
+ break;
+ }
+
+
+ /* We are expecting an atom. If the subexpression (or the whole
+ regexp ends here, we interpret it as an empty expression
+ (which matches an empty string), which is an error.
+ Iterations of an empty expression is also an error. */
+#ifdef REG_LITERAL
+ if (!(ctx->cflags & REG_LITERAL))
+ {
+#endif /* REG_LITERAL */
+ /* error on end of string */
+ if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
+ : REG_EMPTY;
+ /* error on unions and iterations of empty expressions */
+ if (ctx->cflags & REG_EXTENDED)
+ {
+ if (ctx->re < ctx->re_end)
+ {
+ if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
+ if (*ctx->re == CHAR_LBRACE)
+ {
+ ctx->re++;
+ empty_parse_bound:
+ /* We need to parse the bound first and return
+ any error, before returning REG_BADRPT */
+ status = tre_parse_bound(ctx, NULL);
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ /* For ERE, if REG_NOMATCH is returned, we
+ treat the lbrace as a literal. */
+ if (status == REG_NOMATCH)
+ {
+ ctx->re--;
+ /* Drop down to literal-handling code */
+ }
+ else
+ {
+#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ if (status != REG_OK)
+ return status;
+ return REG_BADRPT;
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ }
+#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ }
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+ else
+#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+ if (*ctx->re == CHAR_STAR
+ || *ctx->re == CHAR_PLUS
+ || *ctx->re == CHAR_QUESTIONMARK)
+ {
+ return REG_BADRPT;
+ }
+ }
+ }
+ else if (ctx->re + 1 < ctx->re_end
+ && *ctx->re == CHAR_BACKSLASH
+ && *(ctx->re + 1) == CHAR_LBRACE)
+ {
+ ctx->re += 2;
+ goto empty_parse_bound;
+ }
+#ifdef REG_LITERAL
+ }
+#endif /* REG_LITERAL */
+
+ DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
+ REST(ctx->re)));
+ /* Note that we can't use an tre_isalpha() test here, since there
+ may be characters which are alphabetic but neither upper or
+ lower case. */
+ if (ctx->cflags & REG_ICASE
+ && (tre_isupper_l(*ctx->re, ctx->loc) ||
+ tre_islower_l(*ctx->re, ctx->loc)))
+ {
+ tre_ast_node_t *tmp1;
+ tre_ast_node_t *tmp2;
+
+ /* XXX - Can there be more than one opposite-case
+ counterpoints for some character in some locale? Or
+ more than two characters which all should be regarded
+ the same character if case is ignored? If yes, there
+ does not seem to be a portable way to detect it. I guess
+ that at least for multi-character collating elements there
+ could be several opposite-case counterpoints, but they
+ cannot be supported portably anyway. */
+ tmp1 = tre_ast_new_literal(ctx->mem,
+ tre_toupper_l(*ctx->re, ctx->loc),
+ tre_toupper_l(*ctx->re, ctx->loc),
+ ctx->position);
+ if (!tmp1)
+ return REG_ESPACE;
+ tmp2 = tre_ast_new_literal(ctx->mem,
+ tre_tolower_l(*ctx->re, ctx->loc),
+ tre_tolower_l(*ctx->re, ctx->loc),
+ ctx->position);
+ if (!tmp2)
+ return REG_ESPACE;
+ result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ if (!result)
+ return REG_ESPACE;
+ }
+ else
+ {
+ result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
+ ctx->position);
+ if (!result)
+ return REG_ESPACE;
+ }
+ ctx->position++;
+ ctx->re++;
+ break;
+ }
+ break;
+ }
+
+ case PARSE_MARK_FOR_SUBMATCH:
+ {
+ int submatch_id = tre_stack_pop_int(stack);
+
+ ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
+ if (result->submatch_id >= 0 &&
+ result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
+ {
+ tre_ast_node_t *n, *tmp_node;
+ if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
+ break;
+ n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (n == NULL)
+ return REG_ESPACE;
+ tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
+ if (tmp_node == NULL)
+ return REG_ESPACE;
+ tmp_node->num_submatches = result->num_submatches;
+ result = tmp_node;
+ }
+ result->submatch_id = submatch_id;
+ if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
+ result->num_submatches++;
+ break;
+ }
+
+ default:
+ assert(0);
+ break;
+ }
+ }
+
+ /* Check for missing closing parentheses. */
+ if (depth > 0)
+ return REG_EPAREN;
+
+ ctx->result = result;
+
+ return REG_OK;
+}
+
+/* EOF */