2 tre-parse.c - Regexp parser
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
18 #endif /* HAVE_CONFIG_H */
27 #include "tre-stack.h"
28 #include "tre-parse.h"
31 Before looking up a collating symbol, check if the name matches in
32 the character names (cnames) array; if so, use the corresponding
35 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
36 the implementation choice that for ERE, a non-numeric character following
37 a left brace that would normally be a bound, causes the left brace to be
39 #define BSD_COMPATIBILITY
40 #ifdef BSD_COMPATIBILITY
42 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
43 #endif /* BSD_COMPATIBILITY */
45 /* Characters with special meanings in regexp syntax. */
46 #define CHAR_PIPE L'|'
47 #define CHAR_LPAREN L'('
48 #define CHAR_RPAREN L')'
49 #define CHAR_LBRACE L'{'
50 #define CHAR_RBRACE L'}'
51 #define CHAR_LBRACKET L'['
52 #define CHAR_RBRACKET L']'
53 #define CHAR_MINUS L'-'
54 #define CHAR_STAR L'*'
55 #define CHAR_QUESTIONMARK L'?'
56 #define CHAR_PLUS L'+'
57 #define CHAR_PERIOD L'.'
58 #define CHAR_COLON L':'
59 #define CHAR_EQUAL L'='
60 #define CHAR_COMMA L','
61 #define CHAR_CARET L'^'
62 #define CHAR_DOLLAR L'$'
63 #define CHAR_BACKSLASH L'\\'
64 #define CHAR_HASH L'#'
65 #define CHAR_TILDE L'~'
68 /* Some macros for expanding \w, \s, etc. */
69 static const struct tre_macro_struct
{
71 const char *expansion
;
73 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
74 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
75 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
76 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
81 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
82 must have at least `len' items. Sets buf[0] to zero if the there
83 is no match in `tre_macros'. */
85 tre_expand_macro(const tre_char_t
*regex
, const tre_char_t
*regex_end
,
86 tre_char_t
*buf
, size_t buf_len
)
91 if (regex
>= regex_end
)
94 for (i
= 0; tre_macros
[i
].expansion
; i
++)
96 if (tre_macros
[i
].c
== *regex
)
99 DPRINT(("Expanding macro '%c' => '%s'\n",
100 tre_macros
[i
].c
, tre_macros
[i
].expansion
));
101 for (j
= 0; tre_macros
[i
].expansion
[j
] && j
< buf_len
; j
++)
102 buf
[j
] = tre_macros
[i
].expansion
[j
];
110 tre_new_item(tre_mem_t mem
, int type
, int val
, int *max_i
,
111 tre_bracket_match_list_t
**items
)
113 reg_errcode_t status
= REG_OK
;
114 tre_bracket_match_list_t
*array
= *items
;
115 int i
= array
->num_bracket_matches
;
116 /* Allocate more space if necessary. */
119 tre_bracket_match_list_t
*new_items
;
120 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i
));
121 /* If the array is already 1024 items large, give up -- there's
122 probably an error in the regexp (e.g. not a '\0' terminated
123 string and missing ']') */
127 new_items
= xrealloc(array
, SIZEOF_BRACKET_MATCH_LIST_N(*max_i
));
128 if (new_items
== NULL
)
130 *items
= array
= new_items
;
132 array
->bracket_matches
[i
].type
= type
;
133 array
->bracket_matches
[i
].value
= val
;
134 array
->num_bracket_matches
++;
138 #ifndef TRE_USE_SYSTEM_WCTYPE
140 /* isalnum() and the rest may be macros, so wrap them to functions. */
141 int tre_isalnum_func(tre_cint_t c
) { return tre_isalnum(c
); }
142 int tre_isalpha_func(tre_cint_t c
) { return tre_isalpha(c
); }
145 int tre_isascii_func(tre_cint_t c
) { return tre_isascii(c
); }
146 #else /* !tre_isascii */
147 int tre_isascii_func(tre_cint_t c
) { return !(c
>> 7); }
148 #endif /* !tre_isascii */
151 int tre_isblank_func(tre_cint_t c
) { return tre_isblank(c
); }
152 #else /* !tre_isblank */
153 int tre_isblank_func(tre_cint_t c
) { return ((c
== ' ') || (c
== '\t')); }
154 #endif /* !tre_isblank */
156 int tre_iscntrl_func(tre_cint_t c
) { return tre_iscntrl(c
); }
157 int tre_isdigit_func(tre_cint_t c
) { return tre_isdigit(c
); }
158 int tre_isgraph_func(tre_cint_t c
) { return tre_isgraph(c
); }
159 int tre_islower_func(tre_cint_t c
) { return tre_islower(c
); }
160 int tre_isprint_func(tre_cint_t c
) { return tre_isprint(c
); }
161 int tre_ispunct_func(tre_cint_t c
) { return tre_ispunct(c
); }
162 int tre_isspace_func(tre_cint_t c
) { return tre_isspace(c
); }
163 int tre_isupper_func(tre_cint_t c
) { return tre_isupper(c
); }
164 int tre_isxdigit_func(tre_cint_t c
) { return tre_isxdigit(c
); }
168 int (*func
)(tre_cint_t
);
169 } tre_ctype_map
[] = {
170 { "alnum", &tre_isalnum_func
},
171 { "alpha", &tre_isalpha_func
},
173 { "ascii", &tre_isascii_func
},
174 #endif /* tre_isascii */
176 { "blank", &tre_isblank_func
},
177 #endif /* tre_isblank */
178 { "cntrl", &tre_iscntrl_func
},
179 { "digit", &tre_isdigit_func
},
180 { "graph", &tre_isgraph_func
},
181 { "lower", &tre_islower_func
},
182 { "print", &tre_isprint_func
},
183 { "punct", &tre_ispunct_func
},
184 { "space", &tre_isspace_func
},
185 { "upper", &tre_isupper_func
},
186 { "xdigit", &tre_isxdigit_func
},
190 tre_ctype_t
tre_ctype(const char *name
)
193 for (i
= 0; tre_ctype_map
[i
].name
!= NULL
; i
++)
195 if (strcmp(name
, tre_ctype_map
[i
].name
) == 0)
196 return tre_ctype_map
[i
].func
;
198 return (tre_ctype_t
)0;
200 #endif /* !TRE_USE_SYSTEM_WCTYPE */
202 #define REST(re) (int)(ctx->re_end - (re)), (re)
204 #define START_COLLATING_SYMBOLS 16
205 #define MAX_COLLATING_SYMBOL_LEN 4
208 const tre_char_t
*start
;
210 } tre_collating_symbol
;
214 int __collate_equiv_value(locale_t loc
, const wchar_t *str
, size_t len
);
216 #ifdef BSD_COMPATIBILITY
218 tre_search_cnames(const wchar_t *name
, size_t len
)
221 size_t high
= NCNAMES
- 1;
227 cur
= (low
+ high
) / 2;
228 cmp
= wcsncmp(name
, cnames
[cur
].name
, len
);
229 if (cmp
== 0 && cnames
[cur
].name
[len
] == 0) return cnames
[cur
].code
;
230 if (cmp
> 0) low
= cur
+ 1;
235 #endif /* BSD_COMPATIBILITY */
237 /* Scan the contents of a bracket expression, and create a
238 * tre_bracket_match_list_t encoding the bracket expression. If during
239 * the scan, multi-character collating symbols are detected, switch
240 * into a mode to collect those MCCSs into a tre_collating_symbol
241 * list and pass them back. tre_parse_bracket will use that to
242 * create a new string composed of a union of the bracket expression
243 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
244 * call tre_parse (recursive) to parse that new string (which will
245 * call tre_parse_bracket and tre_parse_bracket_items again. */
247 tre_parse_bracket_items(tre_parse_ctx_t
*ctx
, tre_bracket_match_list_t
**items
,
248 int *items_size
, tre_collating_symbol
**result
)
250 const tre_char_t
*re
= ctx
->re
;
251 const tre_char_t
*re_end
= ctx
->re_end
;
252 tre_collating_symbol
*col_syms
= NULL
;
253 tre_collating_symbol
*cp
= NULL
;
255 reg_errcode_t status
;
256 int max_i
= *items_size
;
257 int other
= 0; /* contains content other than multi-character collating
259 int range
= -1; /* -1 unset, 0 begin range set, +1 end range expected */
261 int invert
= ((*items
)->flags
& TRE_BRACKET_MATCH_FLAG_NEGATE
);
262 int collect_MCCS
= 0;
263 const tre_char_t
*start
;
265 for ( ;re
< re_end
; re
++)
273 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
279 /* The hyphen is the end range */
282 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
284 goto process_end_range
;
286 if (re
+ 1 >= re_end
)
291 /* The hyphen is at the end */
292 if (re
[1] == CHAR_RBRACKET
)
294 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
296 goto process_begin_range
;
298 /* Two ranges are not allowed to share an endpoint, or begin
299 * range is illegal. */
305 range
= 1; /* Expect end range */
306 DPRINT(("tre_parse_bracket: range: '%.*" STRF
"'\n", REST(re
)));
310 if (re
+ 1 >= re_end
)
325 status
= REG_ECOLLATE
;
328 if (*re
== CHAR_PERIOD
)
330 if (re
+ 1 >= re_end
)
332 status
= REG_ECOLLATE
;
336 if (re
[1] == CHAR_RBRACKET
)
338 DPRINT(("tre_parse_bracket: collating "
339 "symbol: '%.*" STRF
"'\n",
344 status
= REG_ECOLLATE
;
347 #ifdef BSD_COMPATIBILITY
348 /* Check if the name is in cnames; if so, use
349 the corresponding code */
350 c
= tre_search_cnames(start
, re
- start
);
351 if (c
!= (wchar_t)-1)
354 goto process_single_character
;
356 #endif /* BSD_COMPATIBILITY */
357 /* Verify this is a known sequence */
358 if (__collate_equiv_value(ctx
->loc
, start
,
361 status
= REG_ECOLLATE
;
364 /* Process single character collating symbols */
369 goto process_single_character
;
371 /* Inverted MCCSs are undefined */
374 status
= REG_ECOLLATE
;
377 /* Can't have MCCSs as an endpoint to a range */
384 /* Switch into MCCS collection mode (if not
390 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
392 #else /* !TRE_DEBUG */
394 #endif /* !TRE_DEBUG */
395 /* Allocate a memory block the first time */
398 if ((col_syms
= xmalloc(sizeof(*col_syms
) *
399 (START_COLLATING_SYMBOLS
+ 2)))
403 n_col_syms
= START_COLLATING_SYMBOLS
;
405 /* Enlarge the memory block is more is needed */
406 if ((cp
- col_syms
) - 1 >= n_col_syms
)
409 tre_collating_symbol
*tmp
=
410 xrealloc(col_syms
, sizeof(*col_syms
) *
411 ((n_col_syms
*= 2) + 2));
417 DPRINT(("tre_list_collating_symbols: "
418 "Enlarging col_syms to %d\n",
421 cp
= col_syms
+ i
+ 1;
424 cp
->len
= re
- start
;
437 /* Process equivalence and character classes */
438 tre_char_t kind
= re
[1];
440 /* Can't have a class as an endpoint to a range */
446 if (!collect_MCCS
&& range
== 0)
448 status
= tre_new_item(ctx
->mem
, TRE_BRACKET_MATCH_TYPE_CHAR
,
450 if (status
!= REG_OK
)
460 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
: REG_ECTYPE
;
465 if (re
+ 1 >= re_end
)
467 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
:
472 if (re
[1] == CHAR_RBRACKET
)
476 /* Empty class name */
477 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
:
481 /* Process equivalence class */
482 if (kind
== CHAR_EQUAL
)
486 DPRINT(("tre_parse_bracket: equivalence: '%.*"
487 STRF
"'\n", REST(start
- 2)));
489 /* While we find the collation value even for
490 multi-character collating elements , we
491 don't (yet) match any collation values
492 against multi-character sequences. We'd have
493 to enumerate those multi-character sequences
494 and like multi-character collating symbols,
495 create a union of those sequences with the
496 rest of the bracket expression. While
497 doable, a bracket expression matching
498 multiple characters, that doesn't explicitly
499 contain multi-character sequences, might
500 be unexpected, so we punt for now. */
501 if ((equiv
= __collate_equiv_value(ctx
->loc
,
502 start
, re
- start
)) <= 0)
504 /* The standard says that if no collating
505 element if found, we use the collating
506 symbol itself. But __collate_equiv_value
507 doesn't make a distinction between
508 an element that is in a equvalence
509 class with others, or is the only member,
510 so we already know there is no collating
511 symbol. (Note that in the case of a
512 collating element whose collation value
513 is unique, matching against the
514 collating element itself, or against
515 its collation value, is equivalent.) */
516 #ifdef BSD_COMPATIBILITY
517 /* Check if the name is in cnames; if so,
518 use the corresponding code */
519 c
= tre_search_cnames(start
, re
- start
);
520 if (c
!= (wchar_t)-1)
523 goto process_single_character
;
525 #endif /* BSD_COMPATIBILITY */
526 status
= REG_ECOLLATE
;
531 status
= tre_new_item(ctx
->mem
,
532 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE
,
533 equiv
, &max_i
, items
);
534 if (status
!= REG_OK
)
540 /* Process character class */
541 DPRINT(("tre_parse_bracket: class: '%.*" STRF
542 "'\n", REST(start
- 2)));
547 int len
= MIN(re
- start
, 63);
550 tre_char_t tmp_wcs
[64];
551 wcsncpy(tmp_wcs
, start
, (size_t)len
);
552 tmp_wcs
[len
] = L
'\0';
553 #if defined HAVE_WCSRTOMBS
556 const tre_char_t
*src
= tmp_wcs
;
557 memset(&state
, '\0', sizeof(state
));
558 len
= wcsrtombs_l(tmp_str
, &src
,
559 sizeof(tmp_str
), &state
,
562 #elif defined HAVE_WCSTOMBS
563 len
= wcstombs(tmp_str
, tmp_wcs
, 63);
564 #endif /* defined HAVE_WCSTOMBS */
566 #else /* !TRE_WCHAR */
567 strncpy(tmp_str
, (const char*)start
, len
);
568 #endif /* !TRE_WCHAR */
570 DPRINT((" class name: %s\n", tmp_str
));
571 class = tre_ctype_l(tmp_str
, ctx
->loc
);
577 status
= tre_new_item(ctx
->mem
,
578 TRE_BRACKET_MATCH_TYPE_CLASS
,
579 class, &max_i
, items
);
580 if (status
!= REG_OK
)
594 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
596 goto process_single_character
;
602 /* A first right bracket */
605 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
614 DPRINT(("tre_parse_bracket: done: '%.*" STRF
"'\n",
618 /* Mark the character following the right bracket. Set len
619 * to whether there are other things besides the
620 * multi-character collating symbols */
621 col_syms
->start
= re
+ 1;
622 col_syms
->len
= other
;
623 /* Mark the end of the list */
629 /* range > 0 is not possible, since we did a lookahead after the
633 status
= tre_new_item(ctx
->mem
, TRE_BRACKET_MATCH_TYPE_CHAR
,
635 if (status
!= REG_OK
)
638 DPRINT(("tre_parse_bracket: done: '%.*" STRF
"'\n", REST(re
)));
644 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
646 process_single_character
:
647 /* Process single character */
653 /* Get collation equivalence values */
654 mine
= __collate_equiv_value(ctx
->loc
, &min
, 1);
655 maxe
= __collate_equiv_value(ctx
->loc
, &c
, 1);
663 status
= tre_new_item(ctx
->mem
,
664 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN
,
665 mine
, &max_i
, items
);
666 if (status
!= REG_OK
)
668 status
= tre_new_item(ctx
->mem
,
669 TRE_BRACKET_MATCH_TYPE_RANGE_END
,
670 maxe
, &max_i
, items
);
671 if (status
!= REG_OK
)
683 status
= tre_new_item(ctx
->mem
,
684 TRE_BRACKET_MATCH_TYPE_CHAR
,
686 if (status
!= REG_OK
)
699 DPRINT(("tre_parse_bracket: error: '%.*" STRF
"', status=%d\n",
707 static const char *bracket_match_type_str
[] = {
715 #endif /* TRE_DEBUG */
718 tre_parse_bracket(tre_parse_ctx_t
*ctx
, tre_ast_node_t
**result
)
720 tre_ast_node_t
*node
;
721 reg_errcode_t status
= REG_OK
;
722 tre_bracket_match_list_t
*items
;
724 tre_collating_symbol
*col_syms
= NULL
;
726 /* Handle special cases [[:<:]] and [[:>:]] */
727 if (ctx
->re_end
- ctx
->re
>= 6 && ctx
->re
[0] == CHAR_LBRACKET
728 && ctx
->re
[1] == CHAR_COLON
&& (ctx
->re
[2] == L
'<' || ctx
->re
[2] == L
'>')
729 && ctx
->re
[3] == CHAR_COLON
&& ctx
->re
[4] == CHAR_RBRACKET
730 && ctx
->re
[5] == CHAR_RBRACKET
)
732 *result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
733 (ctx
->re
[2] == L
'<') ? ASSERT_AT_BOW
: ASSERT_AT_EOW
,
735 DPRINT(("tre_parse_bracket: special case %s\n", (ctx
->re
[2] == L
'<') ?
736 "[[:<:]]" : "[[:>:]]"));
738 return *result
? REG_OK
: REG_ESPACE
;
741 /* Start off with an array of `max_i' elements. */
742 items
= xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i
));
746 if (*ctx
->re
== CHAR_CARET
)
748 DPRINT(("tre_parse_bracket: negate: '%.*" STRF
"'\n", REST(ctx
->re
)));
749 items
->flags
|= TRE_BRACKET_MATCH_FLAG_NEGATE
;
753 status
= tre_parse_bracket_items(ctx
, &items
, &max_i
, &col_syms
);
755 if (status
!= REG_OK
)
756 goto parse_bracket_done
;
758 /* If there are collating symbols, split off the multi-character ones
759 * into a union of the bracket expression (without the collating symbols)
760 * and the multiple-character sequences. We create an equivalent input
761 * string and run tre_parse() recursively */
764 tre_char_t
*str
, *sp
;
765 tre_collating_symbol
*cp
;
766 tre_parse_ctx_t subctx
;
768 /* Allocate a new string. We start with the size of the original
769 * bracket expression (minus 1) and add 2 (for a leading "[" and
770 * a trailing nil; don't need a "^", since it is illegal to have
771 * inverted MCCSs). Since a multi-character collating symbols
772 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
773 * need to worry about the new string getting too long. */
775 str
= xmalloc(sizeof(*str
) * ((col_syms
->start
- ctx
->re
) + 2));
782 if (col_syms
->len
> 0)
784 /* There are other items in the bracket expression besides the
785 * multi-character collating symbols, so create a new bracket
786 * expression with only those other itmes. */
787 const tre_char_t
*re
;
792 for (cp
= col_syms
+ 1; cp
->start
; cp
++)
794 /* The "- 2" is to account for the "[." */
795 if ((i
= ((cp
->start
- re
) - 2)) > 0)
797 memcpy(sp
, re
, sizeof(*sp
) * i
);
800 /* The "+ 2" is to account for the ".]" */
801 re
= cp
->start
+ cp
->len
+ 2;
803 i
= col_syms
->start
- re
; /* Includes the trailing right bracket */
804 memcpy(sp
, re
, sizeof(*sp
) * i
);
808 for (cp
= col_syms
+ 1; cp
->start
; cp
++)
810 memcpy(sp
, cp
->start
, sizeof(*sp
) * cp
->len
);
816 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
819 memcpy(&subctx
, ctx
, sizeof(subctx
));
821 subctx
.len
= sp
- str
;
822 subctx
.nofirstsub
= 1;
823 subctx
.cflags
|= REG_EXTENDED
; /* Force extended mode for parsing */
824 status
= tre_parse(&subctx
);
826 if (status
!= REG_OK
)
831 ctx
->re
= col_syms
->start
;
832 ctx
->position
= subctx
.position
;
834 *result
= subctx
.result
;
835 DPRINT(("tre_parse_bracket: Returning to original string\n"));
839 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
840 node
= tre_ast_new_literal(ctx
->mem
, 0, TRE_CHAR_MAX
, ctx
->position
);
844 goto parse_bracket_done
;
848 tre_literal_t
*l
= node
->obj
;
849 l
->u
.bracket_match_list
= tre_mem_alloc(ctx
->mem
,
850 SIZEOF_BRACKET_MATCH_LIST(items
));
851 if (l
->u
.bracket_match_list
== NULL
)
854 goto parse_bracket_done
;
856 memcpy(l
->u
.bracket_match_list
, items
, SIZEOF_BRACKET_MATCH_LIST(items
));
862 tre_bracket_match_t
*b
;
863 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
864 items
->num_bracket_matches
, items
->flags
));
865 for (i
= 0, b
= items
->bracket_matches
;
866 i
< items
->num_bracket_matches
; i
++, b
++)
868 DPRINT((" %d: %s %d\n", i
, bracket_match_type_str
[b
->type
],
872 #endif /* TRE_DEBUG */
882 /* Parses a positive decimal integer. Returns -1 if the string does not
883 contain a valid number. */
885 tre_parse_int(const tre_char_t
**regex
, const tre_char_t
*regex_end
)
888 const tre_char_t
*r
= *regex
;
889 while (r
< regex_end
&& *r
>= L
'0' && *r
<= L
'9')
893 num
= num
* 10 + *r
- L
'0';
902 tre_parse_bound(tre_parse_ctx_t
*ctx
, tre_ast_node_t
**result
)
907 int cost_ins
, cost_del
, cost_subst
, cost_max
;
908 int limit_ins
, limit_del
, limit_subst
, limit_err
;
909 const tre_char_t
*start
;
910 #endif /* TRE_APPROX */
911 const tre_char_t
*r
= ctx
->re
;
912 int minimal
= (ctx
->cflags
& REG_UNGREEDY
) ? 1 : 0;
918 cost_ins
= cost_del
= cost_subst
= cost_max
= TRE_PARAM_UNSET
;
919 limit_ins
= limit_del
= limit_subst
= limit_err
= TRE_PARAM_UNSET
;
920 #endif /* TRE_APPROX */
922 /* Parse number (minimum repetition count). */
924 if (r
>= ctx
->re_end
)
925 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
926 return (ctx
->cflags
& REG_EXTENDED
) ? REG_NOMATCH
: REG_EBRACE
;
927 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
930 if (*r
>= L
'0' && *r
<= L
'9') {
931 DPRINT(("tre_parse: min count: '%.*" STRF
"'\n", REST(r
)));
932 min
= tre_parse_int(&r
, ctx
->re_end
);
936 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
937 /* For ERE, return REG_NOMATCH to signal that the lbrace should
938 be treated as a literal */
939 return (ctx
->cflags
& REG_EXTENDED
) ? REG_NOMATCH
: REG_BADBR
;
940 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
943 #endif /* !TRE_APPROX */
945 /* Parse comma and second number (maximum repetition count). */
947 if (r
< ctx
->re_end
&& *r
== CHAR_COMMA
)
950 DPRINT(("tre_parse: max count: '%.*" STRF
"'\n", REST(r
)));
951 max
= tre_parse_int(&r
, ctx
->re_end
);
954 /* Check that the repeat counts are sane. */
955 if ((max
>= 0 && min
> max
) || min
> RE_DUP_MAX
|| max
> RE_DUP_MAX
)
962 optionally followed immediately by a number == minimum repcount
963 optionally followed by , then a number == maximum repcount
964 + then a number == maximum insertion count
965 - then a number == maximum deletion count
966 # then a number == maximum substitution count
967 ~ then a number == maximum number of errors
968 Any of +, -, # or ~ without followed by a number means that
969 the maximum count/number of errors is infinite.
971 An equation of the form
973 can be specified to set costs and the cost limit to a value
974 different from the default value:
975 - X is the cost of an insertion
976 - Y is the cost of a deletion
977 - Z is the cost of a substitution
978 - C is the maximum cost
980 If no count limit or cost is set for an operation, the operation
981 is not allowed at all.
989 /* Parse count limit settings */
992 while (r
+ 1 < ctx
->re_end
&& !done
)
996 case CHAR_PLUS
: /* Insert limit */
997 DPRINT(("tre_parse: ins limit: '%.*" STRF
"'\n", REST(r
)));
999 limit_ins
= tre_parse_int(&r
, ctx
->re_end
);
1001 limit_ins
= INT_MAX
;
1004 case CHAR_MINUS
: /* Delete limit */
1005 DPRINT(("tre_parse: del limit: '%.*" STRF
"'\n", REST(r
)));
1007 limit_del
= tre_parse_int(&r
, ctx
->re_end
);
1009 limit_del
= INT_MAX
;
1012 case CHAR_HASH
: /* Substitute limit */
1013 DPRINT(("tre_parse: subst limit: '%.*" STRF
"'\n", REST(r
)));
1015 limit_subst
= tre_parse_int(&r
, ctx
->re_end
);
1016 if (limit_subst
< 0)
1017 limit_subst
= INT_MAX
;
1020 case CHAR_TILDE
: /* Maximum number of changes */
1021 DPRINT(("tre_parse: count limit: '%.*" STRF
"'\n", REST(r
)));
1023 limit_err
= tre_parse_int(&r
, ctx
->re_end
);
1025 limit_err
= INT_MAX
;
1043 /* Parse cost restriction equation. */
1046 while (r
+ 1 < ctx
->re_end
&& !done
)
1055 DPRINT(("tre_parse: max cost: '%.*" STRF
"'\n", REST(r
)));
1059 cost_max
= tre_parse_int(&r
, ctx
->re_end
);
1071 if (*r
>= L
'0' && *r
<= L
'9')
1074 const tre_char_t
*sr
= r
;
1075 #endif /* TRE_DEBUG */
1076 int cost
= tre_parse_int(&r
, ctx
->re_end
);
1077 /* XXX - make sure r is not past end. */
1080 case L
'i': /* Insert cost */
1081 DPRINT(("tre_parse: ins cost: '%.*" STRF
"'\n",
1087 case L
'd': /* Delete cost */
1088 DPRINT(("tre_parse: del cost: '%.*" STRF
"'\n",
1094 case L
's': /* Substitute cost */
1095 DPRINT(("tre_parse: subst cost: '%.*" STRF
"'\n",
1112 } while (start
!= r
);
1113 #endif /* TRE_APPROX */
1115 /*{*//* Missing }. */
1116 if (r
>= ctx
->re_end
)
1119 /* Empty contents of {}. */
1123 /* Parse the ending '}' or '\}'.*/
1124 if (ctx
->cflags
& REG_EXTENDED
)
1126 if (r
>= ctx
->re_end
|| *r
!= CHAR_RBRACE
)
1129 /* Parse trailing '?' marking minimal repetition. */
1130 if (r
< ctx
->re_end
)
1132 if (*r
== CHAR_QUESTIONMARK
)
1134 /* Process the question mark only in enhanced mode.
1135 Otherwise, the question mark is an error in ERE
1136 or a literal in BRE */
1137 if (ctx
->cflags
& REG_ENHANCED
)
1139 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1142 else return REG_BADRPT
;
1144 else if (*r
== CHAR_STAR
|| *r
== CHAR_PLUS
)
1146 /* These are reserved for future extensions. */
1153 if (r
+ 1 >= ctx
->re_end
1154 || *r
!= CHAR_BACKSLASH
1155 || *(r
+ 1) != CHAR_RBRACE
)
1158 if (r
< ctx
->re_end
&& *r
== CHAR_STAR
)
1160 /* This is reserved for future extensions. */
1166 ctx
->num_reorder_tags
++;
1168 if (!result
) goto parse_bound_exit
;
1169 /* Create the AST node(s). */
1170 /* Originally, if min == 0 && max == 0, we immediately replace the whole
1171 iteration with EMPTY. This unfortunately drops any submatches, and
1172 messes up setting the pmatch values (we can get tags of -1, and
1173 tag values in the billions). So we leave it and process this case as
1174 usual, and wait until tre_expand_ast() to replace with EMPTY */
1176 if (min
< 0 && max
< 0)
1177 /* Only approximate parameters set, no repetitions. */
1179 #endif /* TRE_APPROX */
1181 *result
= tre_ast_new_iter(ctx
->mem
, *result
, min
, max
, minimal
);
1186 /* If approximate matching parameters are set, add them to the
1188 if (approx
|| costs_set
|| counts_set
)
1191 tre_iteration_t
*iter
= (*result
)->obj
;
1193 if (costs_set
|| counts_set
)
1195 if (limit_ins
== TRE_PARAM_UNSET
)
1197 if (cost_ins
== TRE_PARAM_UNSET
)
1200 limit_ins
= INT_MAX
;
1203 if (limit_del
== TRE_PARAM_UNSET
)
1205 if (cost_del
== TRE_PARAM_UNSET
)
1208 limit_del
= INT_MAX
;
1211 if (limit_subst
== TRE_PARAM_UNSET
)
1213 if (cost_subst
== TRE_PARAM_UNSET
)
1216 limit_subst
= INT_MAX
;
1220 if (cost_max
== TRE_PARAM_UNSET
)
1222 if (limit_err
== TRE_PARAM_UNSET
)
1223 limit_err
= INT_MAX
;
1225 ctx
->have_approx
= 1;
1226 params
= tre_mem_alloc(ctx
->mem
, sizeof(*params
) * TRE_PARAM_LAST
);
1229 for (i
= 0; i
< TRE_PARAM_LAST
; i
++)
1230 params
[i
] = TRE_PARAM_UNSET
;
1231 params
[TRE_PARAM_COST_INS
] = cost_ins
;
1232 params
[TRE_PARAM_COST_DEL
] = cost_del
;
1233 params
[TRE_PARAM_COST_SUBST
] = cost_subst
;
1234 params
[TRE_PARAM_COST_MAX
] = cost_max
;
1235 params
[TRE_PARAM_MAX_INS
] = limit_ins
;
1236 params
[TRE_PARAM_MAX_DEL
] = limit_del
;
1237 params
[TRE_PARAM_MAX_SUBST
] = limit_subst
;
1238 params
[TRE_PARAM_MAX_ERR
] = limit_err
;
1239 iter
->params
= params
;
1241 #endif /* TRE_APPROX */
1245 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1246 "limits [%d,%d,%d, total %d]\n",
1247 min
, max
, cost_ins
, cost_del
, cost_subst
, cost_max
,
1248 limit_ins
, limit_del
, limit_subst
, limit_err
));
1249 #else /* !TRE_APPROX */
1250 DPRINT(("tre_parse_bound: min %d, max %d\n", min
, max
));
1251 #endif /* !TRE_APPROX */
1258 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1259 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1260 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1261 Because we now set up tags for even non-capturing parenthesized
1262 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1263 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1264 have it restore cflags after the subexpression, we don't need to have
1265 a separate PARSE_RESTORE_CFLAGS, and then after processing the
1266 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1267 This has the side-benefit of now matching the perl behavior: the RE
1268 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1269 of foo AND (?i) (bar OR zap). */
1273 PARSE_MARK_FOR_SUBMATCH
,
1277 PARSE_POST_CATENATION
,
1281 } tre_parse_re_stack_symbol_t
;
1285 tre_parse(tre_parse_ctx_t
*ctx
)
1287 tre_ast_node_t
*result
= NULL
;
1288 tre_parse_re_stack_symbol_t symbol
;
1289 reg_errcode_t status
= REG_OK
;
1290 tre_stack_t
*stack
= ctx
->stack
;
1291 int bottom
= tre_stack_num_objects(stack
);
1293 int temporary_cflags
= 0;
1294 int bre_branch_begin
;
1296 const tre_char_t
*tmp_re
;
1299 DPRINT(("tre_parse: parsing '%.*" STRF
"', len = %d cflags = 0%o\n",
1300 ctx
->len
, ctx
->re
, ctx
->len
, ctx
->cflags
));
1302 if (ctx
->len
<= 0) return REG_EMPTY
;
1303 if (!ctx
->nofirstsub
)
1305 STACK_PUSH(stack
, int, ctx
->cflags
);
1306 STACK_PUSH(stack
, int, ctx
->submatch_id
);
1307 STACK_PUSH(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1310 STACK_PUSH(stack
, int, 0); // bre_branch_begin
1311 STACK_PUSH(stack
, int, PARSE_RE
);
1312 ctx
->re_start
= ctx
->re
;
1313 ctx
->re_end
= ctx
->re
+ ctx
->len
;
1316 /* The following is basically just a recursive descent parser. I use
1317 an explicit stack instead of recursive functions mostly because of
1318 two reasons: compatibility with systems which have an overflowable
1319 call stack, and efficiency (both in lines of code and speed). */
1320 while (tre_stack_num_objects(stack
) > bottom
)
1322 symbol
= tre_stack_pop_int(stack
);
1326 /* Parse a full regexp. A regexp is one or more branches,
1327 separated by the union operator `|'. */
1328 bre_branch_begin
= tre_stack_pop_int(stack
);
1331 !(ctx
->cflags
& REG_LITERAL
) &&
1332 #endif /* REG_LITERAL */
1333 ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
))
1334 STACK_PUSHX(stack
, int, PARSE_UNION
);
1335 STACK_PUSHX(stack
, int, bre_branch_begin
);
1336 STACK_PUSHX(stack
, int, PARSE_BRANCH
);
1340 /* Parse a branch. A branch is one or more pieces, concatenated.
1341 A piece is an atom possibly followed by a postfix operator. */
1342 bre_branch_begin
= tre_stack_pop_int(stack
);
1343 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1344 STACK_PUSHX(stack
, int, bre_branch_begin
);
1345 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1349 /* Parse a piece. A piece is an atom possibly followed by one
1350 or more postfix operators. */
1351 bre_branch_begin
= tre_stack_pop_int(stack
);
1352 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1353 STACK_PUSHX(stack
, int, bre_branch_begin
);
1354 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1357 case PARSE_CATENATION
:
1358 /* If the expression has not ended, parse another piece. */
1361 if (ctx
->re
>= ctx
->re_end
)
1365 if (!(ctx
->cflags
& REG_LITERAL
))
1367 #endif /* REG_LITERAL */
1368 if ((ctx
->cflags
& REG_EXTENDED
&& c
== CHAR_PIPE
) ||
1369 ((ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) == REG_ENHANCED
1370 && ctx
->re
+ 1 < ctx
->re_end
&& c
== CHAR_BACKSLASH
&&
1371 *(ctx
->re
+ 1) == CHAR_PIPE
))
1373 if ((ctx
->cflags
& REG_EXTENDED
1374 && c
== CHAR_RPAREN
&& depth
> 0)
1375 || (!(ctx
->cflags
& REG_EXTENDED
)
1376 && ctx
->re
+ 1 < ctx
->re_end
&& c
== CHAR_BACKSLASH
1377 && *(ctx
->re
+ 1) == CHAR_RPAREN
))
1379 if (!(ctx
->cflags
& REG_EXTENDED
) && depth
== 0)
1381 DPRINT(("tre_parse: group end: '%.*" STRF
"'\n",
1384 if (!(ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)))
1390 #endif /* REG_LITERAL */
1392 #ifdef REG_LEFT_ASSOC
1393 if (ctx
->cflags
& REG_LEFT_ASSOC
)
1395 /* Left associative concatenation. */
1396 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1397 STACK_PUSHX(stack
, voidptr
, result
);
1398 STACK_PUSHX(stack
, int, PARSE_POST_CATENATION
);
1399 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1400 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1403 #endif /* REG_LEFT_ASSOC */
1405 /* Default case, right associative concatenation. */
1406 STACK_PUSHX(stack
, voidptr
, result
);
1407 STACK_PUSHX(stack
, int, PARSE_POST_CATENATION
);
1408 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1409 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1410 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1415 case PARSE_POST_CATENATION
:
1417 tre_ast_node_t
*tree
= tre_stack_pop_voidptr(stack
);
1418 tre_ast_node_t
*tmp_node
;
1419 tmp_node
= tre_ast_new_catenation(ctx
->mem
, tree
, result
);
1427 if (ctx
->re
>= ctx
->re_end
)
1430 if (ctx
->cflags
& REG_LITERAL
)
1432 #endif /* REG_LITERAL */
1433 if (!(ctx
->cflags
& REG_EXTENDED
))
1435 if (*ctx
->re
!= CHAR_BACKSLASH
|| ctx
->re
+ 1 >= ctx
->re_end
)
1442 DPRINT(("tre_parse: union: '%.*" STRF
"'\n",
1444 STACK_PUSHX(stack
, int, PARSE_UNION
);
1445 STACK_PUSHX(stack
, voidptr
, (void *)ctx
->re
);
1446 STACK_PUSHX(stack
, voidptr
, result
);
1447 STACK_PUSHX(stack
, int, PARSE_POST_UNION
);
1448 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1449 indicate if this is the beginning of a BRE extended branch. */
1450 STACK_PUSHX(stack
, int, (ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) == REG_ENHANCED
); // bre_branch_begin
1451 STACK_PUSHX(stack
, int, PARSE_BRANCH
);
1460 if (!(ctx
->cflags
& REG_EXTENDED
))
1466 case PARSE_POST_UNION
:
1468 tre_ast_node_t
*tmp_node
;
1469 tre_ast_node_t
*tree
= tre_stack_pop_voidptr(stack
);
1470 const tre_char_t
*pipechar
= tre_stack_pop_voidptr(stack
);
1471 /* error on empty expression at end of union */
1472 if (pipechar
== ctx
->re
- 1)
1476 tmp_node
= tre_ast_new_union(ctx
->mem
, tree
, result
);
1484 /* Parse postfix operators. */
1485 if (ctx
->re
>= ctx
->re_end
)
1488 if (ctx
->cflags
& REG_LITERAL
)
1490 #endif /* REG_LITERAL */
1491 int minimal
= (ctx
->cflags
& REG_UNGREEDY
) ? 1 : 0;
1500 case CHAR_QUESTIONMARK
:
1501 if (!(ctx
->cflags
& REG_EXTENDED
))
1506 tre_ast_node_t
*tmp_node
;
1508 const char *tstr
= "star";
1512 handle_plus_or_question
:
1513 /* error on iteration of raw assertion (not in subexpression) */
1514 if (result
->type
== LITERAL
&& result
->submatch_id
< 0 &&
1515 IS_ASSERTION((tre_literal_t
*)result
->obj
))
1517 if (!(ctx
->cflags
& REG_EXTENDED
)) break;
1520 if (*ctx
->re
== CHAR_PLUS
)
1527 if (*ctx
->re
== CHAR_QUESTIONMARK
)
1531 tstr
= "questionmark";
1535 if (ctx
->cflags
& REG_EXTENDED
)
1537 if (ctx
->re
+ 1 < ctx
->re_end
)
1539 if (*(ctx
->re
+ 1) == CHAR_QUESTIONMARK
)
1541 /* Process the question mark only in enhanced mode.
1542 Otherwise, the question mark is an error in ERE */
1543 if (ctx
->cflags
& REG_ENHANCED
)
1545 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1548 else return REG_BADRPT
;
1550 else if (*(ctx
->re
+ 1) == CHAR_STAR
1551 || *(ctx
->re
+ 1) == CHAR_PLUS
)
1553 /* These are reserved for future extensions. */
1560 if (ctx
->re
+ 1 < ctx
->re_end
&& *(ctx
->re
+ 1) == CHAR_STAR
)
1562 /* This is reserved for future extensions. */
1565 if (ctx
->re
+ 2 < ctx
->re_end
)
1567 if (*(ctx
->re
+ 1) == CHAR_BACKSLASH
&& *(ctx
->re
+ 1) == CHAR_QUESTIONMARK
)
1569 /* Process the question mark only in enhanced mode.
1570 Otherwise, the question mark is a literal in BRE */
1571 if (ctx
->cflags
& REG_ENHANCED
)
1573 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1577 else if (*(ctx
->re
+ 1) == CHAR_BACKSLASH
&& *(ctx
->re
+ 2) == CHAR_PLUS
)
1579 /* This is reserved for future extensions. */
1586 ctx
->num_reorder_tags
++;
1588 DPRINT(("tre_parse: %s %s: '%.*" STRF
"'\n",
1589 minimal
? " minimal" : "greedy", tstr
, REST(tmp_re
)));
1592 if (ctx
->cflags
& REG_EXTENDED
) return REG_BADRPT
;
1593 else goto parse_literal
;
1596 tmp_node
= tre_ast_new_iter(ctx
->mem
, result
, rep_min
, rep_max
,
1598 if (tmp_node
== NULL
)
1602 /* Set the iterator with a submatch id in the invisible range
1603 * (which will be overridden if a real submatch is needed) */
1604 result
->submatch_id
= ctx
->submatch_id_invisible
++;
1607 /* We don't allow multiple postfixes, but this might be needed
1608 to support approximate matching */
1609 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1614 case CHAR_BACKSLASH
:
1615 /* "\{" is special without REG_EXTENDED */
1616 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1617 if (!(ctx
->cflags
& REG_EXTENDED
)
1618 && ctx
->re
+ 1 < ctx
->re_end
)
1620 switch (*(ctx
->re
+ 1))
1629 case CHAR_QUESTIONMARK
:
1630 if (ctx
->cflags
& REG_ENHANCED
)
1636 goto handle_plus_or_question
;
1649 /* "{" is literal without REG_EXTENDED */
1650 if (!(ctx
->cflags
& REG_EXTENDED
))
1657 /* error on iteration of raw assertion (not in subexpression),
1658 but wait until after parsing bounds */
1659 raw_assertion
= (result
->type
== LITERAL
1660 && result
->submatch_id
< 0
1661 && IS_ASSERTION((tre_literal_t
*)result
->obj
));
1664 status
= tre_parse_bound(ctx
, &result
);
1665 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1666 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1667 is to be treated as a literal. */
1668 if (status
== REG_NOMATCH
)
1673 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1674 DPRINT(("tre_parse: bound: '%.*" STRF
"'\n",
1675 REST(ctx
->re
- lbrace_off
)));
1676 if (status
!= REG_OK
)
1678 if (raw_assertion
) return REG_BADRPT
;
1680 /* Set the iterator with a submatch id in the invisible range
1681 * (which will be overridden if a real submatch is needed) */
1682 if (result
->type
== ITERATION
)
1683 result
->submatch_id
= ctx
->submatch_id_invisible
++;
1686 /* We don't allow multiple postfixes, but this might be needed
1687 to support approximate matching */
1688 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1697 /* Parse an atom. An atom is a regular expression enclosed in `()',
1698 an empty set of `()', a bracket expression, `.', `^', `$',
1699 a `\' followed by a character, or a single character. */
1701 /* The stack contains a boolean value, whether PARSE_ATOM is
1702 being called just after the start of a group (left paren)
1704 bre_branch_begin
= tre_stack_pop_int(stack
);
1706 /* End of regexp? (empty string). */
1707 if (ctx
->re
>= ctx
->re_end
)
1711 if (ctx
->cflags
& REG_LITERAL
)
1713 #endif /* REG_LITERAL */
1717 case CHAR_LPAREN
: /* parenthesized subexpression */
1719 /* Handle "(?...)" extensions. They work in a way similar
1720 to Perls corresponding extensions. */
1721 if ((ctx
->cflags
& (REG_EXTENDED
|REG_ENHANCED
)) ==
1722 (REG_EXTENDED
|REG_ENHANCED
)
1723 && *(ctx
->re
+ 1) == CHAR_QUESTIONMARK
)
1725 int new_cflags
= ctx
->cflags
;
1727 int invisible_submatch
= 0;
1728 DPRINT(("tre_parse: extension: '%.*" STRF
"'\n",
1731 while (/*CONSTCOND*/1)
1733 if (*ctx
->re
== L
'i')
1735 DPRINT(("tre_parse: icase: '%.*" STRF
"'\n",
1738 new_cflags
|= REG_ICASE
;
1740 new_cflags
&= ~REG_ICASE
;
1743 else if (*ctx
->re
== L
'n')
1745 DPRINT(("tre_parse: newline: '%.*" STRF
"'\n",
1748 new_cflags
|= REG_NEWLINE
;
1750 new_cflags
&= ~REG_NEWLINE
;
1753 #ifdef REG_LEFT_ASSOC
1754 else if (*ctx
->re
== L
'l')
1756 DPRINT(("tre_parse: left assoc: '%.*" STRF
"'\n",
1759 new_cflags
|= REG_LEFT_ASSOC
;
1761 new_cflags
&= ~REG_LEFT_ASSOC
;
1764 #endif /* REG_LEFT_ASSOC */
1766 else if (*ctx
->re
== L
'U')
1768 DPRINT(("tre_parse: ungreedy: '%.*" STRF
"'\n",
1771 new_cflags
|= REG_UNGREEDY
;
1773 new_cflags
&= ~REG_UNGREEDY
;
1776 #endif /* REG_UNGREEDY */
1777 else if (*ctx
->re
== CHAR_MINUS
)
1779 DPRINT(("tre_parse: turn off: '%.*" STRF
"'\n",
1784 else if (*ctx
->re
== CHAR_COLON
)
1786 DPRINT(("tre_parse: no group: '%.*" STRF
1787 "', (invisible submatch %d)\n",
1788 REST(ctx
->re
), ctx
->submatch_id_invisible
));
1791 invisible_submatch
= 1;
1794 else if (*ctx
->re
== CHAR_HASH
)
1796 DPRINT(("tre_parse: comment: '%.*" STRF
"'\n",
1798 /* A comment can contain any character except a
1799 right parenthesis */
1800 while (*ctx
->re
!= CHAR_RPAREN
1801 && ctx
->re
< ctx
->re_end
)
1803 if (*ctx
->re
== CHAR_RPAREN
&& ctx
->re
< ctx
->re_end
)
1811 else if (*ctx
->re
== CHAR_RPAREN
)
1820 /* Turn on the cflags changes for the rest of the
1822 if (invisible_submatch
)
1824 STACK_PUSHX(stack
, int, ctx
->cflags
);
1825 STACK_PUSHX(stack
, int, ctx
->submatch_id_invisible
);
1826 STACK_PUSHX(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1827 ctx
->submatch_id_invisible
++;
1828 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1829 STACK_PUSHX(stack
, int, PARSE_RE
);
1832 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1833 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1835 ctx
->cflags
= new_cflags
;
1839 if (ctx
->cflags
& REG_EXTENDED
)
1842 DPRINT(("tre_parse: group begin: '%.*" STRF
1843 "', submatch %d\n", REST(ctx
->re
),
1846 /* First parse a whole RE, then mark the resulting tree
1848 STACK_PUSHX(stack
, int, ctx
->cflags
);
1849 STACK_PUSHX(stack
, int, ctx
->submatch_id
);
1850 STACK_PUSHX(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1851 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1852 indicate if this is the beginning of a BRE group. */
1853 STACK_PUSHX(stack
, int, !(ctx
->cflags
& REG_EXTENDED
));
1854 STACK_PUSHX(stack
, int, PARSE_RE
);
1862 case CHAR_RPAREN
: /* end of current subexpression */
1863 if (ctx
->cflags
& REG_EXTENDED
&& depth
> 0)
1865 parse_bre_rparen_empty
:
1866 if (!(ctx
->cflags
& REG_EXTENDED
) && depth
== 0)
1868 DPRINT(("tre_parse: empty: '%.*" STRF
"'\n",
1870 /* We were expecting an atom, but instead the current
1871 subexpression was closed. POSIX leaves the meaning of
1872 this to be implementation-defined. We interpret this as
1873 an empty expression (which matches an empty string). */
1874 result
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
1877 if (!(ctx
->cflags
& REG_EXTENDED
))
1884 case CHAR_LBRACKET
: /* bracket expression */
1885 DPRINT(("tre_parse: bracket: '%.*" STRF
"'\n",
1888 status
= tre_parse_bracket(ctx
, &result
);
1889 if (status
!= REG_OK
)
1893 case CHAR_BACKSLASH
:
1894 /* Deal with "\(", "\)" or "\{" for BREs */
1895 if (!(ctx
->cflags
& REG_EXTENDED
)
1896 && ctx
->re
+ 1 < ctx
->re_end
)
1898 if (*(ctx
->re
+ 1) == CHAR_LPAREN
)
1901 goto parse_bre_lparen
;
1903 else if (*(ctx
->re
+ 1) == CHAR_RPAREN
)
1906 goto parse_bre_rparen_empty
;
1908 if (*(ctx
->re
+ 1) == CHAR_LBRACE
) goto parse_literal
;
1911 if (ctx
->re
+ 1 >= ctx
->re_end
)
1912 /* Trailing backslash. */
1915 if (!(ctx
->cflags
& REG_ENHANCED
))
1917 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF
"'\n", REST(ctx
->re
)));
1919 goto unenhanced_backslash
;
1922 /* If a macro is used, parse the expanded macro recursively. */
1925 tre_expand_macro(ctx
->re
+ 1, ctx
->re_end
,
1926 buf
, elementsof(buf
));
1929 tre_parse_ctx_t subctx
;
1930 memcpy(&subctx
, ctx
, sizeof(subctx
));
1932 subctx
.len
= tre_strlen(buf
);
1933 subctx
.nofirstsub
= 1;
1934 status
= tre_parse(&subctx
);
1935 if (status
!= REG_OK
)
1938 ctx
->position
= subctx
.position
;
1939 result
= subctx
.result
;
1945 if (*(ctx
->re
+ 1) == L
'Q')
1947 DPRINT(("tre_parse: tmp literal: '%.*" STRF
"'\n",
1949 ctx
->cflags
|= REG_LITERAL
;
1950 temporary_cflags
|= REG_LITERAL
;
1952 STACK_PUSHX(stack
, int, 0);
1953 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1956 #endif /* REG_LITERAL */
1958 DPRINT(("tre_parse: bleep: '%.*" STRF
"'\n", REST(ctx
->re
)));
1963 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1968 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1969 ASSERT_AT_WB_NEG
, -1);
1973 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1978 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1984 if (ctx
->re
[0] != CHAR_LBRACE
&& ctx
->re
< ctx
->re_end
)
1986 /* 8 bit hex char. */
1987 char tmp
[3] = {0, 0, 0};
1989 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF
"'\n",
1990 REST(ctx
->re
- 2)));
1992 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
) &&
1993 ctx
->re
< ctx
->re_end
)
1995 tmp
[0] = (char)ctx
->re
[0];
1998 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
) &&
1999 ctx
->re
< ctx
->re_end
)
2001 tmp
[1] = (char)ctx
->re
[0];
2004 val
= strtol(tmp
, NULL
, 16);
2005 result
= tre_ast_new_literal(ctx
->mem
, (int)val
,
2006 (int)val
, ctx
->position
);
2010 else if (ctx
->re
< ctx
->re_end
)
2017 while (ctx
->re_end
- ctx
->re
>= 0)
2019 if (ctx
->re
[0] == CHAR_RBRACE
)
2021 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
))
2023 tmp
[i
] = (char)ctx
->re
[0];
2032 val
= strtol(tmp
, NULL
, 16);
2033 result
= tre_ast_new_literal(ctx
->mem
, (int)val
, (int)val
,
2041 unenhanced_backslash
:
2042 if ((ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) !=
2044 tre_isdigit_l(*ctx
->re
, ctx
->loc
) && *ctx
->re
!= L
'0')
2046 /* Back reference (only in BRE or enhanced). */
2047 int val
= *ctx
->re
- L
'0';
2048 DPRINT(("tre_parse: backref: '%.*" STRF
"'\n",
2049 REST(ctx
->re
- 1)));
2050 result
= tre_ast_new_literal(ctx
->mem
, BACKREF
, val
,
2055 /* Set the backref with a submatch id in the invisible
2056 * range (which will be overridden if a real submatch
2058 result
->submatch_id
= ctx
->submatch_id_invisible
++;
2061 ctx
->num_reorder_tags
++;
2062 ctx
->max_backref
= MAX(val
, ctx
->max_backref
);
2067 /* Escaped character. */
2068 DPRINT(("tre_parse: escaped: '%.*" STRF
"'\n",
2069 REST(ctx
->re
- 1)));
2070 result
= tre_ast_new_literal(ctx
->mem
, *ctx
->re
, *ctx
->re
,
2081 case CHAR_PERIOD
: /* the any-symbol */
2082 DPRINT(("tre_parse: any: '%.*" STRF
"'\n",
2084 if (ctx
->cflags
& REG_NEWLINE
)
2086 tre_ast_node_t
*tmp1
;
2087 tre_ast_node_t
*tmp2
;
2088 tmp1
= tre_ast_new_literal(ctx
->mem
, 0, L
'\n' - 1,
2092 tmp2
= tre_ast_new_literal(ctx
->mem
, L
'\n' + 1, TRE_CHAR_MAX
,
2096 result
= tre_ast_new_union(ctx
->mem
, tmp1
, tmp2
);
2103 result
= tre_ast_new_literal(ctx
->mem
, 0, TRE_CHAR_MAX
,
2112 case CHAR_CARET
: /* beginning of line assertion */
2113 /* '^' has a special meaning everywhere in EREs, at the
2114 beginning of the RE and after \( is BREs. It is also
2115 special in enhanced BREs at the beginning of each branches
2117 if (ctx
->cflags
& REG_EXTENDED
2119 || ctx
->re
== ctx
->re_start
)
2121 DPRINT(("tre_parse: BOL: '%.*" STRF
"'\n",
2123 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
2133 case CHAR_DOLLAR
: /* end of line assertion. */
2134 /* '$' is special everywhere in EREs, and in the end of the
2135 string and before \) is BREs. */
2136 if (ctx
->cflags
& REG_EXTENDED
2137 || (ctx
->re
+ 2 < ctx
->re_end
2138 && *(ctx
->re
+ 1) == CHAR_BACKSLASH
2139 && *(ctx
->re
+ 2) == CHAR_RPAREN
)
2140 || ctx
->re
+ 1 == ctx
->re_end
)
2142 DPRINT(("tre_parse: EOL: '%.*" STRF
"'\n",
2144 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
2157 if (temporary_cflags
&& ctx
->re
+ 1 < ctx
->re_end
2158 && *ctx
->re
== CHAR_BACKSLASH
&& *(ctx
->re
+ 1) == L
'E')
2160 DPRINT(("tre_parse: end tmps: '%.*" STRF
"'\n",
2162 ctx
->cflags
&= ~temporary_cflags
;
2163 temporary_cflags
= 0;
2165 if (ctx
->re
< ctx
->re_end
)
2167 STACK_PUSHX(stack
, int, 0);
2168 STACK_PUSHX(stack
, int, PARSE_ATOM
);
2172 result
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
2173 if (!result
) return REG_ESPACE
;
2179 /* We are expecting an atom. If the subexpression (or the whole
2180 regexp ends here, we interpret it as an empty expression
2181 (which matches an empty string), which is an error.
2182 Iterations of an empty expression is also an error. */
2184 if (!(ctx
->cflags
& REG_LITERAL
))
2186 #endif /* REG_LITERAL */
2187 /* error on end of string */
2188 if (ctx
->re
>= ctx
->re_end
) return depth
> 0 ? REG_EPAREN
2190 /* error on unions and iterations of empty expressions */
2191 if (ctx
->cflags
& REG_EXTENDED
)
2193 if (ctx
->re
< ctx
->re_end
)
2195 if (*ctx
->re
== CHAR_PIPE
) return REG_EMPTY
;
2196 if (*ctx
->re
== CHAR_LBRACE
)
2200 /* We need to parse the bound first and return
2201 any error, before returning REG_BADRPT */
2202 status
= tre_parse_bound(ctx
, NULL
);
2203 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2204 /* For ERE, if REG_NOMATCH is returned, we
2205 treat the lbrace as a literal. */
2206 if (status
== REG_NOMATCH
)
2209 /* Drop down to literal-handling code */
2213 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2214 if (status
!= REG_OK
)
2217 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2219 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2221 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2223 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2224 if (*ctx
->re
== CHAR_STAR
2225 || *ctx
->re
== CHAR_PLUS
2226 || *ctx
->re
== CHAR_QUESTIONMARK
)
2232 else if (ctx
->re
+ 1 < ctx
->re_end
2233 && *ctx
->re
== CHAR_BACKSLASH
2234 && *(ctx
->re
+ 1) == CHAR_LBRACE
)
2237 goto empty_parse_bound
;
2241 #endif /* REG_LITERAL */
2243 DPRINT(("tre_parse: literal: '%.*" STRF
"'\n",
2245 /* Note that we can't use an tre_isalpha() test here, since there
2246 may be characters which are alphabetic but neither upper or
2248 if (ctx
->cflags
& REG_ICASE
2249 && (tre_isupper_l(*ctx
->re
, ctx
->loc
) ||
2250 tre_islower_l(*ctx
->re
, ctx
->loc
)))
2252 tre_ast_node_t
*tmp1
;
2253 tre_ast_node_t
*tmp2
;
2255 /* XXX - Can there be more than one opposite-case
2256 counterpoints for some character in some locale? Or
2257 more than two characters which all should be regarded
2258 the same character if case is ignored? If yes, there
2259 does not seem to be a portable way to detect it. I guess
2260 that at least for multi-character collating elements there
2261 could be several opposite-case counterpoints, but they
2262 cannot be supported portably anyway. */
2263 tmp1
= tre_ast_new_literal(ctx
->mem
,
2264 tre_toupper_l(*ctx
->re
, ctx
->loc
),
2265 tre_toupper_l(*ctx
->re
, ctx
->loc
),
2269 tmp2
= tre_ast_new_literal(ctx
->mem
,
2270 tre_tolower_l(*ctx
->re
, ctx
->loc
),
2271 tre_tolower_l(*ctx
->re
, ctx
->loc
),
2275 result
= tre_ast_new_union(ctx
->mem
, tmp1
, tmp2
);
2281 result
= tre_ast_new_literal(ctx
->mem
, *ctx
->re
, *ctx
->re
,
2293 case PARSE_MARK_FOR_SUBMATCH
:
2295 int submatch_id
= tre_stack_pop_int(stack
);
2297 ctx
->cflags
= tre_stack_pop_int(stack
); /* restore cflags */
2298 if (result
->submatch_id
>= 0 &&
2299 result
->submatch_id
< SUBMATCH_ID_INVISIBLE_START
)
2301 tre_ast_node_t
*n
, *tmp_node
;
2302 if (submatch_id
>= SUBMATCH_ID_INVISIBLE_START
)
2304 n
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
2307 tmp_node
= tre_ast_new_catenation(ctx
->mem
, n
, result
);
2308 if (tmp_node
== NULL
)
2310 tmp_node
->num_submatches
= result
->num_submatches
;
2313 result
->submatch_id
= submatch_id
;
2314 if (submatch_id
< SUBMATCH_ID_INVISIBLE_START
)
2315 result
->num_submatches
++;
2325 /* Check for missing closing parentheses. */
2329 ctx
->result
= result
;