]> git.saurik.com Git - apple/libc.git/blame - regex/TRE/lib/tre-parse.c
Libc-1439.100.3.tar.gz
[apple/libc.git] / regex / TRE / lib / tre-parse.c
CommitLineData
ad3c9f2a
A
1/*
2 tre-parse.c - Regexp parser
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7*/
8
9/*
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
13*/
14
15
16#ifdef HAVE_CONFIG_H
17#include <config.h>
18#endif /* HAVE_CONFIG_H */
19#include <string.h>
20#include <assert.h>
21#include <limits.h>
22#include <stddef.h>
23
24#include "xmalloc.h"
25#include "tre-mem.h"
26#include "tre-ast.h"
27#include "tre-stack.h"
28#include "tre-parse.h"
29
30/* BSD compatibility:
31 Before looking up a collating symbol, check if the name matches in
32 the character names (cnames) array; if so, use the corresponding
33 character.
34
35 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
36 the implementation choice that for ERE, a non-numeric character following
37 a left brace that would normally be a bound, causes the left brace to be
38 literal. */
39#define BSD_COMPATIBILITY
40#ifdef BSD_COMPATIBILITY
41#include "cname.h"
42#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
43#endif /* BSD_COMPATIBILITY */
44
45/* Characters with special meanings in regexp syntax. */
46#define CHAR_PIPE L'|'
47#define CHAR_LPAREN L'('
48#define CHAR_RPAREN L')'
49#define CHAR_LBRACE L'{'
50#define CHAR_RBRACE L'}'
51#define CHAR_LBRACKET L'['
52#define CHAR_RBRACKET L']'
53#define CHAR_MINUS L'-'
54#define CHAR_STAR L'*'
55#define CHAR_QUESTIONMARK L'?'
56#define CHAR_PLUS L'+'
57#define CHAR_PERIOD L'.'
58#define CHAR_COLON L':'
59#define CHAR_EQUAL L'='
60#define CHAR_COMMA L','
61#define CHAR_CARET L'^'
62#define CHAR_DOLLAR L'$'
63#define CHAR_BACKSLASH L'\\'
64#define CHAR_HASH L'#'
65#define CHAR_TILDE L'~'
66
67
68/* Some macros for expanding \w, \s, etc. */
69static const struct tre_macro_struct {
70 const char c;
71 const char *expansion;
72} tre_macros[] =
73 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
74 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
75 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
76 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
77 { 0, NULL }
78 };
79
80
81/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
82 must have at least `len' items. Sets buf[0] to zero if the there
83 is no match in `tre_macros'. */
84static void
85tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
86 tre_char_t *buf, size_t buf_len)
87{
88 int i;
89
90 buf[0] = 0;
91 if (regex >= regex_end)
92 return;
93
94 for (i = 0; tre_macros[i].expansion; i++)
95 {
96 if (tre_macros[i].c == *regex)
97 {
98 unsigned int j;
99 DPRINT(("Expanding macro '%c' => '%s'\n",
100 tre_macros[i].c, tre_macros[i].expansion));
101 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
102 buf[j] = tre_macros[i].expansion[j];
103 buf[j] = 0;
104 break;
105 }
106 }
107}
108
109static reg_errcode_t
5f125488 110tre_new_item(tre_mem_t __unused mem, int type, int val, int *max_i,
ad3c9f2a
A
111 tre_bracket_match_list_t **items)
112{
113 reg_errcode_t status = REG_OK;
114 tre_bracket_match_list_t *array = *items;
115 int i = array->num_bracket_matches;
116 /* Allocate more space if necessary. */
117 if (i >= *max_i)
118 {
119 tre_bracket_match_list_t *new_items;
120 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
121 /* If the array is already 1024 items large, give up -- there's
122 probably an error in the regexp (e.g. not a '\0' terminated
123 string and missing ']') */
124 if (*max_i >= 1024)
125 return REG_ESPACE;
126 *max_i *= 2;
127 new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
128 if (new_items == NULL)
129 return REG_ESPACE;
130 *items = array = new_items;
131 }
132 array->bracket_matches[i].type = type;
133 array->bracket_matches[i].value = val;
134 array->num_bracket_matches++;
135 return status;
136}
137
138#ifndef TRE_USE_SYSTEM_WCTYPE
139
140/* isalnum() and the rest may be macros, so wrap them to functions. */
141int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
142int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
143
144#ifdef tre_isascii
145int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
146#else /* !tre_isascii */
147int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
148#endif /* !tre_isascii */
149
150#ifdef tre_isblank
151int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
152#else /* !tre_isblank */
153int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
154#endif /* !tre_isblank */
155
156int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
157int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
158int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
159int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
160int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
161int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
162int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
163int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
164int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
165
166struct {
167 char *name;
168 int (*func)(tre_cint_t);
169} tre_ctype_map[] = {
170 { "alnum", &tre_isalnum_func },
171 { "alpha", &tre_isalpha_func },
172#ifdef tre_isascii
173 { "ascii", &tre_isascii_func },
174#endif /* tre_isascii */
175#ifdef tre_isblank
176 { "blank", &tre_isblank_func },
177#endif /* tre_isblank */
178 { "cntrl", &tre_iscntrl_func },
179 { "digit", &tre_isdigit_func },
180 { "graph", &tre_isgraph_func },
181 { "lower", &tre_islower_func },
182 { "print", &tre_isprint_func },
183 { "punct", &tre_ispunct_func },
184 { "space", &tre_isspace_func },
185 { "upper", &tre_isupper_func },
186 { "xdigit", &tre_isxdigit_func },
187 { NULL, NULL}
188};
189
190tre_ctype_t tre_ctype(const char *name)
191{
192 int i;
193 for (i = 0; tre_ctype_map[i].name != NULL; i++)
194 {
195 if (strcmp(name, tre_ctype_map[i].name) == 0)
196 return tre_ctype_map[i].func;
197 }
198 return (tre_ctype_t)0;
199}
200#endif /* !TRE_USE_SYSTEM_WCTYPE */
201
202#define REST(re) (int)(ctx->re_end - (re)), (re)
203
204#define START_COLLATING_SYMBOLS 16
205#define MAX_COLLATING_SYMBOL_LEN 4
206
207typedef struct {
208 const tre_char_t *start;
209 int len;
210} tre_collating_symbol;
211
212#include <xlocale.h>
213
214int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
215
216#ifdef BSD_COMPATIBILITY
217static wchar_t
218tre_search_cnames(const wchar_t *name, size_t len)
219{
220 size_t low = 0;
221 size_t high = NCNAMES - 1;
222 size_t cur;
223 int cmp;
224
225 while(low <= high)
226 {
227 cur = (low + high) / 2;
228 cmp = wcsncmp(name, cnames[cur].name, len);
229 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
230 if (cmp > 0) low = cur + 1;
231 else high = cur - 1;
232 }
233 return (wchar_t)-1;
234}
235#endif /* BSD_COMPATIBILITY */
236
237/* Scan the contents of a bracket expression, and create a
238 * tre_bracket_match_list_t encoding the bracket expression. If during
239 * the scan, multi-character collating symbols are detected, switch
240 * into a mode to collect those MCCSs into a tre_collating_symbol
241 * list and pass them back. tre_parse_bracket will use that to
242 * create a new string composed of a union of the bracket expression
243 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
244 * call tre_parse (recursive) to parse that new string (which will
245 * call tre_parse_bracket and tre_parse_bracket_items again. */
246static reg_errcode_t
247tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
248 int *items_size, tre_collating_symbol **result)
249{
250 const tre_char_t *re = ctx->re;
251 const tre_char_t *re_end = ctx->re_end;
252 tre_collating_symbol *col_syms = NULL;
253 tre_collating_symbol *cp = NULL;
254 int n_col_syms = 0;
255 reg_errcode_t status;
256 int max_i = *items_size;
257 int other = 0; /* contains content other than multi-character collating
258 * symbols */
259 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
260 tre_cint_t min, c;
261 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
262 int collect_MCCS = 0;
263 const tre_char_t *start;
264
265 for ( ;re < re_end; re++)
266 {
267 switch (*re)
268 {
269 case CHAR_MINUS:
270 /* A first hyphen */
271 if (re == ctx->re)
272 {
273 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
274 min = CHAR_MINUS;
275 other++;
276 range = 0;
277 break;
278 }
279 /* The hyphen is the end range */
280 if (range > 0)
281 {
282 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
283 c = CHAR_MINUS;
284 goto process_end_range;
285 }
286 if (re + 1 >= re_end)
287 {
288 status = REG_EBRACK;
289 goto error;
290 }
291 /* The hyphen is at the end */
292 if (re[1] == CHAR_RBRACKET)
293 {
294 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
295 c = CHAR_MINUS;
296 goto process_begin_range;
297 }
298 /* Two ranges are not allowed to share an endpoint, or begin
299 * range is illegal. */
300 if (range < 0)
301 {
302 status = REG_ERANGE;
303 goto error;
304 }
305 range = 1; /* Expect end range */
306 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
307 break;
308
309 case CHAR_LBRACKET:
310 if (re + 1 >= re_end)
311 {
312 status = REG_EBRACK;
313 goto error;
314 }
315 switch (re[1])
316 {
317 case CHAR_PERIOD:
318 {
319 re += 2;
320 start = re;
321 for (;; re++)
322 {
323 if (re >= re_end)
324 {
325 status = REG_ECOLLATE;
326 goto error;
327 }
328 if (*re == CHAR_PERIOD)
329 {
330 if (re + 1 >= re_end)
331 {
332 status = REG_ECOLLATE;
333 goto error;
334 }
335 /* Found end */
336 if (re[1] == CHAR_RBRACKET)
337 {
338 DPRINT(("tre_parse_bracket: collating "
339 "symbol: '%.*" STRF "'\n",
340 REST(start - 2)));
341 /* Empty name */
342 if (re == start)
343 {
344 status = REG_ECOLLATE;
345 goto error;
346 }
347#ifdef BSD_COMPATIBILITY
348 /* Check if the name is in cnames; if so, use
349 the corresponding code */
350 c = tre_search_cnames(start, re - start);
351 if (c != (wchar_t)-1)
352 {
353 re++;
354 goto process_single_character;
355 }
356#endif /* BSD_COMPATIBILITY */
357 /* Verify this is a known sequence */
358 if (__collate_equiv_value(ctx->loc, start,
359 re - start) <= 0)
360 {
361 status = REG_ECOLLATE;
362 goto error;
363 }
364 /* Process single character collating symbols */
365 if (re - start == 1)
366 {
367 c = *start;
368 re++;
369 goto process_single_character;
370 }
371 /* Inverted MCCSs are undefined */
372 if (invert)
373 {
374 status = REG_ECOLLATE;
375 goto error;
376 }
377 /* Can't have MCCSs as an endpoint to a range */
378 if (range > 0)
379 {
380 status = REG_ERANGE;
381 goto error;
382 }
383 range = -1;
384 /* Switch into MCCS collection mode (if not
385 * already there */
386#if TRE_DEBUG
387 if (!collect_MCCS)
388 {
389 collect_MCCS = 1;
390 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
391 }
392#else /* !TRE_DEBUG */
393 collect_MCCS = 1;
394#endif /* !TRE_DEBUG */
395 /* Allocate a memory block the first time */
396 if (!cp)
397 {
398 if ((col_syms = xmalloc(sizeof(*col_syms) *
399 (START_COLLATING_SYMBOLS + 2)))
400 == NULL)
401 return REG_ESPACE;
402 cp = col_syms + 1;
403 n_col_syms = START_COLLATING_SYMBOLS;
404 }
405 /* Enlarge the memory block is more is needed */
406 if ((cp - col_syms) - 1 >= n_col_syms)
407 {
408 int i = n_col_syms;
409 tre_collating_symbol *tmp =
410 xrealloc(col_syms, sizeof(*col_syms) *
411 ((n_col_syms *= 2) + 2));
412 if (tmp == NULL)
413 {
414 xfree(col_syms);
415 return REG_ESPACE;
416 }
417 DPRINT(("tre_list_collating_symbols: "
418 "Enlarging col_syms to %d\n",
419 n_col_syms));
420 col_syms = tmp;
421 cp = col_syms + i + 1;
422 }
423 cp->start = start;
424 cp->len = re - start;
425 cp++;
426 re++;
427 break;
428 }
429 }
430 }
431 break;
432 }
433
434 case CHAR_EQUAL:
435 case CHAR_COLON:
436 {
437 /* Process equivalence and character classes */
438 tre_char_t kind = re[1];
439
440 /* Can't have a class as an endpoint to a range */
441 if (range > 0)
442 {
443 status = REG_ERANGE;
444 goto error;
445 }
446 if (!collect_MCCS && range == 0)
447 {
448 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
449 min, &max_i, items);
450 if (status != REG_OK)
451 goto error;
452 }
453 range = -1;
454 re += 2;
455 start = re;
456 for (;; re++)
457 {
458 if (re >= re_end)
459 {
460 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
461 goto error;
462 }
463 if (*re == kind)
464 {
465 if (re + 1 >= re_end)
466 {
467 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
468 REG_ECTYPE;
469 goto error;
470 }
471 /* Found end */
472 if (re[1] == CHAR_RBRACKET)
473 {
474 if (re == start)
475 {
476 /* Empty class name */
477 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
478 REG_ECTYPE;
479 goto error;
480 }
481 /* Process equivalence class */
482 if (kind == CHAR_EQUAL)
483 {
484 int equiv;
485
486 DPRINT(("tre_parse_bracket: equivalence: '%.*"
487 STRF "'\n", REST(start - 2)));
488
489 /* While we find the collation value even for
490 multi-character collating elements , we
491 don't (yet) match any collation values
492 against multi-character sequences. We'd have
493 to enumerate those multi-character sequences
494 and like multi-character collating symbols,
495 create a union of those sequences with the
496 rest of the bracket expression. While
497 doable, a bracket expression matching
498 multiple characters, that doesn't explicitly
499 contain multi-character sequences, might
500 be unexpected, so we punt for now. */
501 if ((equiv = __collate_equiv_value(ctx->loc,
502 start, re - start)) <= 0)
503 {
504 /* The standard says that if no collating
505 element if found, we use the collating
506 symbol itself. But __collate_equiv_value
507 doesn't make a distinction between
508 an element that is in a equvalence
509 class with others, or is the only member,
510 so we already know there is no collating
511 symbol. (Note that in the case of a
512 collating element whose collation value
513 is unique, matching against the
514 collating element itself, or against
515 its collation value, is equivalent.) */
516#ifdef BSD_COMPATIBILITY
517 /* Check if the name is in cnames; if so,
518 use the corresponding code */
519 c = tre_search_cnames(start, re - start);
520 if (c != (wchar_t)-1)
521 {
522 re++;
523 goto process_single_character;
524 }
525#endif /* BSD_COMPATIBILITY */
526 status = REG_ECOLLATE;
527 goto error;
528 }
529 if (!collect_MCCS)
530 {
531 status = tre_new_item(ctx->mem,
532 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
533 equiv, &max_i, items);
534 if (status != REG_OK)
535 goto error;
536 }
537 }
538 else
539 {
540 /* Process character class */
541 DPRINT(("tre_parse_bracket: class: '%.*" STRF
542 "'\n", REST(start - 2)));
543 if (!collect_MCCS)
544 {
545 char tmp_str[64];
546 tre_ctype_t class;
547 int len = MIN(re - start, 63);
548#ifdef TRE_WCHAR
549 {
550 tre_char_t tmp_wcs[64];
551 wcsncpy(tmp_wcs, start, (size_t)len);
552 tmp_wcs[len] = L'\0';
553#if defined HAVE_WCSRTOMBS
554 {
555 mbstate_t state;
556 const tre_char_t *src = tmp_wcs;
557 memset(&state, '\0', sizeof(state));
558 len = wcsrtombs_l(tmp_str, &src,
559 sizeof(tmp_str), &state,
560 ctx->loc);
561 }
562#elif defined HAVE_WCSTOMBS
563 len = wcstombs(tmp_str, tmp_wcs, 63);
564#endif /* defined HAVE_WCSTOMBS */
565 }
566#else /* !TRE_WCHAR */
567 strncpy(tmp_str, (const char*)start, len);
568#endif /* !TRE_WCHAR */
569 tmp_str[len] = '\0';
570 DPRINT((" class name: %s\n", tmp_str));
571 class = tre_ctype_l(tmp_str, ctx->loc);
572 if (!class)
573 {
574 status = REG_ECTYPE;
575 goto error;
576 }
577 status = tre_new_item(ctx->mem,
578 TRE_BRACKET_MATCH_TYPE_CLASS,
579 class, &max_i, items);
580 if (status != REG_OK)
581 goto error;
582 }
583 }
584 re++;
585 break;
586 }
587 }
588 }
589 other++;
590 break;
591 }
592
593 default:
594 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
595 c = CHAR_LBRACKET;
596 goto process_single_character;
597 break;
598 }
599 break;
600
601 case CHAR_RBRACKET:
602 /* A first right bracket */
603 if (re == ctx->re)
604 {
605 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
606 min = CHAR_RBRACKET;
607 range = 0;
608 other++;
609 break;
610 }
611 /* Done */
612 if (collect_MCCS)
613 {
614 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
615 REST(re)));
616 if (col_syms)
617 {
618 /* Mark the character following the right bracket. Set len
619 * to whether there are other things besides the
620 * multi-character collating symbols */
621 col_syms->start = re + 1;
622 col_syms->len = other;
623 /* Mark the end of the list */
624 cp->start = NULL;
625 }
626 *result = col_syms;
627 return REG_OK;
628 }
629 /* range > 0 is not possible, since we did a lookahead after the
630 * hyphen */
631 if (range == 0)
632 {
633 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
634 min, &max_i, items);
635 if (status != REG_OK)
636 goto error;
637 }
638 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
639 *items_size = max_i;
640 ctx->re = re + 1;
641 return REG_OK;
642
643 default:
644 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
645 c = *re;
646process_single_character:
647 /* Process single character */
648 if (range > 0)
649 {
650 int mine, maxe;
651
652process_end_range:
653 /* Get collation equivalence values */
654 mine = __collate_equiv_value(ctx->loc, &min, 1);
655 maxe = __collate_equiv_value(ctx->loc, &c, 1);
656 if (maxe < mine)
657 {
658 status = REG_ERANGE;
659 goto error;
660 }
661 if (!collect_MCCS)
662 {
663 status = tre_new_item(ctx->mem,
664 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
665 mine, &max_i, items);
666 if (status != REG_OK)
667 goto error;
668 status = tre_new_item(ctx->mem,
669 TRE_BRACKET_MATCH_TYPE_RANGE_END,
670 maxe, &max_i, items);
671 if (status != REG_OK)
672 goto error;
673 }
674 range = -1;
675 }
676 else
677 {
678process_begin_range:
679 if (!collect_MCCS)
680 {
681 if (range == 0)
682 {
683 status = tre_new_item(ctx->mem,
684 TRE_BRACKET_MATCH_TYPE_CHAR,
685 min, &max_i, items);
686 if (status != REG_OK)
687 goto error;
688 }
689 min = c;
690 }
691 range = 0;
692 }
693 other++;
694 break;
695 }
696 }
697 status = REG_EBRACK;
698error:
699 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
700 REST(re), status));
701 if (col_syms)
702 xfree(col_syms);
703 return status;
704}
705
706#ifdef TRE_DEBUG
707static const char *bracket_match_type_str[] = {
708 "unused",
709 "char",
710 "range begin",
711 "range end",
712 "class",
713 "equivalence value",
714};
715#endif /* TRE_DEBUG */
716
717static reg_errcode_t
718tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
719{
2acb8998 720 tre_ast_node_t *node = NULL;
ad3c9f2a
A
721 reg_errcode_t status = REG_OK;
722 tre_bracket_match_list_t *items;
723 int max_i = 32;
724 tre_collating_symbol *col_syms = NULL;
725
726 /* Handle special cases [[:<:]] and [[:>:]] */
727 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
728 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
729 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
730 && ctx->re[5] == CHAR_RBRACKET)
731 {
732 *result = tre_ast_new_literal(ctx->mem, ASSERTION,
733 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
734 -1);
735 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
736 "[[:<:]]" : "[[:>:]]"));
737 ctx->re += 6;
738 return *result ? REG_OK : REG_ESPACE;
739 }
740
741 /* Start off with an array of `max_i' elements. */
742 items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
743 if (items == NULL)
744 return REG_ESPACE;
745
746 if (*ctx->re == CHAR_CARET)
747 {
748 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
749 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
750 ctx->re++;
751 }
752
753 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
754
755 if (status != REG_OK)
756 goto parse_bracket_done;
757
758 /* If there are collating symbols, split off the multi-character ones
759 * into a union of the bracket expression (without the collating symbols)
760 * and the multiple-character sequences. We create an equivalent input
761 * string and run tre_parse() recursively */
762 if (col_syms)
763 {
764 tre_char_t *str, *sp;
765 tre_collating_symbol *cp;
766 tre_parse_ctx_t subctx;
767
768 /* Allocate a new string. We start with the size of the original
769 * bracket expression (minus 1) and add 2 (for a leading "[" and
770 * a trailing nil; don't need a "^", since it is illegal to have
771 * inverted MCCSs). Since a multi-character collating symbols
772 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
773 * need to worry about the new string getting too long. */
774 xfree(items);
775 str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
776 if (str == NULL)
777 {
778 xfree(col_syms);
779 return REG_ESPACE;
780 }
781 sp = str;
782 if (col_syms->len > 0)
783 {
784 /* There are other items in the bracket expression besides the
785 * multi-character collating symbols, so create a new bracket
786 * expression with only those other itmes. */
787 const tre_char_t *re;
788 ptrdiff_t i;
789
790 *sp++ = '[';
791 re = ctx->re;
792 for (cp = col_syms + 1; cp->start; cp++)
793 {
794 /* The "- 2" is to account for the "[." */
795 if ((i = ((cp->start - re) - 2)) > 0)
796 {
797 memcpy(sp, re, sizeof(*sp) * i);
798 sp += i;
799 }
800 /* The "+ 2" is to account for the ".]" */
801 re = cp->start + cp->len + 2;
802 }
803 i = col_syms->start - re; /* Includes the trailing right bracket */
804 memcpy(sp, re, sizeof(*sp) * i);
805 sp += i;
806 *sp++ = '|';
807 }
808 for (cp = col_syms + 1; cp->start; cp++)
809 {
810 memcpy(sp, cp->start, sizeof(*sp) * cp->len);
811 sp += cp->len;
812 if (cp[1].start)
813 *sp++ = '|';
814 }
815 *sp = 0;
816 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
817 str));
818
819 memcpy(&subctx, ctx, sizeof(subctx));
820 subctx.re = str;
821 subctx.len = sp - str;
822 subctx.nofirstsub = 1;
823 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
824 status = tre_parse(&subctx);
825 xfree(str);
826 if (status != REG_OK)
827 {
828 xfree(col_syms);
829 return status;
830 }
831 ctx->re = col_syms->start;
832 ctx->position = subctx.position;
833 xfree(col_syms);
834 *result = subctx.result;
835 DPRINT(("tre_parse_bracket: Returning to original string\n"));
836 return REG_OK;
837 }
838
839 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
840 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
841 if (node == NULL)
842 {
843 status = REG_ESPACE;
844 goto parse_bracket_done;
845 }
846 else
847 {
848 tre_literal_t *l = node->obj;
849 l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
850 SIZEOF_BRACKET_MATCH_LIST(items));
851 if (l->u.bracket_match_list == NULL)
852 {
853 status = REG_ESPACE;
854 goto parse_bracket_done;
855 }
856 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
857 }
858
859#ifdef TRE_DEBUG
860 {
861 int i;
862 tre_bracket_match_t *b;
863 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
864 items->num_bracket_matches, items->flags));
865 for (i = 0, b = items->bracket_matches;
866 i < items->num_bracket_matches; i++, b++)
867 {
868 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
869 b->value));
870 }
871 }
872#endif /* TRE_DEBUG */
873
874 parse_bracket_done:
875 xfree(items);
876 ctx->position++;
877 *result = node;
878 return status;
879}
880
881
882/* Parses a positive decimal integer. Returns -1 if the string does not
883 contain a valid number. */
884static int
885tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
886{
887 int num = -1;
888 const tre_char_t *r = *regex;
889 while (r < regex_end && *r >= L'0' && *r <= L'9')
890 {
891 if (num < 0)
892 num = 0;
893 num = num * 10 + *r - L'0';
894 r++;
895 }
896 *regex = r;
897 return num;
898}
899
900
901static reg_errcode_t
902tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
903{
904 int min, max;
905#ifdef TRE_APPROX
906 int i;
907 int cost_ins, cost_del, cost_subst, cost_max;
908 int limit_ins, limit_del, limit_subst, limit_err;
909 const tre_char_t *start;
910#endif /* TRE_APPROX */
911 const tre_char_t *r = ctx->re;
912 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
913#ifdef TRE_APPROX
914 int approx = 0;
915 int costs_set = 0;
916 int counts_set = 0;
917
918 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
919 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
920#endif /* TRE_APPROX */
921
922 /* Parse number (minimum repetition count). */
923 min = -1;
924 if (r >= ctx->re_end)
925#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
926 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
927#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
928 return REG_EBRACE;
929#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
930 if (*r >= L'0' && *r <= L'9') {
931 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
932 min = tre_parse_int(&r, ctx->re_end);
933 }
934#ifndef TRE_APPROX
935 else
936#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
937 /* For ERE, return REG_NOMATCH to signal that the lbrace should
938 be treated as a literal */
939 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
940#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
941 return REG_BADBR;
942#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
943#endif /* !TRE_APPROX */
944
945 /* Parse comma and second number (maximum repetition count). */
946 max = min;
947 if (r < ctx->re_end && *r == CHAR_COMMA)
948 {
949 r++;
950 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
951 max = tre_parse_int(&r, ctx->re_end);
952 }
953
954 /* Check that the repeat counts are sane. */
955 if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
956 return REG_BADBR;
957
958
959#ifdef TRE_APPROX
960 /*
961 '{'
962 optionally followed immediately by a number == minimum repcount
963 optionally followed by , then a number == maximum repcount
964 + then a number == maximum insertion count
965 - then a number == maximum deletion count
966 # then a number == maximum substitution count
967 ~ then a number == maximum number of errors
968 Any of +, -, # or ~ without followed by a number means that
969 the maximum count/number of errors is infinite.
970
971 An equation of the form
972 Xi + Yd + Zs < C
973 can be specified to set costs and the cost limit to a value
974 different from the default value:
975 - X is the cost of an insertion
976 - Y is the cost of a deletion
977 - Z is the cost of a substitution
978 - C is the maximum cost
979
980 If no count limit or cost is set for an operation, the operation
981 is not allowed at all.
982 */
983
984
985 do {
986 int done;
987 start = r;
988
989 /* Parse count limit settings */
990 done = 0;
991 if (!counts_set)
992 while (r + 1 < ctx->re_end && !done)
993 {
994 switch (*r)
995 {
996 case CHAR_PLUS: /* Insert limit */
997 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
998 r++;
999 limit_ins = tre_parse_int(&r, ctx->re_end);
1000 if (limit_ins < 0)
1001 limit_ins = INT_MAX;
1002 counts_set = 1;
1003 break;
1004 case CHAR_MINUS: /* Delete limit */
1005 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
1006 r++;
1007 limit_del = tre_parse_int(&r, ctx->re_end);
1008 if (limit_del < 0)
1009 limit_del = INT_MAX;
1010 counts_set = 1;
1011 break;
1012 case CHAR_HASH: /* Substitute limit */
1013 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1014 r++;
1015 limit_subst = tre_parse_int(&r, ctx->re_end);
1016 if (limit_subst < 0)
1017 limit_subst = INT_MAX;
1018 counts_set = 1;
1019 break;
1020 case CHAR_TILDE: /* Maximum number of changes */
1021 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1022 r++;
1023 limit_err = tre_parse_int(&r, ctx->re_end);
1024 if (limit_err < 0)
1025 limit_err = INT_MAX;
1026 approx = 1;
1027 break;
1028 case CHAR_COMMA:
1029 r++;
1030 break;
1031 case L' ':
1032 r++;
1033 break;
1034 case L'}':
1035 done = 1;
1036 break;
1037 default:
1038 done = 1;
1039 break;
1040 }
1041 }
1042
1043 /* Parse cost restriction equation. */
1044 done = 0;
1045 if (!costs_set)
1046 while (r + 1 < ctx->re_end && !done)
1047 {
1048 switch (*r)
1049 {
1050 case CHAR_PLUS:
1051 case L' ':
1052 r++;
1053 break;
1054 case L'<':
1055 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
1056 r++;
1057 while (*r == L' ')
1058 r++;
1059 cost_max = tre_parse_int(&r, ctx->re_end);
1060 if (cost_max < 0)
1061 cost_max = INT_MAX;
1062 else
1063 cost_max--;
1064 approx = 1;
1065 break;
1066 case CHAR_COMMA:
1067 r++;
1068 done = 1;
1069 break;
1070 default:
1071 if (*r >= L'0' && *r <= L'9')
1072 {
1073#ifdef TRE_DEBUG
1074 const tre_char_t *sr = r;
1075#endif /* TRE_DEBUG */
1076 int cost = tre_parse_int(&r, ctx->re_end);
1077 /* XXX - make sure r is not past end. */
1078 switch (*r)
1079 {
1080 case L'i': /* Insert cost */
1081 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
1082 REST(sr)));
1083 r++;
1084 cost_ins = cost;
1085 costs_set = 1;
1086 break;
1087 case L'd': /* Delete cost */
1088 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
1089 REST(sr)));
1090 r++;
1091 cost_del = cost;
1092 costs_set = 1;
1093 break;
1094 case L's': /* Substitute cost */
1095 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
1096 REST(sr)));
1097 r++;
1098 cost_subst = cost;
1099 costs_set = 1;
1100 break;
1101 default:
1102 return REG_BADBR;
1103 }
1104 }
1105 else
1106 {
1107 done = 1;
1108 break;
1109 }
1110 }
1111 }
1112 } while (start != r);
1113#endif /* TRE_APPROX */
1114
1115 /*{*//* Missing }. */
1116 if (r >= ctx->re_end)
1117 return REG_EBRACE;
1118
1119 /* Empty contents of {}. */
1120 if (r == ctx->re)
1121 return REG_BADBR;
1122
1123 /* Parse the ending '}' or '\}'.*/
1124 if (ctx->cflags & REG_EXTENDED)
1125 {
1126 if (r >= ctx->re_end || *r != CHAR_RBRACE)
1127 return REG_BADBR;
1128 r++;
1129 /* Parse trailing '?' marking minimal repetition. */
1130 if (r < ctx->re_end)
1131 {
1132 if (*r == CHAR_QUESTIONMARK)
1133 {
1134 /* Process the question mark only in enhanced mode.
1135 Otherwise, the question mark is an error in ERE
1136 or a literal in BRE */
1137 if (ctx->cflags & REG_ENHANCED)
1138 {
1139 minimal = !(ctx->cflags & REG_UNGREEDY);
1140 r++;
1141 }
1142 else return REG_BADRPT;
1143 }
1144 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1145 {
1146 /* These are reserved for future extensions. */
1147 return REG_BADRPT;
1148 }
1149 }
1150 }
1151 else
1152 {
1153 if (r + 1 >= ctx->re_end
1154 || *r != CHAR_BACKSLASH
1155 || *(r + 1) != CHAR_RBRACE)
1156 return REG_BADBR;
1157 r += 2;
1158 if (r < ctx->re_end && *r == CHAR_STAR)
1159 {
1160 /* This is reserved for future extensions. */
1161 return REG_BADRPT;
1162 }
1163 }
1164
1165 if (minimal)
1166 ctx->num_reorder_tags++;
1167
1168 if (!result) goto parse_bound_exit;
1169 /* Create the AST node(s). */
1170 /* Originally, if min == 0 && max == 0, we immediately replace the whole
1171 iteration with EMPTY. This unfortunately drops any submatches, and
1172 messes up setting the pmatch values (we can get tags of -1, and
1173 tag values in the billions). So we leave it and process this case as
1174 usual, and wait until tre_expand_ast() to replace with EMPTY */
1175#ifdef TRE_APPROX
1176 if (min < 0 && max < 0)
1177 /* Only approximate parameters set, no repetitions. */
1178 min = max = 1;
1179#endif /* TRE_APPROX */
1180
1181 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1182 if (!*result)
1183 return REG_ESPACE;
1184
1185#ifdef TRE_APPROX
1186 /* If approximate matching parameters are set, add them to the
1187 iteration node. */
1188 if (approx || costs_set || counts_set)
1189 {
1190 int *params;
1191 tre_iteration_t *iter = (*result)->obj;
1192
1193 if (costs_set || counts_set)
1194 {
1195 if (limit_ins == TRE_PARAM_UNSET)
1196 {
1197 if (cost_ins == TRE_PARAM_UNSET)
1198 limit_ins = 0;
1199 else
1200 limit_ins = INT_MAX;
1201 }
1202
1203 if (limit_del == TRE_PARAM_UNSET)
1204 {
1205 if (cost_del == TRE_PARAM_UNSET)
1206 limit_del = 0;
1207 else
1208 limit_del = INT_MAX;
1209 }
1210
1211 if (limit_subst == TRE_PARAM_UNSET)
1212 {
1213 if (cost_subst == TRE_PARAM_UNSET)
1214 limit_subst = 0;
1215 else
1216 limit_subst = INT_MAX;
1217 }
1218 }
1219
1220 if (cost_max == TRE_PARAM_UNSET)
1221 cost_max = INT_MAX;
1222 if (limit_err == TRE_PARAM_UNSET)
1223 limit_err = INT_MAX;
1224
1225 ctx->have_approx = 1;
1226 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1227 if (!params)
1228 return REG_ESPACE;
1229 for (i = 0; i < TRE_PARAM_LAST; i++)
1230 params[i] = TRE_PARAM_UNSET;
1231 params[TRE_PARAM_COST_INS] = cost_ins;
1232 params[TRE_PARAM_COST_DEL] = cost_del;
1233 params[TRE_PARAM_COST_SUBST] = cost_subst;
1234 params[TRE_PARAM_COST_MAX] = cost_max;
1235 params[TRE_PARAM_MAX_INS] = limit_ins;
1236 params[TRE_PARAM_MAX_DEL] = limit_del;
1237 params[TRE_PARAM_MAX_SUBST] = limit_subst;
1238 params[TRE_PARAM_MAX_ERR] = limit_err;
1239 iter->params = params;
1240 }
1241#endif /* TRE_APPROX */
1242
1243parse_bound_exit:
1244#ifdef TRE_APPROX
1245 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1246 "limits [%d,%d,%d, total %d]\n",
1247 min, max, cost_ins, cost_del, cost_subst, cost_max,
1248 limit_ins, limit_del, limit_subst, limit_err));
1249#else /* !TRE_APPROX */
1250 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1251#endif /* !TRE_APPROX */
1252
1253
1254 ctx->re = r;
1255 return REG_OK;
1256}
1257
1258/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1259 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1260 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1261 Because we now set up tags for even non-capturing parenthesized
1262 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1263 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1264 have it restore cflags after the subexpression, we don't need to have
1265 a separate PARSE_RESTORE_CFLAGS, and then after processing the
1266 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1267 This has the side-benefit of now matching the perl behavior: the RE
1268 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1269 of foo AND (?i) (bar OR zap). */
1270typedef enum {
1271 PARSE_RE = 0,
1272 PARSE_ATOM,
1273 PARSE_MARK_FOR_SUBMATCH,
1274 PARSE_BRANCH,
1275 PARSE_PIECE,
1276 PARSE_CATENATION,
1277 PARSE_POST_CATENATION,
1278 PARSE_UNION,
1279 PARSE_POST_UNION,
1280 PARSE_POSTFIX,
1281} tre_parse_re_stack_symbol_t;
1282
1283
1284reg_errcode_t
1285tre_parse(tre_parse_ctx_t *ctx)
1286{
1287 tre_ast_node_t *result = NULL;
1288 tre_parse_re_stack_symbol_t symbol;
1289 reg_errcode_t status = REG_OK;
1290 tre_stack_t *stack = ctx->stack;
1291 int bottom = tre_stack_num_objects(stack);
1292 int depth = 0;
1293 int temporary_cflags = 0;
1294 int bre_branch_begin;
1295#ifdef TRE_DEBUG
1296 const tre_char_t *tmp_re;
1297#endif
1298
1299 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1300 ctx->len, ctx->re, ctx->len, ctx->cflags));
1301
1302 if (ctx->len <= 0) return REG_EMPTY;
1303 if (!ctx->nofirstsub)
1304 {
1305 STACK_PUSH(stack, int, ctx->cflags);
1306 STACK_PUSH(stack, int, ctx->submatch_id);
1307 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1308 ctx->submatch_id++;
1309 }
1310 STACK_PUSH(stack, int, 0); // bre_branch_begin
1311 STACK_PUSH(stack, int, PARSE_RE);
1312 ctx->re_start = ctx->re;
1313 ctx->re_end = ctx->re + ctx->len;
1314
1315
1316 /* The following is basically just a recursive descent parser. I use
1317 an explicit stack instead of recursive functions mostly because of
1318 two reasons: compatibility with systems which have an overflowable
1319 call stack, and efficiency (both in lines of code and speed). */
1320 while (tre_stack_num_objects(stack) > bottom)
1321 {
1322 symbol = tre_stack_pop_int(stack);
1323 switch (symbol)
1324 {
1325 case PARSE_RE:
1326 /* Parse a full regexp. A regexp is one or more branches,
1327 separated by the union operator `|'. */
1328 bre_branch_begin = tre_stack_pop_int(stack);
1329 if (
1330#ifdef REG_LITERAL
1331 !(ctx->cflags & REG_LITERAL) &&
1332#endif /* REG_LITERAL */
1333 ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1334 STACK_PUSHX(stack, int, PARSE_UNION);
1335 STACK_PUSHX(stack, int, bre_branch_begin);
1336 STACK_PUSHX(stack, int, PARSE_BRANCH);
1337 break;
1338
1339 case PARSE_BRANCH:
1340 /* Parse a branch. A branch is one or more pieces, concatenated.
1341 A piece is an atom possibly followed by a postfix operator. */
1342 bre_branch_begin = tre_stack_pop_int(stack);
1343 STACK_PUSHX(stack, int, PARSE_CATENATION);
1344 STACK_PUSHX(stack, int, bre_branch_begin);
1345 STACK_PUSHX(stack, int, PARSE_PIECE);
1346 break;
1347
1348 case PARSE_PIECE:
1349 /* Parse a piece. A piece is an atom possibly followed by one
1350 or more postfix operators. */
1351 bre_branch_begin = tre_stack_pop_int(stack);
1352 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1353 STACK_PUSHX(stack, int, bre_branch_begin);
1354 STACK_PUSHX(stack, int, PARSE_ATOM);
1355 break;
1356
1357 case PARSE_CATENATION:
1358 /* If the expression has not ended, parse another piece. */
1359 {
1360 tre_char_t c;
1361 if (ctx->re >= ctx->re_end)
1362 break;
1363 c = *ctx->re;
1364#ifdef REG_LITERAL
1365 if (!(ctx->cflags & REG_LITERAL))
1366 {
1367#endif /* REG_LITERAL */
1368 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1369 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1370 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1371 *(ctx->re + 1) == CHAR_PIPE))
1372 break;
1373 if ((ctx->cflags & REG_EXTENDED
1374 && c == CHAR_RPAREN && depth > 0)
1375 || (!(ctx->cflags & REG_EXTENDED)
1376 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1377 && *(ctx->re + 1) == CHAR_RPAREN))
1378 {
1379 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1380 return REG_EPAREN;
1381 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1382 REST(ctx->re)));
1383 depth--;
1384 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1385 ctx->re += 2;
1386 break;
1387 }
1388#ifdef REG_LITERAL
1389 }
1390#endif /* REG_LITERAL */
1391
1392#ifdef REG_LEFT_ASSOC
1393 if (ctx->cflags & REG_LEFT_ASSOC)
1394 {
1395 /* Left associative concatenation. */
1396 STACK_PUSHX(stack, int, PARSE_CATENATION);
1397 STACK_PUSHX(stack, voidptr, result);
1398 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1399 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1400 STACK_PUSHX(stack, int, PARSE_PIECE);
1401 }
1402 else
1403#endif /* REG_LEFT_ASSOC */
1404 {
1405 /* Default case, right associative concatenation. */
1406 STACK_PUSHX(stack, voidptr, result);
1407 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1408 STACK_PUSHX(stack, int, PARSE_CATENATION);
1409 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1410 STACK_PUSHX(stack, int, PARSE_PIECE);
1411 }
1412 break;
1413 }
1414
1415 case PARSE_POST_CATENATION:
1416 {
1417 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1418 tre_ast_node_t *tmp_node;
1419 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1420 if (!tmp_node)
1421 return REG_ESPACE;
1422 result = tmp_node;
1423 break;
1424 }
1425
1426 case PARSE_UNION:
1427 if (ctx->re >= ctx->re_end)
1428 break;
1429#ifdef REG_LITERAL
1430 if (ctx->cflags & REG_LITERAL)
1431 break;
1432#endif /* REG_LITERAL */
1433 if (!(ctx->cflags & REG_EXTENDED))
1434 {
1435 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1436 break;
1437 ctx->re++;
1438 }
1439 switch (*ctx->re)
1440 {
1441 case CHAR_PIPE:
1442 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1443 REST(ctx->re)));
1444 STACK_PUSHX(stack, int, PARSE_UNION);
1445 STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1446 STACK_PUSHX(stack, voidptr, result);
1447 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1448 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1449 indicate if this is the beginning of a BRE extended branch. */
1450 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1451 STACK_PUSHX(stack, int, PARSE_BRANCH);
1452 ctx->re++;
1453 break;
1454
1455 case CHAR_RPAREN:
1456 ctx->re++;
1457 break;
1458
1459 default:
1460 if (!(ctx->cflags & REG_EXTENDED))
1461 ctx->re--;
1462 break;
1463 }
1464 break;
1465
1466 case PARSE_POST_UNION:
1467 {
1468 tre_ast_node_t *tmp_node;
1469 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1470 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1471 /* error on empty expression at end of union */
1472 if (pipechar == ctx->re - 1)
1473 {
1474 return REG_EMPTY;
1475 }
1476 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1477 if (!tmp_node)
1478 return REG_ESPACE;
1479 result = tmp_node;
1480 break;
1481 }
1482
1483 case PARSE_POSTFIX:
1484 /* Parse postfix operators. */
1485 if (ctx->re >= ctx->re_end)
1486 break;
1487#ifdef REG_LITERAL
1488 if (ctx->cflags & REG_LITERAL)
1489 break;
1490#endif /* REG_LITERAL */
1491 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1492 int rep_min = 0;
1493 int rep_max = -1;
1494#ifdef TRE_DEBUG
1495 int lbrace_off;
1496#endif
1497 switch (*ctx->re)
1498 {
1499 case CHAR_PLUS:
1500 case CHAR_QUESTIONMARK:
1501 if (!(ctx->cflags & REG_EXTENDED))
1502 break;
1503 /*FALLTHROUGH*/
1504 case CHAR_STAR:
1505 {
1506 tre_ast_node_t *tmp_node;
1507#ifdef TRE_DEBUG
1508 const char *tstr = "star";
1509 tmp_re = ctx->re;
1510#endif
1511
1512 handle_plus_or_question:
1513 /* error on iteration of raw assertion (not in subexpression) */
1514 if (result->type == LITERAL && result->submatch_id < 0 &&
1515 IS_ASSERTION((tre_literal_t *)result->obj))
1516 {
1517 if (!(ctx->cflags & REG_EXTENDED)) break;
1518 return REG_BADRPT;
1519 }
1520 if (*ctx->re == CHAR_PLUS)
1521 {
1522 rep_min = 1;
1523#ifdef TRE_DEBUG
1524 tstr = "plus";
1525#endif
1526 }
1527 if (*ctx->re == CHAR_QUESTIONMARK)
1528 {
1529 rep_max = 1;
1530#ifdef TRE_DEBUG
1531 tstr = "questionmark";
1532#endif
1533 }
1534
1535 if (ctx->cflags & REG_EXTENDED)
1536 {
1537 if (ctx->re + 1 < ctx->re_end)
1538 {
1539 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1540 {
1541 /* Process the question mark only in enhanced mode.
1542 Otherwise, the question mark is an error in ERE */
1543 if (ctx->cflags & REG_ENHANCED)
1544 {
1545 minimal = !(ctx->cflags & REG_UNGREEDY);
1546 ctx->re++;
1547 }
1548 else return REG_BADRPT;
1549 }
1550 else if (*(ctx->re + 1) == CHAR_STAR
1551 || *(ctx->re + 1) == CHAR_PLUS)
1552 {
1553 /* These are reserved for future extensions. */
1554 return REG_BADRPT;
1555 }
1556 }
1557 }
1558 else
1559 {
1560 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1561 {
1562 /* This is reserved for future extensions. */
1563 return REG_BADRPT;
1564 }
1565 if (ctx->re + 2 < ctx->re_end)
1566 {
1567 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1568 {
1569 /* Process the question mark only in enhanced mode.
1570 Otherwise, the question mark is a literal in BRE */
1571 if (ctx->cflags & REG_ENHANCED)
1572 {
1573 minimal = !(ctx->cflags & REG_UNGREEDY);
1574 ctx->re += 2;
1575 }
1576 }
1577 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1578 {
1579 /* This is reserved for future extensions. */
1580 return REG_BADRPT;
1581 }
1582 }
1583 }
1584
1585 if (minimal)
1586 ctx->num_reorder_tags++;
1587
1588 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1589 minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1590 if (result == NULL)
1591 {
1592 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1593 else goto parse_literal;
1594 }
1595 ctx->re++;
1596 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1597 minimal);
1598 if (tmp_node == NULL)
1599 return REG_ESPACE;
1600 result = tmp_node;
1601
1602 /* Set the iterator with a submatch id in the invisible range
1603 * (which will be overridden if a real submatch is needed) */
1604 result->submatch_id = ctx->submatch_id_invisible++;
1605
1606#if 0
1607 /* We don't allow multiple postfixes, but this might be needed
1608 to support approximate matching */
1609 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1610#endif
1611 }
1612 break;
1613
1614 case CHAR_BACKSLASH:
1615 /* "\{" is special without REG_EXTENDED */
1616 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1617 if (!(ctx->cflags & REG_EXTENDED)
1618 && ctx->re + 1 < ctx->re_end)
1619 {
1620 switch (*(ctx->re + 1))
1621 {
1622 case CHAR_LBRACE:
1623 ctx->re++;
1624#ifdef TRE_DEBUG
1625 lbrace_off = 2;
1626#endif
1627 goto parse_brace;
1628 case CHAR_PLUS:
1629 case CHAR_QUESTIONMARK:
1630 if (ctx->cflags & REG_ENHANCED)
1631 {
1632#ifdef TRE_DEBUG
1633 tmp_re = ctx->re;
1634#endif
1635 ctx->re++;
1636 goto handle_plus_or_question;
1637 }
1638 break;
1639 }
1640 break;
1641 }
1642 else
1643 break;
1644
1645 case CHAR_LBRACE:
1646 {
1647 int raw_assertion;
1648
1649 /* "{" is literal without REG_EXTENDED */
1650 if (!(ctx->cflags & REG_EXTENDED))
1651 break;
1652#ifdef TRE_DEBUG
1653 lbrace_off = 1;
1654#endif
1655
1656 parse_brace:
1657 /* error on iteration of raw assertion (not in subexpression),
1658 but wait until after parsing bounds */
1659 raw_assertion = (result->type == LITERAL
1660 && result->submatch_id < 0
1661 && IS_ASSERTION((tre_literal_t *)result->obj));
1662 ctx->re++;
1663
1664 status = tre_parse_bound(ctx, &result);
1665#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1666 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1667 is to be treated as a literal. */
1668 if (status == REG_NOMATCH)
1669 {
1670 ctx->re--;
1671 break;
1672 }
1673#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1674 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1675 REST(ctx->re - lbrace_off)));
1676 if (status != REG_OK)
1677 return status;
1678 if (raw_assertion) return REG_BADRPT;
1679
1680 /* Set the iterator with a submatch id in the invisible range
1681 * (which will be overridden if a real submatch is needed) */
1682 if (result->type == ITERATION)
1683 result->submatch_id = ctx->submatch_id_invisible++;
1684
1685#if 0
1686 /* We don't allow multiple postfixes, but this might be needed
1687 to support approximate matching */
1688 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1689#endif
1690 break;
1691 }
1692 }
1693 break;
1694
1695 case PARSE_ATOM:
1696 {
1697 /* Parse an atom. An atom is a regular expression enclosed in `()',
1698 an empty set of `()', a bracket expression, `.', `^', `$',
1699 a `\' followed by a character, or a single character. */
1700
1701 /* The stack contains a boolean value, whether PARSE_ATOM is
1702 being called just after the start of a group (left paren)
1703 in a BRE */
1704 bre_branch_begin = tre_stack_pop_int(stack);
1705
1706 /* End of regexp? (empty string). */
1707 if (ctx->re >= ctx->re_end)
1708 goto parse_literal;
1709
1710#ifdef REG_LITERAL
1711 if (ctx->cflags & REG_LITERAL)
1712 goto parse_literal;
1713#endif /* REG_LITERAL */
1714
1715 switch (*ctx->re)
1716 {
1717 case CHAR_LPAREN: /* parenthesized subexpression */
1718
1719 /* Handle "(?...)" extensions. They work in a way similar
1720 to Perls corresponding extensions. */
1721 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1722 (REG_EXTENDED|REG_ENHANCED)
1723 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1724 {
1725 int new_cflags = ctx->cflags;
1726 int bit = 1;
1727 int invisible_submatch = 0;
1728 DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1729 REST(ctx->re)));
1730 ctx->re += 2;
1731 while (/*CONSTCOND*/1)
1732 {
1733 if (*ctx->re == L'i')
1734 {
1735 DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
1736 REST(ctx->re)));
1737 if (bit)
1738 new_cflags |= REG_ICASE;
1739 else
1740 new_cflags &= ~REG_ICASE;
1741 ctx->re++;
1742 }
1743 else if (*ctx->re == L'n')
1744 {
1745 DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
1746 REST(ctx->re)));
1747 if (bit)
1748 new_cflags |= REG_NEWLINE;
1749 else
1750 new_cflags &= ~REG_NEWLINE;
1751 ctx->re++;
1752 }
1753#ifdef REG_LEFT_ASSOC
1754 else if (*ctx->re == L'l')
1755 {
1756 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1757 REST(ctx->re)));
1758 if (bit)
1759 new_cflags |= REG_LEFT_ASSOC;
1760 else
1761 new_cflags &= ~REG_LEFT_ASSOC;
1762 ctx->re++;
1763 }
1764#endif /* REG_LEFT_ASSOC */
1765#ifdef REG_UNGREEDY
1766 else if (*ctx->re == L'U')
1767 {
1768 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
1769 REST(ctx->re)));
1770 if (bit)
1771 new_cflags |= REG_UNGREEDY;
1772 else
1773 new_cflags &= ~REG_UNGREEDY;
1774 ctx->re++;
1775 }
1776#endif /* REG_UNGREEDY */
1777 else if (*ctx->re == CHAR_MINUS)
1778 {
1779 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
1780 REST(ctx->re)));
1781 ctx->re++;
1782 bit = 0;
1783 }
1784 else if (*ctx->re == CHAR_COLON)
1785 {
1786 DPRINT(("tre_parse: no group: '%.*" STRF
1787 "', (invisible submatch %d)\n",
1788 REST(ctx->re), ctx->submatch_id_invisible));
1789 ctx->re++;
1790 depth++;
1791 invisible_submatch = 1;
1792 break;
1793 }
1794 else if (*ctx->re == CHAR_HASH)
1795 {
1796 DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
1797 REST(ctx->re)));
1798 /* A comment can contain any character except a
1799 right parenthesis */
1800 while (*ctx->re != CHAR_RPAREN
1801 && ctx->re < ctx->re_end)
1802 ctx->re++;
1803 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1804 {
1805 ctx->re++;
1806 break;
1807 }
1808 else
1809 return REG_BADPAT;
1810 }
1811 else if (*ctx->re == CHAR_RPAREN)
1812 {
1813 ctx->re++;
1814 break;
1815 }
1816 else
1817 return REG_BADRPT;
1818 }
1819
1820 /* Turn on the cflags changes for the rest of the
1821 enclosing group. */
1822 if (invisible_submatch)
1823 {
1824 STACK_PUSHX(stack, int, ctx->cflags);
1825 STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1826 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1827 ctx->submatch_id_invisible++;
1828 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1829 STACK_PUSHX(stack, int, PARSE_RE);
1830 }
1831 else {
1832 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1833 STACK_PUSHX(stack, int, PARSE_ATOM);
1834 }
1835 ctx->cflags = new_cflags;
1836 break;
1837 }
1838
1839 if (ctx->cflags & REG_EXTENDED)
1840 {
1841 parse_bre_lparen:
1842 DPRINT(("tre_parse: group begin: '%.*" STRF
1843 "', submatch %d\n", REST(ctx->re),
1844 ctx->submatch_id));
1845 ctx->re++;
1846 /* First parse a whole RE, then mark the resulting tree
1847 for submatching. */
1848 STACK_PUSHX(stack, int, ctx->cflags);
1849 STACK_PUSHX(stack, int, ctx->submatch_id);
1850 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1851 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1852 indicate if this is the beginning of a BRE group. */
1853 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1854 STACK_PUSHX(stack, int, PARSE_RE);
1855 ctx->submatch_id++;
1856 depth++;
1857 }
1858 else
1859 goto parse_literal;
1860 break;
1861
1862 case CHAR_RPAREN: /* end of current subexpression */
1863 if (ctx->cflags & REG_EXTENDED && depth > 0)
1864 {
1865 parse_bre_rparen_empty:
1866 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1867 return REG_EPAREN;
1868 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1869 REST(ctx->re)));
1870 /* We were expecting an atom, but instead the current
1871 subexpression was closed. POSIX leaves the meaning of
1872 this to be implementation-defined. We interpret this as
1873 an empty expression (which matches an empty string). */
1874 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1875 if (result == NULL)
1876 return REG_ESPACE;
1877 if (!(ctx->cflags & REG_EXTENDED))
1878 ctx->re--;
1879 }
1880 else
1881 goto parse_literal;
1882 break;
1883
1884 case CHAR_LBRACKET: /* bracket expression */
1885 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1886 REST(ctx->re)));
1887 ctx->re++;
1888 status = tre_parse_bracket(ctx, &result);
1889 if (status != REG_OK)
1890 return status;
1891 break;
1892
1893 case CHAR_BACKSLASH:
1894 /* Deal with "\(", "\)" or "\{" for BREs */
1895 if (!(ctx->cflags & REG_EXTENDED)
1896 && ctx->re + 1 < ctx->re_end)
1897 {
1898 if (*(ctx->re + 1) == CHAR_LPAREN)
1899 {
1900 ctx->re++;
1901 goto parse_bre_lparen;
1902 }
1903 else if (*(ctx->re + 1) == CHAR_RPAREN)
1904 {
1905 ctx->re++;
1906 goto parse_bre_rparen_empty;
1907 }
1908 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1909 }
1910
1911 if (ctx->re + 1 >= ctx->re_end)
1912 /* Trailing backslash. */
1913 return REG_EESCAPE;
1914
1915 if (!(ctx->cflags & REG_ENHANCED))
1916 {
1917 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1918 ctx->re++;
1919 goto unenhanced_backslash;
1920 }
1921
1922 /* If a macro is used, parse the expanded macro recursively. */
1923 {
1924 tre_char_t buf[64];
1925 tre_expand_macro(ctx->re + 1, ctx->re_end,
1926 buf, elementsof(buf));
1927 if (buf[0] != 0)
1928 {
1929 tre_parse_ctx_t subctx;
1930 memcpy(&subctx, ctx, sizeof(subctx));
1931 subctx.re = buf;
1932 subctx.len = tre_strlen(buf);
1933 subctx.nofirstsub = 1;
1934 status = tre_parse(&subctx);
1935 if (status != REG_OK)
1936 return status;
1937 ctx->re += 2;
1938 ctx->position = subctx.position;
1939 result = subctx.result;
1940 break;
1941 }
1942 }
1943
1944#ifdef REG_LITERAL
1945 if (*(ctx->re + 1) == L'Q')
1946 {
1947 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1948 REST(ctx->re)));
1949 ctx->cflags |= REG_LITERAL;
1950 temporary_cflags |= REG_LITERAL;
1951 ctx->re += 2;
1952 STACK_PUSHX(stack, int, 0);
1953 STACK_PUSHX(stack, int, PARSE_ATOM);
1954 break;
1955 }
1956#endif /* REG_LITERAL */
1957
1958 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1959 ctx->re++;
1960 switch (*ctx->re)
1961 {
1962 case L'b':
1963 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1964 ASSERT_AT_WB, -1);
1965 ctx->re++;
1966 break;
1967 case L'B':
1968 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1969 ASSERT_AT_WB_NEG, -1);
1970 ctx->re++;
1971 break;
1972 case L'<':
1973 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1974 ASSERT_AT_BOW, -1);
1975 ctx->re++;
1976 break;
1977 case L'>':
1978 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1979 ASSERT_AT_EOW, -1);
1980 ctx->re++;
1981 break;
1982 case L'x':
1983 ctx->re++;
1984 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1985 {
1986 /* 8 bit hex char. */
1987 char tmp[3] = {0, 0, 0};
1988 long val;
1989 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1990 REST(ctx->re - 2)));
1991
1992 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1993 ctx->re < ctx->re_end)
1994 {
1995 tmp[0] = (char)ctx->re[0];
1996 ctx->re++;
1997 }
1998 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1999 ctx->re < ctx->re_end)
2000 {
2001 tmp[1] = (char)ctx->re[0];
2002 ctx->re++;
2003 }
2004 val = strtol(tmp, NULL, 16);
2005 result = tre_ast_new_literal(ctx->mem, (int)val,
2006 (int)val, ctx->position);
2007 ctx->position++;
2008 break;
2009 }
2010 else if (ctx->re < ctx->re_end)
2011 {
2012 /* Wide char. */
2013 char tmp[32];
2014 long val;
2015 int i = 0;
2016 ctx->re++;
2017 while (ctx->re_end - ctx->re >= 0)
2018 {
2acb8998
A
2019 if (i == sizeof(tmp))
2020 return REG_EBRACE;
ad3c9f2a
A
2021 if (ctx->re[0] == CHAR_RBRACE)
2022 break;
2023 if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2024 {
2025 tmp[i] = (char)ctx->re[0];
2026 i++;
2027 ctx->re++;
2028 continue;
2029 }
2030 return REG_EBRACE;
2031 }
2032 ctx->re++;
2033 tmp[i] = 0;
2034 val = strtol(tmp, NULL, 16);
2035 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2036 ctx->position);
2037 ctx->position++;
2038 break;
2039 }
2040 /*FALLTHROUGH*/
2041
2042 default:
2043 unenhanced_backslash:
2044 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2045 REG_EXTENDED &&
2046 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2047 {
2048 /* Back reference (only in BRE or enhanced). */
2049 int val = *ctx->re - L'0';
2050 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
2051 REST(ctx->re - 1)));
2052 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2053 ctx->position);
2054 if (result == NULL)
2055 return REG_ESPACE;
2056
2057 /* Set the backref with a submatch id in the invisible
2058 * range (which will be overridden if a real submatch
2059 * is needed) */
2060 result->submatch_id = ctx->submatch_id_invisible++;
2061
2062 ctx->position++;
2063 ctx->num_reorder_tags++;
2064 ctx->max_backref = MAX(val, ctx->max_backref);
2065 ctx->re++;
2066 }
2067 else
2068 {
2069 /* Escaped character. */
2070 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
2071 REST(ctx->re - 1)));
2072 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2073 ctx->position);
2074 ctx->position++;
2075 ctx->re++;
2076 }
2077 break;
2078 }
2079 if (result == NULL)
2080 return REG_ESPACE;
2081 break;
2082
2083 case CHAR_PERIOD: /* the any-symbol */
2084 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
2085 REST(ctx->re)));
2086 if (ctx->cflags & REG_NEWLINE)
2087 {
2088 tre_ast_node_t *tmp1;
2089 tre_ast_node_t *tmp2;
2090 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2091 ctx->position);
2092 if (!tmp1)
2093 return REG_ESPACE;
2094 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2095 ctx->position + 1);
2096 if (!tmp2)
2097 return REG_ESPACE;
2098 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2099 if (!result)
2100 return REG_ESPACE;
2101 ctx->position += 2;
2102 }
2103 else
2104 {
2105 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2106 ctx->position);
2107 if (!result)
2108 return REG_ESPACE;
2109 ctx->position++;
2110 }
2111 ctx->re++;
2112 break;
2113
2114 case CHAR_CARET: /* beginning of line assertion */
2115 /* '^' has a special meaning everywhere in EREs, at the
2116 beginning of the RE and after \( is BREs. It is also
2117 special in enhanced BREs at the beginning of each branches
2118 of a union */
2119 if (ctx->cflags & REG_EXTENDED
2120 || bre_branch_begin
2121 || ctx->re == ctx->re_start)
2122 {
2123 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
2124 REST(ctx->re)));
2125 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2126 ASSERT_AT_BOL, -1);
2127 if (result == NULL)
2128 return REG_ESPACE;
2129 ctx->re++;
2130 }
2131 else
2132 goto parse_literal;
2133 break;
2134
2135 case CHAR_DOLLAR: /* end of line assertion. */
2136 /* '$' is special everywhere in EREs, and in the end of the
2137 string and before \) is BREs. */
2138 if (ctx->cflags & REG_EXTENDED
2139 || (ctx->re + 2 < ctx->re_end
2140 && *(ctx->re + 1) == CHAR_BACKSLASH
2141 && *(ctx->re + 2) == CHAR_RPAREN)
2142 || ctx->re + 1 == ctx->re_end)
2143 {
2144 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
2145 REST(ctx->re)));
2146 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2147 ASSERT_AT_EOL, -1);
2148 if (result == NULL)
2149 return REG_ESPACE;
2150 ctx->re++;
2151 }
2152 else
2153 goto parse_literal;
2154 break;
2155
2156 default:
2157 parse_literal:
2158
2159 if (temporary_cflags && ctx->re + 1 < ctx->re_end
2160 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2161 {
2162 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
2163 REST(ctx->re)));
2164 ctx->cflags &= ~temporary_cflags;
2165 temporary_cflags = 0;
2166 ctx->re += 2;
2167 if (ctx->re < ctx->re_end)
2168 {
2169 STACK_PUSHX(stack, int, 0);
2170 STACK_PUSHX(stack, int, PARSE_ATOM);
2171 }
2172 else
2173 {
2174 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2175 if (!result) return REG_ESPACE;
2176 }
2177 break;
2178 }
2179
2180
2181 /* We are expecting an atom. If the subexpression (or the whole
2182 regexp ends here, we interpret it as an empty expression
2183 (which matches an empty string), which is an error.
2184 Iterations of an empty expression is also an error. */
2185#ifdef REG_LITERAL
2186 if (!(ctx->cflags & REG_LITERAL))
2187 {
2188#endif /* REG_LITERAL */
2189 /* error on end of string */
2190 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2191 : REG_EMPTY;
2192 /* error on unions and iterations of empty expressions */
2193 if (ctx->cflags & REG_EXTENDED)
2194 {
2195 if (ctx->re < ctx->re_end)
2196 {
2197 if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2198 if (*ctx->re == CHAR_LBRACE)
2199 {
2200 ctx->re++;
2201 empty_parse_bound:
2202 /* We need to parse the bound first and return
2203 any error, before returning REG_BADRPT */
2204 status = tre_parse_bound(ctx, NULL);
2205#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2206 /* For ERE, if REG_NOMATCH is returned, we
2207 treat the lbrace as a literal. */
2208 if (status == REG_NOMATCH)
2209 {
2210 ctx->re--;
2211 /* Drop down to literal-handling code */
2212 }
2213 else
2214 {
2215#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2216 if (status != REG_OK)
2217 return status;
2218 return REG_BADRPT;
2219#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2220 }
2221#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2222 }
2223#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2224 else
2225#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2226 if (*ctx->re == CHAR_STAR
2227 || *ctx->re == CHAR_PLUS
2228 || *ctx->re == CHAR_QUESTIONMARK)
2229 {
2230 return REG_BADRPT;
2231 }
2232 }
2233 }
2234 else if (ctx->re + 1 < ctx->re_end
2235 && *ctx->re == CHAR_BACKSLASH
2236 && *(ctx->re + 1) == CHAR_LBRACE)
2237 {
2238 ctx->re += 2;
2239 goto empty_parse_bound;
2240 }
2241#ifdef REG_LITERAL
2242 }
2243#endif /* REG_LITERAL */
2244
2245 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
2246 REST(ctx->re)));
2247 /* Note that we can't use an tre_isalpha() test here, since there
2248 may be characters which are alphabetic but neither upper or
2249 lower case. */
2250 if (ctx->cflags & REG_ICASE
2251 && (tre_isupper_l(*ctx->re, ctx->loc) ||
2252 tre_islower_l(*ctx->re, ctx->loc)))
2253 {
2254 tre_ast_node_t *tmp1;
2255 tre_ast_node_t *tmp2;
2256
2257 /* XXX - Can there be more than one opposite-case
2258 counterpoints for some character in some locale? Or
2259 more than two characters which all should be regarded
2260 the same character if case is ignored? If yes, there
2261 does not seem to be a portable way to detect it. I guess
2262 that at least for multi-character collating elements there
2263 could be several opposite-case counterpoints, but they
2264 cannot be supported portably anyway. */
2265 tmp1 = tre_ast_new_literal(ctx->mem,
2266 tre_toupper_l(*ctx->re, ctx->loc),
2267 tre_toupper_l(*ctx->re, ctx->loc),
2268 ctx->position);
2269 if (!tmp1)
2270 return REG_ESPACE;
2271 tmp2 = tre_ast_new_literal(ctx->mem,
2272 tre_tolower_l(*ctx->re, ctx->loc),
2273 tre_tolower_l(*ctx->re, ctx->loc),
2274 ctx->position);
2275 if (!tmp2)
2276 return REG_ESPACE;
2277 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2278 if (!result)
2279 return REG_ESPACE;
2280 }
2281 else
2282 {
2283 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2284 ctx->position);
2285 if (!result)
2286 return REG_ESPACE;
2287 }
2288 ctx->position++;
2289 ctx->re++;
2290 break;
2291 }
2292 break;
2293 }
2294
2295 case PARSE_MARK_FOR_SUBMATCH:
2296 {
2297 int submatch_id = tre_stack_pop_int(stack);
2298
2299 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2300 if (result->submatch_id >= 0 &&
2301 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2302 {
2303 tre_ast_node_t *n, *tmp_node;
2304 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2305 break;
2306 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2307 if (n == NULL)
2308 return REG_ESPACE;
2309 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2310 if (tmp_node == NULL)
2311 return REG_ESPACE;
2312 tmp_node->num_submatches = result->num_submatches;
2313 result = tmp_node;
2314 }
2315 result->submatch_id = submatch_id;
2316 if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2317 result->num_submatches++;
2318 break;
2319 }
2320
2321 default:
2322 assert(0);
2323 break;
2324 }
2325 }
2326
2327 /* Check for missing closing parentheses. */
2328 if (depth > 0)
2329 return REG_EPAREN;
2330
2331 ctx->result = result;
2332
2333 return REG_OK;
2334}
2335
2336/* EOF */