git.saurik.com Git - apple/libc.git/blame - regex/TRE/lib/tre-parse.c

Commit	Line	Data
ad3c9f2a A	1	/*
	2	tre-parse.c - Regexp parser
	3
	4	This software is released under a BSD-style license.
	5	See the file LICENSE for details and copyright.
	6
	7	*/
	8
	9	/*
	10	This parser is just a simple recursive descent parser for POSIX.2
	11	regexps. The parser supports both the obsolete default syntax and
	12	the "extended" syntax, and some nonstandard extensions.
	13	*/
	14
	15
	16	#ifdef HAVE_CONFIG_H
	17	#include <config.h>
	18	#endif /* HAVE_CONFIG_H */
	19	#include <string.h>
	20	#include <assert.h>
	21	#include <limits.h>
	22	#include <stddef.h>
	23
	24	#include "xmalloc.h"
	25	#include "tre-mem.h"
	26	#include "tre-ast.h"
	27	#include "tre-stack.h"
	28	#include "tre-parse.h"
	29
	30	/* BSD compatibility:
	31	Before looking up a collating symbol, check if the name matches in
	32	the character names (cnames) array; if so, use the corresponding
	33	character.
	34
	35	Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
	36	the implementation choice that for ERE, a non-numeric character following
	37	a left brace that would normally be a bound, causes the left brace to be
	38	literal. */
	39	#define BSD_COMPATIBILITY
	40	#ifdef BSD_COMPATIBILITY
	41	#include "cname.h"
	42	#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
	43	#endif /* BSD_COMPATIBILITY */
	44
	45	/* Characters with special meanings in regexp syntax. */
	46	#define CHAR_PIPE L'\|'
	47	#define CHAR_LPAREN L'('
	48	#define CHAR_RPAREN L')'
	49	#define CHAR_LBRACE L'{'
	50	#define CHAR_RBRACE L'}'
	51	#define CHAR_LBRACKET L'['
	52	#define CHAR_RBRACKET L']'
	53	#define CHAR_MINUS L'-'
	54	#define CHAR_STAR L'*'
	55	#define CHAR_QUESTIONMARK L'?'
	56	#define CHAR_PLUS L'+'
	57	#define CHAR_PERIOD L'.'
	58	#define CHAR_COLON L':'
	59	#define CHAR_EQUAL L'='
	60	#define CHAR_COMMA L','
	61	#define CHAR_CARET L'^'
	62	#define CHAR_DOLLAR L'$'
	63	#define CHAR_BACKSLASH L'\\'
	64	#define CHAR_HASH L'#'
65	#define CHAR_TILDE L'~'
66
67
68	/* Some macros for expanding \w, \s, etc. */
69	static const struct tre_macro_struct {
70	const char c;
71	const char *expansion;
72	} tre_macros[] =
73	{ {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
74	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
75	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
76	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
77	{ 0, NULL }
78	};
79
80
81	/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
82	must have at least `len' items. Sets buf[0] to zero if the there
83	is no match in `tre_macros'. */
84	static void
85	tre_expand_macro(const tre_char_t regex, const tre_char_t regex_end,
86	tre_char_t *buf, size_t buf_len)
87	{
88	int i;
89
90	buf[0] = 0;
91	if (regex >= regex_end)
92	return;
93
94	for (i = 0; tre_macros[i].expansion; i++)
95	{
96	if (tre_macros[i].c == *regex)
97	{
98	unsigned int j;
99	DPRINT(("Expanding macro '%c' => '%s'\n",
100	tre_macros[i].c, tre_macros[i].expansion));
101	for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
102	buf[j] = tre_macros[i].expansion[j];
103	buf[j] = 0;
104	break;
105	}
106	}
107	}
108
109	static reg_errcode_t
5f125488	110	tre_new_item(tre_mem_t __unused mem, int type, int val, int *max_i,
ad3c9f2a A	111	tre_bracket_match_list_t **items)
	112	{
	113	reg_errcode_t status = REG_OK;
	114	tre_bracket_match_list_t array = items;
	115	int i = array->num_bracket_matches;
	116	/* Allocate more space if necessary. */
	117	if (i >= *max_i)
	118	{
	119	tre_bracket_match_list_t *new_items;
	120	DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
	121	/* If the array is already 1024 items large, give up -- there's
	122	probably an error in the regexp (e.g. not a '\0' terminated
	123	string and missing ']') */
	124	if (*max_i >= 1024)
	125	return REG_ESPACE;
	126	max_i = 2;
	127	new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
	128	if (new_items == NULL)
	129	return REG_ESPACE;
	130	*items = array = new_items;
	131	}
	132	array->bracket_matches[i].type = type;
	133	array->bracket_matches[i].value = val;
	134	array->num_bracket_matches++;
	135	return status;
	136	}
	137
	138	#ifndef TRE_USE_SYSTEM_WCTYPE
	139
	140	/* isalnum() and the rest may be macros, so wrap them to functions. */
	141	int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
	142	int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
	143
	144	#ifdef tre_isascii
	145	int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
	146	#else /* !tre_isascii */
	147	int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
	148	#endif /* !tre_isascii */
	149
	150	#ifdef tre_isblank
	151	int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
	152	#else /* !tre_isblank */
	153	int tre_isblank_func(tre_cint_t c) { return ((c == ' ') \|\| (c == '\t')); }
	154	#endif /* !tre_isblank */
	155
	156	int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
	157	int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
	158	int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
	159	int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
	160	int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
	161	int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
	162	int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
	163	int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
	164	int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
	165
	166	struct {
	167	char *name;
	168	int (*func)(tre_cint_t);
	169	} tre_ctype_map[] = {
	170	{ "alnum", &tre_isalnum_func },
	171	{ "alpha", &tre_isalpha_func },
	172	#ifdef tre_isascii
	173	{ "ascii", &tre_isascii_func },
	174	#endif /* tre_isascii */
175	#ifdef tre_isblank
176	{ "blank", &tre_isblank_func },
177	#endif /* tre_isblank */
178	{ "cntrl", &tre_iscntrl_func },
179	{ "digit", &tre_isdigit_func },
180	{ "graph", &tre_isgraph_func },
181	{ "lower", &tre_islower_func },
182	{ "print", &tre_isprint_func },
183	{ "punct", &tre_ispunct_func },
184	{ "space", &tre_isspace_func },
185	{ "upper", &tre_isupper_func },
186	{ "xdigit", &tre_isxdigit_func },
187	{ NULL, NULL}
188	};
189
190	tre_ctype_t tre_ctype(const char *name)
191	{
192	int i;
193	for (i = 0; tre_ctype_map[i].name != NULL; i++)
194	{
195	if (strcmp(name, tre_ctype_map[i].name) == 0)
196	return tre_ctype_map[i].func;
197	}
198	return (tre_ctype_t)0;
199	}
200	#endif /* !TRE_USE_SYSTEM_WCTYPE */
201
202	#define REST(re) (int)(ctx->re_end - (re)), (re)
203
204	#define START_COLLATING_SYMBOLS 16
205	#define MAX_COLLATING_SYMBOL_LEN 4
206
207	typedef struct {
208	const tre_char_t *start;
209	int len;
210	} tre_collating_symbol;
211
212	#include <xlocale.h>
213
214	int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
215
216	#ifdef BSD_COMPATIBILITY
217	static wchar_t
218	tre_search_cnames(const wchar_t *name, size_t len)
219	{
220	size_t low = 0;
221	size_t high = NCNAMES - 1;
222	size_t cur;
223	int cmp;
224
225	while(low <= high)
226	{
227	cur = (low + high) / 2;
228	cmp = wcsncmp(name, cnames[cur].name, len);
229	if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
230	if (cmp > 0) low = cur + 1;
231	else high = cur - 1;
232	}
233	return (wchar_t)-1;
234	}
235	#endif /* BSD_COMPATIBILITY */
236
237	/* Scan the contents of a bracket expression, and create a
238	* tre_bracket_match_list_t encoding the bracket expression. If during
239	* the scan, multi-character collating symbols are detected, switch
240	* into a mode to collect those MCCSs into a tre_collating_symbol
241	* list and pass them back. tre_parse_bracket will use that to
242	* create a new string composed of a union of the bracket expression
243	* without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]\|ch), and
244	* call tre_parse (recursive) to parse that new string (which will
245	* call tre_parse_bracket and tre_parse_bracket_items again. */
246	static reg_errcode_t
247	tre_parse_bracket_items(tre_parse_ctx_t ctx, tre_bracket_match_list_t *items,
248	int items_size, tre_collating_symbol *result)
249	{
250	const tre_char_t *re = ctx->re;
251	const tre_char_t *re_end = ctx->re_end;
252	tre_collating_symbol *col_syms = NULL;
253	tre_collating_symbol *cp = NULL;
254	int n_col_syms = 0;
255	reg_errcode_t status;
256	int max_i = *items_size;
257	int other = 0; /* contains content other than multi-character collating
258	* symbols */
259	int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
260	tre_cint_t min, c;
261	int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
262	int collect_MCCS = 0;
263	const tre_char_t *start;
264
265	for ( ;re < re_end; re++)
266	{
267	switch (*re)
268	{
269	case CHAR_MINUS:
270	/* A first hyphen */
271	if (re == ctx->re)
272	{
273	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
274	min = CHAR_MINUS;
275	other++;
276	range = 0;
277	break;
278	}
279	/* The hyphen is the end range */
280	if (range > 0)
281	{
282	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
283	c = CHAR_MINUS;
284	goto process_end_range;
285	}
286	if (re + 1 >= re_end)
287	{
288	status = REG_EBRACK;
289	goto error;
290	}
291	/* The hyphen is at the end */
292	if (re[1] == CHAR_RBRACKET)
293	{
294	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
295	c = CHAR_MINUS;
296	goto process_begin_range;
297	}
298	/* Two ranges are not allowed to share an endpoint, or begin
299	* range is illegal. */
300	if (range < 0)
301	{
302	status = REG_ERANGE;
303	goto error;
304	}
305	range = 1; /* Expect end range */
306	DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
307	break;
308
309	case CHAR_LBRACKET:
310	if (re + 1 >= re_end)
311	{
312	status = REG_EBRACK;
313	goto error;
314	}
315	switch (re[1])
316	{
317	case CHAR_PERIOD:
318	{
319	re += 2;
320	start = re;
321	for (;; re++)
322	{
323	if (re >= re_end)
324	{
325	status = REG_ECOLLATE;
326	goto error;
327	}
328	if (*re == CHAR_PERIOD)
329	{
330	if (re + 1 >= re_end)
331	{
332	status = REG_ECOLLATE;
333	goto error;
334	}
335	/* Found end */
336	if (re[1] == CHAR_RBRACKET)
337	{
338	DPRINT(("tre_parse_bracket: collating "
339	"symbol: '%.*" STRF "'\n",
340	REST(start - 2)));
341	/* Empty name */
342	if (re == start)
343	{
344	status = REG_ECOLLATE;
345	goto error;
346	}
347	#ifdef BSD_COMPATIBILITY
348	/* Check if the name is in cnames; if so, use
349	the corresponding code */
350	c = tre_search_cnames(start, re - start);
351	if (c != (wchar_t)-1)
352	{
353	re++;
354	goto process_single_character;
355	}
356	#endif /* BSD_COMPATIBILITY */
357	/* Verify this is a known sequence */
358	if (__collate_equiv_value(ctx->loc, start,
359	re - start) <= 0)
360	{
361	status = REG_ECOLLATE;
362	goto error;
363	}
364	/* Process single character collating symbols */
365	if (re - start == 1)
366	{
367	c = *start;
368	re++;
369	goto process_single_character;
370	}
371	/* Inverted MCCSs are undefined */
372	if (invert)
373	{
374	status = REG_ECOLLATE;
375	goto error;
376	}
377	/* Can't have MCCSs as an endpoint to a range */
378	if (range > 0)
379	{
380	status = REG_ERANGE;
381	goto error;
382	}
383	range = -1;
384	/* Switch into MCCS collection mode (if not
385	* already there */
386	#if TRE_DEBUG
387	if (!collect_MCCS)
388	{
389	collect_MCCS = 1;
390	DPRINT(("tre_parse_bracket: Detected MCCS\n"));
391	}
392	#else /* !TRE_DEBUG */
393	collect_MCCS = 1;
394	#endif /* !TRE_DEBUG */
395	/* Allocate a memory block the first time */
396	if (!cp)
397	{
398	if ((col_syms = xmalloc(sizeof(col_syms)
399	(START_COLLATING_SYMBOLS + 2)))
400	== NULL)
401	return REG_ESPACE;
402	cp = col_syms + 1;
403	n_col_syms = START_COLLATING_SYMBOLS;
404	}
405	/* Enlarge the memory block is more is needed */
406	if ((cp - col_syms) - 1 >= n_col_syms)
407	{
408	int i = n_col_syms;
409	tre_collating_symbol *tmp =
410	xrealloc(col_syms, sizeof(col_syms)
411	((n_col_syms *= 2) + 2));
412	if (tmp == NULL)
413	{
414	xfree(col_syms);
415	return REG_ESPACE;
416	}
417	DPRINT(("tre_list_collating_symbols: "
418	"Enlarging col_syms to %d\n",
419	n_col_syms));
420	col_syms = tmp;
421	cp = col_syms + i + 1;
422	}
423	cp->start = start;
424	cp->len = re - start;
425	cp++;
426	re++;
427	break;
428	}
429	}
430	}
431	break;
432	}
433
434	case CHAR_EQUAL:
435	case CHAR_COLON:
436	{
437	/* Process equivalence and character classes */
438	tre_char_t kind = re[1];
439
440	/* Can't have a class as an endpoint to a range */
441	if (range > 0)
442	{
443	status = REG_ERANGE;
444	goto error;
445	}
446	if (!collect_MCCS && range == 0)
447	{
448	status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
449	min, &max_i, items);
450	if (status != REG_OK)
451	goto error;
452	}
453	range = -1;
454	re += 2;
455	start = re;
456	for (;; re++)
457	{
458	if (re >= re_end)
459	{
460	status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
461	goto error;
462	}
463	if (*re == kind)
464	{
465	if (re + 1 >= re_end)
466	{
467	status = kind == CHAR_EQUAL ? REG_ECOLLATE :
468	REG_ECTYPE;
469	goto error;
470	}
471	/* Found end */
472	if (re[1] == CHAR_RBRACKET)
473	{
474	if (re == start)
475	{
476	/* Empty class name */
477	status = kind == CHAR_EQUAL ? REG_ECOLLATE :
478	REG_ECTYPE;
479	goto error;
480	}
481	/* Process equivalence class */
482	if (kind == CHAR_EQUAL)
483	{
484	int equiv;
485
486	DPRINT(("tre_parse_bracket: equivalence: '%.*"
487	STRF "'\n", REST(start - 2)));
488
489	/* While we find the collation value even for
490	multi-character collating elements , we
491	don't (yet) match any collation values
492	against multi-character sequences. We'd have
493	to enumerate those multi-character sequences
494	and like multi-character collating symbols,
495	create a union of those sequences with the
496	rest of the bracket expression. While
497	doable, a bracket expression matching
498	multiple characters, that doesn't explicitly
499	contain multi-character sequences, might
500	be unexpected, so we punt for now. */
501	if ((equiv = __collate_equiv_value(ctx->loc,
502	start, re - start)) <= 0)
503	{
504	/* The standard says that if no collating
505	element if found, we use the collating
506	symbol itself. But __collate_equiv_value
507	doesn't make a distinction between
508	an element that is in a equvalence
509	class with others, or is the only member,
510	so we already know there is no collating
511	symbol. (Note that in the case of a
512	collating element whose collation value
513	is unique, matching against the
514	collating element itself, or against
515	its collation value, is equivalent.) */
516	#ifdef BSD_COMPATIBILITY
517	/* Check if the name is in cnames; if so,
518	use the corresponding code */
519	c = tre_search_cnames(start, re - start);
520	if (c != (wchar_t)-1)
521	{
522	re++;
523	goto process_single_character;
524	}
525	#endif /* BSD_COMPATIBILITY */
526	status = REG_ECOLLATE;
527	goto error;
528	}
529	if (!collect_MCCS)
530	{
531	status = tre_new_item(ctx->mem,
532	TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
533	equiv, &max_i, items);
534	if (status != REG_OK)
535	goto error;
536	}
537	}
538	else
539	{
540	/* Process character class */
541	DPRINT(("tre_parse_bracket: class: '%.*" STRF
542	"'\n", REST(start - 2)));
543	if (!collect_MCCS)
544	{
545	char tmp_str[64];
546	tre_ctype_t class;
547	int len = MIN(re - start, 63);
548	#ifdef TRE_WCHAR
549	{
550	tre_char_t tmp_wcs[64];
551	wcsncpy(tmp_wcs, start, (size_t)len);
552	tmp_wcs[len] = L'\0';
553	#if defined HAVE_WCSRTOMBS
554	{
555	mbstate_t state;
556	const tre_char_t *src = tmp_wcs;
557	memset(&state, '\0', sizeof(state));
558	len = wcsrtombs_l(tmp_str, &src,
559	sizeof(tmp_str), &state,
560	ctx->loc);
561	}
562	#elif defined HAVE_WCSTOMBS
563	len = wcstombs(tmp_str, tmp_wcs, 63);
564	#endif /* defined HAVE_WCSTOMBS */
565	}
566	#else /* !TRE_WCHAR */
567	strncpy(tmp_str, (const char*)start, len);
568	#endif /* !TRE_WCHAR */
569	tmp_str[len] = '\0';
570	DPRINT((" class name: %s\n", tmp_str));
571	class = tre_ctype_l(tmp_str, ctx->loc);
572	if (!class)
573	{
574	status = REG_ECTYPE;
575	goto error;
576	}
577	status = tre_new_item(ctx->mem,
578	TRE_BRACKET_MATCH_TYPE_CLASS,
579	class, &max_i, items);
580	if (status != REG_OK)
581	goto error;
582	}
583	}
584	re++;
585	break;
586	}
587	}
588	}
589	other++;
590	break;
591	}
592
593	default:
594	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
595	c = CHAR_LBRACKET;
596	goto process_single_character;
597	break;
598	}
599	break;
600
601	case CHAR_RBRACKET:
602	/* A first right bracket */
603	if (re == ctx->re)
604	{
605	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
606	min = CHAR_RBRACKET;
607	range = 0;
608	other++;
609	break;
610	}
611	/* Done */
612	if (collect_MCCS)
613	{
614	DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
615	REST(re)));
616	if (col_syms)
617	{
618	/* Mark the character following the right bracket. Set len
619	* to whether there are other things besides the
620	* multi-character collating symbols */
621	col_syms->start = re + 1;
622	col_syms->len = other;
623	/* Mark the end of the list */
624	cp->start = NULL;
625	}
626	*result = col_syms;
627	return REG_OK;
628	}
629	/* range > 0 is not possible, since we did a lookahead after the
630	* hyphen */
631	if (range == 0)
632	{
633	status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
634	min, &max_i, items);
635	if (status != REG_OK)
636	goto error;
637	}
638	DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
639	*items_size = max_i;
640	ctx->re = re + 1;
641	return REG_OK;
642
643	default:
644	DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
645	c = *re;
646	process_single_character:
647	/* Process single character */
648	if (range > 0)
649	{
650	int mine, maxe;
651
652	process_end_range:
653	/* Get collation equivalence values */
654	mine = __collate_equiv_value(ctx->loc, &min, 1);
655	maxe = __collate_equiv_value(ctx->loc, &c, 1);
656	if (maxe < mine)
657	{
658	status = REG_ERANGE;
659	goto error;
660	}
661	if (!collect_MCCS)
662	{
663	status = tre_new_item(ctx->mem,
664	TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
665	mine, &max_i, items);
666	if (status != REG_OK)
667	goto error;
668	status = tre_new_item(ctx->mem,
669	TRE_BRACKET_MATCH_TYPE_RANGE_END,
670	maxe, &max_i, items);
671	if (status != REG_OK)
672	goto error;
673	}
674	range = -1;
675	}
676	else
677	{
678	process_begin_range:
679	if (!collect_MCCS)
680	{
681	if (range == 0)
682	{
683	status = tre_new_item(ctx->mem,
684	TRE_BRACKET_MATCH_TYPE_CHAR,
685	min, &max_i, items);
686	if (status != REG_OK)
687	goto error;
688	}
689	min = c;
690	}
691	range = 0;
692	}
693	other++;
694	break;
695	}
696	}
697	status = REG_EBRACK;
698	error:
699	DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
700	REST(re), status));
701	if (col_syms)
702	xfree(col_syms);
703	return status;
704	}
705
706	#ifdef TRE_DEBUG
707	static const char *bracket_match_type_str[] = {
708	"unused",
709	"char",
710	"range begin",
711	"range end",
712	"class",
713	"equivalence value",
714	};
715	#endif /* TRE_DEBUG */
716
717	static reg_errcode_t
718	tre_parse_bracket(tre_parse_ctx_t ctx, tre_ast_node_t *result)
719	{
2acb8998	720	tre_ast_node_t *node = NULL;
ad3c9f2a A	721	reg_errcode_t status = REG_OK;
	722	tre_bracket_match_list_t *items;
	723	int max_i = 32;
	724	tre_collating_symbol *col_syms = NULL;
	725
	726	/* Handle special cases [[:<:]] and [[:>:]] */
	727	if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
	728	&& ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' \|\| ctx->re[2] == L'>')
	729	&& ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
	730	&& ctx->re[5] == CHAR_RBRACKET)
	731	{
	732	*result = tre_ast_new_literal(ctx->mem, ASSERTION,
	733	(ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
	734	-1);
	735	DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
	736	"[[:<:]]" : "[[:>:]]"));
	737	ctx->re += 6;
	738	return *result ? REG_OK : REG_ESPACE;
	739	}
	740
	741	/* Start off with an array of `max_i' elements. */
	742	items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
	743	if (items == NULL)
	744	return REG_ESPACE;
	745
	746	if (*ctx->re == CHAR_CARET)
	747	{
	748	DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
	749	items->flags \|= TRE_BRACKET_MATCH_FLAG_NEGATE;
	750	ctx->re++;
	751	}
	752
	753	status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
	754
	755	if (status != REG_OK)
	756	goto parse_bracket_done;
	757
	758	/* If there are collating symbols, split off the multi-character ones
	759	* into a union of the bracket expression (without the collating symbols)
	760	* and the multiple-character sequences. We create an equivalent input
	761	* string and run tre_parse() recursively */
	762	if (col_syms)
	763	{
	764	tre_char_t str, sp;
	765	tre_collating_symbol *cp;
	766	tre_parse_ctx_t subctx;
	767
	768	/* Allocate a new string. We start with the size of the original
	769	* bracket expression (minus 1) and add 2 (for a leading "[" and
	770	* a trailing nil; don't need a "^", since it is illegal to have
	771	* inverted MCCSs). Since a multi-character collating symbols
	772	* will be converted from "[.xx.]" to "\|xx" (n+4 to n+1), we don't
	773	* need to worry about the new string getting too long. */
	774	xfree(items);
	775	str = xmalloc(sizeof(str) ((col_syms->start - ctx->re) + 2));
	776	if (str == NULL)
	777	{
	778	xfree(col_syms);
	779	return REG_ESPACE;
	780	}
	781	sp = str;
	782	if (col_syms->len > 0)
	783	{
	784	/* There are other items in the bracket expression besides the
785	* multi-character collating symbols, so create a new bracket
786	* expression with only those other itmes. */
787	const tre_char_t *re;
788	ptrdiff_t i;
789
790	*sp++ = '[';
791	re = ctx->re;
792	for (cp = col_syms + 1; cp->start; cp++)
793	{
794	/* The "- 2" is to account for the "[." */
795	if ((i = ((cp->start - re) - 2)) > 0)
796	{
797	memcpy(sp, re, sizeof(sp) i);
798	sp += i;
799	}
800	/* The "+ 2" is to account for the ".]" */
801	re = cp->start + cp->len + 2;
802	}
803	i = col_syms->start - re; /* Includes the trailing right bracket */
804	memcpy(sp, re, sizeof(sp) i);
805	sp += i;
806	*sp++ = '\|';
807	}
808	for (cp = col_syms + 1; cp->start; cp++)
809	{
810	memcpy(sp, cp->start, sizeof(sp) cp->len);
811	sp += cp->len;
812	if (cp[1].start)
813	*sp++ = '\|';
814	}
815	*sp = 0;
816	DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
817	str));
818
819	memcpy(&subctx, ctx, sizeof(subctx));
820	subctx.re = str;
821	subctx.len = sp - str;
822	subctx.nofirstsub = 1;
823	subctx.cflags \|= REG_EXTENDED; /* Force extended mode for parsing */
824	status = tre_parse(&subctx);
825	xfree(str);
826	if (status != REG_OK)
827	{
828	xfree(col_syms);
829	return status;
830	}
831	ctx->re = col_syms->start;
832	ctx->position = subctx.position;
833	xfree(col_syms);
834	*result = subctx.result;
835	DPRINT(("tre_parse_bracket: Returning to original string\n"));
836	return REG_OK;
837	}
838
839	DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
840	node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
841	if (node == NULL)
842	{
843	status = REG_ESPACE;
844	goto parse_bracket_done;
845	}
846	else
847	{
848	tre_literal_t *l = node->obj;
849	l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
850	SIZEOF_BRACKET_MATCH_LIST(items));
851	if (l->u.bracket_match_list == NULL)
852	{
853	status = REG_ESPACE;
854	goto parse_bracket_done;
855	}
856	memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
857	}
858
859	#ifdef TRE_DEBUG
860	{
861	int i;
862	tre_bracket_match_t *b;
863	DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
864	items->num_bracket_matches, items->flags));
865	for (i = 0, b = items->bracket_matches;
866	i < items->num_bracket_matches; i++, b++)
867	{
868	DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
869	b->value));
870	}
871	}
872	#endif /* TRE_DEBUG */
873
874	parse_bracket_done:
875	xfree(items);
876	ctx->position++;
877	*result = node;
878	return status;
879	}
880
881
882	/* Parses a positive decimal integer. Returns -1 if the string does not
883	contain a valid number. */
884	static int
885	tre_parse_int(const tre_char_t *regex, const tre_char_t regex_end)
886	{
887	int num = -1;
888	const tre_char_t r = regex;
889	while (r < regex_end && r >= L'0' && r <= L'9')
890	{
891	if (num < 0)
892	num = 0;
893	num = num * 10 + *r - L'0';
894	r++;
895	}
896	*regex = r;
897	return num;
898	}
899
900
901	static reg_errcode_t
902	tre_parse_bound(tre_parse_ctx_t ctx, tre_ast_node_t *result)
903	{
904	int min, max;
905	#ifdef TRE_APPROX
906	int i;
907	int cost_ins, cost_del, cost_subst, cost_max;
908	int limit_ins, limit_del, limit_subst, limit_err;
909	const tre_char_t *start;
910	#endif /* TRE_APPROX */
911	const tre_char_t *r = ctx->re;
912	int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
913	#ifdef TRE_APPROX
914	int approx = 0;
915	int costs_set = 0;
916	int counts_set = 0;
917
918	cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
919	limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
920	#endif /* TRE_APPROX */
921
922	/* Parse number (minimum repetition count). */
923	min = -1;
924	if (r >= ctx->re_end)
925	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
926	return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
927	#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
928	return REG_EBRACE;
929	#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
930	if (r >= L'0' && r <= L'9') {
931	DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
932	min = tre_parse_int(&r, ctx->re_end);
933	}
934	#ifndef TRE_APPROX
935	else
936	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
937	/* For ERE, return REG_NOMATCH to signal that the lbrace should
938	be treated as a literal */
939	return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
940	#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
941	return REG_BADBR;
942	#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
943	#endif /* !TRE_APPROX */
944
945	/* Parse comma and second number (maximum repetition count). */
946	max = min;
947	if (r < ctx->re_end && *r == CHAR_COMMA)
948	{
949	r++;
950	DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
951	max = tre_parse_int(&r, ctx->re_end);
952	}
953
954	/* Check that the repeat counts are sane. */
955	if ((max >= 0 && min > max) \|\| min > RE_DUP_MAX \|\| max > RE_DUP_MAX)
956	return REG_BADBR;
957
958
959	#ifdef TRE_APPROX
960	/*
961	'{'
962	optionally followed immediately by a number == minimum repcount
963	optionally followed by , then a number == maximum repcount
964	+ then a number == maximum insertion count
965	- then a number == maximum deletion count
966	# then a number == maximum substitution count
967	~ then a number == maximum number of errors
968	Any of +, -, # or ~ without followed by a number means that
969	the maximum count/number of errors is infinite.
970
971	An equation of the form
972	Xi + Yd + Zs < C
973	can be specified to set costs and the cost limit to a value
974	different from the default value:
975	- X is the cost of an insertion
976	- Y is the cost of a deletion
977	- Z is the cost of a substitution
978	- C is the maximum cost
979
980	If no count limit or cost is set for an operation, the operation
981	is not allowed at all.
982	*/
983
984
985	do {
986	int done;
987	start = r;
988
989	/* Parse count limit settings */
990	done = 0;
991	if (!counts_set)
992	while (r + 1 < ctx->re_end && !done)
993	{
994	switch (*r)
995	{
996	case CHAR_PLUS: /* Insert limit */
997	DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
998	r++;
999	limit_ins = tre_parse_int(&r, ctx->re_end);
1000	if (limit_ins < 0)
1001	limit_ins = INT_MAX;
1002	counts_set = 1;
1003	break;
1004	case CHAR_MINUS: /* Delete limit */
1005	DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
1006	r++;
1007	limit_del = tre_parse_int(&r, ctx->re_end);
1008	if (limit_del < 0)
1009	limit_del = INT_MAX;
1010	counts_set = 1;
1011	break;
1012	case CHAR_HASH: /* Substitute limit */
1013	DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1014	r++;
1015	limit_subst = tre_parse_int(&r, ctx->re_end);
1016	if (limit_subst < 0)
1017	limit_subst = INT_MAX;
1018	counts_set = 1;
1019	break;
1020	case CHAR_TILDE: /* Maximum number of changes */
1021	DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1022	r++;
1023	limit_err = tre_parse_int(&r, ctx->re_end);
1024	if (limit_err < 0)
1025	limit_err = INT_MAX;
1026	approx = 1;
1027	break;
1028	case CHAR_COMMA:
1029	r++;
1030	break;
1031	case L' ':
1032	r++;
1033	break;
1034	case L'}':
1035	done = 1;
1036	break;
1037	default:
1038	done = 1;
1039	break;
1040	}
1041	}
1042
1043	/* Parse cost restriction equation. */
1044	done = 0;
1045	if (!costs_set)
1046	while (r + 1 < ctx->re_end && !done)
1047	{
1048	switch (*r)
1049	{
1050	case CHAR_PLUS:
1051	case L' ':
1052	r++;
1053	break;
1054	case L'<':
1055	DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
1056	r++;
1057	while (*r == L' ')
1058	r++;
1059	cost_max = tre_parse_int(&r, ctx->re_end);
1060	if (cost_max < 0)
1061	cost_max = INT_MAX;
1062	else
1063	cost_max--;
1064	approx = 1;
1065	break;
1066	case CHAR_COMMA:
1067	r++;
1068	done = 1;
1069	break;
1070	default:
1071	if (r >= L'0' && r <= L'9')
1072	{
1073	#ifdef TRE_DEBUG
1074	const tre_char_t *sr = r;
1075	#endif /* TRE_DEBUG */
1076	int cost = tre_parse_int(&r, ctx->re_end);
1077	/* XXX - make sure r is not past end. */
1078	switch (*r)
1079	{
1080	case L'i': /* Insert cost */
1081	DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
1082	REST(sr)));
1083	r++;
1084	cost_ins = cost;
1085	costs_set = 1;
1086	break;
1087	case L'd': /* Delete cost */
1088	DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
1089	REST(sr)));
1090	r++;
1091	cost_del = cost;
1092	costs_set = 1;
1093	break;
1094	case L's': /* Substitute cost */
1095	DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
1096	REST(sr)));
1097	r++;
1098	cost_subst = cost;
1099	costs_set = 1;
1100	break;
1101	default:
1102	return REG_BADBR;
1103	}
1104	}
1105	else
1106	{
1107	done = 1;
1108	break;
1109	}
1110	}
1111	}
1112	} while (start != r);
1113	#endif /* TRE_APPROX */
1114
1115	/{//* Missing }. */
1116	if (r >= ctx->re_end)
1117	return REG_EBRACE;
1118
1119	/* Empty contents of {}. */
1120	if (r == ctx->re)
1121	return REG_BADBR;
1122
1123	/* Parse the ending '}' or '\}'.*/
1124	if (ctx->cflags & REG_EXTENDED)
1125	{
1126	if (r >= ctx->re_end \|\| *r != CHAR_RBRACE)
1127	return REG_BADBR;
1128	r++;
1129	/* Parse trailing '?' marking minimal repetition. */
1130	if (r < ctx->re_end)
1131	{
1132	if (*r == CHAR_QUESTIONMARK)
1133	{
1134	/* Process the question mark only in enhanced mode.
1135	Otherwise, the question mark is an error in ERE
1136	or a literal in BRE */
1137	if (ctx->cflags & REG_ENHANCED)
1138	{
1139	minimal = !(ctx->cflags & REG_UNGREEDY);
1140	r++;
1141	}
1142	else return REG_BADRPT;
1143	}
1144	else if (r == CHAR_STAR \|\| r == CHAR_PLUS)
1145	{
1146	/* These are reserved for future extensions. */
1147	return REG_BADRPT;
1148	}
1149	}
1150	}
1151	else
1152	{
1153	if (r + 1 >= ctx->re_end
1154	\|\| *r != CHAR_BACKSLASH
1155	\|\| *(r + 1) != CHAR_RBRACE)
1156	return REG_BADBR;
1157	r += 2;
1158	if (r < ctx->re_end && *r == CHAR_STAR)
1159	{
1160	/* This is reserved for future extensions. */
1161	return REG_BADRPT;
1162	}
1163	}
1164
1165	if (minimal)
1166	ctx->num_reorder_tags++;
1167
1168	if (!result) goto parse_bound_exit;
1169	/* Create the AST node(s). */
1170	/* Originally, if min == 0 && max == 0, we immediately replace the whole
1171	iteration with EMPTY. This unfortunately drops any submatches, and
1172	messes up setting the pmatch values (we can get tags of -1, and
1173	tag values in the billions). So we leave it and process this case as
1174	usual, and wait until tre_expand_ast() to replace with EMPTY */
1175	#ifdef TRE_APPROX
1176	if (min < 0 && max < 0)
1177	/* Only approximate parameters set, no repetitions. */
1178	min = max = 1;
1179	#endif /* TRE_APPROX */
1180
1181	result = tre_ast_new_iter(ctx->mem, result, min, max, minimal);
1182	if (!*result)
1183	return REG_ESPACE;
1184
1185	#ifdef TRE_APPROX
1186	/* If approximate matching parameters are set, add them to the
1187	iteration node. */
1188	if (approx \|\| costs_set \|\| counts_set)
1189	{
1190	int *params;
1191	tre_iteration_t iter = (result)->obj;
1192
1193	if (costs_set \|\| counts_set)
1194	{
1195	if (limit_ins == TRE_PARAM_UNSET)
1196	{
1197	if (cost_ins == TRE_PARAM_UNSET)
1198	limit_ins = 0;
1199	else
1200	limit_ins = INT_MAX;
1201	}
1202
1203	if (limit_del == TRE_PARAM_UNSET)
1204	{
1205	if (cost_del == TRE_PARAM_UNSET)
1206	limit_del = 0;
1207	else
1208	limit_del = INT_MAX;
1209	}
1210
1211	if (limit_subst == TRE_PARAM_UNSET)
1212	{
1213	if (cost_subst == TRE_PARAM_UNSET)
1214	limit_subst = 0;
1215	else
1216	limit_subst = INT_MAX;
1217	}
1218	}
1219
1220	if (cost_max == TRE_PARAM_UNSET)
1221	cost_max = INT_MAX;
1222	if (limit_err == TRE_PARAM_UNSET)
1223	limit_err = INT_MAX;
1224
1225	ctx->have_approx = 1;
1226	params = tre_mem_alloc(ctx->mem, sizeof(params) TRE_PARAM_LAST);
1227	if (!params)
1228	return REG_ESPACE;
1229	for (i = 0; i < TRE_PARAM_LAST; i++)
1230	params[i] = TRE_PARAM_UNSET;
1231	params[TRE_PARAM_COST_INS] = cost_ins;
1232	params[TRE_PARAM_COST_DEL] = cost_del;
1233	params[TRE_PARAM_COST_SUBST] = cost_subst;
1234	params[TRE_PARAM_COST_MAX] = cost_max;
1235	params[TRE_PARAM_MAX_INS] = limit_ins;
1236	params[TRE_PARAM_MAX_DEL] = limit_del;
1237	params[TRE_PARAM_MAX_SUBST] = limit_subst;
1238	params[TRE_PARAM_MAX_ERR] = limit_err;
1239	iter->params = params;
1240	}
1241	#endif /* TRE_APPROX */
1242
1243	parse_bound_exit:
1244	#ifdef TRE_APPROX
1245	DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1246	"limits [%d,%d,%d, total %d]\n",
1247	min, max, cost_ins, cost_del, cost_subst, cost_max,
1248	limit_ins, limit_del, limit_subst, limit_err));
1249	#else /* !TRE_APPROX */
1250	DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1251	#endif /* !TRE_APPROX */
1252
1253
1254	ctx->re = r;
1255	return REG_OK;
1256	}
1257
1258	/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1259	non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1260	treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1261	Because we now set up tags for even non-capturing parenthesized
1262	subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1263	pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1264	have it restore cflags after the subexpression, we don't need to have
1265	a separate PARSE_RESTORE_CFLAGS, and then after processing the
1266	non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1267	This has the side-benefit of now matching the perl behavior: the RE
1268	foo(?i)bar\|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1269	of foo AND (?i) (bar OR zap). */
1270	typedef enum {
1271	PARSE_RE = 0,
1272	PARSE_ATOM,
1273	PARSE_MARK_FOR_SUBMATCH,
1274	PARSE_BRANCH,
1275	PARSE_PIECE,
1276	PARSE_CATENATION,
1277	PARSE_POST_CATENATION,
1278	PARSE_UNION,
1279	PARSE_POST_UNION,
1280	PARSE_POSTFIX,
1281	} tre_parse_re_stack_symbol_t;
1282
1283
1284	reg_errcode_t
1285	tre_parse(tre_parse_ctx_t *ctx)
1286	{
1287	tre_ast_node_t *result = NULL;
1288	tre_parse_re_stack_symbol_t symbol;
1289	reg_errcode_t status = REG_OK;
1290	tre_stack_t *stack = ctx->stack;
1291	int bottom = tre_stack_num_objects(stack);
1292	int depth = 0;
1293	int temporary_cflags = 0;
1294	int bre_branch_begin;
1295	#ifdef TRE_DEBUG
1296	const tre_char_t *tmp_re;
1297	#endif
1298
1299	DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1300	ctx->len, ctx->re, ctx->len, ctx->cflags));
1301
1302	if (ctx->len <= 0) return REG_EMPTY;
1303	if (!ctx->nofirstsub)
1304	{
1305	STACK_PUSH(stack, int, ctx->cflags);
1306	STACK_PUSH(stack, int, ctx->submatch_id);
1307	STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1308	ctx->submatch_id++;
1309	}
1310	STACK_PUSH(stack, int, 0); // bre_branch_begin
1311	STACK_PUSH(stack, int, PARSE_RE);
1312	ctx->re_start = ctx->re;
1313	ctx->re_end = ctx->re + ctx->len;
1314
1315
1316	/* The following is basically just a recursive descent parser. I use
1317	an explicit stack instead of recursive functions mostly because of
1318	two reasons: compatibility with systems which have an overflowable
1319	call stack, and efficiency (both in lines of code and speed). */
1320	while (tre_stack_num_objects(stack) > bottom)
1321	{
1322	symbol = tre_stack_pop_int(stack);
1323	switch (symbol)
1324	{
1325	case PARSE_RE:
1326	/* Parse a full regexp. A regexp is one or more branches,
1327	separated by the union operator `\|'. */
1328	bre_branch_begin = tre_stack_pop_int(stack);
1329	if (
1330	#ifdef REG_LITERAL
1331	!(ctx->cflags & REG_LITERAL) &&
1332	#endif /* REG_LITERAL */
1333	ctx->cflags & (REG_EXTENDED \| REG_ENHANCED))
1334	STACK_PUSHX(stack, int, PARSE_UNION);
1335	STACK_PUSHX(stack, int, bre_branch_begin);
1336	STACK_PUSHX(stack, int, PARSE_BRANCH);
1337	break;
1338
1339	case PARSE_BRANCH:
1340	/* Parse a branch. A branch is one or more pieces, concatenated.
1341	A piece is an atom possibly followed by a postfix operator. */
1342	bre_branch_begin = tre_stack_pop_int(stack);
1343	STACK_PUSHX(stack, int, PARSE_CATENATION);
1344	STACK_PUSHX(stack, int, bre_branch_begin);
1345	STACK_PUSHX(stack, int, PARSE_PIECE);
1346	break;
1347
1348	case PARSE_PIECE:
1349	/* Parse a piece. A piece is an atom possibly followed by one
1350	or more postfix operators. */
1351	bre_branch_begin = tre_stack_pop_int(stack);
1352	STACK_PUSHX(stack, int, PARSE_POSTFIX);
1353	STACK_PUSHX(stack, int, bre_branch_begin);
1354	STACK_PUSHX(stack, int, PARSE_ATOM);
1355	break;
1356
1357	case PARSE_CATENATION:
1358	/* If the expression has not ended, parse another piece. */
1359	{
1360	tre_char_t c;
1361	if (ctx->re >= ctx->re_end)
1362	break;
1363	c = *ctx->re;
1364	#ifdef REG_LITERAL
1365	if (!(ctx->cflags & REG_LITERAL))
1366	{
1367	#endif /* REG_LITERAL */
1368	if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) \|\|
1369	((ctx->cflags & (REG_EXTENDED \| REG_ENHANCED)) == REG_ENHANCED
1370	&& ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1371	*(ctx->re + 1) == CHAR_PIPE))
1372	break;
1373	if ((ctx->cflags & REG_EXTENDED
1374	&& c == CHAR_RPAREN && depth > 0)
1375	\|\| (!(ctx->cflags & REG_EXTENDED)
1376	&& ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1377	&& *(ctx->re + 1) == CHAR_RPAREN))
1378	{
1379	if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1380	return REG_EPAREN;
1381	DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1382	REST(ctx->re)));
1383	depth--;
1384	if (!(ctx->cflags & (REG_EXTENDED \| REG_ENHANCED)))
1385	ctx->re += 2;
1386	break;
1387	}
1388	#ifdef REG_LITERAL
1389	}
1390	#endif /* REG_LITERAL */
1391
1392	#ifdef REG_LEFT_ASSOC
1393	if (ctx->cflags & REG_LEFT_ASSOC)
1394	{
1395	/* Left associative concatenation. */
1396	STACK_PUSHX(stack, int, PARSE_CATENATION);
1397	STACK_PUSHX(stack, voidptr, result);
1398	STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1399	STACK_PUSHX(stack, int, 0); // bre_branch_begin
1400	STACK_PUSHX(stack, int, PARSE_PIECE);
1401	}
1402	else
1403	#endif /* REG_LEFT_ASSOC */
1404	{
1405	/* Default case, right associative concatenation. */
1406	STACK_PUSHX(stack, voidptr, result);
1407	STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1408	STACK_PUSHX(stack, int, PARSE_CATENATION);
1409	STACK_PUSHX(stack, int, 0); // bre_branch_begin
1410	STACK_PUSHX(stack, int, PARSE_PIECE);
1411	}
1412	break;
1413	}
1414
1415	case PARSE_POST_CATENATION:
1416	{
1417	tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1418	tre_ast_node_t *tmp_node;
1419	tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1420	if (!tmp_node)
1421	return REG_ESPACE;
1422	result = tmp_node;
1423	break;
1424	}
1425
1426	case PARSE_UNION:
1427	if (ctx->re >= ctx->re_end)
1428	break;
1429	#ifdef REG_LITERAL
1430	if (ctx->cflags & REG_LITERAL)
1431	break;
1432	#endif /* REG_LITERAL */
1433	if (!(ctx->cflags & REG_EXTENDED))
1434	{
1435	if (*ctx->re != CHAR_BACKSLASH \|\| ctx->re + 1 >= ctx->re_end)
1436	break;
1437	ctx->re++;
1438	}
1439	switch (*ctx->re)
1440	{
1441	case CHAR_PIPE:
1442	DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1443	REST(ctx->re)));
1444	STACK_PUSHX(stack, int, PARSE_UNION);
1445	STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1446	STACK_PUSHX(stack, voidptr, result);
1447	STACK_PUSHX(stack, int, PARSE_POST_UNION);
1448	/* We need to pass a boolean (eventually) to PARSE_ATOM to
1449	indicate if this is the beginning of a BRE extended branch. */
1450	STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED \| REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1451	STACK_PUSHX(stack, int, PARSE_BRANCH);
1452	ctx->re++;
1453	break;
1454
1455	case CHAR_RPAREN:
1456	ctx->re++;
1457	break;
1458
1459	default:
1460	if (!(ctx->cflags & REG_EXTENDED))
1461	ctx->re--;
1462	break;
1463	}
1464	break;
1465
1466	case PARSE_POST_UNION:
1467	{
1468	tre_ast_node_t *tmp_node;
1469	tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1470	const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1471	/* error on empty expression at end of union */
1472	if (pipechar == ctx->re - 1)
1473	{
1474	return REG_EMPTY;
1475	}
1476	tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1477	if (!tmp_node)
1478	return REG_ESPACE;
1479	result = tmp_node;
1480	break;
1481	}
1482
1483	case PARSE_POSTFIX:
1484	/* Parse postfix operators. */
1485	if (ctx->re >= ctx->re_end)
1486	break;
1487	#ifdef REG_LITERAL
1488	if (ctx->cflags & REG_LITERAL)
1489	break;
1490	#endif /* REG_LITERAL */
1491	int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1492	int rep_min = 0;
1493	int rep_max = -1;
1494	#ifdef TRE_DEBUG
1495	int lbrace_off;
1496	#endif
1497	switch (*ctx->re)
1498	{
1499	case CHAR_PLUS:
1500	case CHAR_QUESTIONMARK:
1501	if (!(ctx->cflags & REG_EXTENDED))
1502	break;
1503	/FALLTHROUGH/
1504	case CHAR_STAR:
1505	{
1506	tre_ast_node_t *tmp_node;
1507	#ifdef TRE_DEBUG
1508	const char *tstr = "star";
1509	tmp_re = ctx->re;
1510	#endif
1511
1512	handle_plus_or_question:
1513	/* error on iteration of raw assertion (not in subexpression) */
1514	if (result->type == LITERAL && result->submatch_id < 0 &&
1515	IS_ASSERTION((tre_literal_t *)result->obj))
1516	{
1517	if (!(ctx->cflags & REG_EXTENDED)) break;
1518	return REG_BADRPT;
1519	}
1520	if (*ctx->re == CHAR_PLUS)
1521	{
1522	rep_min = 1;
1523	#ifdef TRE_DEBUG
1524	tstr = "plus";
1525	#endif
1526	}
1527	if (*ctx->re == CHAR_QUESTIONMARK)
1528	{
1529	rep_max = 1;
1530	#ifdef TRE_DEBUG
1531	tstr = "questionmark";
1532	#endif
1533	}
1534
1535	if (ctx->cflags & REG_EXTENDED)
1536	{
1537	if (ctx->re + 1 < ctx->re_end)
1538	{
1539	if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1540	{
1541	/* Process the question mark only in enhanced mode.
1542	Otherwise, the question mark is an error in ERE */
1543	if (ctx->cflags & REG_ENHANCED)
1544	{
1545	minimal = !(ctx->cflags & REG_UNGREEDY);
1546	ctx->re++;
1547	}
1548	else return REG_BADRPT;
1549	}
1550	else if (*(ctx->re + 1) == CHAR_STAR
1551	\|\| *(ctx->re + 1) == CHAR_PLUS)
1552	{
1553	/* These are reserved for future extensions. */
1554	return REG_BADRPT;
1555	}
1556	}
1557	}
1558	else
1559	{
1560	if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1561	{
1562	/* This is reserved for future extensions. */
1563	return REG_BADRPT;
1564	}
1565	if (ctx->re + 2 < ctx->re_end)
1566	{
1567	if ((ctx->re + 1) == CHAR_BACKSLASH && (ctx->re + 1) == CHAR_QUESTIONMARK)
1568	{
1569	/* Process the question mark only in enhanced mode.
1570	Otherwise, the question mark is a literal in BRE */
1571	if (ctx->cflags & REG_ENHANCED)
1572	{
1573	minimal = !(ctx->cflags & REG_UNGREEDY);
1574	ctx->re += 2;
1575	}
1576	}
1577	else if ((ctx->re + 1) == CHAR_BACKSLASH && (ctx->re + 2) == CHAR_PLUS)
1578	{
1579	/* This is reserved for future extensions. */
1580	return REG_BADRPT;
1581	}
1582	}
1583	}
1584
1585	if (minimal)
1586	ctx->num_reorder_tags++;
1587
1588	DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1589	minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1590	if (result == NULL)
1591	{
1592	if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1593	else goto parse_literal;
1594	}
1595	ctx->re++;
1596	tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1597	minimal);
1598	if (tmp_node == NULL)
1599	return REG_ESPACE;
1600	result = tmp_node;
1601
1602	/* Set the iterator with a submatch id in the invisible range
1603	* (which will be overridden if a real submatch is needed) */
1604	result->submatch_id = ctx->submatch_id_invisible++;
1605
1606	#if 0
1607	/* We don't allow multiple postfixes, but this might be needed
1608	to support approximate matching */
1609	STACK_PUSHX(stack, int, PARSE_POSTFIX);
1610	#endif
1611	}
1612	break;
1613
1614	case CHAR_BACKSLASH:
1615	/* "\{" is special without REG_EXTENDED */
1616	/* "\+" and "\?" are special with REG_ENHANCED for BRE */
1617	if (!(ctx->cflags & REG_EXTENDED)
1618	&& ctx->re + 1 < ctx->re_end)
1619	{
1620	switch (*(ctx->re + 1))
1621	{
1622	case CHAR_LBRACE:
1623	ctx->re++;
1624	#ifdef TRE_DEBUG
1625	lbrace_off = 2;
1626	#endif
1627	goto parse_brace;
1628	case CHAR_PLUS:
1629	case CHAR_QUESTIONMARK:
1630	if (ctx->cflags & REG_ENHANCED)
1631	{
1632	#ifdef TRE_DEBUG
1633	tmp_re = ctx->re;
1634	#endif
1635	ctx->re++;
1636	goto handle_plus_or_question;
1637	}
1638	break;
1639	}
1640	break;
1641	}
1642	else
1643	break;
1644
1645	case CHAR_LBRACE:
1646	{
1647	int raw_assertion;
1648
1649	/* "{" is literal without REG_EXTENDED */
1650	if (!(ctx->cflags & REG_EXTENDED))
1651	break;
1652	#ifdef TRE_DEBUG
1653	lbrace_off = 1;
1654	#endif
1655
1656	parse_brace:
1657	/* error on iteration of raw assertion (not in subexpression),
1658	but wait until after parsing bounds */
1659	raw_assertion = (result->type == LITERAL
1660	&& result->submatch_id < 0
1661	&& IS_ASSERTION((tre_literal_t *)result->obj));
1662	ctx->re++;
1663
1664	status = tre_parse_bound(ctx, &result);
1665	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1666	/* For ERE, if status is REG_NOMATCH, this mean the lbrace
1667	is to be treated as a literal. */
1668	if (status == REG_NOMATCH)
1669	{
1670	ctx->re--;
1671	break;
1672	}
1673	#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1674	DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1675	REST(ctx->re - lbrace_off)));
1676	if (status != REG_OK)
1677	return status;
1678	if (raw_assertion) return REG_BADRPT;
1679
1680	/* Set the iterator with a submatch id in the invisible range
1681	* (which will be overridden if a real submatch is needed) */
1682	if (result->type == ITERATION)
1683	result->submatch_id = ctx->submatch_id_invisible++;
1684
1685	#if 0
1686	/* We don't allow multiple postfixes, but this might be needed
1687	to support approximate matching */
1688	STACK_PUSHX(stack, int, PARSE_POSTFIX);
1689	#endif
1690	break;
1691	}
1692	}
1693	break;
1694
1695	case PARSE_ATOM:
1696	{
1697	/* Parse an atom. An atom is a regular expression enclosed in `()',
1698	an empty set of `()', a bracket expression, `.', `^', `$',
1699	a `\' followed by a character, or a single character. */
1700
1701	/* The stack contains a boolean value, whether PARSE_ATOM is
1702	being called just after the start of a group (left paren)
1703	in a BRE */
1704	bre_branch_begin = tre_stack_pop_int(stack);
1705
1706	/* End of regexp? (empty string). */
1707	if (ctx->re >= ctx->re_end)
1708	goto parse_literal;
1709
1710	#ifdef REG_LITERAL
1711	if (ctx->cflags & REG_LITERAL)
1712	goto parse_literal;
1713	#endif /* REG_LITERAL */
1714
1715	switch (*ctx->re)
1716	{
1717	case CHAR_LPAREN: /* parenthesized subexpression */
1718
1719	/* Handle "(?...)" extensions. They work in a way similar
1720	to Perls corresponding extensions. */
1721	if ((ctx->cflags & (REG_EXTENDED\|REG_ENHANCED)) ==
1722	(REG_EXTENDED\|REG_ENHANCED)
1723	&& *(ctx->re + 1) == CHAR_QUESTIONMARK)
1724	{
1725	int new_cflags = ctx->cflags;
1726	int bit = 1;
1727	int invisible_submatch = 0;
1728	DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1729	REST(ctx->re)));
1730	ctx->re += 2;
1731	while (/CONSTCOND/1)
1732	{
1733	if (*ctx->re == L'i')
1734	{
1735	DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
1736	REST(ctx->re)));
1737	if (bit)
1738	new_cflags \|= REG_ICASE;
1739	else
1740	new_cflags &= ~REG_ICASE;
1741	ctx->re++;
1742	}
1743	else if (*ctx->re == L'n')
1744	{
1745	DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
1746	REST(ctx->re)));
1747	if (bit)
1748	new_cflags \|= REG_NEWLINE;
1749	else
1750	new_cflags &= ~REG_NEWLINE;
1751	ctx->re++;
1752	}
1753	#ifdef REG_LEFT_ASSOC
1754	else if (*ctx->re == L'l')
1755	{
1756	DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1757	REST(ctx->re)));
1758	if (bit)
1759	new_cflags \|= REG_LEFT_ASSOC;
1760	else
1761	new_cflags &= ~REG_LEFT_ASSOC;
1762	ctx->re++;
1763	}
1764	#endif /* REG_LEFT_ASSOC */
1765	#ifdef REG_UNGREEDY
1766	else if (*ctx->re == L'U')
1767	{
1768	DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
1769	REST(ctx->re)));
1770	if (bit)
1771	new_cflags \|= REG_UNGREEDY;
1772	else
1773	new_cflags &= ~REG_UNGREEDY;
1774	ctx->re++;
1775	}
1776	#endif /* REG_UNGREEDY */
1777	else if (*ctx->re == CHAR_MINUS)
1778	{
1779	DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
1780	REST(ctx->re)));
1781	ctx->re++;
1782	bit = 0;
1783	}
1784	else if (*ctx->re == CHAR_COLON)
1785	{
1786	DPRINT(("tre_parse: no group: '%.*" STRF
1787	"', (invisible submatch %d)\n",
1788	REST(ctx->re), ctx->submatch_id_invisible));
1789	ctx->re++;
1790	depth++;
1791	invisible_submatch = 1;
1792	break;
1793	}
1794	else if (*ctx->re == CHAR_HASH)
1795	{
1796	DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
1797	REST(ctx->re)));
1798	/* A comment can contain any character except a
1799	right parenthesis */
1800	while (*ctx->re != CHAR_RPAREN
1801	&& ctx->re < ctx->re_end)
1802	ctx->re++;
1803	if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1804	{
1805	ctx->re++;
1806	break;
1807	}
1808	else
1809	return REG_BADPAT;
1810	}
1811	else if (*ctx->re == CHAR_RPAREN)
1812	{
1813	ctx->re++;
1814	break;
1815	}
1816	else
1817	return REG_BADRPT;
1818	}
1819
1820	/* Turn on the cflags changes for the rest of the
1821	enclosing group. */
1822	if (invisible_submatch)
1823	{
1824	STACK_PUSHX(stack, int, ctx->cflags);
1825	STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1826	STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1827	ctx->submatch_id_invisible++;
1828	STACK_PUSHX(stack, int, 0); // bre_branch_begin
1829	STACK_PUSHX(stack, int, PARSE_RE);
1830	}
1831	else {
1832	STACK_PUSHX(stack, int, 0); // bre_branch_begin
1833	STACK_PUSHX(stack, int, PARSE_ATOM);
1834	}
1835	ctx->cflags = new_cflags;
1836	break;
1837	}
1838
1839	if (ctx->cflags & REG_EXTENDED)
1840	{
1841	parse_bre_lparen:
1842	DPRINT(("tre_parse: group begin: '%.*" STRF
1843	"', submatch %d\n", REST(ctx->re),
1844	ctx->submatch_id));
1845	ctx->re++;
1846	/* First parse a whole RE, then mark the resulting tree
1847	for submatching. */
1848	STACK_PUSHX(stack, int, ctx->cflags);
1849	STACK_PUSHX(stack, int, ctx->submatch_id);
1850	STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1851	/* We need to pass a boolean (eventually) to PARSE_ATOM to
1852	indicate if this is the beginning of a BRE group. */
1853	STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1854	STACK_PUSHX(stack, int, PARSE_RE);
1855	ctx->submatch_id++;
1856	depth++;
1857	}
1858	else
1859	goto parse_literal;
1860	break;
1861
1862	case CHAR_RPAREN: /* end of current subexpression */
1863	if (ctx->cflags & REG_EXTENDED && depth > 0)
1864	{
1865	parse_bre_rparen_empty:
1866	if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1867	return REG_EPAREN;
1868	DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1869	REST(ctx->re)));
1870	/* We were expecting an atom, but instead the current
1871	subexpression was closed. POSIX leaves the meaning of
1872	this to be implementation-defined. We interpret this as
1873	an empty expression (which matches an empty string). */
1874	result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1875	if (result == NULL)
1876	return REG_ESPACE;
1877	if (!(ctx->cflags & REG_EXTENDED))
1878	ctx->re--;
1879	}
1880	else
1881	goto parse_literal;
1882	break;
1883
1884	case CHAR_LBRACKET: /* bracket expression */
1885	DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1886	REST(ctx->re)));
1887	ctx->re++;
1888	status = tre_parse_bracket(ctx, &result);
1889	if (status != REG_OK)
1890	return status;
1891	break;
1892
1893	case CHAR_BACKSLASH:
1894	/* Deal with "$", "$" or "\{" for BREs */
1895	if (!(ctx->cflags & REG_EXTENDED)
1896	&& ctx->re + 1 < ctx->re_end)
1897	{
1898	if (*(ctx->re + 1) == CHAR_LPAREN)
1899	{
1900	ctx->re++;
1901	goto parse_bre_lparen;
1902	}
1903	else if (*(ctx->re + 1) == CHAR_RPAREN)
1904	{
1905	ctx->re++;
1906	goto parse_bre_rparen_empty;
1907	}
1908	if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1909	}
1910
1911	if (ctx->re + 1 >= ctx->re_end)
1912	/* Trailing backslash. */
1913	return REG_EESCAPE;
1914
1915	if (!(ctx->cflags & REG_ENHANCED))
1916	{
1917	DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1918	ctx->re++;
1919	goto unenhanced_backslash;
1920	}
1921
1922	/* If a macro is used, parse the expanded macro recursively. */
1923	{
1924	tre_char_t buf[64];
1925	tre_expand_macro(ctx->re + 1, ctx->re_end,
1926	buf, elementsof(buf));
1927	if (buf[0] != 0)
1928	{
1929	tre_parse_ctx_t subctx;
1930	memcpy(&subctx, ctx, sizeof(subctx));
1931	subctx.re = buf;
1932	subctx.len = tre_strlen(buf);
1933	subctx.nofirstsub = 1;
1934	status = tre_parse(&subctx);
1935	if (status != REG_OK)
1936	return status;
1937	ctx->re += 2;
1938	ctx->position = subctx.position;
1939	result = subctx.result;
1940	break;
1941	}
1942	}
1943
1944	#ifdef REG_LITERAL
1945	if (*(ctx->re + 1) == L'Q')
1946	{
1947	DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1948	REST(ctx->re)));
1949	ctx->cflags \|= REG_LITERAL;
1950	temporary_cflags \|= REG_LITERAL;
1951	ctx->re += 2;
1952	STACK_PUSHX(stack, int, 0);
1953	STACK_PUSHX(stack, int, PARSE_ATOM);
1954	break;
1955	}
1956	#endif /* REG_LITERAL */
1957
1958	DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1959	ctx->re++;
1960	switch (*ctx->re)
1961	{
1962	case L'b':
1963	result = tre_ast_new_literal(ctx->mem, ASSERTION,
1964	ASSERT_AT_WB, -1);
1965	ctx->re++;
1966	break;
1967	case L'B':
1968	result = tre_ast_new_literal(ctx->mem, ASSERTION,
1969	ASSERT_AT_WB_NEG, -1);
1970	ctx->re++;
1971	break;
1972	case L'<':
1973	result = tre_ast_new_literal(ctx->mem, ASSERTION,
1974	ASSERT_AT_BOW, -1);
1975	ctx->re++;
1976	break;
1977	case L'>':
1978	result = tre_ast_new_literal(ctx->mem, ASSERTION,
1979	ASSERT_AT_EOW, -1);
1980	ctx->re++;
1981	break;
1982	case L'x':
1983	ctx->re++;
1984	if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1985	{
1986	/* 8 bit hex char. */
1987	char tmp[3] = {0, 0, 0};
1988	long val;
1989	DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1990	REST(ctx->re - 2)));
1991
1992	if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1993	ctx->re < ctx->re_end)
1994	{
1995	tmp[0] = (char)ctx->re[0];
1996	ctx->re++;
1997	}
1998	if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1999	ctx->re < ctx->re_end)
2000	{
2001	tmp[1] = (char)ctx->re[0];
2002	ctx->re++;
2003	}
2004	val = strtol(tmp, NULL, 16);
2005	result = tre_ast_new_literal(ctx->mem, (int)val,
2006	(int)val, ctx->position);
2007	ctx->position++;
2008	break;
2009	}
2010	else if (ctx->re < ctx->re_end)
2011	{
2012	/* Wide char. */
2013	char tmp[32];
2014	long val;
2015	int i = 0;
2016	ctx->re++;
2017	while (ctx->re_end - ctx->re >= 0)
2018	{
2acb8998 A	2019	if (i == sizeof(tmp))
2acb8998 A	2020	return REG_EBRACE;
ad3c9f2a A	2021	if (ctx->re[0] == CHAR_RBRACE)
	2022	break;
	2023	if (tre_isxdigit_l(ctx->re[0], ctx->loc))
	2024	{
	2025	tmp[i] = (char)ctx->re[0];
	2026	i++;
	2027	ctx->re++;
	2028	continue;
	2029	}
	2030	return REG_EBRACE;
	2031	}
	2032	ctx->re++;
	2033	tmp[i] = 0;
	2034	val = strtol(tmp, NULL, 16);
	2035	result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
	2036	ctx->position);
	2037	ctx->position++;
	2038	break;
	2039	}
	2040	/FALLTHROUGH/
	2041
	2042	default:
	2043	unenhanced_backslash:
	2044	if ((ctx->cflags & (REG_EXTENDED \| REG_ENHANCED)) !=
	2045	REG_EXTENDED &&
	2046	tre_isdigit_l(ctx->re, ctx->loc) && ctx->re != L'0')
	2047	{
	2048	/* Back reference (only in BRE or enhanced). */
	2049	int val = *ctx->re - L'0';
	2050	DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
	2051	REST(ctx->re - 1)));
	2052	result = tre_ast_new_literal(ctx->mem, BACKREF, val,
	2053	ctx->position);
	2054	if (result == NULL)
	2055	return REG_ESPACE;
	2056
	2057	/* Set the backref with a submatch id in the invisible
	2058	* range (which will be overridden if a real submatch
	2059	* is needed) */
	2060	result->submatch_id = ctx->submatch_id_invisible++;
	2061
	2062	ctx->position++;
	2063	ctx->num_reorder_tags++;
	2064	ctx->max_backref = MAX(val, ctx->max_backref);
	2065	ctx->re++;
	2066	}
	2067	else
	2068	{
	2069	/* Escaped character. */
	2070	DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
	2071	REST(ctx->re - 1)));
	2072	result = tre_ast_new_literal(ctx->mem, ctx->re, ctx->re,
	2073	ctx->position);
	2074	ctx->position++;
	2075	ctx->re++;
	2076	}
	2077	break;
	2078	}
	2079	if (result == NULL)
	2080	return REG_ESPACE;
	2081	break;
	2082
	2083	case CHAR_PERIOD: /* the any-symbol */
	2084	DPRINT(("tre_parse: any: '%.*" STRF "'\n",
2085	REST(ctx->re)));
2086	if (ctx->cflags & REG_NEWLINE)
2087	{
2088	tre_ast_node_t *tmp1;
2089	tre_ast_node_t *tmp2;
2090	tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2091	ctx->position);
2092	if (!tmp1)
2093	return REG_ESPACE;
2094	tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2095	ctx->position + 1);
2096	if (!tmp2)
2097	return REG_ESPACE;
2098	result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2099	if (!result)
2100	return REG_ESPACE;
2101	ctx->position += 2;
2102	}
2103	else
2104	{
2105	result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2106	ctx->position);
2107	if (!result)
2108	return REG_ESPACE;
2109	ctx->position++;
2110	}
2111	ctx->re++;
2112	break;
2113
2114	case CHAR_CARET: /* beginning of line assertion */
2115	/* '^' has a special meaning everywhere in EREs, at the
2116	beginning of the RE and after \( is BREs. It is also
2117	special in enhanced BREs at the beginning of each branches
2118	of a union */
2119	if (ctx->cflags & REG_EXTENDED
2120	\|\| bre_branch_begin
2121	\|\| ctx->re == ctx->re_start)
2122	{
2123	DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
2124	REST(ctx->re)));
2125	result = tre_ast_new_literal(ctx->mem, ASSERTION,
2126	ASSERT_AT_BOL, -1);
2127	if (result == NULL)
2128	return REG_ESPACE;
2129	ctx->re++;
2130	}
2131	else
2132	goto parse_literal;
2133	break;
2134
2135	case CHAR_DOLLAR: /* end of line assertion. */
2136	/* '$' is special everywhere in EREs, and in the end of the
2137	string and before \) is BREs. */
2138	if (ctx->cflags & REG_EXTENDED
2139	\|\| (ctx->re + 2 < ctx->re_end
2140	&& *(ctx->re + 1) == CHAR_BACKSLASH
2141	&& *(ctx->re + 2) == CHAR_RPAREN)
2142	\|\| ctx->re + 1 == ctx->re_end)
2143	{
2144	DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
2145	REST(ctx->re)));
2146	result = tre_ast_new_literal(ctx->mem, ASSERTION,
2147	ASSERT_AT_EOL, -1);
2148	if (result == NULL)
2149	return REG_ESPACE;
2150	ctx->re++;
2151	}
2152	else
2153	goto parse_literal;
2154	break;
2155
2156	default:
2157	parse_literal:
2158
2159	if (temporary_cflags && ctx->re + 1 < ctx->re_end
2160	&& ctx->re == CHAR_BACKSLASH && (ctx->re + 1) == L'E')
2161	{
2162	DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
2163	REST(ctx->re)));
2164	ctx->cflags &= ~temporary_cflags;
2165	temporary_cflags = 0;
2166	ctx->re += 2;
2167	if (ctx->re < ctx->re_end)
2168	{
2169	STACK_PUSHX(stack, int, 0);
2170	STACK_PUSHX(stack, int, PARSE_ATOM);
2171	}
2172	else
2173	{
2174	result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2175	if (!result) return REG_ESPACE;
2176	}
2177	break;
2178	}
2179
2180
2181	/* We are expecting an atom. If the subexpression (or the whole
2182	regexp ends here, we interpret it as an empty expression
2183	(which matches an empty string), which is an error.
2184	Iterations of an empty expression is also an error. */
2185	#ifdef REG_LITERAL
2186	if (!(ctx->cflags & REG_LITERAL))
2187	{
2188	#endif /* REG_LITERAL */
2189	/* error on end of string */
2190	if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2191	: REG_EMPTY;
2192	/* error on unions and iterations of empty expressions */
2193	if (ctx->cflags & REG_EXTENDED)
2194	{
2195	if (ctx->re < ctx->re_end)
2196	{
2197	if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2198	if (*ctx->re == CHAR_LBRACE)
2199	{
2200	ctx->re++;
2201	empty_parse_bound:
2202	/* We need to parse the bound first and return
2203	any error, before returning REG_BADRPT */
2204	status = tre_parse_bound(ctx, NULL);
2205	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2206	/* For ERE, if REG_NOMATCH is returned, we
2207	treat the lbrace as a literal. */
2208	if (status == REG_NOMATCH)
2209	{
2210	ctx->re--;
2211	/* Drop down to literal-handling code */
2212	}
2213	else
2214	{
2215	#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2216	if (status != REG_OK)
2217	return status;
2218	return REG_BADRPT;
2219	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2220	}
2221	#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2222	}
2223	#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2224	else
2225	#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2226	if (*ctx->re == CHAR_STAR
2227	\|\| *ctx->re == CHAR_PLUS
2228	\|\| *ctx->re == CHAR_QUESTIONMARK)
2229	{
2230	return REG_BADRPT;
2231	}
2232	}
2233	}
2234	else if (ctx->re + 1 < ctx->re_end
2235	&& *ctx->re == CHAR_BACKSLASH
2236	&& *(ctx->re + 1) == CHAR_LBRACE)
2237	{
2238	ctx->re += 2;
2239	goto empty_parse_bound;
2240	}
2241	#ifdef REG_LITERAL
2242	}
2243	#endif /* REG_LITERAL */
2244
2245	DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
2246	REST(ctx->re)));
2247	/* Note that we can't use an tre_isalpha() test here, since there
2248	may be characters which are alphabetic but neither upper or
2249	lower case. */
2250	if (ctx->cflags & REG_ICASE
2251	&& (tre_isupper_l(*ctx->re, ctx->loc) \|\|
2252	tre_islower_l(*ctx->re, ctx->loc)))
2253	{
2254	tre_ast_node_t *tmp1;
2255	tre_ast_node_t *tmp2;
2256
2257	/* XXX - Can there be more than one opposite-case
2258	counterpoints for some character in some locale? Or
2259	more than two characters which all should be regarded
2260	the same character if case is ignored? If yes, there
2261	does not seem to be a portable way to detect it. I guess
2262	that at least for multi-character collating elements there
2263	could be several opposite-case counterpoints, but they
2264	cannot be supported portably anyway. */
2265	tmp1 = tre_ast_new_literal(ctx->mem,
2266	tre_toupper_l(*ctx->re, ctx->loc),
2267	tre_toupper_l(*ctx->re, ctx->loc),
2268	ctx->position);
2269	if (!tmp1)
2270	return REG_ESPACE;
2271	tmp2 = tre_ast_new_literal(ctx->mem,
2272	tre_tolower_l(*ctx->re, ctx->loc),
2273	tre_tolower_l(*ctx->re, ctx->loc),
2274	ctx->position);
2275	if (!tmp2)
2276	return REG_ESPACE;
2277	result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2278	if (!result)
2279	return REG_ESPACE;
2280	}
2281	else
2282	{
2283	result = tre_ast_new_literal(ctx->mem, ctx->re, ctx->re,
2284	ctx->position);
2285	if (!result)
2286	return REG_ESPACE;
2287	}
2288	ctx->position++;
2289	ctx->re++;
2290	break;
2291	}
2292	break;
2293	}
2294
2295	case PARSE_MARK_FOR_SUBMATCH:
2296	{
2297	int submatch_id = tre_stack_pop_int(stack);
2298
2299	ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2300	if (result->submatch_id >= 0 &&
2301	result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2302	{
2303	tre_ast_node_t n, tmp_node;
2304	if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2305	break;
2306	n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2307	if (n == NULL)
2308	return REG_ESPACE;
2309	tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2310	if (tmp_node == NULL)
2311	return REG_ESPACE;
2312	tmp_node->num_submatches = result->num_submatches;
2313	result = tmp_node;
2314	}
2315	result->submatch_id = submatch_id;
2316	if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2317	result->num_submatches++;
2318	break;
2319	}
2320
2321	default:
2322	assert(0);
2323	break;
2324	}
2325	}
2326
2327	/* Check for missing closing parentheses. */
2328	if (depth > 0)
2329	return REG_EPAREN;
2330
2331	ctx->result = result;
2332
2333	return REG_OK;
2334	}
2335
2336	/* EOF */