From 38d679fdb55d39e08b90cc703a44ef72273abdb3 Mon Sep 17 00:00:00 2001
From: Ryan Norton <wxprojects@comcast.net>
Date: Wed, 13 Oct 1999 02:22:17 +0000
Subject: [PATCH] Tcl regex lib

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@3949 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
---
 src/regex/regc_lex.c | 1146 ++++++++++++++++++++++++++++++++++++++++++
 src/regex/regerrs.h  |   75 +++
 src/regex/regguts.h  |  417 +++++++++++++++
 3 files changed, 1638 insertions(+)
 create mode 100644 src/regex/regc_lex.c
 create mode 100644 src/regex/regerrs.h
 create mode 100644 src/regex/regguts.h

diff --git a/src/regex/regc_lex.c b/src/regex/regc_lex.c
new file mode 100644
index 0000000000..a24290d1a1
--- /dev/null
+++ b/src/regex/regc_lex.c
@@ -0,0 +1,1146 @@
+/*
+ * lexical analyzer
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.	All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ */
+
+/* scanning macros (know about v) */
+#define ATEOS()		(v->now >= v->stop)
+#define HAVE(n)		(v->stop - v->now >= (n))
+#define NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
+#define NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
+#define NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
+						*(v->now+1) == CHR(b) && \
+						*(v->now+2) == CHR(c))
+#define SET(c)		(v->nexttype = (c))
+#define SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
+#define RET(c)		return (SET(c), 1)
+#define RETV(c, n)	return (SETV(c, n), 1)
+#define FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
+#define LASTTYPE(t) (v->lasttype == (t))
+
+/* lexical contexts */
+#define L_ERE	1				/* mainline ERE/ARE */
+#define L_BRE	2				/* mainline BRE */
+#define L_Q 3					/* REG_QUOTE */
+#define L_EBND	4				/* ERE/ARE bound */
+#define L_BBND	5				/* BRE bound */
+#define L_BRACK 6				/* brackets */
+#define L_CEL	7				/* collating element */
+#define L_ECL	8				/* equivalence class */
+#define L_CCL	9				/* character class */
+#define INTOCON(c)	(v->lexcon = (c))
+#define INCON(con)	(v->lexcon == (con))
+
+/* construct pointer past end of chr array */
+#define ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
+
+/*
+ * lexstart - set up lexical stuff, scan leading options
+ */
+static void
+lexstart(struct vars * v)
+{
+	prefixes(v);				/* may turn on new type bits etc. */
+	NOERR();
+
+	if (v->cflags & REG_QUOTE)
+	{
+		assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
+		INTOCON(L_Q);
+	}
+	else if (v->cflags & REG_EXTENDED)
+	{
+		assert(!(v->cflags & REG_QUOTE));
+		INTOCON(L_ERE);
+	}
+	else
+	{
+		assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
+		INTOCON(L_BRE);
+	}
+
+	v->nexttype = EMPTY;		/* remember we were at the start */
+	next(v);					/* set up the first token */
+}
+
+/*
+ * prefixes - implement various special prefixes
+ */
+static void
+prefixes(struct vars * v)
+{
+	/* literal string doesn't get any of this stuff */
+	if (v->cflags & REG_QUOTE)
+		return;
+
+	/* initial "***" gets special things */
+	if (HAVE(4) && NEXT3('*', '*', '*'))
+		switch (*(v->now + 3))
+		{
+			case CHR('?'):		/* "***?" error, msg shows version */
+				ERR(REG_BADPAT);
+				return;			/* proceed no further */
+				break;
+			case CHR('='):		/* "***=" shifts to literal string */
+				NOTE(REG_UNONPOSIX);
+				v->cflags |= REG_QUOTE;
+				v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
+				v->now += 4;
+				return;			/* and there can be no more prefixes */
+				break;
+			case CHR(':'):		/* "***:" shifts to AREs */
+				NOTE(REG_UNONPOSIX);
+				v->cflags |= REG_ADVANCED;
+				v->now += 4;
+				break;
+			default:			/* otherwise *** is just an error */
+				ERR(REG_BADRPT);
+				return;
+				break;
+		}
+
+	/* BREs and EREs don't get embedded options */
+	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
+		return;
+
+	/* embedded options (AREs only) */
+	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
+	{
+		NOTE(REG_UNONPOSIX);
+		v->now += 2;
+		for (; !ATEOS() && iscalpha(*v->now); v->now++)
+			switch (*v->now)
+			{
+				case CHR('b'):	/* BREs (but why???) */
+					v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+					break;
+				case CHR('c'):	/* case sensitive */
+					v->cflags &= ~REG_ICASE;
+					break;
+				case CHR('e'):	/* plain EREs */
+					v->cflags |= REG_EXTENDED;
+					v->cflags &= ~(REG_ADVF | REG_QUOTE);
+					break;
+				case CHR('i'):	/* case insensitive */
+					v->cflags |= REG_ICASE;
+					break;
+				case CHR('m'):	/* Perloid synonym for n */
+				case CHR('n'):	/* \n affects ^ $ . [^ */
+					v->cflags |= REG_NEWLINE;
+					break;
+				case CHR('p'):	/* ~Perl, \n affects . [^ */
+					v->cflags |= REG_NLSTOP;
+					v->cflags &= ~REG_NLANCH;
+					break;
+				case CHR('q'):	/* literal string */
+					v->cflags |= REG_QUOTE;
+					v->cflags &= ~REG_ADVANCED;
+					break;
+				case CHR('s'):	/* single line, \n ordinary */
+					v->cflags &= ~REG_NEWLINE;
+					break;
+				case CHR('t'):	/* tight syntax */
+					v->cflags &= ~REG_EXPANDED;
+					break;
+				case CHR('w'):	/* weird, \n affects ^ $ only */
+					v->cflags &= ~REG_NLSTOP;
+					v->cflags |= REG_NLANCH;
+					break;
+				case CHR('x'):	/* expanded syntax */
+					v->cflags |= REG_EXPANDED;
+					break;
+				default:
+					ERR(REG_BADOPT);
+					return;
+			}
+		if (!NEXT1(')'))
+		{
+			ERR(REG_BADOPT);
+			return;
+		}
+		v->now++;
+		if (v->cflags & REG_QUOTE)
+			v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
+	}
+}
+
+/*
+ * lexnest - "call a subroutine", interpolating string at the lexical level
+ *
+ * Note, this is not a very general facility.  There are a number of
+ * implicit assumptions about what sorts of strings can be subroutines.
+ */
+static void
+lexnest(struct vars * v,
+		chr *beginp,			/* start of interpolation */
+		chr *endp)				/* one past end of interpolation */
+{
+	assert(v->savenow == NULL); /* only one level of nesting */
+	v->savenow = v->now;
+	v->savestop = v->stop;
+	v->now = beginp;
+	v->stop = endp;
+}
+
+/*
+ * string constants to interpolate as expansions of things like \d
+ */
+static chr	backd[] = {			/* \d */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr	backD[] = {			/* \D */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr	brbackd[] = {		/* \d within brackets */
+	CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']')
+};
+static chr	backs[] = {			/* \s */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr	backS[] = {			/* \S */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr	brbacks[] = {		/* \s within brackets */
+	CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']')
+};
+static chr	backw[] = {			/* \w */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr	backW[] = {			/* \W */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr	brbackw[] = {		/* \w within brackets */
+	CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_')
+};
+
+/*
+ * lexword - interpolate a bracket expression for word characters
+ * Possibly ought to inquire whether there is a "word" character class.
+ */
+static void
+lexword(struct vars * v)
+{
+	lexnest(v, backw, ENDOF(backw));
+}
+
+/*
+ * next - get next token
+ */
+static int						/* 1 normal, 0 failure */
+next(struct vars * v)
+{
+	chr			c;
+
+	/* errors yield an infinite sequence of failures */
+	if (ISERR())
+		return 0;				/* the error has set nexttype to EOS */
+
+	/* remember flavor of last token */
+	v->lasttype = v->nexttype;
+
+	/* REG_BOSONLY */
+	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
+	{
+		/* at start of a REG_BOSONLY RE */
+		RETV(SBEGIN, 0);		/* same as \A */
+	}
+
+	/* if we're nested and we've hit end, return to outer level */
+	if (v->savenow != NULL && ATEOS())
+	{
+		v->now = v->savenow;
+		v->stop = v->savestop;
+		v->savenow = v->savestop = NULL;
+	}
+
+	/* skip white space etc. if appropriate (not in literal or []) */
+	if (v->cflags & REG_EXPANDED)
+		switch (v->lexcon)
+		{
+			case L_ERE:
+			case L_BRE:
+			case L_EBND:
+			case L_BBND:
+				skip(v);
+				break;
+		}
+
+	/* handle EOS, depending on context */
+	if (ATEOS())
+	{
+		switch (v->lexcon)
+		{
+			case L_ERE:
+			case L_BRE:
+			case L_Q:
+				RET(EOS);
+				break;
+			case L_EBND:
+			case L_BBND:
+				FAILW(REG_EBRACE);
+				break;
+			case L_BRACK:
+			case L_CEL:
+			case L_ECL:
+			case L_CCL:
+				FAILW(REG_EBRACK);
+				break;
+		}
+		assert(NOTREACHED);
+	}
+
+	/* okay, time to actually get a character */
+	c = *v->now++;
+
+	/* deal with the easy contexts, punt EREs to code below */
+	switch (v->lexcon)
+	{
+		case L_BRE:				/* punt BREs to separate function */
+			return brenext(v, c);
+			break;
+		case L_ERE:				/* see below */
+			break;
+		case L_Q:				/* literal strings are easy */
+			RETV(PLAIN, c);
+			break;
+		case L_BBND:			/* bounds are fairly simple */
+		case L_EBND:
+			switch (c)
+			{
+				case CHR('0'):
+				case CHR('1'):
+				case CHR('2'):
+				case CHR('3'):
+				case CHR('4'):
+				case CHR('5'):
+				case CHR('6'):
+				case CHR('7'):
+				case CHR('8'):
+				case CHR('9'):
+					RETV(DIGIT, (chr) DIGITVAL(c));
+					break;
+				case CHR(','):
+					RET(',');
+					break;
+				case CHR('}'):	/* ERE bound ends with } */
+					if (INCON(L_EBND))
+					{
+						INTOCON(L_ERE);
+						if ((v->cflags & REG_ADVF) && NEXT1('?'))
+						{
+							v->now++;
+							NOTE(REG_UNONPOSIX);
+							RETV('}', 0);
+						}
+						RETV('}', 1);
+					}
+					else
+						FAILW(REG_BADBR);
+					break;
+				case CHR('\\'):	/* BRE bound ends with \} */
+					if (INCON(L_BBND) && NEXT1('}'))
+					{
+						v->now++;
+						INTOCON(L_BRE);
+						RET('}');
+					}
+					else
+						FAILW(REG_BADBR);
+					break;
+				default:
+					FAILW(REG_BADBR);
+					break;
+			}
+			assert(NOTREACHED);
+			break;
+		case L_BRACK:			/* brackets are not too hard */
+			switch (c)
+			{
+				case CHR(']'):
+					if (LASTTYPE('['))
+						RETV(PLAIN, c);
+					else
+					{
+						INTOCON((v->cflags & REG_EXTENDED) ?
+								L_ERE : L_BRE);
+						RET(']');
+					}
+					break;
+				case CHR('\\'):
+					NOTE(REG_UBBS);
+					if (!(v->cflags & REG_ADVF))
+						RETV(PLAIN, c);
+					NOTE(REG_UNONPOSIX);
+					if (ATEOS())
+						FAILW(REG_EESCAPE);
+					(DISCARD) lexescape(v);
+					switch (v->nexttype)
+					{			/* not all escapes okay here */
+						case PLAIN:
+							return 1;
+							break;
+						case CCLASS:
+							switch (v->nextvalue)
+							{
+								case 'd':
+									lexnest(v, brbackd, ENDOF(brbackd));
+									break;
+								case 's':
+									lexnest(v, brbacks, ENDOF(brbacks));
+									break;
+								case 'w':
+									lexnest(v, brbackw, ENDOF(brbackw));
+									break;
+								default:
+									FAILW(REG_EESCAPE);
+									break;
+							}
+							/* lexnest done, back up and try again */
+							v->nexttype = v->lasttype;
+							return next(v);
+							break;
+					}
+					/* not one of the acceptable escapes */
+					FAILW(REG_EESCAPE);
+					break;
+				case CHR('-'):
+					if (LASTTYPE('[') || NEXT1(']'))
+						RETV(PLAIN, c);
+					else
+						RETV(RANGE, c);
+					break;
+				case CHR('['):
+					if (ATEOS())
+						FAILW(REG_EBRACK);
+					switch (*v->now++)
+					{
+						case CHR('.'):
+							INTOCON(L_CEL);
+							/* might or might not be locale-specific */
+							RET(COLLEL);
+							break;
+						case CHR('='):
+							INTOCON(L_ECL);
+							NOTE(REG_ULOCALE);
+							RET(ECLASS);
+							break;
+						case CHR(':'):
+							INTOCON(L_CCL);
+							NOTE(REG_ULOCALE);
+							RET(CCLASS);
+							break;
+						default:		/* oops */
+							v->now--;
+							RETV(PLAIN, c);
+							break;
+					}
+					assert(NOTREACHED);
+					break;
+				default:
+					RETV(PLAIN, c);
+					break;
+			}
+			assert(NOTREACHED);
+			break;
+		case L_CEL:				/* collating elements are easy */
+			if (c == CHR('.') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, '.');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		case L_ECL:				/* ditto equivalence classes */
+			if (c == CHR('=') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, '=');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		case L_CCL:				/* ditto character classes */
+			if (c == CHR(':') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, ':');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		default:
+			assert(NOTREACHED);
+			break;
+	}
+
+	/* that got rid of everything except EREs and AREs */
+	assert(INCON(L_ERE));
+
+	/* deal with EREs and AREs, except for backslashes */
+	switch (c)
+	{
+		case CHR('|'):
+			RET('|');
+			break;
+		case CHR('*'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('*', 0);
+			}
+			RETV('*', 1);
+			break;
+		case CHR('+'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('+', 0);
+			}
+			RETV('+', 1);
+			break;
+		case CHR('?'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('?', 0);
+			}
+			RETV('?', 1);
+			break;
+		case CHR('{'):			/* bounds start or plain character */
+			if (v->cflags & REG_EXPANDED)
+				skip(v);
+			if (ATEOS() || !iscdigit(*v->now))
+			{
+				NOTE(REG_UBRACES);
+				NOTE(REG_UUNSPEC);
+				RETV(PLAIN, c);
+			}
+			else
+			{
+				NOTE(REG_UBOUNDS);
+				INTOCON(L_EBND);
+				RET('{');
+			}
+			assert(NOTREACHED);
+			break;
+		case CHR('('):			/* parenthesis, or advanced extension */
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				NOTE(REG_UNONPOSIX);
+				v->now++;
+				switch (*v->now++)
+				{
+					case CHR(':'):		/* non-capturing paren */
+						RETV('(', 0);
+						break;
+					case CHR('#'):		/* comment */
+						while (!ATEOS() && *v->now != CHR(')'))
+							v->now++;
+						if (!ATEOS())
+							v->now++;
+						assert(v->nexttype == v->lasttype);
+						return next(v);
+						break;
+					case CHR('='):		/* positive lookahead */
+						NOTE(REG_ULOOKAHEAD);
+						RETV(LACON, 1);
+						break;
+					case CHR('!'):		/* negative lookahead */
+						NOTE(REG_ULOOKAHEAD);
+						RETV(LACON, 0);
+						break;
+					default:
+						FAILW(REG_BADRPT);
+						break;
+				}
+				assert(NOTREACHED);
+			}
+			if (v->cflags & REG_NOSUB)
+				RETV('(', 0);	/* all parens non-capturing */
+			else
+				RETV('(', 1);
+			break;
+		case CHR(')'):
+			if (LASTTYPE('('))
+				NOTE(REG_UUNSPEC);
+			RETV(')', c);
+			break;
+		case CHR('['):			/* easy except for [[:<:]] and [[:>:]] */
+			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+				*(v->now + 1) == CHR(':') &&
+				(*(v->now + 2) == CHR('<') ||
+				 *(v->now + 2) == CHR('>')) &&
+				*(v->now + 3) == CHR(':') &&
+				*(v->now + 4) == CHR(']') &&
+				*(v->now + 5) == CHR(']'))
+			{
+				c = *(v->now + 2);
+				v->now += 6;
+				NOTE(REG_UNONPOSIX);
+				RET((c == CHR('<')) ? '<' : '>');
+			}
+			INTOCON(L_BRACK);
+			if (NEXT1('^'))
+			{
+				v->now++;
+				RETV('[', 0);
+			}
+			RETV('[', 1);
+			break;
+		case CHR('.'):
+			RET('.');
+			break;
+		case CHR('^'):
+			RET('^');
+			break;
+		case CHR('$'):
+			RET('$');
+			break;
+		case CHR('\\'): /* mostly punt backslashes to code below */
+			if (ATEOS())
+				FAILW(REG_EESCAPE);
+			break;
+		default:				/* ordinary character */
+			RETV(PLAIN, c);
+			break;
+	}
+
+	/* ERE/ARE backslash handling; backslash already eaten */
+	assert(!ATEOS());
+	if (!(v->cflags & REG_ADVF))
+	{							/* only AREs have non-trivial escapes */
+		if (iscalnum(*v->now))
+		{
+			NOTE(REG_UBSALNUM);
+			NOTE(REG_UUNSPEC);
+		}
+		RETV(PLAIN, *v->now++);
+	}
+	(DISCARD) lexescape(v);
+	if (ISERR())
+		FAILW(REG_EESCAPE);
+	if (v->nexttype == CCLASS)
+	{							/* fudge at lexical level */
+		switch (v->nextvalue)
+		{
+			case 'd':
+				lexnest(v, backd, ENDOF(backd));
+				break;
+			case 'D':
+				lexnest(v, backD, ENDOF(backD));
+				break;
+			case 's':
+				lexnest(v, backs, ENDOF(backs));
+				break;
+			case 'S':
+				lexnest(v, backS, ENDOF(backS));
+				break;
+			case 'w':
+				lexnest(v, backw, ENDOF(backw));
+				break;
+			case 'W':
+				lexnest(v, backW, ENDOF(backW));
+				break;
+			default:
+				assert(NOTREACHED);
+				FAILW(REG_ASSERT);
+				break;
+		}
+		/* lexnest done, back up and try again */
+		v->nexttype = v->lasttype;
+		return next(v);
+	}
+	/* otherwise, lexescape has already done the work */
+	return !ISERR();
+}
+
+/*
+ * lexescape - parse an ARE backslash escape (backslash already eaten)
+ * Note slightly nonstandard use of the CCLASS type code.
+ */
+static int						/* not actually used, but convenient for
+								 * RETV */
+lexescape(struct vars * v)
+{
+	chr			c;
+	static chr	alert[] = {
+		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
+	};
+	static chr	esc[] = {
+		CHR('E'), CHR('S'), CHR('C')
+	};
+	chr		   *save;
+
+	assert(v->cflags & REG_ADVF);
+
+	assert(!ATEOS());
+	c = *v->now++;
+	if (!iscalnum(c))
+		RETV(PLAIN, c);
+
+	NOTE(REG_UNONPOSIX);
+	switch (c)
+	{
+		case CHR('a'):
+			RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
+			break;
+		case CHR('A'):
+			RETV(SBEGIN, 0);
+			break;
+		case CHR('b'):
+			RETV(PLAIN, CHR('\b'));
+			break;
+		case CHR('B'):
+			RETV(PLAIN, CHR('\\'));
+			break;
+		case CHR('c'):
+			NOTE(REG_UUNPORT);
+			if (ATEOS())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, (chr) (*v->now++ & 037));
+			break;
+		case CHR('d'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'd');
+			break;
+		case CHR('D'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'D');
+			break;
+		case CHR('e'):
+			NOTE(REG_UUNPORT);
+			RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
+			break;
+		case CHR('f'):
+			RETV(PLAIN, CHR('\f'));
+			break;
+		case CHR('m'):
+			RET('<');
+			break;
+		case CHR('M'):
+			RET('>');
+			break;
+		case CHR('n'):
+			RETV(PLAIN, CHR('\n'));
+			break;
+		case CHR('r'):
+			RETV(PLAIN, CHR('\r'));
+			break;
+		case CHR('s'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 's');
+			break;
+		case CHR('S'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'S');
+			break;
+		case CHR('t'):
+			RETV(PLAIN, CHR('\t'));
+			break;
+		case CHR('u'):
+			c = lexdigits(v, 16, 4, 4);
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('U'):
+			c = lexdigits(v, 16, 8, 8);
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('v'):
+			RETV(PLAIN, CHR('\v'));
+			break;
+		case CHR('w'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'w');
+			break;
+		case CHR('W'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'W');
+			break;
+		case CHR('x'):
+			NOTE(REG_UUNPORT);
+			c = lexdigits(v, 16, 1, 255);		/* REs >255 long outside
+												 * spec */
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('y'):
+			NOTE(REG_ULOCALE);
+			RETV(WBDRY, 0);
+			break;
+		case CHR('Y'):
+			NOTE(REG_ULOCALE);
+			RETV(NWBDRY, 0);
+			break;
+		case CHR('Z'):
+			RETV(SEND, 0);
+			break;
+		case CHR('1'):
+		case CHR('2'):
+		case CHR('3'):
+		case CHR('4'):
+		case CHR('5'):
+		case CHR('6'):
+		case CHR('7'):
+		case CHR('8'):
+		case CHR('9'):
+			save = v->now;
+			v->now--;			/* put first digit back */
+			c = lexdigits(v, 10, 1, 255);		/* REs >255 long outside
+												 * spec */
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			/* ugly heuristic (first test is "exactly 1 digit?") */
+			if (v->now - save == 0 || (int) c <= v->nsubexp)
+			{
+				NOTE(REG_UBACKREF);
+				RETV(BACKREF, (chr) c);
+			}
+			/* oops, doesn't look like it's a backref after all... */
+			v->now = save;
+			/* and fall through into octal number */
+		case CHR('0'):
+			NOTE(REG_UUNPORT);
+			v->now--;			/* put first digit back */
+			c = lexdigits(v, 8, 1, 3);
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		default:
+			assert(iscalpha(c));
+			FAILW(REG_EESCAPE); /* unknown alphabetic escape */
+			break;
+	}
+	assert(NOTREACHED);
+}
+
+/*
+ * lexdigits - slurp up digits and return chr value
+ */
+static chr						/* chr value; errors signalled via ERR */
+lexdigits(struct vars * v,
+		  int base,
+		  int minlen,
+		  int maxlen)
+{
+	uchr		n;				/* unsigned to avoid overflow misbehavior */
+	int			len;
+	chr			c;
+	int			d;
+	const uchr	ub = (uchr) base;
+
+	n = 0;
+	for (len = 0; len < maxlen && !ATEOS(); len++)
+	{
+		c = *v->now++;
+		switch (c)
+		{
+			case CHR('0'):
+			case CHR('1'):
+			case CHR('2'):
+			case CHR('3'):
+			case CHR('4'):
+			case CHR('5'):
+			case CHR('6'):
+			case CHR('7'):
+			case CHR('8'):
+			case CHR('9'):
+				d = DIGITVAL(c);
+				break;
+			case CHR('a'):
+			case CHR('A'):
+				d = 10;
+				break;
+			case CHR('b'):
+			case CHR('B'):
+				d = 11;
+				break;
+			case CHR('c'):
+			case CHR('C'):
+				d = 12;
+				break;
+			case CHR('d'):
+			case CHR('D'):
+				d = 13;
+				break;
+			case CHR('e'):
+			case CHR('E'):
+				d = 14;
+				break;
+			case CHR('f'):
+			case CHR('F'):
+				d = 15;
+				break;
+			default:
+				v->now--;		/* oops, not a digit at all */
+				d = -1;
+				break;
+		}
+
+		if (d >= base)
+		{						/* not a plausible digit */
+			v->now--;
+			d = -1;
+		}
+		if (d < 0)
+			break;				/* NOTE BREAK OUT */
+		n = n * ub + (uchr) d;
+	}
+	if (len < minlen)
+		ERR(REG_EESCAPE);
+
+	return (chr) n;
+}
+
+/*
+ * brenext - get next BRE token
+ *
+ * This is much like EREs except for all the stupid backslashes and the
+ * context-dependency of some things.
+ */
+static int						/* 1 normal, 0 failure */
+brenext(struct vars * v,
+		chr pc)
+{
+	chr			c = (chr) pc;
+
+	switch (c)
+	{
+		case CHR('*'):
+			if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
+				RETV(PLAIN, c);
+			RET('*');
+			break;
+		case CHR('['):
+			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+				*(v->now + 1) == CHR(':') &&
+				(*(v->now + 2) == CHR('<') ||
+				 *(v->now + 2) == CHR('>')) &&
+				*(v->now + 3) == CHR(':') &&
+				*(v->now + 4) == CHR(']') &&
+				*(v->now + 5) == CHR(']'))
+			{
+				c = *(v->now + 2);
+				v->now += 6;
+				NOTE(REG_UNONPOSIX);
+				RET((c == CHR('<')) ? '<' : '>');
+			}
+			INTOCON(L_BRACK);
+			if (NEXT1('^'))
+			{
+				v->now++;
+				RETV('[', 0);
+			}
+			RETV('[', 1);
+			break;
+		case CHR('.'):
+			RET('.');
+			break;
+		case CHR('^'):
+			if (LASTTYPE(EMPTY))
+				RET('^');
+			if (LASTTYPE('('))
+			{
+				NOTE(REG_UUNSPEC);
+				RET('^');
+			}
+			RETV(PLAIN, c);
+			break;
+		case CHR('$'):
+			if (v->cflags & REG_EXPANDED)
+				skip(v);
+			if (ATEOS())
+				RET('$');
+			if (NEXT2('\\', ')'))
+			{
+				NOTE(REG_UUNSPEC);
+				RET('$');
+			}
+			RETV(PLAIN, c);
+			break;
+		case CHR('\\'):
+			break;				/* see below */
+		default:
+			RETV(PLAIN, c);
+			break;
+	}
+
+	assert(c == CHR('\\'));
+
+	if (ATEOS())
+		FAILW(REG_EESCAPE);
+
+	c = *v->now++;
+	switch (c)
+	{
+		case CHR('{'):
+			INTOCON(L_BBND);
+			NOTE(REG_UBOUNDS);
+			RET('{');
+			break;
+		case CHR('('):
+			RETV('(', 1);
+			break;
+		case CHR(')'):
+			RETV(')', c);
+			break;
+		case CHR('<'):
+			NOTE(REG_UNONPOSIX);
+			RET('<');
+			break;
+		case CHR('>'):
+			NOTE(REG_UNONPOSIX);
+			RET('>');
+			break;
+		case CHR('1'):
+		case CHR('2'):
+		case CHR('3'):
+		case CHR('4'):
+		case CHR('5'):
+		case CHR('6'):
+		case CHR('7'):
+		case CHR('8'):
+		case CHR('9'):
+			NOTE(REG_UBACKREF);
+			RETV(BACKREF, (chr) DIGITVAL(c));
+			break;
+		default:
+			if (iscalnum(c))
+			{
+				NOTE(REG_UBSALNUM);
+				NOTE(REG_UUNSPEC);
+			}
+			RETV(PLAIN, c);
+			break;
+	}
+
+	assert(NOTREACHED);
+}
+
+/*
+ * skip - skip white space and comments in expanded form
+ */
+static void
+skip(struct vars * v)
+{
+	chr		   *start = v->now;
+
+	assert(v->cflags & REG_EXPANDED);
+
+	for (;;)
+	{
+		while (!ATEOS() && iscspace(*v->now))
+			v->now++;
+		if (ATEOS() || *v->now != CHR('#'))
+			break;				/* NOTE BREAK OUT */
+		assert(NEXT1('#'));
+		while (!ATEOS() && *v->now != CHR('\n'))
+			v->now++;
+		/* leave the newline to be picked up by the iscspace loop */
+	}
+
+	if (v->now != start)
+		NOTE(REG_UNONPOSIX);
+}
+
+/*
+ * newline - return the chr for a newline
+ *
+ * This helps confine use of CHR to this source file.
+ */
+static chr
+newline(void)
+{
+	return CHR('\n');
+}
+
+/*
+ * chrnamed - return the chr known by a given (chr string) name
+ *
+ * The code is a bit clumsy, but this routine gets only such specialized
+ * use that it hardly matters.
+ */
+static chr
+chrnamed(struct vars * v,
+		 chr *startp,			/* start of name */
+		 chr *endp,				/* just past end of name */
+		 chr lastresort)		/* what to return if name lookup fails */
+{
+	celt		c;
+	int			errsave;
+	int			e;
+	struct cvec *cv;
+
+	errsave = v->err;
+	v->err = 0;
+	c = element(v, startp, endp);
+	e = v->err;
+	v->err = errsave;
+
+	if (e != 0)
+		return (chr) lastresort;
+
+	cv = range(v, c, c, 0);
+	if (cv->nchrs == 0)
+		return (chr) lastresort;
+	return cv->chrs[0];
+}
diff --git a/src/regex/regerrs.h b/src/regex/regerrs.h
new file mode 100644
index 0000000000..f99dbf4f73
--- /dev/null
+++ b/src/regex/regerrs.h
@@ -0,0 +1,75 @@
+/*
+ * $Id$
+ */
+
+{
+	REG_OKAY, "REG_OKAY", "no errors detected"
+},
+
+{
+	REG_NOMATCH, "REG_NOMATCH", "failed to match"
+},
+
+{
+	REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)"
+},
+
+{
+	REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"
+},
+
+{
+	REG_ECTYPE, "REG_ECTYPE", "invalid character class"
+},
+
+{
+	REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence"
+},
+
+{
+	REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"
+},
+
+{
+	REG_EBRACK, "REG_EBRACK", "brackets [] not balanced"
+},
+
+{
+	REG_EPAREN, "REG_EPAREN", "parentheses () not balanced"
+},
+
+{
+	REG_EBRACE, "REG_EBRACE", "braces {} not balanced"
+},
+
+{
+	REG_BADBR, "REG_BADBR", "invalid repetition count(s)"
+},
+
+{
+	REG_ERANGE, "REG_ERANGE", "invalid character range"
+},
+
+{
+	REG_ESPACE, "REG_ESPACE", "out of memory"
+},
+
+{
+	REG_BADRPT, "REG_BADRPT", "quantifier operand invalid"
+},
+
+{
+	REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"
+},
+
+{
+	REG_INVARG, "REG_INVARG", "invalid argument to regex function"
+},
+
+{
+	REG_MIXED, "REG_MIXED", "character widths of regex and string differ"
+},
+
+{
+	REG_BADOPT, "REG_BADOPT", "invalid embedded option"
+},
diff --git a/src/regex/regguts.h b/src/regex/regguts.h
new file mode 100644
index 0000000000..aa12dbf445
--- /dev/null
+++ b/src/regex/regguts.h
@@ -0,0 +1,417 @@
+/*
+ * Internal interface definitions, etc., for the reg package
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.	All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+
+
+/*
+ * Environmental customization.  It should not (I hope) be necessary to
+ * alter the file you are now reading -- regcustom.h should handle it all,
+ * given care here and elsewhere.
+ */
+#include "regcustom.h"
+
+
+
+/*
+ * Things that regcustom.h might override.
+ */
+
+/* assertions */
+#ifndef assert
+#ifndef REG_DEBUG
+#	ifndef NDEBUG
+#		define  NDEBUG					/* no assertions */
+#	endif
+#endif
+#include <assert.h>
+#endif
+
+/* voids */
+#ifndef DISCARD
+#define DISCARD void			/* for throwing values away */
+#endif
+#ifndef VS
+#define VS(x)	((void *)(x))	/* cast something to generic ptr */
+#endif
+
+/* function-pointer declarator */
+#ifndef FUNCPTR
+#define FUNCPTR(name, args) (*name) args
+#endif
+
+/* memory allocation */
+#ifndef MALLOC
+#define MALLOC(n)	malloc(n)
+#endif
+#ifndef REALLOC
+#define REALLOC(p, n)	realloc(VS(p), n)
+#endif
+#ifndef FREE
+#define FREE(p)		free(VS(p))
+#endif
+
+/* want size of a char in bits, and max value in bounded quantifiers */
+#ifndef CHAR_BIT
+#include <limits.h>
+#endif
+#ifndef _POSIX2_RE_DUP_MAX
+#define _POSIX2_RE_DUP_MAX	255 /* normally from <limits.h> */
+#endif
+
+
+
+/*
+ * misc
+ */
+
+#define NOTREACHED	0
+#define xxx		1
+
+#define DUPMAX	_POSIX2_RE_DUP_MAX
+#define INFINITY	(DUPMAX+1)
+
+#define REMAGIC 0xfed7			/* magic number for main struct */
+
+
+
+/*
+ * debugging facilities
+ */
+#ifdef REG_DEBUG
+/* FDEBUG does finite-state tracing */
+#define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
+/* MDEBUG does higher-level tracing */
+#define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
+#else
+#define FDEBUG(arglist) {}
+#define MDEBUG(arglist) {}
+#endif
+
+
+
+/*
+ * bitmap manipulation
+ */
+#define UBITS	(CHAR_BIT * sizeof(unsigned))
+#define BSET(uv, sn)	((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
+#define ISBSET(uv, sn)	((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
+
+
+
+/*
+ * We dissect a chr into byts for colormap table indexing.	Here we define
+ * a byt, which will be the same as a byte on most machines...	The exact
+ * size of a byt is not critical, but about 8 bits is good, and extraction
+ * of 8-bit chunks is sometimes especially fast.
+ */
+#ifndef BYTBITS
+#define BYTBITS 8				/* bits in a byt */
+#endif
+#define BYTTAB	(1<<BYTBITS)	/* size of table with one entry per byt
+								 * value */
+#define BYTMASK (BYTTAB-1)		/* bit mask for byt */
+#define NBYTS	((CHRBITS+BYTBITS-1)/BYTBITS)
+/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
+
+
+
+/*
+ * As soon as possible, we map chrs into equivalence classes -- "colors" --
+ * which are of much more manageable number.
+ */
+typedef short color;			/* colors of characters */
+typedef int pcolor;				/* what color promotes to */
+
+#define COLORLESS	(-1)		/* impossible color */
+#define WHITE		0			/* default color, parent of all others */
+
+
+
+/*
+ * A colormap is a tree -- more precisely, a DAG -- indexed at each level
+ * by a byt of the chr, to map the chr to a color efficiently.	Because
+ * lower sections of the tree can be shared, it can exploit the usual
+ * sparseness of such a mapping table.	The tree is always NBYTS levels
+ * deep (in the past it was shallower during construction but was "filled"
+ * to full depth at the end of that); areas that are unaltered as yet point
+ * to "fill blocks" which are entirely WHITE in color.
+ */
+
+/* the tree itself */
+struct colors
+{
+	color		ccolor[BYTTAB];
+};
+struct ptrs
+{
+	union tree *pptr[BYTTAB];
+};
+union tree
+{
+	struct colors colors;
+	struct ptrs ptrs;
+};
+
+#define tcolor	colors.ccolor
+#define tptr	ptrs.pptr
+
+/* internal per-color structure for the color machinery */
+struct colordesc
+{
+	uchr		nchrs;			/* number of chars of this color */
+	color		sub;			/* open subcolor (if any); free chain ptr */
+#define  NOSUB	 COLORLESS
+	struct arc *arcs;			/* color chain */
+	int			flags;
+#define  FREECOL 01				/* currently free */
+#define  PSEUDO  02				/* pseudocolor, no real chars */
+#define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
+	union tree *block;			/* block of solid color, if any */
+};
+
+/* the color map itself */
+struct colormap
+{
+	int			magic;
+#define  CMMAGIC 0x876
+	struct vars *v;				/* for compile error reporting */
+	size_t		ncds;			/* number of colordescs */
+	size_t		max;			/* highest in use */
+	color		free;			/* beginning of free chain (if non-0) */
+	struct colordesc *cd;
+#define  CDEND(cm)	 (&(cm)->cd[(cm)->max + 1])
+#define  NINLINECDS  ((size_t)10)
+	struct colordesc cdspace[NINLINECDS];
+	union tree	tree[NBYTS];	/* tree top, plus fill blocks */
+};
+
+/* optimization magic to do fast chr->color mapping */
+#define B0(c)	((c) & BYTMASK)
+#define B1(c)	(((c)>>BYTBITS) & BYTMASK)
+#define B2(c)	(((c)>>(2*BYTBITS)) & BYTMASK)
+#define B3(c)	(((c)>>(3*BYTBITS)) & BYTMASK)
+#if NBYTS == 1
+#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
+#endif
+/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
+#if NBYTS == 2
+#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+#if NBYTS == 4
+#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+
+
+
+/*
+ * Interface definitions for locale-interface functions in locale.c.
+ * Multi-character collating elements (MCCEs) cause most of the trouble.
+ */
+struct cvec
+{
+	int			nchrs;			/* number of chrs */
+	int			chrspace;		/* number of chrs possible */
+	chr		   *chrs;			/* pointer to vector of chrs */
+	int			nranges;		/* number of ranges (chr pairs) */
+	int			rangespace;		/* number of chrs possible */
+	chr		   *ranges;			/* pointer to vector of chr pairs */
+	int			nmcces;			/* number of MCCEs */
+	int			mccespace;		/* number of MCCEs possible */
+	int			nmccechrs;		/* number of chrs used for MCCEs */
+	chr		   *mcces[1];		/* pointers to 0-terminated MCCEs */
+	/* and both batches of chrs are on the end */
+};
+
+/* caution:  this value cannot be changed easily */
+#define MAXMCCE 2				/* length of longest MCCE */
+
+
+
+/*
+ * definitions for NFA internal representation
+ *
+ * Having a "from" pointer within each arc may seem redundant, but it
+ * saves a lot of hassle.
+ */
+struct state;
+
+struct arc
+{
+	int			type;
+#define  ARCFREE '\0'
+	color		co;
+	struct state *from;			/* where it's from (and contained within) */
+	struct state *to;			/* where it's to */
+	struct arc *outchain;		/* *from's outs chain or free chain */
+#define  freechain	 outchain
+	struct arc *inchain;		/* *to's ins chain */
+	struct arc *colorchain;		/* color's arc chain */
+};
+
+struct arcbatch
+{								/* for bulk allocation of arcs */
+	struct arcbatch *next;
+#define  ABSIZE  10
+	struct arc	a[ABSIZE];
+};
+
+struct state
+{
+	int			no;
+#define  FREESTATE	 (-1)
+	char		flag;			/* marks special states */
+	int			nins;			/* number of inarcs */
+	struct arc *ins;			/* chain of inarcs */
+	int			nouts;			/* number of outarcs */
+	struct arc *outs;			/* chain of outarcs */
+	struct arc *free;			/* chain of free arcs */
+	struct state *tmp;			/* temporary for traversal algorithms */
+	struct state *next;			/* chain for traversing all */
+	struct state *prev;			/* back chain */
+	struct arcbatch oas;		/* first arcbatch, avoid malloc in easy
+								 * case */
+	int			noas;			/* number of arcs used in first arcbatch */
+};
+
+struct nfa
+{
+	struct state *pre;			/* pre-initial state */
+	struct state *init;			/* initial state */
+	struct state *final;		/* final state */
+	struct state *post;			/* post-final state */
+	int			nstates;		/* for numbering states */
+	struct state *states;		/* state-chain header */
+	struct state *slast;		/* tail of the chain */
+	struct state *free;			/* free list */
+	struct colormap *cm;		/* the color map */
+	color		bos[2];			/* colors, if any, assigned to BOS and BOL */
+	color		eos[2];			/* colors, if any, assigned to EOS and EOL */
+	struct vars *v;				/* simplifies compile error reporting */
+	struct nfa *parent;			/* parent NFA, if any */
+};
+
+
+
+/*
+ * definitions for compacted NFA
+ */
+struct carc
+{
+	color		co;				/* COLORLESS is list terminator */
+	int			to;				/* state number */
+};
+
+struct cnfa
+{
+	int			nstates;		/* number of states */
+	int			ncolors;		/* number of colors */
+	int			flags;
+#define  HASLACONS	 01			/* uses lookahead constraints */
+	int			pre;			/* setup state number */
+	int			post;			/* teardown state number */
+	color		bos[2];			/* colors, if any, assigned to BOS and BOL */
+	color		eos[2];			/* colors, if any, assigned to EOS and EOL */
+	struct carc **states;		/* vector of pointers to outarc lists */
+	struct carc *arcs;			/* the area for the lists */
+};
+
+#define ZAPCNFA(cnfa)	((cnfa).nstates = 0)
+#define NULLCNFA(cnfa)	((cnfa).nstates == 0)
+
+
+
+/*
+ * subexpression tree
+ */
+struct subre
+{
+	char		op;				/* '|', '.' (concat), 'b' (backref), '(',
+								 * '=' */
+	char		flags;
+#define  LONGER  01				/* prefers longer match */
+#define  SHORTER 02				/* prefers shorter match */
+#define  MIXED	 04				/* mixed preference below */
+#define  CAP 010				/* capturing parens below */
+#define  BACKR	 020			/* back reference below */
+#define  INUSE	 0100			/* in use in final tree */
+#define  LOCAL	 03				/* bits which may not propagate up */
+#define  LMIX(f) ((f)<<2)		/* LONGER -> MIXED */
+#define  SMIX(f) ((f)<<1)		/* SHORTER -> MIXED */
+#define  UP(f)	 (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
+#define  MESSY(f)	 ((f)&(MIXED|CAP|BACKR))
+#define  PREF(f) ((f)&LOCAL)
+#define  PREF2(f1, f2)	 ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
+#define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
+	short		retry;			/* index into retry memory */
+	int			subno;			/* subexpression number (for 'b' and '(') */
+	short		min;			/* min repetitions, for backref only */
+	short		max;			/* max repetitions, for backref only */
+	struct subre *left;			/* left child, if any (also freelist
+								 * chain) */
+	struct subre *right;		/* right child, if any */
+	struct state *begin;		/* outarcs from here... */
+	struct state *end;			/* ...ending in inarcs here */
+	struct cnfa cnfa;			/* compacted NFA, if any */
+	struct subre *chain;		/* for bookkeeping and error cleanup */
+};
+
+
+
+/*
+ * table of function pointers for generic manipulation functions
+ * A regex_t's re_fns points to one of these.
+ */
+struct fns
+{
+	void		FUNCPTR(free, (regex_t *));
+};
+
+
+
+/*
+ * the insides of a regex_t, hidden behind a void *
+ */
+struct guts
+{
+	int			magic;
+#define  GUTSMAGIC	 0xfed9
+	int			cflags;			/* copy of compile flags */
+	long		info;			/* copy of re_info */
+	size_t		nsub;			/* copy of re_nsub */
+	struct subre *tree;
+	struct cnfa search;			/* for fast preliminary search */
+	int			ntree;
+	struct colormap cmap;
+	int			FUNCPTR(compare, (const chr *, const chr *, size_t));
+	struct subre *lacons;		/* lookahead-constraint vector */
+	int			nlacons;		/* size of lacons */
+};
-- 
2.50.0