--- /dev/null
+Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved.
+This software is not subject to any license of the American Telephone
+and Telegraph Company or of the Regents of the University of California.
+
+Permission is granted to anyone to use this software for any purpose on
+any computer system, and to alter it and redistribute it, subject
+to the following restrictions:
+
+1. The author is not responsible for the consequences of use of this
+ software, no matter how awful, even if they arise from flaws in it.
+
+2. The origin of this software must not be misrepresented, either by
+ explicit claim or by omission. Since few users ever read sources,
+ credits must appear in the documentation.
+
+3. Altered versions must be plainly marked as such, and must not be
+ misrepresented as being the original software. Since few users
+ ever read sources, credits must appear in the documentation.
+
+4. This notice may not be removed or altered.
--- /dev/null
+# You probably want to take -DREDEBUG out of CFLAGS, and put something like
+# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of
+# internal assertion checking and some debugging facilities).
+# Put -Dconst= in for a pre-ANSI compiler.
+# Do not take -DPOSIX_MISTAKE out.
+# REGCFLAGS isn't important to you (it's for my use in some special contexts).
+CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS)
+
+# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want
+# the Berkeley __P macro, put -b in.
+MKHFLAGS=
+
+# Flags for linking but not compiling, if any.
+LDFLAGS=
+
+# Extra libraries for linking, if any.
+LIBS=
+
+# Internal stuff, should not need changing.
+OBJPRODN=regcomp.o regexec.o regerror.o regfree.o
+OBJS=$(OBJPRODN) split.o debug.o main.o
+H=cclass.h cname.h regex2.h utils.h
+REGSRC=regcomp.c regerror.c regexec.c regfree.c
+ALLSRC=$(REGSRC) engine.c debug.c main.c split.c
+
+# Stuff that matters only if you're trying to lint the package.
+LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG
+LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c
+JUNKLINT=possible pointer alignment|null effect
+
+# arrangements to build forward-reference header files
+.SUFFIXES: .ih .h
+.c.ih:
+ sh ./mkh $(MKHFLAGS) -p $< >$@
+
+default: r
+
+lib: purge $(OBJPRODN)
+ rm -f libregex.a
+ ar crv libregex.a $(OBJPRODN)
+
+purge:
+ rm -f *.o
+
+# stuff to build regex.h
+REGEXH=regex.h
+REGEXHSRC=regex2.h $(REGSRC)
+$(REGEXH): $(REGEXHSRC) mkh
+ sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp
+ cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h
+ rm -f regex.tmp
+
+# dependencies
+$(OBJPRODN) debug.o: utils.h regex.h regex2.h
+regcomp.o: cclass.h cname.h regcomp.ih
+regexec.o: engine.c engine.ih
+regerror.o: regerror.ih
+debug.o: debug.ih
+main.o: main.ih
+
+# tester
+re: $(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
+
+# regression test
+r: re tests
+ ./re <tests
+ ./re -el <tests
+ ./re -er <tests
+
+# 57 variants, and other stuff, for development use -- not useful to you
+ra: ./re tests
+ -./re <tests
+ -./re -el <tests
+ -./re -er <tests
+
+rx: ./re tests
+ ./re -x <tests
+ ./re -x -el <tests
+ ./re -x -er <tests
+
+t: ./re tests
+ -time ./re <tests
+ -time ./re -cs <tests
+ -time ./re -el <tests
+ -time ./re -cs -el <tests
+
+l: $(LINTC)
+ lint $(LINTFLAGS) -h $(LINTC) 2>&1 | egrep -v '$(JUNKLINT)' | tee lint
+
+fullprint:
+ ti README WHATSNEW notes todo | list
+ ti *.h | list
+ list *.c
+ list regex.3 regex.7
+
+print:
+ ti README WHATSNEW notes todo | list
+ ti *.h | list
+ list reg*.c engine.c
+
+
+mf.tmp: Makefile
+ sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@
+
+DTRH=cclass.h cname.h regex2.h utils.h
+PRE=COPYRIGHT README WHATSNEW
+POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch]
+FILES=$(PRE) Makefile $(POST)
+DTR=$(PRE) Makefile=mf.tmp $(POST)
+dtr: $(FILES) mf.tmp
+ makedtr $(DTR) >$@
+ rm mf.tmp
+
+cio: $(FILES)
+ cio $(FILES)
+
+rdf: $(FILES)
+ rcsdiff -c $(FILES) 2>&1 | p
+
+# various forms of cleanup
+tidy:
+ rm -f junk* core core.* *.core dtr *.tmp lint
+
+clean: tidy
+ rm -f *.o *.s *.ih re libregex.a
+
+# don't do this one unless you know what you're doing
+spotless: clean
+ rm -f mkh regex.h
--- /dev/null
+alpha3.8 release.
+Tue Aug 10 15:51:48 EDT 1999
+henry@spsystems.net (formerly henry@zoo.toronto.edu)
+
+See WHATSNEW for change listing.
+
+installation notes:
+--------
+Read the comments at the beginning of Makefile before running.
+
+Utils.h contains some things that just might have to be modified on
+some systems, as well as a nested include (ugh) of <assert.h>.
+
+The "fake" directory contains quick-and-dirty fakes for some header
+files and routines that old systems may not have. Note also that
+-DUSEBCOPY will make utils.h substitute bcopy() for memmove().
+
+After that, "make r" will build regcomp.o, regexec.o, regfree.o,
+and regerror.o (the actual routines), bundle them together into a test
+program, and run regression tests on them. No output is good output.
+
+"make lib" builds just the .o files for the actual routines (when
+you're happy with testing and have adjusted CFLAGS for production),
+and puts them together into libregex.a. You can pick up either the
+library or *.o ("make lib" makes sure there are no other .o files left
+around to confuse things).
+
+Main.c, debug.c, split.c are used for regression testing but are not part
+of the RE routines themselves.
+
+Regex.h goes in /usr/include. All other .h files are internal only.
+--------
--- /dev/null
+New in alpha3.8: Bug fix for signed/unsigned mixup, found and fixed
+by the FreeBSD folks.
+
+New in alpha3.7: A bit of cleanup aimed at maximizing portability,
+possibly at slight cost in efficiency. "ul" suffixes and "unsigned long"
+no longer appear, in particular.
+
+New in alpha3.6: A couple more portability glitches fixed.
+
+New in alpha3.5: Active development of this code has been stopped --
+I'm working on a complete reimplementation -- but folks have found some
+minor portability glitches and the like, hence this release to fix them.
+One penalty: slightly reduced compatibility with old compilers, because
+the ANSI C `unsigned long' type and `ul' constant suffix are used in a
+few places (I could avoid this but it would be considerably more work).
+
+New in alpha3.4: The complex bug alluded to below has been fixed (in a
+slightly kludgey temporary way that may hurt efficiency a bit; this is
+another "get it out the door for 4.4" release). The tests at the end of
+the tests file have accordingly been uncommented. The primary sign of
+the bug was that something like a?b matching ab matched b rather than ab.
+(The bug was essentially specific to this exact situation, else it would
+have shown up earlier.)
+
+New in alpha3.3: The definition of word boundaries has been altered
+slightly, to more closely match the usual programming notion that "_"
+is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir,
+and the makefile no longer alludes to it in mysterious ways. The
+makefile has generally been cleaned up some. Fixes have been made
+(again!) so that the regression test will run without -DREDEBUG, at
+the cost of weaker checking. A workaround for a bug in some folks'
+<assert.h> has been added. And some more things have been added to
+tests, including a couple right at the end which are commented out
+because the code currently flunks them (complex bug; fix coming).
+Plus the usual minor cleanup.
+
+New in alpha3.2: Assorted bits of cleanup and portability improvement
+(the development base is now a BSDI system using GCC instead of an ancient
+Sun system, and the newer compiler exposed some glitches). Fix for a
+serious bug that affected REs using many [] (including REG_ICASE REs
+because of the way they are implemented), *sometimes*, depending on
+memory-allocation patterns. The header-file prototypes no longer name
+the parameters, avoiding possible name conflicts. The possibility that
+some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is
+now handled gracefully. "uchar" is no longer used as an internal type
+name (too many people have the same idea). Still the same old lousy
+performance, alas.
+
+New in alpha3.1: Basically nothing, this release is just a bookkeeping
+convenience. Stay tuned.
+
+New in alpha3.0: Performance is no better, alas, but some fixes have been
+made and some functionality has been added. (This is basically the "get
+it out the door in time for 4.4" release.) One bug fix: regfree() didn't
+free the main internal structure (how embarrassing). It is now possible
+to put NULs in either the RE or the target string, using (resp.) a new
+REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to
+regcomp() makes all characters ordinary, so you can match a literal
+string easily (this will become more useful when performance improves!).
+There are now primitives to match beginnings and ends of words, although
+the syntax is disgusting and so is the implementation. The REG_ATOI
+debugging interface has changed a bit. And there has been considerable
+internal cleanup of various kinds.
+
+New in alpha2.3: Split change list out of README, and moved flags notes
+into Makefile. Macro-ized the name of regex(7) in regex(3), since it has
+to change for 4.4BSD. Cleanup work in engine.c, and some new regression
+tests to catch tricky cases thereof.
+
+New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two
+small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges
+in my own test program and might be useful to others for similar purposes.
+The regression test will now compile (and run) without REDEBUG. The
+BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now.
+Char/uchar parameters are now written int/unsigned, to avoid possible
+portability problems with unpromoted parameters. Some unsigned casts have
+been introduced to minimize portability problems with shifting into sign
+bits.
+
+New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big
+thing is that regex.h is now generated, using mkh, rather than being
+supplied in the distribution; due to circularities in dependencies,
+you have to build regex.h explicitly by "make h". The two known bugs
+have been fixed (and the regression test now checks for them), as has a
+problem with assertions not being suppressed in the absence of REDEBUG.
+No performance work yet.
+
+New in alpha2: Backslash-anything is an ordinary character, not an
+error (except, of course, for the handful of backslashed metacharacters
+in BREs), which should reduce script breakage. The regression test
+checks *where* null strings are supposed to match, and has generally
+been tightened up somewhat. Small bug fixes in parameter passing (not
+harmful, but technically errors) and some other areas. Debugging
+invoked by defining REDEBUG rather than not defining NDEBUG.
+
+New in alpha+3: full prototyping for internal routines, using a little
+helper program, mkh, which extracts prototypes given in stylized comments.
+More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple
+pre-screening of input when a literal string is known to be part of the
+RE; this does wonders for performance.
+
+New in alpha+2: minor bits of cleanup. Notably, the number "32" for the
+word width isn't hardwired into regexec.c any more, the public header
+file prototypes the functions if __STDC__ is defined, and some small typos
+in the manpages have been fixed.
+
+New in alpha+1: improvements to the manual pages, and an important
+extension, the REG_STARTEND option to regexec().
--- /dev/null
+/* character-class table */
+static struct cclass {
+ char *name;
+ char *chars;
+ char *multis;
+} cclasses[] = {
+ "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789", "",
+ "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+ "",
+ "blank", " \t", "",
+ "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
+\25\26\27\30\31\32\33\34\35\36\37\177", "",
+ "digit", "0123456789", "",
+ "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ "",
+ "lower", "abcdefghijklmnopqrstuvwxyz",
+ "",
+ "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
+ "",
+ "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ "",
+ "space", "\t\n\v\f\r ", "",
+ "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
+ "",
+ "xdigit", "0123456789ABCDEFabcdef",
+ "",
+ NULL, 0, ""
+};
--- /dev/null
+/* character-name table */
+static struct cname {
+ char *name;
+ char code;
+} cnames[] = {
+ "NUL", '\0',
+ "SOH", '\001',
+ "STX", '\002',
+ "ETX", '\003',
+ "EOT", '\004',
+ "ENQ", '\005',
+ "ACK", '\006',
+ "BEL", '\007',
+ "alert", '\007',
+ "BS", '\010',
+ "backspace", '\b',
+ "HT", '\011',
+ "tab", '\t',
+ "LF", '\012',
+ "newline", '\n',
+ "VT", '\013',
+ "vertical-tab", '\v',
+ "FF", '\014',
+ "form-feed", '\f',
+ "CR", '\015',
+ "carriage-return", '\r',
+ "SO", '\016',
+ "SI", '\017',
+ "DLE", '\020',
+ "DC1", '\021',
+ "DC2", '\022',
+ "DC3", '\023',
+ "DC4", '\024',
+ "NAK", '\025',
+ "SYN", '\026',
+ "ETB", '\027',
+ "CAN", '\030',
+ "EM", '\031',
+ "SUB", '\032',
+ "ESC", '\033',
+ "IS4", '\034',
+ "FS", '\034',
+ "IS3", '\035',
+ "GS", '\035',
+ "IS2", '\036',
+ "RS", '\036',
+ "IS1", '\037',
+ "US", '\037',
+ "space", ' ',
+ "exclamation-mark", '!',
+ "quotation-mark", '"',
+ "number-sign", '#',
+ "dollar-sign", '$',
+ "percent-sign", '%',
+ "ampersand", '&',
+ "apostrophe", '\'',
+ "left-parenthesis", '(',
+ "right-parenthesis", ')',
+ "asterisk", '*',
+ "plus-sign", '+',
+ "comma", ',',
+ "hyphen", '-',
+ "hyphen-minus", '-',
+ "period", '.',
+ "full-stop", '.',
+ "slash", '/',
+ "solidus", '/',
+ "zero", '0',
+ "one", '1',
+ "two", '2',
+ "three", '3',
+ "four", '4',
+ "five", '5',
+ "six", '6',
+ "seven", '7',
+ "eight", '8',
+ "nine", '9',
+ "colon", ':',
+ "semicolon", ';',
+ "less-than-sign", '<',
+ "equals-sign", '=',
+ "greater-than-sign", '>',
+ "question-mark", '?',
+ "commercial-at", '@',
+ "left-square-bracket", '[',
+ "backslash", '\\',
+ "reverse-solidus", '\\',
+ "right-square-bracket", ']',
+ "circumflex", '^',
+ "circumflex-accent", '^',
+ "underscore", '_',
+ "low-line", '_',
+ "grave-accent", '`',
+ "left-brace", '{',
+ "left-curly-bracket", '{',
+ "vertical-line", '|',
+ "right-brace", '}',
+ "right-curly-bracket", '}',
+ "tilde", '~',
+ "DEL", '\177',
+ NULL, 0,
+};
--- /dev/null
+#! /bin/sh
+# mkh - pull headers out of C source
+PATH=/bin:/usr/bin ; export PATH
+
+# egrep pattern to pick out marked lines
+egrep='^ =([ ]|$)'
+
+# Sed program to process marked lines into lines for the header file.
+# The markers have already been removed. Two things are done here: removal
+# of backslashed newlines, and some fudging of comments. The first is done
+# because -o needs to have prototypes on one line to strip them down.
+# Getting comments into the output is tricky; we turn C++-style // comments
+# into /* */ comments, after altering any existing */'s to avoid trouble.
+peel=' /\\$/N
+ /\\\n[ ]*/s///g
+ /\/\//s;\*/;* /;g
+ /\/\//s;//\(.*\);/*\1 */;'
+
+for a
+do
+ case "$a" in
+ -o) # old (pre-function-prototype) compiler
+ # add code to comment out argument lists
+ peel="$peel
+ "'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1(/*\2*/);'
+ shift
+ ;;
+ -b) # funny Berkeley __P macro
+ peel="$peel
+ "'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1 __P((\2));'
+ shift
+ ;;
+ -s) # compiler doesn't like `static foo();'
+ # add code to get rid of the `static'
+ peel="$peel
+ "'/^static[ ][^\/]*[a-zA-Z0-9_)](.*)/s;static.;;'
+ shift
+ ;;
+ -p) # private declarations
+ egrep='^ ==([ ]|$)'
+ shift
+ ;;
+ -i) # wrap in #ifndef, argument is name
+ ifndef="$2"
+ shift ; shift
+ ;;
+ *) break
+ ;;
+ esac
+done
+
+if test " $ifndef" != " "
+then
+ echo "#ifndef $ifndef"
+ echo "#define $ifndef /* never again */"
+fi
+echo "/* ========= begin header generated by $0 ========= */"
+echo '#ifdef __cplusplus'
+echo 'extern "C" {'
+echo '#endif'
+for f
+do
+ echo
+ echo "/* === $f === */"
+ egrep "$egrep" $f | sed 's/^ ==*[ ]//;s/^ ==*$//' | sed "$peel"
+ echo
+done
+echo '#ifdef __cplusplus'
+echo '}'
+echo '#endif'
+echo "/* ========= end header generated by $0 ========= */"
+if test " $ifndef" != " "
+then
+ echo "#endif"
+fi
+exit 0
--- /dev/null
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <regex.h>
+
+#include "utils.h"
+#include "regerror.ih"
+
+/*
+ = #define REG_OKAY 0
+ = #define REG_NOMATCH 1
+ = #define REG_BADPAT 2
+ = #define REG_ECOLLATE 3
+ = #define REG_ECTYPE 4
+ = #define REG_EESCAPE 5
+ = #define REG_ESUBREG 6
+ = #define REG_EBRACK 7
+ = #define REG_EPAREN 8
+ = #define REG_EBRACE 9
+ = #define REG_BADBR 10
+ = #define REG_ERANGE 11
+ = #define REG_ESPACE 12
+ = #define REG_BADRPT 13
+ = #define REG_EMPTY 14
+ = #define REG_ASSERT 15
+ = #define REG_INVARG 16
+ = #define REG_ATOI 255 // convert name to number (!)
+ = #define REG_ITOA 0400 // convert number to name (!)
+ */
+static struct rerr {
+ int code;
+ char *name;
+ char *explain;
+} rerrs[] = {
+ REG_OKAY, "REG_OKAY", "no errors detected",
+ REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match",
+ REG_BADPAT, "REG_BADPAT", "invalid regular expression",
+ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element",
+ REG_ECTYPE, "REG_ECTYPE", "invalid character class",
+ REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)",
+ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number",
+ REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced",
+ REG_EPAREN, "REG_EPAREN", "parentheses not balanced",
+ REG_EBRACE, "REG_EBRACE", "braces not balanced",
+ REG_BADBR, "REG_BADBR", "invalid repetition count(s)",
+ REG_ERANGE, "REG_ERANGE", "invalid character range",
+ REG_ESPACE, "REG_ESPACE", "out of memory",
+ REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid",
+ REG_EMPTY, "REG_EMPTY", "empty (sub)expression",
+ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug",
+ REG_INVARG, "REG_INVARG", "invalid argument to regex routine",
+ -1, "", "*** unknown regexp error code ***",
+};
+
+/*
+ - regerror - the interface to error numbers
+ = extern size_t regerror(int, const regex_t *, char *, size_t);
+ */
+/* ARGSUSED */
+size_t
+regerror(errcode, preg, errbuf, errbuf_size)
+int errcode;
+const regex_t *preg;
+char *errbuf;
+size_t errbuf_size;
+{
+ register struct rerr *r;
+ register size_t len;
+ register int target = errcode &~ REG_ITOA;
+ register char *s;
+ char convbuf[50];
+
+ if (errcode == REG_ATOI)
+ s = regatoi(preg, convbuf);
+ else {
+ for (r = rerrs; r->code >= 0; r++)
+ if (r->code == target)
+ break;
+
+ if (errcode®_ITOA) {
+ if (r->code >= 0)
+ (void) strcpy(convbuf, r->name);
+ else
+ sprintf(convbuf, "REG_0x%x", target);
+ assert(strlen(convbuf) < sizeof(convbuf));
+ s = convbuf;
+ } else
+ s = r->explain;
+ }
+
+ len = strlen(s) + 1;
+ if (errbuf_size > 0) {
+ if (errbuf_size > len)
+ (void) strcpy(errbuf, s);
+ else {
+ (void) strncpy(errbuf, s, errbuf_size-1);
+ errbuf[errbuf_size-1] = '\0';
+ }
+ }
+
+ return(len);
+}
+
+/*
+ - regatoi - internal routine to implement REG_ATOI
+ == static char *regatoi(const regex_t *preg, char *localbuf);
+ */
+static char *
+regatoi(preg, localbuf)
+const regex_t *preg;
+char *localbuf;
+{
+ register struct rerr *r;
+
+ for (r = rerrs; r->code >= 0; r++)
+ if (strcmp(r->name, preg->re_endp) == 0)
+ break;
+ if (r->code < 0)
+ return("0");
+
+ sprintf(localbuf, "%d", r->code);
+ return(localbuf);
+}
--- /dev/null
+.TH REGEX 3 "25 Sept 1997"
+.BY "Henry Spencer"
+.de ZR
+.\" one other place knows this name: the SEE ALSO section
+.IR regex (7) \\$1
+..
+.SH NAME
+regcomp, regexec, regerror, regfree \- regular-expression library
+.SH SYNOPSIS
+.ft B
+.\".na
+#include <sys/types.h>
+.br
+#include <regex.h>
+.HP 10
+int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags);
+.HP
+int\ regexec(const\ regex_t\ *preg, const\ char\ *string,
+size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags);
+.HP
+size_t\ regerror(int\ errcode, const\ regex_t\ *preg,
+char\ *errbuf, size_t\ errbuf_size);
+.HP
+void\ regfree(regex_t\ *preg);
+.\".ad
+.ft
+.SH DESCRIPTION
+These routines implement POSIX 1003.2 regular expressions (``RE''s);
+see
+.ZR .
+.I Regcomp
+compiles an RE written as a string into an internal form,
+.I regexec
+matches that internal form against a string and reports results,
+.I regerror
+transforms error codes from either into human-readable messages,
+and
+.I regfree
+frees any dynamically-allocated storage used by the internal form
+of an RE.
+.PP
+The header
+.I <regex.h>
+declares two structure types,
+.I regex_t
+and
+.IR regmatch_t ,
+the former for compiled internal forms and the latter for match reporting.
+It also declares the four functions,
+a type
+.IR regoff_t ,
+and a number of constants with names starting with ``REG_''.
+.PP
+.I Regcomp
+compiles the regular expression contained in the
+.I pattern
+string,
+subject to the flags in
+.IR cflags ,
+and places the results in the
+.I regex_t
+structure pointed to by
+.IR preg .
+.I Cflags
+is the bitwise OR of zero or more of the following flags:
+.IP REG_EXTENDED \w'REG_EXTENDED'u+2n
+Compile modern (``extended'') REs,
+rather than the obsolete (``basic'') REs that
+are the default.
+.IP REG_BASIC
+This is a synonym for 0,
+provided as a counterpart to REG_EXTENDED to improve readability.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+.IP REG_NOSPEC
+Compile with recognition of all special characters turned off.
+All characters are thus considered ordinary,
+so the ``RE'' is a literal string.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+REG_EXTENDED and REG_NOSPEC may not be used
+in the same call to
+.IR regcomp .
+.IP REG_ICASE
+Compile for matching that ignores upper/lower case distinctions.
+See
+.ZR .
+.IP REG_NOSUB
+Compile for matching that need only report success or failure,
+not what was matched.
+.IP REG_NEWLINE
+Compile for newline-sensitive matching.
+By default, newline is a completely ordinary character with no special
+meaning in either REs or strings.
+With this flag,
+`[^' bracket expressions and `.' never match newline,
+a `^' anchor matches the null string after any newline in the string
+in addition to its normal function,
+and the `$' anchor matches the null string before any newline in the
+string in addition to its normal function.
+.IP REG_PEND
+The regular expression ends,
+not at the first NUL,
+but just before the character pointed to by the
+.I re_endp
+member of the structure pointed to by
+.IR preg .
+The
+.I re_endp
+member is of type
+.IR const\ char\ * .
+This flag permits inclusion of NULs in the RE;
+they are considered ordinary characters.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+.PP
+When successful,
+.I regcomp
+returns 0 and fills in the structure pointed to by
+.IR preg .
+One member of that structure
+(other than
+.IR re_endp )
+is publicized:
+.IR re_nsub ,
+of type
+.IR size_t ,
+contains the number of parenthesized subexpressions within the RE
+(except that the value of this member is undefined if the
+REG_NOSUB flag was used).
+If
+.I regcomp
+fails, it returns a non-zero error code;
+see DIAGNOSTICS.
+.PP
+.I Regexec
+matches the compiled RE pointed to by
+.I preg
+against the
+.IR string ,
+subject to the flags in
+.IR eflags ,
+and reports results using
+.IR nmatch ,
+.IR pmatch ,
+and the returned value.
+The RE must have been compiled by a previous invocation of
+.IR regcomp .
+The compiled form is not altered during execution of
+.IR regexec ,
+so a single compiled RE can be used simultaneously by multiple threads.
+.PP
+By default,
+the NUL-terminated string pointed to by
+.I string
+is considered to be the text of an entire line,
+with the NUL indicating the end of the line.
+(That is,
+any other end-of-line marker is considered to have been removed
+and replaced by the NUL.)
+The
+.I eflags
+argument is the bitwise OR of zero or more of the following flags:
+.IP REG_NOTBOL \w'REG_STARTEND'u+2n
+The first character of
+the string
+is not the beginning of a line, so the `^' anchor should not match before it.
+This does not affect the behavior of newlines under REG_NEWLINE.
+.IP REG_NOTEOL
+The NUL terminating
+the string
+does not end a line, so the `$' anchor should not match before it.
+This does not affect the behavior of newlines under REG_NEWLINE.
+.IP REG_STARTEND
+The string is considered to start at
+\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR
+and to have a terminating NUL located at
+\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR
+(there need not actually be a NUL at that location),
+regardless of the value of
+.IR nmatch .
+See below for the definition of
+.IR pmatch
+and
+.IR nmatch .
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL;
+REG_STARTEND affects only the location of the string,
+not how it is matched.
+.PP
+See
+.ZR
+for a discussion of what is matched in situations where an RE or a
+portion thereof could match any of several substrings of
+.IR string .
+.PP
+Normally,
+.I regexec
+returns 0 for success and the non-zero code REG_NOMATCH for failure.
+Other non-zero error codes may be returned in exceptional situations;
+see DIAGNOSTICS.
+.PP
+If REG_NOSUB was specified in the compilation of the RE,
+or if
+.I nmatch
+is 0,
+.I regexec
+ignores the
+.I pmatch
+argument (but see below for the case where REG_STARTEND is specified).
+Otherwise,
+.I pmatch
+points to an array of
+.I nmatch
+structures of type
+.IR regmatch_t .
+Such a structure has at least the members
+.I rm_so
+and
+.IR rm_eo ,
+both of type
+.I regoff_t
+(a signed arithmetic type at least as large as an
+.I off_t
+and a
+.IR ssize_t ),
+containing respectively the offset of the first character of a substring
+and the offset of the first character after the end of the substring.
+Offsets are measured from the beginning of the
+.I string
+argument given to
+.IR regexec .
+An empty substring is denoted by equal offsets,
+both indicating the character following the empty substring.
+.PP
+The 0th member of the
+.I pmatch
+array is filled in to indicate what substring of
+.I string
+was matched by the entire RE.
+Remaining members report what substring was matched by parenthesized
+subexpressions within the RE;
+member
+.I i
+reports subexpression
+.IR i ,
+with subexpressions counted (starting at 1) by the order of their opening
+parentheses in the RE, left to right.
+Unused entries in the array\(emcorresponding either to subexpressions that
+did not participate in the match at all, or to subexpressions that do not
+exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both
+.I rm_so
+and
+.I rm_eo
+set to \-1.
+If a subexpression participated in the match several times,
+the reported substring is the last one it matched.
+(Note, as an example in particular, that when the RE `(b*)+' matches `bbb',
+the parenthesized subexpression matches the three `b's and then
+an infinite number of empty strings following the last `b',
+so the reported substring is one of the empties.)
+.PP
+If REG_STARTEND is specified,
+.I pmatch
+must point to at least one
+.I regmatch_t
+(even if
+.I nmatch
+is 0 or REG_NOSUB was specified),
+to hold the input offsets for REG_STARTEND.
+Use for output is still entirely controlled by
+.IR nmatch ;
+if
+.I nmatch
+is 0 or REG_NOSUB was specified,
+the value of
+.IR pmatch [0]
+will not be changed by a successful
+.IR regexec .
+.PP
+.I Regerror
+maps a non-zero
+.I errcode
+from either
+.I regcomp
+or
+.I regexec
+to a human-readable, printable message.
+If
+.I preg
+is non-NULL,
+the error code should have arisen from use of
+the
+.I regex_t
+pointed to by
+.IR preg ,
+and if the error code came from
+.IR regcomp ,
+it should have been the result from the most recent
+.I regcomp
+using that
+.IR regex_t .
+.RI ( Regerror
+may be able to supply a more detailed message using information
+from the
+.IR regex_t .)
+.I Regerror
+places the NUL-terminated message into the buffer pointed to by
+.IR errbuf ,
+limiting the length (including the NUL) to at most
+.I errbuf_size
+bytes.
+If the whole message won't fit,
+as much of it as will fit before the terminating NUL is supplied.
+In any case,
+the returned value is the size of buffer needed to hold the whole
+message (including terminating NUL).
+If
+.I errbuf_size
+is 0,
+.I errbuf
+is ignored but the return value is still correct.
+.PP
+If the
+.I errcode
+given to
+.I regerror
+is first ORed with REG_ITOA,
+the ``message'' that results is the printable name of the error code,
+e.g. ``REG_NOMATCH'',
+rather than an explanation thereof.
+If
+.I errcode
+is REG_ATOI,
+then
+.I preg
+shall be non-NULL and the
+.I re_endp
+member of the structure it points to
+must point to the printable name of an error code;
+in this case, the result in
+.I errbuf
+is the decimal digits of
+the numeric value of the error code
+(0 if the name is not recognized).
+REG_ITOA and REG_ATOI are intended primarily as debugging facilities;
+they are extensions,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+Be warned also that they are considered experimental and changes are possible.
+.PP
+.I Regfree
+frees any dynamically-allocated storage associated with the compiled RE
+pointed to by
+.IR preg .
+The remaining
+.I regex_t
+is no longer a valid compiled RE
+and the effect of supplying it to
+.I regexec
+or
+.I regerror
+is undefined.
+.PP
+None of these functions references global variables except for tables
+of constants;
+all are safe for use from multiple threads if the arguments are safe.
+.SH IMPLEMENTATION CHOICES
+There are a number of decisions that 1003.2 leaves up to the implementor,
+either by explicitly saying ``undefined'' or by virtue of them being
+forbidden by the RE grammar.
+This implementation treats them as follows.
+.PP
+See
+.ZR
+for a discussion of the definition of case-independent matching.
+.PP
+There is no particular limit on the length of REs,
+except insofar as memory is limited.
+Memory usage is approximately linear in RE size, and largely insensitive
+to RE complexity, except for bounded repetitions.
+See BUGS for one short RE using them
+that will run almost any system out of memory.
+.PP
+A backslashed character other than one specifically given a magic meaning
+by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs)
+is taken as an ordinary character.
+.PP
+Any unmatched [ is a REG_EBRACK error.
+.PP
+Equivalence classes cannot begin or end bracket-expression ranges.
+The endpoint of one range cannot begin another.
+.PP
+RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255.
+.PP
+A repetition operator (?, *, +, or bounds) cannot follow another
+repetition operator.
+A repetition operator cannot begin an expression or subexpression
+or follow `^' or `|'.
+.PP
+`|' cannot appear first or last in a (sub)expression or after another `|',
+i.e. an operand of `|' cannot be an empty subexpression.
+An empty parenthesized subexpression, `()', is legal and matches an
+empty (sub)string.
+An empty string is not a legal RE.
+.PP
+A `{' followed by a digit is considered the beginning of bounds for a
+bounded repetition, which must then follow the syntax for bounds.
+A `{' \fInot\fR followed by a digit is considered an ordinary character.
+.PP
+`^' and `$' beginning and ending subexpressions in obsolete (``basic'')
+REs are anchors, not ordinary characters.
+.SH SEE ALSO
+grep(1), regex(7)
+.PP
+POSIX 1003.2, sections 2.8 (Regular Expression Notation)
+and
+B.5 (C Binding for Regular Expression Matching).
+.SH DIAGNOSTICS
+Non-zero error codes from
+.I regcomp
+and
+.I regexec
+include the following:
+.PP
+.nf
+.ta \w'REG_ECOLLATE'u+3n
+REG_NOMATCH regexec() failed to match
+REG_BADPAT invalid regular expression
+REG_ECOLLATE invalid collating element
+REG_ECTYPE invalid character class
+REG_EESCAPE \e applied to unescapable character
+REG_ESUBREG invalid backreference number
+REG_EBRACK brackets [ ] not balanced
+REG_EPAREN parentheses ( ) not balanced
+REG_EBRACE braces { } not balanced
+REG_BADBR invalid repetition count(s) in { }
+REG_ERANGE invalid character range in [ ]
+REG_ESPACE ran out of memory
+REG_BADRPT ?, *, or + operand invalid
+REG_EMPTY empty (sub)expression
+REG_ASSERT ``can't happen''\(emyou found a bug
+REG_INVARG invalid argument, e.g. negative-length string
+.fi
+.SH HISTORY
+Written by Henry Spencer,
+henry@zoo.toronto.edu.
+.SH BUGS
+This is an alpha release with known defects.
+Please report problems.
+.PP
+There is one known functionality bug.
+The implementation of internationalization is incomplete:
+the locale is always assumed to be the default one of 1003.2,
+and only the collating elements etc. of that locale are available.
+.PP
+The back-reference code is subtle and doubts linger about its correctness
+in complex cases.
+.PP
+.I Regexec
+performance is poor.
+This will improve with later releases.
+.I Nmatch
+exceeding 0 is expensive;
+.I nmatch
+exceeding 1 is worse.
+.I Regexec
+is largely insensitive to RE complexity \fIexcept\fR that back
+references are massively expensive.
+RE length does matter; in particular, there is a strong speed bonus
+for keeping RE length under about 30 characters,
+with most special characters counting roughly double.
+.PP
+.I Regcomp
+implements bounded repetitions by macro expansion,
+which is costly in time and space if counts are large
+or bounded repetitions are nested.
+An RE like, say,
+`((((a{1,100}){1,100}){1,100}){1,100}){1,100}'
+will (eventually) run almost any existing machine out of swap space.
+.PP
+There are suspected problems with response to obscure error conditions.
+Notably,
+certain kinds of internal overflow,
+produced only by truly enormous REs or by multiply nested bounded repetitions,
+are probably not handled well.
+.PP
+Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is
+a special character only in the presence of a previous unmatched `('.
+This can't be fixed until the spec is fixed.
+.PP
+The standard's definition of back references is vague.
+For example, does
+`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?
+Until the standard is clarified,
+behavior in such cases should not be relied on.
+.PP
+The implementation of word-boundary matching is a bit of a kludge,
+and bugs may lurk in combinations of word-boundary matching and anchoring.
--- /dev/null
+.TH REGEX 7 "25 Oct 1995"
+.BY "Henry Spencer"
+.SH NAME
+regex \- POSIX 1003.2 regular expressions
+.SH DESCRIPTION
+Regular expressions (``RE''s),
+as defined in POSIX 1003.2, come in two forms:
+modern REs (roughly those of
+.IR egrep ;
+1003.2 calls these ``extended'' REs)
+and obsolete REs (roughly those of
+.IR ed ;
+1003.2 ``basic'' REs).
+Obsolete REs mostly exist for backward compatibility in some old programs;
+they will be discussed at the end.
+1003.2 leaves some aspects of RE syntax and semantics open;
+`\(dg' marks decisions on these aspects that
+may not be fully portable to other 1003.2 implementations.
+.PP
+A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR,
+separated by `|'.
+It matches anything that matches one of the branches.
+.PP
+A branch is one\(dg or more \fIpieces\fR, concatenated.
+It matches a match for the first, followed by a match for the second, etc.
+.PP
+A piece is an \fIatom\fR possibly followed
+by a single\(dg `*', `+', `?', or \fIbound\fR.
+An atom followed by `*' matches a sequence of 0 or more matches of the atom.
+An atom followed by `+' matches a sequence of 1 or more matches of the atom.
+An atom followed by `?' matches a sequence of 0 or 1 matches of the atom.
+.PP
+A \fIbound\fR is `{' followed by an unsigned decimal integer,
+possibly followed by `,'
+possibly followed by another unsigned decimal integer,
+always followed by `}'.
+The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive,
+and if there are two of them, the first may not exceed the second.
+An atom followed by a bound containing one integer \fIi\fR
+and no comma matches
+a sequence of exactly \fIi\fR matches of the atom.
+An atom followed by a bound
+containing one integer \fIi\fR and a comma matches
+a sequence of \fIi\fR or more matches of the atom.
+An atom followed by a bound
+containing two integers \fIi\fR and \fIj\fR matches
+a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom.
+.PP
+An atom is a regular expression enclosed in `()' (matching a match for the
+regular expression),
+an empty set of `()' (matching the null string)\(dg,
+a \fIbracket expression\fR (see below), `.'
+(matching any single character), `^' (matching the null string at the
+beginning of a line), `$' (matching the null string at the
+end of a line), a `\e' followed by one of the characters
+`^.[$()|*+?{\e'
+(matching that character taken as an ordinary character),
+a `\e' followed by any other character\(dg
+(matching that character taken as an ordinary character,
+as if the `\e' had not been present\(dg),
+or a single character with no other significance (matching that character).
+A `{' followed by a character other than a digit is an ordinary
+character, not the beginning of a bound\(dg.
+It is illegal to end an RE with `\e'.
+.PP
+A \fIbracket expression\fR is a list of characters enclosed in `[]'.
+It normally matches any single character from the list (but see below).
+If the list begins with `^',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+If two characters in the list are separated by `\-', this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g. `[0\-9]' in ASCII matches any decimal digit.
+It is illegal\(dg for two ranges to share an
+endpoint, e.g. `a\-c\-e'.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal `]' in the list, make it the first character
+(following a possible `^').
+To include a literal `\-', make it the first or last character,
+or the second endpoint of a range.
+To use a literal `\-' as the first endpoint of a range,
+enclose it in `[.' and `.]' to make it a collating element (see below).
+With the exception of these and some combinations using `[' (see next
+paragraphs), all other special characters, including `\e', lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in `[.' and `.]' stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression containing a multi-character collating element
+can thus match more than one character,
+e.g. if the collating sequence includes a `ch' collating element,
+then the RE `[[.ch.]]*c' matches the first five characters
+of `chchcc'.
+.PP
+Within a bracket expression, a collating element enclosed in `[=' and
+`=]' is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `[.' and `.]'.)
+For example, if o and \o'o^' are the members of an equivalence class,
+then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous.
+An equivalence class may not\(dg be an endpoint
+of a range.
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in `[:' and `:]' stands for the list of all characters belonging to that
+class.
+Standard character class names are:
+.PP
+.RS
+.nf
+.ta 3c 6c 9c
+alnum digit punct
+alpha graph space
+blank lower upper
+cntrl print xdigit
+.fi
+.RE
+.PP
+These stand for the character classes defined in
+.IR ctype (3).
+A locale may provide others.
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases\(dg of bracket expressions:
+the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at
+the beginning and end of a word respectively.
+A word is defined as a sequence of
+word characters
+which is neither preceded nor followed by
+word characters.
+A word character is an
+.I alnum
+character (as defined by
+.IR ctype (3))
+or an underscore.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+.PP
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+it matches the longest.
+Subexpressions also match the longest possible substrings, subject to
+the constraint that the whole match be as long as possible,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that higher-level subexpressions thus take priority over
+their lower-level component subexpressions.
+.PP
+Match lengths are measured in characters, not collating elements.
+A null string is considered longer than no match at all.
+For example,
+`bb*' matches the three middle characters of `abbbc',
+`(wee|week)(knights|nights)' matches all ten characters of `weeknights',
+when `(.*).*' is matched against `abc' the parenthesized subexpression
+matches all three characters, and
+when `(a*)*' is matched against `bc' both the whole RE and the parenthesized
+subexpression match the null string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+e.g. `x' becomes `[xX]'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that (e.g.) `[x]'
+becomes `[xX]' and `[^x]' becomes `[^xX]'.
+.PP
+No particular limit is imposed on the length of REs\(dg.
+Programs intended to be portable should not employ REs longer
+than 256 bytes,
+as an implementation can refuse to accept such REs and remain
+POSIX-compliant.
+.PP
+Obsolete (``basic'') regular expressions differ in several respects.
+`|', `+', and `?' are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are `\e{' and `\e}',
+with `{' and `}' by themselves ordinary characters.
+The parentheses for nested subexpressions are `\e(' and `\e)',
+with `(' and `)' by themselves ordinary characters.
+`^' is an ordinary character except at the beginning of the
+RE or\(dg the beginning of a parenthesized subexpression,
+`$' is an ordinary character except at the end of the
+RE or\(dg the end of a parenthesized subexpression,
+and `*' is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `^').
+Finally, there is one new type of atom, a \fIback reference\fR:
+`\e' followed by a non-zero decimal digit \fId\fR
+matches the same sequence of characters
+matched by the \fId\fRth parenthesized subexpression
+(numbering subexpressions by the positions of their opening parentheses,
+left to right),
+so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'.
+.SH SEE ALSO
+regex(3)
+.PP
+POSIX 1003.2, section 2.8 (Regular Expression Notation).
+.SH HISTORY
+Written by Henry Spencer, based on the 1003.2 spec.
+.SH BUGS
+Having two kinds of REs is a botch.
+.PP
+The current 1003.2 spec says that `)' is an ordinary character in
+the absence of an unmatched `(';
+this was an unintentional result of a wording error,
+and change is likely.
+Avoid relying on it.
+.PP
+Back references are a dreadful botch,
+posing major problems for efficient implementations.
+They are also somewhat vaguely defined
+(does
+`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?).
+Avoid using them.
+.PP
+1003.2's specification of case-independent matching is vague.
+The ``one case implies all cases'' definition given above
+is current consensus among implementors as to the right interpretation.
+.PP
+The syntax for word boundaries is incredibly ugly.
--- /dev/null
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <regex.h>
+
+#include "utils.h"
+#include "regex2.h"
+
+/*
+ - regfree - free everything
+ = extern void regfree(regex_t *);
+ */
+void
+regfree(preg)
+regex_t *preg;
+{
+ register struct re_guts *g;
+
+ if (preg->re_magic != MAGIC1) /* oops */
+ return; /* nice to complain, but hard */
+
+ g = preg->re_g;
+ if (g == NULL || g->magic != MAGIC2) /* oops again */
+ return;
+ preg->re_magic = 0; /* mark it invalid */
+ g->magic = 0; /* mark it invalid */
+
+ if (g->strip != NULL)
+ free((char *)g->strip);
+ if (g->sets != NULL)
+ free((char *)g->sets);
+ if (g->setbits != NULL)
+ free((char *)g->setbits);
+ if (g->must != NULL)
+ free(g->must);
+ free((char *)g);
+}
--- /dev/null
+# regular expression test set
+# Lines are at least three fields, separated by one or more tabs. "" stands
+# for an empty field. First field is an RE. Second field is flags. If
+# C flag given, regcomp() is expected to fail, and the third field is the
+# error name (minus the leading REG_).
+#
+# Otherwise it is expected to succeed, and the third field is the string to
+# try matching it against. If there is no fourth field, the match is
+# expected to fail. If there is a fourth field, it is the substring that
+# the RE is expected to match. If there is a fifth field, it is a comma-
+# separated list of what the subexpressions should match, with - indicating
+# no match for that one. In both the fourth and fifth fields, a (sub)field
+# starting with @ indicates that the (sub)expression is expected to match
+# a null string followed by the stuff after the @; this provides a way to
+# test where null strings match. The character `N' in REs and strings
+# is newline, `S' is space, `T' is tab, `Z' is NUL.
+#
+# The full list of flags:
+# - placeholder, does nothing
+# b RE is a BRE, not an ERE
+# & try it as both an ERE and a BRE
+# C regcomp() error expected, third field is error name
+# i REG_ICASE
+# m ("mundane") REG_NOSPEC
+# s REG_NOSUB (not really testable)
+# n REG_NEWLINE
+# ^ REG_NOTBOL
+# $ REG_NOTEOL
+# # REG_STARTEND (see below)
+# p REG_PEND
+#
+# For REG_STARTEND, the start/end offsets are those of the substring
+# enclosed in ().
+
+# basics
+a & a a
+abc & abc abc
+abc|de - abc abc
+a|b|c - abc a
+
+# parentheses and perversions thereof
+a(b)c - abc abc
+a\(b\)c b abc abc
+a( C EPAREN
+a( b a( a(
+a\( - a( a(
+a\( bC EPAREN
+a\(b bC EPAREN
+a(b C EPAREN
+a(b b a(b a(b
+# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
+a) - a) a)
+) - ) )
+# end gagging (in a just world, those *should* give EPAREN)
+a) b a) a)
+a\) bC EPAREN
+\) bC EPAREN
+a()b - ab ab
+a\(\)b b ab ab
+
+# anchoring and REG_NEWLINE
+^abc$ & abc abc
+a^b - a^b
+a^b b a^b a^b
+a$b - a$b
+a$b b a$b a$b
+^ & abc @abc
+$ & abc @
+^$ & "" @
+$^ - "" @
+\($\)\(^\) b "" @
+# stop retching, those are legitimate (although disgusting)
+^^ - "" @
+$$ - "" @
+b$ & abNc
+b$ &n abNc b
+^b$ & aNbNc
+^b$ &n aNbNc b
+^$ &n aNNb @Nb
+^$ n abc
+^$ n abcN @
+$^ n aNNb @Nb
+\($\)\(^\) bn aNNb @Nb
+^^ n^ aNNb @Nb
+$$ n aNNb @NN
+^a ^ a
+a$ $ a
+^a ^n aNb
+^b ^n aNb b
+a$ $n bNa
+b$ $n bNa b
+a*(^b$)c* - b b
+a*\(^b$\)c* b b b
+
+# certain syntax errors and non-errors
+| C EMPTY
+| b | |
+* C BADRPT
+* b * *
++ C BADRPT
+? C BADRPT
+"" &C EMPTY
+() - abc @abc
+\(\) b abc @abc
+a||b C EMPTY
+|ab C EMPTY
+ab| C EMPTY
+(|a)b C EMPTY
+(a|)b C EMPTY
+(*a) C BADRPT
+(+a) C BADRPT
+(?a) C BADRPT
+({1}a) C BADRPT
+\(\{1\}a\) bC BADRPT
+(a|*b) C BADRPT
+(a|+b) C BADRPT
+(a|?b) C BADRPT
+(a|{1}b) C BADRPT
+^* C BADRPT
+^* b * *
+^+ C BADRPT
+^? C BADRPT
+^{1} C BADRPT
+^\{1\} bC BADRPT
+
+# metacharacters, backslashes
+a.c & abc abc
+a[bc]d & abd abd
+a\*c & a*c a*c
+a\\b & a\b a\b
+a\\\*b & a\*b a\*b
+a\bc & abc abc
+a\ &C EESCAPE
+a\\bc & a\bc a\bc
+\{ bC BADRPT
+a\[b & a[b a[b
+a[b &C EBRACK
+# trailing $ is a peculiar special case for the BRE code
+a$ & a a
+a$ & a$
+a\$ & a
+a\$ & a$ a$
+a\\$ & a
+a\\$ & a$
+a\\$ & a\$
+a\\$ & a\ a\
+
+# back references, ugh
+a\(b\)\2c bC ESUBREG
+a\(b\1\)c bC ESUBREG
+a\(b*\)c\1d b abbcbbd abbcbbd bb
+a\(b*\)c\1d b abbcbd
+a\(b*\)c\1d b abbcbbbd
+^\(.\)\1 b abc
+a\([bc]\)\1d b abcdabbd abbd b
+a\(\([bc]\)\2\)*d b abbccd abbccd
+a\(\([bc]\)\2\)*d b abbcbd
+# actually, this next one probably ought to fail, but the spec is unclear
+a\(\(b\)*\2\)*d b abbbd abbbd
+# here is a case that no NFA implementation does right
+\(ab*\)[ab]*\1 b ababaaa ababaaa a
+# check out normal matching in the presence of back refs
+\(a\)\1bcd b aabcd aabcd
+\(a\)\1bc*d b aabcd aabcd
+\(a\)\1bc*d b aabd aabd
+\(a\)\1bc*d b aabcccd aabcccd
+\(a\)\1bc*[ce]d b aabcccd aabcccd
+^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd
+
+# ordinary repetitions
+ab*c & abc abc
+ab+c - abc abc
+ab?c - abc abc
+a\(*\)b b a*b a*b
+a\(**\)b b ab ab
+a\(***\)b bC BADRPT
+*a b *a *a
+**a b a a
+***a bC BADRPT
+
+# the dreaded bounded repetitions
+{ & { {
+{abc & {abc {abc
+{1 C BADRPT
+{1} C BADRPT
+a{b & a{b a{b
+a{1}b - ab ab
+a\{1\}b b ab ab
+a{1,}b - ab ab
+a\{1,\}b b ab ab
+a{1,2}b - aab aab
+a\{1,2\}b b aab aab
+a{1 C EBRACE
+a\{1 bC EBRACE
+a{1a C EBRACE
+a\{1a bC EBRACE
+a{1a} C BADBR
+a\{1a\} bC BADBR
+a{,2} - a{,2} a{,2}
+a\{,2\} bC BADBR
+a{,} - a{,} a{,}
+a\{,\} bC BADBR
+a{1,x} C BADBR
+a\{1,x\} bC BADBR
+a{1,x C EBRACE
+a\{1,x bC EBRACE
+a{300} C BADBR
+a\{300\} bC BADBR
+a{1,0} C BADBR
+a\{1,0\} bC BADBR
+ab{0,0}c - abcac ac
+ab\{0,0\}c b abcac ac
+ab{0,1}c - abcac abc
+ab\{0,1\}c b abcac abc
+ab{0,3}c - abbcac abbc
+ab\{0,3\}c b abbcac abbc
+ab{1,1}c - acabc abc
+ab\{1,1\}c b acabc abc
+ab{1,3}c - acabc abc
+ab\{1,3\}c b acabc abc
+ab{2,2}c - abcabbc abbc
+ab\{2,2\}c b abcabbc abbc
+ab{2,4}c - abcabbc abbc
+ab\{2,4\}c b abcabbc abbc
+((a{1,10}){1,10}){1,10} - a a a,a
+
+# multiple repetitions
+a** &C BADRPT
+a++ C BADRPT
+a?? C BADRPT
+a*+ C BADRPT
+a*? C BADRPT
+a+* C BADRPT
+a+? C BADRPT
+a?* C BADRPT
+a?+ C BADRPT
+a{1}{1} C BADRPT
+a*{1} C BADRPT
+a+{1} C BADRPT
+a?{1} C BADRPT
+a{1}* C BADRPT
+a{1}+ C BADRPT
+a{1}? C BADRPT
+a*{b} - a{b} a{b}
+a\{1\}\{1\} bC BADRPT
+a*\{1\} bC BADRPT
+a\{1\}* bC BADRPT
+
+# brackets, and numerous perversions thereof
+a[b]c & abc abc
+a[ab]c & abc abc
+a[^ab]c & adc adc
+a[]b]c & a]c a]c
+a[[b]c & a[c a[c
+a[-b]c & a-c a-c
+a[^]b]c & adc adc
+a[^-b]c & adc adc
+a[b-]c & a-c a-c
+a[b &C EBRACK
+a[] &C EBRACK
+a[1-3]c & a2c a2c
+a[3-1]c &C ERANGE
+a[1-3-5]c &C ERANGE
+a[[.-.]--]c & a-c a-c
+a[1- &C ERANGE
+a[[. &C EBRACK
+a[[.x &C EBRACK
+a[[.x. &C EBRACK
+a[[.x.] &C EBRACK
+a[[.x.]] & ax ax
+a[[.x,.]] &C ECOLLATE
+a[[.one.]]b & a1b a1b
+a[[.notdef.]]b &C ECOLLATE
+a[[.].]]b & a]b a]b
+a[[:alpha:]]c & abc abc
+a[[:notdef:]]c &C ECTYPE
+a[[: &C EBRACK
+a[[:alpha &C EBRACK
+a[[:alpha:] &C EBRACK
+a[[:alpha,:] &C ECTYPE
+a[[:]:]]b &C ECTYPE
+a[[:-:]]b &C ECTYPE
+a[[:alph:]] &C ECTYPE
+a[[:alphabet:]] &C ECTYPE
+[[:alnum:]]+ - -%@a0X- a0X
+[[:alpha:]]+ - -%@aX0- aX
+[[:blank:]]+ - aSSTb SST
+[[:cntrl:]]+ - aNTb NT
+[[:digit:]]+ - a019b 019
+[[:graph:]]+ - Sa%bS a%b
+[[:lower:]]+ - AabC ab
+[[:print:]]+ - NaSbN aSb
+[[:punct:]]+ - S%-&T %-&
+[[:space:]]+ - aSNTb SNT
+[[:upper:]]+ - aBCd BC
+[[:xdigit:]]+ - p0f3Cq 0f3C
+a[[=b=]]c & abc abc
+a[[= &C EBRACK
+a[[=b &C EBRACK
+a[[=b= &C EBRACK
+a[[=b=] &C EBRACK
+a[[=b,=]] &C ECOLLATE
+a[[=one=]]b & a1b a1b
+
+# complexities
+a(((b)))c - abc abc
+a(b|(c))d - abd abd
+a(b*|c)d - abbd abbd
+# just gotta have one DFA-buster, of course
+a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
+# and an inline expansion in case somebody gets tricky
+a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
+# and in case somebody just slips in an NFA...
+a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
+# fish for anomalies as the number of states passes 32
+12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
+123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
+1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
+12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
+123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
+# and one really big one, beyond any plausible word width
+1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
+# fish for problems as brackets go past 8
+[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
+[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
+[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
+[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
+
+# subtleties of matching
+abc & xabcy abc
+a\(b\)?c\1d b acd
+aBc i Abc Abc
+a[Bc]*d i abBCcd abBCcd
+0[[:upper:]]1 &i 0a1 0a1
+0[[:lower:]]1 &i 0A1 0A1
+a[^b]c &i abc
+a[^b]c &i aBc
+a[^b]c &i adc adc
+[a]b[c] - abc abc
+[a]b[a] - aba aba
+[abc]b[abc] - abc abc
+[abc]b[abd] - abd abd
+a(b?c)+d - accd accd
+(wee|week)(knights|night) - weeknights weeknights
+(we|wee|week|frob)(knights|night|day) - weeknights weeknights
+a[bc]d - xyzaaabcaababdacd abd
+a[ab]c - aaabc abc
+abc s abc abc
+a* & b @b
+
+# Let's have some fun -- try to match a C comment.
+# first the obvious, which looks okay at first glance...
+/\*.*\*/ - /*x*/ /*x*/
+# but...
+/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/
+# okay, we must not match */ inside; try to do that...
+/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/
+/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/
+# but...
+/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/
+# and a still fancier version, which does it right (I think)...
+/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/
+/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/
+/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/
+/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/
+/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/
+/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/
+
+# subexpressions
+.* - abc abc -
+a(b)(c)d - abcd abcd b,c
+a(((b)))c - abc abc b,b,b
+a(b|(c))d - abd abd b,-
+a(b*|c|e)d - abbd abbd bb
+a(b*|c|e)d - acd acd c
+a(b*|c|e)d - ad ad @d
+a(b?)c - abc abc b
+a(b?)c - ac ac @c
+a(b+)c - abc abc b
+a(b+)c - abbbc abbbc bbb
+a(b*)c - ac ac @c
+(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
+# the regression tester only asks for 9 subexpressions
+a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
+a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
+a([bc]?)c - abc abc b
+a([bc]?)c - ac ac @c
+a([bc]+)c - abc abc b
+a([bc]+)c - abcc abcc bc
+a([bc]+)bc - abcbc abcbc bc
+a(bb+|b)b - abb abb b
+a(bbb+|bb+|b)b - abb abb b
+a(bbb+|bb+|b)b - abbb abbb bb
+a(bbb+|bb+|b)bb - abbb abbb b
+(.*).* - abcdef abcdef abcdef
+(a*)* - bc @b @b
+
+# do we get the right subexpression when it is used more than once?
+a(b|c)*d - ad ad -
+a(b|c)*d - abcd abcd c
+a(b|c)+d - abd abd b
+a(b|c)+d - abcd abcd c
+a(b|c?)+d - ad ad @d
+a(b|c?)+d - abcd abcd @d
+a(b|c){0,0}d - ad ad -
+a(b|c){0,1}d - ad ad -
+a(b|c){0,1}d - abd abd b
+a(b|c){0,2}d - ad ad -
+a(b|c){0,2}d - abcd abcd c
+a(b|c){0,}d - ad ad -
+a(b|c){0,}d - abcd abcd c
+a(b|c){1,1}d - abd abd b
+a(b|c){1,1}d - acd acd c
+a(b|c){1,2}d - abd abd b
+a(b|c){1,2}d - abcd abcd c
+a(b|c){1,}d - abd abd b
+a(b|c){1,}d - abcd abcd c
+a(b|c){2,2}d - acbd acbd b
+a(b|c){2,2}d - abcd abcd c
+a(b|c){2,4}d - abcd abcd c
+a(b|c){2,4}d - abcbd abcbd b
+a(b|c){2,4}d - abcbcd abcbcd c
+a(b|c){2,}d - abcd abcd c
+a(b|c){2,}d - abcbd abcbd b
+a(b+|((c)*))+d - abd abd @d,@d,-
+a(b+|((c)*))+d - abcd abcd @d,@d,-
+
+# check out the STARTEND option
+[abc] &# a(b)c b
+[abc] &# a(d)c
+[abc] &# a(bc)d b
+[abc] &# a(dc)d c
+. &# a()c
+b.*c &# b(bc)c bc
+b.* &# b(bc)c bc
+.*c &# b(bc)c bc
+
+# plain strings, with the NOSPEC flag
+abc m abc abc
+abc m xabcy abc
+abc m xyz
+a*b m aba*b a*b
+a*b m ab
+"" mC EMPTY
+
+# cases involving NULs
+aZb & a a
+aZb &p a
+aZb &p# (aZb) aZb
+aZ*b &p# (ab) ab
+a.b &# (aZb) aZb
+a.* &# (aZb)c aZb
+
+# word boundaries (ick)
+[[:<:]]a & a a
+[[:<:]]a & ba
+[[:<:]]a & -a a
+a[[:>:]] & a a
+a[[:>:]] & ab
+a[[:>:]] & a- a
+[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
+[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
+[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
+[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
+[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
+[[:<:]]a_b[[:>:]] & x_a_b
+
+# past problems, and suspected problems
+(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1
+abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
+abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
+(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11
+CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
+Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz
+a?b - ab ab
+-\{0,1\}[0-9]*$ b -5 -5
+a*a*a*a*a*a*a* & aaaaaa aaaaaa