From: Vadim Zeitlin Date: Wed, 2 Jun 1999 01:53:28 +0000 (+0000) Subject: Initial revision X-Git-Url: https://git.saurik.com/wxWidgets.git/commitdiff_plain/07dcc2173f3545a38736762f817fd6486ec17639 Initial revision git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@2629 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- diff --git a/src/regex/COPYRIGHT b/src/regex/COPYRIGHT new file mode 100644 index 0000000000..30c1f7a488 --- /dev/null +++ b/src/regex/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. + +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: + +1. The author is not responsible for the consequences of use of this + software, no matter how awful, even if they arise from flaws in it. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. Since few users ever read sources, + credits must appear in the documentation. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. Since few users + ever read sources, credits must appear in the documentation. + +4. This notice may not be removed or altered. diff --git a/src/regex/Makefile b/src/regex/Makefile new file mode 100644 index 0000000000..3882b37864 --- /dev/null +++ b/src/regex/Makefile @@ -0,0 +1,130 @@ +# You probably want to take -DREDEBUG out of CFLAGS, and put something like +# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of +# internal assertion checking and some debugging facilities). +# Put -Dconst= in for a pre-ANSI compiler. +# Do not take -DPOSIX_MISTAKE out. +# REGCFLAGS isn't important to you (it's for my use in some special contexts). +CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS) + +# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want +# the Berkeley __P macro, put -b in. +MKHFLAGS= + +# Flags for linking but not compiling, if any. +LDFLAGS= + +# Extra libraries for linking, if any. +LIBS= + +# Internal stuff, should not need changing. +OBJPRODN=regcomp.o regexec.o regerror.o regfree.o +OBJS=$(OBJPRODN) split.o debug.o main.o +H=cclass.h cname.h regex2.h utils.h +REGSRC=regcomp.c regerror.c regexec.c regfree.c +ALLSRC=$(REGSRC) engine.c debug.c main.c split.c + +# Stuff that matters only if you're trying to lint the package. +LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG +LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c +JUNKLINT=possible pointer alignment|null effect + +# arrangements to build forward-reference header files +.SUFFIXES: .ih .h +.c.ih: + sh ./mkh $(MKHFLAGS) -p $< >$@ + +default: r + +lib: purge $(OBJPRODN) + rm -f libregex.a + ar crv libregex.a $(OBJPRODN) + +purge: + rm -f *.o + +# stuff to build regex.h +REGEXH=regex.h +REGEXHSRC=regex2.h $(REGSRC) +$(REGEXH): $(REGEXHSRC) mkh + sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp + cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h + rm -f regex.tmp + +# dependencies +$(OBJPRODN) debug.o: utils.h regex.h regex2.h +regcomp.o: cclass.h cname.h regcomp.ih +regexec.o: engine.c engine.ih +regerror.o: regerror.ih +debug.o: debug.ih +main.o: main.ih + +# tester +re: $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ + +# regression test +r: re tests + ./re &1 | egrep -v '$(JUNKLINT)' | tee lint + +fullprint: + ti README WHATSNEW notes todo | list + ti *.h | list + list *.c + list regex.3 regex.7 + +print: + ti README WHATSNEW notes todo | list + ti *.h | list + list reg*.c engine.c + + +mf.tmp: Makefile + sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@ + +DTRH=cclass.h cname.h regex2.h utils.h +PRE=COPYRIGHT README WHATSNEW +POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch] +FILES=$(PRE) Makefile $(POST) +DTR=$(PRE) Makefile=mf.tmp $(POST) +dtr: $(FILES) mf.tmp + makedtr $(DTR) >$@ + rm mf.tmp + +cio: $(FILES) + cio $(FILES) + +rdf: $(FILES) + rcsdiff -c $(FILES) 2>&1 | p + +# various forms of cleanup +tidy: + rm -f junk* core core.* *.core dtr *.tmp lint + +clean: tidy + rm -f *.o *.s *.ih re libregex.a + +# don't do this one unless you know what you're doing +spotless: clean + rm -f mkh regex.h diff --git a/src/regex/README b/src/regex/README new file mode 100644 index 0000000000..e6ce373444 --- /dev/null +++ b/src/regex/README @@ -0,0 +1,32 @@ +alpha3.8 release. +Tue Aug 10 15:51:48 EDT 1999 +henry@spsystems.net (formerly henry@zoo.toronto.edu) + +See WHATSNEW for change listing. + +installation notes: +-------- +Read the comments at the beginning of Makefile before running. + +Utils.h contains some things that just might have to be modified on +some systems, as well as a nested include (ugh) of . + +The "fake" directory contains quick-and-dirty fakes for some header +files and routines that old systems may not have. Note also that +-DUSEBCOPY will make utils.h substitute bcopy() for memmove(). + +After that, "make r" will build regcomp.o, regexec.o, regfree.o, +and regerror.o (the actual routines), bundle them together into a test +program, and run regression tests on them. No output is good output. + +"make lib" builds just the .o files for the actual routines (when +you're happy with testing and have adjusted CFLAGS for production), +and puts them together into libregex.a. You can pick up either the +library or *.o ("make lib" makes sure there are no other .o files left +around to confuse things). + +Main.c, debug.c, split.c are used for regression testing but are not part +of the RE routines themselves. + +Regex.h goes in /usr/include. All other .h files are internal only. +-------- diff --git a/src/regex/WHATSNEW b/src/regex/WHATSNEW new file mode 100644 index 0000000000..12953433d3 --- /dev/null +++ b/src/regex/WHATSNEW @@ -0,0 +1,108 @@ +New in alpha3.8: Bug fix for signed/unsigned mixup, found and fixed +by the FreeBSD folks. + +New in alpha3.7: A bit of cleanup aimed at maximizing portability, +possibly at slight cost in efficiency. "ul" suffixes and "unsigned long" +no longer appear, in particular. + +New in alpha3.6: A couple more portability glitches fixed. + +New in alpha3.5: Active development of this code has been stopped -- +I'm working on a complete reimplementation -- but folks have found some +minor portability glitches and the like, hence this release to fix them. +One penalty: slightly reduced compatibility with old compilers, because +the ANSI C `unsigned long' type and `ul' constant suffix are used in a +few places (I could avoid this but it would be considerably more work). + +New in alpha3.4: The complex bug alluded to below has been fixed (in a +slightly kludgey temporary way that may hurt efficiency a bit; this is +another "get it out the door for 4.4" release). The tests at the end of +the tests file have accordingly been uncommented. The primary sign of +the bug was that something like a?b matching ab matched b rather than ab. +(The bug was essentially specific to this exact situation, else it would +have shown up earlier.) + +New in alpha3.3: The definition of word boundaries has been altered +slightly, to more closely match the usual programming notion that "_" +is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir, +and the makefile no longer alludes to it in mysterious ways. The +makefile has generally been cleaned up some. Fixes have been made +(again!) so that the regression test will run without -DREDEBUG, at +the cost of weaker checking. A workaround for a bug in some folks' + has been added. And some more things have been added to +tests, including a couple right at the end which are commented out +because the code currently flunks them (complex bug; fix coming). +Plus the usual minor cleanup. + +New in alpha3.2: Assorted bits of cleanup and portability improvement +(the development base is now a BSDI system using GCC instead of an ancient +Sun system, and the newer compiler exposed some glitches). Fix for a +serious bug that affected REs using many [] (including REG_ICASE REs +because of the way they are implemented), *sometimes*, depending on +memory-allocation patterns. The header-file prototypes no longer name +the parameters, avoiding possible name conflicts. The possibility that +some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is +now handled gracefully. "uchar" is no longer used as an internal type +name (too many people have the same idea). Still the same old lousy +performance, alas. + +New in alpha3.1: Basically nothing, this release is just a bookkeeping +convenience. Stay tuned. + +New in alpha3.0: Performance is no better, alas, but some fixes have been +made and some functionality has been added. (This is basically the "get +it out the door in time for 4.4" release.) One bug fix: regfree() didn't +free the main internal structure (how embarrassing). It is now possible +to put NULs in either the RE or the target string, using (resp.) a new +REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to +regcomp() makes all characters ordinary, so you can match a literal +string easily (this will become more useful when performance improves!). +There are now primitives to match beginnings and ends of words, although +the syntax is disgusting and so is the implementation. The REG_ATOI +debugging interface has changed a bit. And there has been considerable +internal cleanup of various kinds. + +New in alpha2.3: Split change list out of README, and moved flags notes +into Makefile. Macro-ized the name of regex(7) in regex(3), since it has +to change for 4.4BSD. Cleanup work in engine.c, and some new regression +tests to catch tricky cases thereof. + +New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two +small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges +in my own test program and might be useful to others for similar purposes. +The regression test will now compile (and run) without REDEBUG. The +BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now. +Char/uchar parameters are now written int/unsigned, to avoid possible +portability problems with unpromoted parameters. Some unsigned casts have +been introduced to minimize portability problems with shifting into sign +bits. + +New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big +thing is that regex.h is now generated, using mkh, rather than being +supplied in the distribution; due to circularities in dependencies, +you have to build regex.h explicitly by "make h". The two known bugs +have been fixed (and the regression test now checks for them), as has a +problem with assertions not being suppressed in the absence of REDEBUG. +No performance work yet. + +New in alpha2: Backslash-anything is an ordinary character, not an +error (except, of course, for the handful of backslashed metacharacters +in BREs), which should reduce script breakage. The regression test +checks *where* null strings are supposed to match, and has generally +been tightened up somewhat. Small bug fixes in parameter passing (not +harmful, but technically errors) and some other areas. Debugging +invoked by defining REDEBUG rather than not defining NDEBUG. + +New in alpha+3: full prototyping for internal routines, using a little +helper program, mkh, which extracts prototypes given in stylized comments. +More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple +pre-screening of input when a literal string is known to be part of the +RE; this does wonders for performance. + +New in alpha+2: minor bits of cleanup. Notably, the number "32" for the +word width isn't hardwired into regexec.c any more, the public header +file prototypes the functions if __STDC__ is defined, and some small typos +in the manpages have been fixed. + +New in alpha+1: improvements to the manual pages, and an important +extension, the REG_STARTEND option to regexec(). diff --git a/src/regex/cclass.h b/src/regex/cclass.h new file mode 100644 index 0000000000..0c293028e9 --- /dev/null +++ b/src/regex/cclass.h @@ -0,0 +1,31 @@ +/* character-class table */ +static struct cclass { + char *name; + char *chars; + char *multis; +} cclasses[] = { + "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789", "", + "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + "", + "blank", " \t", "", + "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ +\25\26\27\30\31\32\33\34\35\36\37\177", "", + "digit", "0123456789", "", + "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + "", + "lower", "abcdefghijklmnopqrstuvwxyz", + "", + "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", + "", + "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + "", + "space", "\t\n\v\f\r ", "", + "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + "", + "xdigit", "0123456789ABCDEFabcdef", + "", + NULL, 0, "" +}; diff --git a/src/regex/cname.h b/src/regex/cname.h new file mode 100644 index 0000000000..02e86e912e --- /dev/null +++ b/src/regex/cname.h @@ -0,0 +1,102 @@ +/* character-name table */ +static struct cname { + char *name; + char code; +} cnames[] = { + "NUL", '\0', + "SOH", '\001', + "STX", '\002', + "ETX", '\003', + "EOT", '\004', + "ENQ", '\005', + "ACK", '\006', + "BEL", '\007', + "alert", '\007', + "BS", '\010', + "backspace", '\b', + "HT", '\011', + "tab", '\t', + "LF", '\012', + "newline", '\n', + "VT", '\013', + "vertical-tab", '\v', + "FF", '\014', + "form-feed", '\f', + "CR", '\015', + "carriage-return", '\r', + "SO", '\016', + "SI", '\017', + "DLE", '\020', + "DC1", '\021', + "DC2", '\022', + "DC3", '\023', + "DC4", '\024', + "NAK", '\025', + "SYN", '\026', + "ETB", '\027', + "CAN", '\030', + "EM", '\031', + "SUB", '\032', + "ESC", '\033', + "IS4", '\034', + "FS", '\034', + "IS3", '\035', + "GS", '\035', + "IS2", '\036', + "RS", '\036', + "IS1", '\037', + "US", '\037', + "space", ' ', + "exclamation-mark", '!', + "quotation-mark", '"', + "number-sign", '#', + "dollar-sign", '$', + "percent-sign", '%', + "ampersand", '&', + "apostrophe", '\'', + "left-parenthesis", '(', + "right-parenthesis", ')', + "asterisk", '*', + "plus-sign", '+', + "comma", ',', + "hyphen", '-', + "hyphen-minus", '-', + "period", '.', + "full-stop", '.', + "slash", '/', + "solidus", '/', + "zero", '0', + "one", '1', + "two", '2', + "three", '3', + "four", '4', + "five", '5', + "six", '6', + "seven", '7', + "eight", '8', + "nine", '9', + "colon", ':', + "semicolon", ';', + "less-than-sign", '<', + "equals-sign", '=', + "greater-than-sign", '>', + "question-mark", '?', + "commercial-at", '@', + "left-square-bracket", '[', + "backslash", '\\', + "reverse-solidus", '\\', + "right-square-bracket", ']', + "circumflex", '^', + "circumflex-accent", '^', + "underscore", '_', + "low-line", '_', + "grave-accent", '`', + "left-brace", '{', + "left-curly-bracket", '{', + "vertical-line", '|', + "right-brace", '}', + "right-curly-bracket", '}', + "tilde", '~', + "DEL", '\177', + NULL, 0, +}; diff --git a/src/regex/mkh b/src/regex/mkh new file mode 100644 index 0000000000..252b246c7b --- /dev/null +++ b/src/regex/mkh @@ -0,0 +1,76 @@ +#! /bin/sh +# mkh - pull headers out of C source +PATH=/bin:/usr/bin ; export PATH + +# egrep pattern to pick out marked lines +egrep='^ =([ ]|$)' + +# Sed program to process marked lines into lines for the header file. +# The markers have already been removed. Two things are done here: removal +# of backslashed newlines, and some fudging of comments. The first is done +# because -o needs to have prototypes on one line to strip them down. +# Getting comments into the output is tricky; we turn C++-style // comments +# into /* */ comments, after altering any existing */'s to avoid trouble. +peel=' /\\$/N + /\\\n[ ]*/s///g + /\/\//s;\*/;* /;g + /\/\//s;//\(.*\);/*\1 */;' + +for a +do + case "$a" in + -o) # old (pre-function-prototype) compiler + # add code to comment out argument lists + peel="$peel + "'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1(/*\2*/);' + shift + ;; + -b) # funny Berkeley __P macro + peel="$peel + "'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1 __P((\2));' + shift + ;; + -s) # compiler doesn't like `static foo();' + # add code to get rid of the `static' + peel="$peel + "'/^static[ ][^\/]*[a-zA-Z0-9_)](.*)/s;static.;;' + shift + ;; + -p) # private declarations + egrep='^ ==([ ]|$)' + shift + ;; + -i) # wrap in #ifndef, argument is name + ifndef="$2" + shift ; shift + ;; + *) break + ;; + esac +done + +if test " $ifndef" != " " +then + echo "#ifndef $ifndef" + echo "#define $ifndef /* never again */" +fi +echo "/* ========= begin header generated by $0 ========= */" +echo '#ifdef __cplusplus' +echo 'extern "C" {' +echo '#endif' +for f +do + echo + echo "/* === $f === */" + egrep "$egrep" $f | sed 's/^ ==*[ ]//;s/^ ==*$//' | sed "$peel" + echo +done +echo '#ifdef __cplusplus' +echo '}' +echo '#endif' +echo "/* ========= end header generated by $0 ========= */" +if test " $ifndef" != " " +then + echo "#endif" +fi +exit 0 diff --git a/src/regex/regerror.c b/src/regex/regerror.c new file mode 100644 index 0000000000..e53dafcfbf --- /dev/null +++ b/src/regex/regerror.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "regerror.ih" + +/* + = #define REG_OKAY 0 + = #define REG_NOMATCH 1 + = #define REG_BADPAT 2 + = #define REG_ECOLLATE 3 + = #define REG_ECTYPE 4 + = #define REG_EESCAPE 5 + = #define REG_ESUBREG 6 + = #define REG_EBRACK 7 + = #define REG_EPAREN 8 + = #define REG_EBRACE 9 + = #define REG_BADBR 10 + = #define REG_ERANGE 11 + = #define REG_ESPACE 12 + = #define REG_BADRPT 13 + = #define REG_EMPTY 14 + = #define REG_ASSERT 15 + = #define REG_INVARG 16 + = #define REG_ATOI 255 // convert name to number (!) + = #define REG_ITOA 0400 // convert number to name (!) + */ +static struct rerr { + int code; + char *name; + char *explain; +} rerrs[] = { + REG_OKAY, "REG_OKAY", "no errors detected", + REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match", + REG_BADPAT, "REG_BADPAT", "invalid regular expression", + REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element", + REG_ECTYPE, "REG_ECTYPE", "invalid character class", + REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)", + REG_ESUBREG, "REG_ESUBREG", "invalid backreference number", + REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced", + REG_EPAREN, "REG_EPAREN", "parentheses not balanced", + REG_EBRACE, "REG_EBRACE", "braces not balanced", + REG_BADBR, "REG_BADBR", "invalid repetition count(s)", + REG_ERANGE, "REG_ERANGE", "invalid character range", + REG_ESPACE, "REG_ESPACE", "out of memory", + REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid", + REG_EMPTY, "REG_EMPTY", "empty (sub)expression", + REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug", + REG_INVARG, "REG_INVARG", "invalid argument to regex routine", + -1, "", "*** unknown regexp error code ***", +}; + +/* + - regerror - the interface to error numbers + = extern size_t regerror(int, const regex_t *, char *, size_t); + */ +/* ARGSUSED */ +size_t +regerror(errcode, preg, errbuf, errbuf_size) +int errcode; +const regex_t *preg; +char *errbuf; +size_t errbuf_size; +{ + register struct rerr *r; + register size_t len; + register int target = errcode &~ REG_ITOA; + register char *s; + char convbuf[50]; + + if (errcode == REG_ATOI) + s = regatoi(preg, convbuf); + else { + for (r = rerrs; r->code >= 0; r++) + if (r->code == target) + break; + + if (errcode®_ITOA) { + if (r->code >= 0) + (void) strcpy(convbuf, r->name); + else + sprintf(convbuf, "REG_0x%x", target); + assert(strlen(convbuf) < sizeof(convbuf)); + s = convbuf; + } else + s = r->explain; + } + + len = strlen(s) + 1; + if (errbuf_size > 0) { + if (errbuf_size > len) + (void) strcpy(errbuf, s); + else { + (void) strncpy(errbuf, s, errbuf_size-1); + errbuf[errbuf_size-1] = '\0'; + } + } + + return(len); +} + +/* + - regatoi - internal routine to implement REG_ATOI + == static char *regatoi(const regex_t *preg, char *localbuf); + */ +static char * +regatoi(preg, localbuf) +const regex_t *preg; +char *localbuf; +{ + register struct rerr *r; + + for (r = rerrs; r->code >= 0; r++) + if (strcmp(r->name, preg->re_endp) == 0) + break; + if (r->code < 0) + return("0"); + + sprintf(localbuf, "%d", r->code); + return(localbuf); +} diff --git a/src/regex/regex.3 b/src/regex/regex.3 new file mode 100644 index 0000000000..bc747096d6 --- /dev/null +++ b/src/regex/regex.3 @@ -0,0 +1,509 @@ +.TH REGEX 3 "25 Sept 1997" +.BY "Henry Spencer" +.de ZR +.\" one other place knows this name: the SEE ALSO section +.IR regex (7) \\$1 +.. +.SH NAME +regcomp, regexec, regerror, regfree \- regular-expression library +.SH SYNOPSIS +.ft B +.\".na +#include +.br +#include +.HP 10 +int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags); +.HP +int\ regexec(const\ regex_t\ *preg, const\ char\ *string, +size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags); +.HP +size_t\ regerror(int\ errcode, const\ regex_t\ *preg, +char\ *errbuf, size_t\ errbuf_size); +.HP +void\ regfree(regex_t\ *preg); +.\".ad +.ft +.SH DESCRIPTION +These routines implement POSIX 1003.2 regular expressions (``RE''s); +see +.ZR . +.I Regcomp +compiles an RE written as a string into an internal form, +.I regexec +matches that internal form against a string and reports results, +.I regerror +transforms error codes from either into human-readable messages, +and +.I regfree +frees any dynamically-allocated storage used by the internal form +of an RE. +.PP +The header +.I +declares two structure types, +.I regex_t +and +.IR regmatch_t , +the former for compiled internal forms and the latter for match reporting. +It also declares the four functions, +a type +.IR regoff_t , +and a number of constants with names starting with ``REG_''. +.PP +.I Regcomp +compiles the regular expression contained in the +.I pattern +string, +subject to the flags in +.IR cflags , +and places the results in the +.I regex_t +structure pointed to by +.IR preg . +.I Cflags +is the bitwise OR of zero or more of the following flags: +.IP REG_EXTENDED \w'REG_EXTENDED'u+2n +Compile modern (``extended'') REs, +rather than the obsolete (``basic'') REs that +are the default. +.IP REG_BASIC +This is a synonym for 0, +provided as a counterpart to REG_EXTENDED to improve readability. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +.IP REG_NOSPEC +Compile with recognition of all special characters turned off. +All characters are thus considered ordinary, +so the ``RE'' is a literal string. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +REG_EXTENDED and REG_NOSPEC may not be used +in the same call to +.IR regcomp . +.IP REG_ICASE +Compile for matching that ignores upper/lower case distinctions. +See +.ZR . +.IP REG_NOSUB +Compile for matching that need only report success or failure, +not what was matched. +.IP REG_NEWLINE +Compile for newline-sensitive matching. +By default, newline is a completely ordinary character with no special +meaning in either REs or strings. +With this flag, +`[^' bracket expressions and `.' never match newline, +a `^' anchor matches the null string after any newline in the string +in addition to its normal function, +and the `$' anchor matches the null string before any newline in the +string in addition to its normal function. +.IP REG_PEND +The regular expression ends, +not at the first NUL, +but just before the character pointed to by the +.I re_endp +member of the structure pointed to by +.IR preg . +The +.I re_endp +member is of type +.IR const\ char\ * . +This flag permits inclusion of NULs in the RE; +they are considered ordinary characters. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +.PP +When successful, +.I regcomp +returns 0 and fills in the structure pointed to by +.IR preg . +One member of that structure +(other than +.IR re_endp ) +is publicized: +.IR re_nsub , +of type +.IR size_t , +contains the number of parenthesized subexpressions within the RE +(except that the value of this member is undefined if the +REG_NOSUB flag was used). +If +.I regcomp +fails, it returns a non-zero error code; +see DIAGNOSTICS. +.PP +.I Regexec +matches the compiled RE pointed to by +.I preg +against the +.IR string , +subject to the flags in +.IR eflags , +and reports results using +.IR nmatch , +.IR pmatch , +and the returned value. +The RE must have been compiled by a previous invocation of +.IR regcomp . +The compiled form is not altered during execution of +.IR regexec , +so a single compiled RE can be used simultaneously by multiple threads. +.PP +By default, +the NUL-terminated string pointed to by +.I string +is considered to be the text of an entire line, +with the NUL indicating the end of the line. +(That is, +any other end-of-line marker is considered to have been removed +and replaced by the NUL.) +The +.I eflags +argument is the bitwise OR of zero or more of the following flags: +.IP REG_NOTBOL \w'REG_STARTEND'u+2n +The first character of +the string +is not the beginning of a line, so the `^' anchor should not match before it. +This does not affect the behavior of newlines under REG_NEWLINE. +.IP REG_NOTEOL +The NUL terminating +the string +does not end a line, so the `$' anchor should not match before it. +This does not affect the behavior of newlines under REG_NEWLINE. +.IP REG_STARTEND +The string is considered to start at +\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR +and to have a terminating NUL located at +\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR +(there need not actually be a NUL at that location), +regardless of the value of +.IR nmatch . +See below for the definition of +.IR pmatch +and +.IR nmatch . +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL; +REG_STARTEND affects only the location of the string, +not how it is matched. +.PP +See +.ZR +for a discussion of what is matched in situations where an RE or a +portion thereof could match any of several substrings of +.IR string . +.PP +Normally, +.I regexec +returns 0 for success and the non-zero code REG_NOMATCH for failure. +Other non-zero error codes may be returned in exceptional situations; +see DIAGNOSTICS. +.PP +If REG_NOSUB was specified in the compilation of the RE, +or if +.I nmatch +is 0, +.I regexec +ignores the +.I pmatch +argument (but see below for the case where REG_STARTEND is specified). +Otherwise, +.I pmatch +points to an array of +.I nmatch +structures of type +.IR regmatch_t . +Such a structure has at least the members +.I rm_so +and +.IR rm_eo , +both of type +.I regoff_t +(a signed arithmetic type at least as large as an +.I off_t +and a +.IR ssize_t ), +containing respectively the offset of the first character of a substring +and the offset of the first character after the end of the substring. +Offsets are measured from the beginning of the +.I string +argument given to +.IR regexec . +An empty substring is denoted by equal offsets, +both indicating the character following the empty substring. +.PP +The 0th member of the +.I pmatch +array is filled in to indicate what substring of +.I string +was matched by the entire RE. +Remaining members report what substring was matched by parenthesized +subexpressions within the RE; +member +.I i +reports subexpression +.IR i , +with subexpressions counted (starting at 1) by the order of their opening +parentheses in the RE, left to right. +Unused entries in the array\(emcorresponding either to subexpressions that +did not participate in the match at all, or to subexpressions that do not +exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both +.I rm_so +and +.I rm_eo +set to \-1. +If a subexpression participated in the match several times, +the reported substring is the last one it matched. +(Note, as an example in particular, that when the RE `(b*)+' matches `bbb', +the parenthesized subexpression matches the three `b's and then +an infinite number of empty strings following the last `b', +so the reported substring is one of the empties.) +.PP +If REG_STARTEND is specified, +.I pmatch +must point to at least one +.I regmatch_t +(even if +.I nmatch +is 0 or REG_NOSUB was specified), +to hold the input offsets for REG_STARTEND. +Use for output is still entirely controlled by +.IR nmatch ; +if +.I nmatch +is 0 or REG_NOSUB was specified, +the value of +.IR pmatch [0] +will not be changed by a successful +.IR regexec . +.PP +.I Regerror +maps a non-zero +.I errcode +from either +.I regcomp +or +.I regexec +to a human-readable, printable message. +If +.I preg +is non-NULL, +the error code should have arisen from use of +the +.I regex_t +pointed to by +.IR preg , +and if the error code came from +.IR regcomp , +it should have been the result from the most recent +.I regcomp +using that +.IR regex_t . +.RI ( Regerror +may be able to supply a more detailed message using information +from the +.IR regex_t .) +.I Regerror +places the NUL-terminated message into the buffer pointed to by +.IR errbuf , +limiting the length (including the NUL) to at most +.I errbuf_size +bytes. +If the whole message won't fit, +as much of it as will fit before the terminating NUL is supplied. +In any case, +the returned value is the size of buffer needed to hold the whole +message (including terminating NUL). +If +.I errbuf_size +is 0, +.I errbuf +is ignored but the return value is still correct. +.PP +If the +.I errcode +given to +.I regerror +is first ORed with REG_ITOA, +the ``message'' that results is the printable name of the error code, +e.g. ``REG_NOMATCH'', +rather than an explanation thereof. +If +.I errcode +is REG_ATOI, +then +.I preg +shall be non-NULL and the +.I re_endp +member of the structure it points to +must point to the printable name of an error code; +in this case, the result in +.I errbuf +is the decimal digits of +the numeric value of the error code +(0 if the name is not recognized). +REG_ITOA and REG_ATOI are intended primarily as debugging facilities; +they are extensions, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +Be warned also that they are considered experimental and changes are possible. +.PP +.I Regfree +frees any dynamically-allocated storage associated with the compiled RE +pointed to by +.IR preg . +The remaining +.I regex_t +is no longer a valid compiled RE +and the effect of supplying it to +.I regexec +or +.I regerror +is undefined. +.PP +None of these functions references global variables except for tables +of constants; +all are safe for use from multiple threads if the arguments are safe. +.SH IMPLEMENTATION CHOICES +There are a number of decisions that 1003.2 leaves up to the implementor, +either by explicitly saying ``undefined'' or by virtue of them being +forbidden by the RE grammar. +This implementation treats them as follows. +.PP +See +.ZR +for a discussion of the definition of case-independent matching. +.PP +There is no particular limit on the length of REs, +except insofar as memory is limited. +Memory usage is approximately linear in RE size, and largely insensitive +to RE complexity, except for bounded repetitions. +See BUGS for one short RE using them +that will run almost any system out of memory. +.PP +A backslashed character other than one specifically given a magic meaning +by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs) +is taken as an ordinary character. +.PP +Any unmatched [ is a REG_EBRACK error. +.PP +Equivalence classes cannot begin or end bracket-expression ranges. +The endpoint of one range cannot begin another. +.PP +RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255. +.PP +A repetition operator (?, *, +, or bounds) cannot follow another +repetition operator. +A repetition operator cannot begin an expression or subexpression +or follow `^' or `|'. +.PP +`|' cannot appear first or last in a (sub)expression or after another `|', +i.e. an operand of `|' cannot be an empty subexpression. +An empty parenthesized subexpression, `()', is legal and matches an +empty (sub)string. +An empty string is not a legal RE. +.PP +A `{' followed by a digit is considered the beginning of bounds for a +bounded repetition, which must then follow the syntax for bounds. +A `{' \fInot\fR followed by a digit is considered an ordinary character. +.PP +`^' and `$' beginning and ending subexpressions in obsolete (``basic'') +REs are anchors, not ordinary characters. +.SH SEE ALSO +grep(1), regex(7) +.PP +POSIX 1003.2, sections 2.8 (Regular Expression Notation) +and +B.5 (C Binding for Regular Expression Matching). +.SH DIAGNOSTICS +Non-zero error codes from +.I regcomp +and +.I regexec +include the following: +.PP +.nf +.ta \w'REG_ECOLLATE'u+3n +REG_NOMATCH regexec() failed to match +REG_BADPAT invalid regular expression +REG_ECOLLATE invalid collating element +REG_ECTYPE invalid character class +REG_EESCAPE \e applied to unescapable character +REG_ESUBREG invalid backreference number +REG_EBRACK brackets [ ] not balanced +REG_EPAREN parentheses ( ) not balanced +REG_EBRACE braces { } not balanced +REG_BADBR invalid repetition count(s) in { } +REG_ERANGE invalid character range in [ ] +REG_ESPACE ran out of memory +REG_BADRPT ?, *, or + operand invalid +REG_EMPTY empty (sub)expression +REG_ASSERT ``can't happen''\(emyou found a bug +REG_INVARG invalid argument, e.g. negative-length string +.fi +.SH HISTORY +Written by Henry Spencer, +henry@zoo.toronto.edu. +.SH BUGS +This is an alpha release with known defects. +Please report problems. +.PP +There is one known functionality bug. +The implementation of internationalization is incomplete: +the locale is always assumed to be the default one of 1003.2, +and only the collating elements etc. of that locale are available. +.PP +The back-reference code is subtle and doubts linger about its correctness +in complex cases. +.PP +.I Regexec +performance is poor. +This will improve with later releases. +.I Nmatch +exceeding 0 is expensive; +.I nmatch +exceeding 1 is worse. +.I Regexec +is largely insensitive to RE complexity \fIexcept\fR that back +references are massively expensive. +RE length does matter; in particular, there is a strong speed bonus +for keeping RE length under about 30 characters, +with most special characters counting roughly double. +.PP +.I Regcomp +implements bounded repetitions by macro expansion, +which is costly in time and space if counts are large +or bounded repetitions are nested. +An RE like, say, +`((((a{1,100}){1,100}){1,100}){1,100}){1,100}' +will (eventually) run almost any existing machine out of swap space. +.PP +There are suspected problems with response to obscure error conditions. +Notably, +certain kinds of internal overflow, +produced only by truly enormous REs or by multiply nested bounded repetitions, +are probably not handled well. +.PP +Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is +a special character only in the presence of a previous unmatched `('. +This can't be fixed until the spec is fixed. +.PP +The standard's definition of back references is vague. +For example, does +`a\e(\e(b\e)*\e2\e)*d' match `abbbd'? +Until the standard is clarified, +behavior in such cases should not be relied on. +.PP +The implementation of word-boundary matching is a bit of a kludge, +and bugs may lurk in combinations of word-boundary matching and anchoring. diff --git a/src/regex/regex.7 b/src/regex/regex.7 new file mode 100644 index 0000000000..0fa180269e --- /dev/null +++ b/src/regex/regex.7 @@ -0,0 +1,235 @@ +.TH REGEX 7 "25 Oct 1995" +.BY "Henry Spencer" +.SH NAME +regex \- POSIX 1003.2 regular expressions +.SH DESCRIPTION +Regular expressions (``RE''s), +as defined in POSIX 1003.2, come in two forms: +modern REs (roughly those of +.IR egrep ; +1003.2 calls these ``extended'' REs) +and obsolete REs (roughly those of +.IR ed ; +1003.2 ``basic'' REs). +Obsolete REs mostly exist for backward compatibility in some old programs; +they will be discussed at the end. +1003.2 leaves some aspects of RE syntax and semantics open; +`\(dg' marks decisions on these aspects that +may not be fully portable to other 1003.2 implementations. +.PP +A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR, +separated by `|'. +It matches anything that matches one of the branches. +.PP +A branch is one\(dg or more \fIpieces\fR, concatenated. +It matches a match for the first, followed by a match for the second, etc. +.PP +A piece is an \fIatom\fR possibly followed +by a single\(dg `*', `+', `?', or \fIbound\fR. +An atom followed by `*' matches a sequence of 0 or more matches of the atom. +An atom followed by `+' matches a sequence of 1 or more matches of the atom. +An atom followed by `?' matches a sequence of 0 or 1 matches of the atom. +.PP +A \fIbound\fR is `{' followed by an unsigned decimal integer, +possibly followed by `,' +possibly followed by another unsigned decimal integer, +always followed by `}'. +The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive, +and if there are two of them, the first may not exceed the second. +An atom followed by a bound containing one integer \fIi\fR +and no comma matches +a sequence of exactly \fIi\fR matches of the atom. +An atom followed by a bound +containing one integer \fIi\fR and a comma matches +a sequence of \fIi\fR or more matches of the atom. +An atom followed by a bound +containing two integers \fIi\fR and \fIj\fR matches +a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom. +.PP +An atom is a regular expression enclosed in `()' (matching a match for the +regular expression), +an empty set of `()' (matching the null string)\(dg, +a \fIbracket expression\fR (see below), `.' +(matching any single character), `^' (matching the null string at the +beginning of a line), `$' (matching the null string at the +end of a line), a `\e' followed by one of the characters +`^.[$()|*+?{\e' +(matching that character taken as an ordinary character), +a `\e' followed by any other character\(dg +(matching that character taken as an ordinary character, +as if the `\e' had not been present\(dg), +or a single character with no other significance (matching that character). +A `{' followed by a character other than a digit is an ordinary +character, not the beginning of a bound\(dg. +It is illegal to end an RE with `\e'. +.PP +A \fIbracket expression\fR is a list of characters enclosed in `[]'. +It normally matches any single character from the list (but see below). +If the list begins with `^', +it matches any single character +(but see below) \fInot\fR from the rest of the list. +If two characters in the list are separated by `\-', this is shorthand +for the full \fIrange\fR of characters between those two (inclusive) in the +collating sequence, +e.g. `[0\-9]' in ASCII matches any decimal digit. +It is illegal\(dg for two ranges to share an +endpoint, e.g. `a\-c\-e'. +Ranges are very collating-sequence-dependent, +and portable programs should avoid relying on them. +.PP +To include a literal `]' in the list, make it the first character +(following a possible `^'). +To include a literal `\-', make it the first or last character, +or the second endpoint of a range. +To use a literal `\-' as the first endpoint of a range, +enclose it in `[.' and `.]' to make it a collating element (see below). +With the exception of these and some combinations using `[' (see next +paragraphs), all other special characters, including `\e', lose their +special significance within a bracket expression. +.PP +Within a bracket expression, a collating element (a character, +a multi-character sequence that collates as if it were a single character, +or a collating-sequence name for either) +enclosed in `[.' and `.]' stands for the +sequence of characters of that collating element. +The sequence is a single element of the bracket expression's list. +A bracket expression containing a multi-character collating element +can thus match more than one character, +e.g. if the collating sequence includes a `ch' collating element, +then the RE `[[.ch.]]*c' matches the first five characters +of `chchcc'. +.PP +Within a bracket expression, a collating element enclosed in `[=' and +`=]' is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. +(If there are no other equivalent collating elements, +the treatment is as if the enclosing delimiters were `[.' and `.]'.) +For example, if o and \o'o^' are the members of an equivalence class, +then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous. +An equivalence class may not\(dg be an endpoint +of a range. +.PP +Within a bracket expression, the name of a \fIcharacter class\fR enclosed +in `[:' and `:]' stands for the list of all characters belonging to that +class. +Standard character class names are: +.PP +.RS +.nf +.ta 3c 6c 9c +alnum digit punct +alpha graph space +blank lower upper +cntrl print xdigit +.fi +.RE +.PP +These stand for the character classes defined in +.IR ctype (3). +A locale may provide others. +A character class may not be used as an endpoint of a range. +.PP +There are two special cases\(dg of bracket expressions: +the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at +the beginning and end of a word respectively. +A word is defined as a sequence of +word characters +which is neither preceded nor followed by +word characters. +A word character is an +.I alnum +character (as defined by +.IR ctype (3)) +or an underscore. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +.PP +In the event that an RE could match more than one substring of a given +string, +the RE matches the one starting earliest in the string. +If the RE could match more than one substring starting at that point, +it matches the longest. +Subexpressions also match the longest possible substrings, subject to +the constraint that the whole match be as long as possible, +with subexpressions starting earlier in the RE taking priority over +ones starting later. +Note that higher-level subexpressions thus take priority over +their lower-level component subexpressions. +.PP +Match lengths are measured in characters, not collating elements. +A null string is considered longer than no match at all. +For example, +`bb*' matches the three middle characters of `abbbc', +`(wee|week)(knights|nights)' matches all ten characters of `weeknights', +when `(.*).*' is matched against `abc' the parenthesized subexpression +matches all three characters, and +when `(a*)*' is matched against `bc' both the whole RE and the parenthesized +subexpression match the null string. +.PP +If case-independent matching is specified, +the effect is much as if all case distinctions had vanished from the +alphabet. +When an alphabetic that exists in multiple cases appears as an +ordinary character outside a bracket expression, it is effectively +transformed into a bracket expression containing both cases, +e.g. `x' becomes `[xX]'. +When it appears inside a bracket expression, all case counterparts +of it are added to the bracket expression, so that (e.g.) `[x]' +becomes `[xX]' and `[^x]' becomes `[^xX]'. +.PP +No particular limit is imposed on the length of REs\(dg. +Programs intended to be portable should not employ REs longer +than 256 bytes, +as an implementation can refuse to accept such REs and remain +POSIX-compliant. +.PP +Obsolete (``basic'') regular expressions differ in several respects. +`|', `+', and `?' are ordinary characters and there is no equivalent +for their functionality. +The delimiters for bounds are `\e{' and `\e}', +with `{' and `}' by themselves ordinary characters. +The parentheses for nested subexpressions are `\e(' and `\e)', +with `(' and `)' by themselves ordinary characters. +`^' is an ordinary character except at the beginning of the +RE or\(dg the beginning of a parenthesized subexpression, +`$' is an ordinary character except at the end of the +RE or\(dg the end of a parenthesized subexpression, +and `*' is an ordinary character if it appears at the beginning of the +RE or the beginning of a parenthesized subexpression +(after a possible leading `^'). +Finally, there is one new type of atom, a \fIback reference\fR: +`\e' followed by a non-zero decimal digit \fId\fR +matches the same sequence of characters +matched by the \fId\fRth parenthesized subexpression +(numbering subexpressions by the positions of their opening parentheses, +left to right), +so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'. +.SH SEE ALSO +regex(3) +.PP +POSIX 1003.2, section 2.8 (Regular Expression Notation). +.SH HISTORY +Written by Henry Spencer, based on the 1003.2 spec. +.SH BUGS +Having two kinds of REs is a botch. +.PP +The current 1003.2 spec says that `)' is an ordinary character in +the absence of an unmatched `('; +this was an unintentional result of a wording error, +and change is likely. +Avoid relying on it. +.PP +Back references are a dreadful botch, +posing major problems for efficient implementations. +They are also somewhat vaguely defined +(does +`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?). +Avoid using them. +.PP +1003.2's specification of case-independent matching is vague. +The ``one case implies all cases'' definition given above +is current consensus among implementors as to the right interpretation. +.PP +The syntax for word boundaries is incredibly ugly. diff --git a/src/regex/regfree.c b/src/regex/regfree.c new file mode 100644 index 0000000000..9a6acf1733 --- /dev/null +++ b/src/regex/regfree.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "utils.h" +#include "regex2.h" + +/* + - regfree - free everything + = extern void regfree(regex_t *); + */ +void +regfree(preg) +regex_t *preg; +{ + register struct re_guts *g; + + if (preg->re_magic != MAGIC1) /* oops */ + return; /* nice to complain, but hard */ + + g = preg->re_g; + if (g == NULL || g->magic != MAGIC2) /* oops again */ + return; + preg->re_magic = 0; /* mark it invalid */ + g->magic = 0; /* mark it invalid */ + + if (g->strip != NULL) + free((char *)g->strip); + if (g->sets != NULL) + free((char *)g->sets); + if (g->setbits != NULL) + free((char *)g->setbits); + if (g->must != NULL) + free(g->must); + free((char *)g); +} diff --git a/src/regex/tests b/src/regex/tests new file mode 100644 index 0000000000..e4d928dad6 --- /dev/null +++ b/src/regex/tests @@ -0,0 +1,477 @@ +# regular expression test set +# Lines are at least three fields, separated by one or more tabs. "" stands +# for an empty field. First field is an RE. Second field is flags. If +# C flag given, regcomp() is expected to fail, and the third field is the +# error name (minus the leading REG_). +# +# Otherwise it is expected to succeed, and the third field is the string to +# try matching it against. If there is no fourth field, the match is +# expected to fail. If there is a fourth field, it is the substring that +# the RE is expected to match. If there is a fifth field, it is a comma- +# separated list of what the subexpressions should match, with - indicating +# no match for that one. In both the fourth and fifth fields, a (sub)field +# starting with @ indicates that the (sub)expression is expected to match +# a null string followed by the stuff after the @; this provides a way to +# test where null strings match. The character `N' in REs and strings +# is newline, `S' is space, `T' is tab, `Z' is NUL. +# +# The full list of flags: +# - placeholder, does nothing +# b RE is a BRE, not an ERE +# & try it as both an ERE and a BRE +# C regcomp() error expected, third field is error name +# i REG_ICASE +# m ("mundane") REG_NOSPEC +# s REG_NOSUB (not really testable) +# n REG_NEWLINE +# ^ REG_NOTBOL +# $ REG_NOTEOL +# # REG_STARTEND (see below) +# p REG_PEND +# +# For REG_STARTEND, the start/end offsets are those of the substring +# enclosed in (). + +# basics +a & a a +abc & abc abc +abc|de - abc abc +a|b|c - abc a + +# parentheses and perversions thereof +a(b)c - abc abc +a\(b\)c b abc abc +a( C EPAREN +a( b a( a( +a\( - a( a( +a\( bC EPAREN +a\(b bC EPAREN +a(b C EPAREN +a(b b a(b a(b +# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) +a) - a) a) +) - ) ) +# end gagging (in a just world, those *should* give EPAREN) +a) b a) a) +a\) bC EPAREN +\) bC EPAREN +a()b - ab ab +a\(\)b b ab ab + +# anchoring and REG_NEWLINE +^abc$ & abc abc +a^b - a^b +a^b b a^b a^b +a$b - a$b +a$b b a$b a$b +^ & abc @abc +$ & abc @ +^$ & "" @ +$^ - "" @ +\($\)\(^\) b "" @ +# stop retching, those are legitimate (although disgusting) +^^ - "" @ +$$ - "" @ +b$ & abNc +b$ &n abNc b +^b$ & aNbNc +^b$ &n aNbNc b +^$ &n aNNb @Nb +^$ n abc +^$ n abcN @ +$^ n aNNb @Nb +\($\)\(^\) bn aNNb @Nb +^^ n^ aNNb @Nb +$$ n aNNb @NN +^a ^ a +a$ $ a +^a ^n aNb +^b ^n aNb b +a$ $n bNa +b$ $n bNa b +a*(^b$)c* - b b +a*\(^b$\)c* b b b + +# certain syntax errors and non-errors +| C EMPTY +| b | | +* C BADRPT +* b * * ++ C BADRPT +? C BADRPT +"" &C EMPTY +() - abc @abc +\(\) b abc @abc +a||b C EMPTY +|ab C EMPTY +ab| C EMPTY +(|a)b C EMPTY +(a|)b C EMPTY +(*a) C BADRPT +(+a) C BADRPT +(?a) C BADRPT +({1}a) C BADRPT +\(\{1\}a\) bC BADRPT +(a|*b) C BADRPT +(a|+b) C BADRPT +(a|?b) C BADRPT +(a|{1}b) C BADRPT +^* C BADRPT +^* b * * +^+ C BADRPT +^? C BADRPT +^{1} C BADRPT +^\{1\} bC BADRPT + +# metacharacters, backslashes +a.c & abc abc +a[bc]d & abd abd +a\*c & a*c a*c +a\\b & a\b a\b +a\\\*b & a\*b a\*b +a\bc & abc abc +a\ &C EESCAPE +a\\bc & a\bc a\bc +\{ bC BADRPT +a\[b & a[b a[b +a[b &C EBRACK +# trailing $ is a peculiar special case for the BRE code +a$ & a a +a$ & a$ +a\$ & a +a\$ & a$ a$ +a\\$ & a +a\\$ & a$ +a\\$ & a\$ +a\\$ & a\ a\ + +# back references, ugh +a\(b\)\2c bC ESUBREG +a\(b\1\)c bC ESUBREG +a\(b*\)c\1d b abbcbbd abbcbbd bb +a\(b*\)c\1d b abbcbd +a\(b*\)c\1d b abbcbbbd +^\(.\)\1 b abc +a\([bc]\)\1d b abcdabbd abbd b +a\(\([bc]\)\2\)*d b abbccd abbccd +a\(\([bc]\)\2\)*d b abbcbd +# actually, this next one probably ought to fail, but the spec is unclear +a\(\(b\)*\2\)*d b abbbd abbbd +# here is a case that no NFA implementation does right +\(ab*\)[ab]*\1 b ababaaa ababaaa a +# check out normal matching in the presence of back refs +\(a\)\1bcd b aabcd aabcd +\(a\)\1bc*d b aabcd aabcd +\(a\)\1bc*d b aabd aabd +\(a\)\1bc*d b aabcccd aabcccd +\(a\)\1bc*[ce]d b aabcccd aabcccd +^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd + +# ordinary repetitions +ab*c & abc abc +ab+c - abc abc +ab?c - abc abc +a\(*\)b b a*b a*b +a\(**\)b b ab ab +a\(***\)b bC BADRPT +*a b *a *a +**a b a a +***a bC BADRPT + +# the dreaded bounded repetitions +{ & { { +{abc & {abc {abc +{1 C BADRPT +{1} C BADRPT +a{b & a{b a{b +a{1}b - ab ab +a\{1\}b b ab ab +a{1,}b - ab ab +a\{1,\}b b ab ab +a{1,2}b - aab aab +a\{1,2\}b b aab aab +a{1 C EBRACE +a\{1 bC EBRACE +a{1a C EBRACE +a\{1a bC EBRACE +a{1a} C BADBR +a\{1a\} bC BADBR +a{,2} - a{,2} a{,2} +a\{,2\} bC BADBR +a{,} - a{,} a{,} +a\{,\} bC BADBR +a{1,x} C BADBR +a\{1,x\} bC BADBR +a{1,x C EBRACE +a\{1,x bC EBRACE +a{300} C BADBR +a\{300\} bC BADBR +a{1,0} C BADBR +a\{1,0\} bC BADBR +ab{0,0}c - abcac ac +ab\{0,0\}c b abcac ac +ab{0,1}c - abcac abc +ab\{0,1\}c b abcac abc +ab{0,3}c - abbcac abbc +ab\{0,3\}c b abbcac abbc +ab{1,1}c - acabc abc +ab\{1,1\}c b acabc abc +ab{1,3}c - acabc abc +ab\{1,3\}c b acabc abc +ab{2,2}c - abcabbc abbc +ab\{2,2\}c b abcabbc abbc +ab{2,4}c - abcabbc abbc +ab\{2,4\}c b abcabbc abbc +((a{1,10}){1,10}){1,10} - a a a,a + +# multiple repetitions +a** &C BADRPT +a++ C BADRPT +a?? C BADRPT +a*+ C BADRPT +a*? C BADRPT +a+* C BADRPT +a+? C BADRPT +a?* C BADRPT +a?+ C BADRPT +a{1}{1} C BADRPT +a*{1} C BADRPT +a+{1} C BADRPT +a?{1} C BADRPT +a{1}* C BADRPT +a{1}+ C BADRPT +a{1}? C BADRPT +a*{b} - a{b} a{b} +a\{1\}\{1\} bC BADRPT +a*\{1\} bC BADRPT +a\{1\}* bC BADRPT + +# brackets, and numerous perversions thereof +a[b]c & abc abc +a[ab]c & abc abc +a[^ab]c & adc adc +a[]b]c & a]c a]c +a[[b]c & a[c a[c +a[-b]c & a-c a-c +a[^]b]c & adc adc +a[^-b]c & adc adc +a[b-]c & a-c a-c +a[b &C EBRACK +a[] &C EBRACK +a[1-3]c & a2c a2c +a[3-1]c &C ERANGE +a[1-3-5]c &C ERANGE +a[[.-.]--]c & a-c a-c +a[1- &C ERANGE +a[[. &C EBRACK +a[[.x &C EBRACK +a[[.x. &C EBRACK +a[[.x.] &C EBRACK +a[[.x.]] & ax ax +a[[.x,.]] &C ECOLLATE +a[[.one.]]b & a1b a1b +a[[.notdef.]]b &C ECOLLATE +a[[.].]]b & a]b a]b +a[[:alpha:]]c & abc abc +a[[:notdef:]]c &C ECTYPE +a[[: &C EBRACK +a[[:alpha &C EBRACK +a[[:alpha:] &C EBRACK +a[[:alpha,:] &C ECTYPE +a[[:]:]]b &C ECTYPE +a[[:-:]]b &C ECTYPE +a[[:alph:]] &C ECTYPE +a[[:alphabet:]] &C ECTYPE +[[:alnum:]]+ - -%@a0X- a0X +[[:alpha:]]+ - -%@aX0- aX +[[:blank:]]+ - aSSTb SST +[[:cntrl:]]+ - aNTb NT +[[:digit:]]+ - a019b 019 +[[:graph:]]+ - Sa%bS a%b +[[:lower:]]+ - AabC ab +[[:print:]]+ - NaSbN aSb +[[:punct:]]+ - S%-&T %-& +[[:space:]]+ - aSNTb SNT +[[:upper:]]+ - aBCd BC +[[:xdigit:]]+ - p0f3Cq 0f3C +a[[=b=]]c & abc abc +a[[= &C EBRACK +a[[=b &C EBRACK +a[[=b= &C EBRACK +a[[=b=] &C EBRACK +a[[=b,=]] &C ECOLLATE +a[[=one=]]b & a1b a1b + +# complexities +a(((b)))c - abc abc +a(b|(c))d - abd abd +a(b*|c)d - abbd abbd +# just gotta have one DFA-buster, of course +a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and an inline expansion in case somebody gets tricky +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and in case somebody just slips in an NFA... +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights +# fish for anomalies as the number of states passes 32 +12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 +123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 +1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 +12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 +123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 +# and one really big one, beyond any plausible word width +1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 +# fish for problems as brackets go past 8 +[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm +[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo +[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq +[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq + +# subtleties of matching +abc & xabcy abc +a\(b\)?c\1d b acd +aBc i Abc Abc +a[Bc]*d i abBCcd abBCcd +0[[:upper:]]1 &i 0a1 0a1 +0[[:lower:]]1 &i 0A1 0A1 +a[^b]c &i abc +a[^b]c &i aBc +a[^b]c &i adc adc +[a]b[c] - abc abc +[a]b[a] - aba aba +[abc]b[abc] - abc abc +[abc]b[abd] - abd abd +a(b?c)+d - accd accd +(wee|week)(knights|night) - weeknights weeknights +(we|wee|week|frob)(knights|night|day) - weeknights weeknights +a[bc]d - xyzaaabcaababdacd abd +a[ab]c - aaabc abc +abc s abc abc +a* & b @b + +# Let's have some fun -- try to match a C comment. +# first the obvious, which looks okay at first glance... +/\*.*\*/ - /*x*/ /*x*/ +# but... +/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ +# okay, we must not match */ inside; try to do that... +/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ +/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ +# but... +/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ +# and a still fancier version, which does it right (I think)... +/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ +/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ + +# subexpressions +.* - abc abc - +a(b)(c)d - abcd abcd b,c +a(((b)))c - abc abc b,b,b +a(b|(c))d - abd abd b,- +a(b*|c|e)d - abbd abbd bb +a(b*|c|e)d - acd acd c +a(b*|c|e)d - ad ad @d +a(b?)c - abc abc b +a(b?)c - ac ac @c +a(b+)c - abc abc b +a(b+)c - abbbc abbbc bbb +a(b*)c - ac ac @c +(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de +# the regression tester only asks for 9 subexpressions +a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j +a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k +a([bc]?)c - abc abc b +a([bc]?)c - ac ac @c +a([bc]+)c - abc abc b +a([bc]+)c - abcc abcc bc +a([bc]+)bc - abcbc abcbc bc +a(bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abbb abbb bb +a(bbb+|bb+|b)bb - abbb abbb b +(.*).* - abcdef abcdef abcdef +(a*)* - bc @b @b + +# do we get the right subexpression when it is used more than once? +a(b|c)*d - ad ad - +a(b|c)*d - abcd abcd c +a(b|c)+d - abd abd b +a(b|c)+d - abcd abcd c +a(b|c?)+d - ad ad @d +a(b|c?)+d - abcd abcd @d +a(b|c){0,0}d - ad ad - +a(b|c){0,1}d - ad ad - +a(b|c){0,1}d - abd abd b +a(b|c){0,2}d - ad ad - +a(b|c){0,2}d - abcd abcd c +a(b|c){0,}d - ad ad - +a(b|c){0,}d - abcd abcd c +a(b|c){1,1}d - abd abd b +a(b|c){1,1}d - acd acd c +a(b|c){1,2}d - abd abd b +a(b|c){1,2}d - abcd abcd c +a(b|c){1,}d - abd abd b +a(b|c){1,}d - abcd abcd c +a(b|c){2,2}d - acbd acbd b +a(b|c){2,2}d - abcd abcd c +a(b|c){2,4}d - abcd abcd c +a(b|c){2,4}d - abcbd abcbd b +a(b|c){2,4}d - abcbcd abcbcd c +a(b|c){2,}d - abcd abcd c +a(b|c){2,}d - abcbd abcbd b +a(b+|((c)*))+d - abd abd @d,@d,- +a(b+|((c)*))+d - abcd abcd @d,@d,- + +# check out the STARTEND option +[abc] &# a(b)c b +[abc] &# a(d)c +[abc] &# a(bc)d b +[abc] &# a(dc)d c +. &# a()c +b.*c &# b(bc)c bc +b.* &# b(bc)c bc +.*c &# b(bc)c bc + +# plain strings, with the NOSPEC flag +abc m abc abc +abc m xabcy abc +abc m xyz +a*b m aba*b a*b +a*b m ab +"" mC EMPTY + +# cases involving NULs +aZb & a a +aZb &p a +aZb &p# (aZb) aZb +aZ*b &p# (ab) ab +a.b &# (aZb) aZb +a.* &# (aZb)c aZb + +# word boundaries (ick) +[[:<:]]a & a a +[[:<:]]a & ba +[[:<:]]a & -a a +a[[:>:]] & a a +a[[:>:]] & ab +a[[:>:]] & a- a +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc +[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc +[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc +[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ +[[:<:]]a_b[[:>:]] & x_a_b + +# past problems, and suspected problems +(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 +abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop +abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv +(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 +CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 +Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz +a?b - ab ab +-\{0,1\}[0-9]*$ b -5 -5 +a*a*a*a*a*a*a* & aaaaaa aaaaaa