From 38d679fdb55d39e08b90cc703a44ef72273abdb3 Mon Sep 17 00:00:00 2001 From: Ryan Norton Date: Wed, 13 Oct 1999 02:22:17 +0000 Subject: [PATCH] Tcl regex lib git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@3949 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- src/regex/regc_lex.c | 1146 ++++++++++++++++++++++++++++++++++++++++++ src/regex/regerrs.h | 75 +++ src/regex/regguts.h | 417 +++++++++++++++ 3 files changed, 1638 insertions(+) create mode 100644 src/regex/regc_lex.c create mode 100644 src/regex/regerrs.h create mode 100644 src/regex/regguts.h diff --git a/src/regex/regc_lex.c b/src/regex/regc_lex.c new file mode 100644 index 0000000000..a24290d1a1 --- /dev/null +++ b/src/regex/regc_lex.c @@ -0,0 +1,1146 @@ +/* + * lexical analyzer + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Header$ + * + */ + +/* scanning macros (know about v) */ +#define ATEOS() (v->now >= v->stop) +#define HAVE(n) (v->stop - v->now >= (n)) +#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) +#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) +#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ + *(v->now+1) == CHR(b) && \ + *(v->now+2) == CHR(c)) +#define SET(c) (v->nexttype = (c)) +#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) +#define RET(c) return (SET(c), 1) +#define RETV(c, n) return (SETV(c, n), 1) +#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ +#define LASTTYPE(t) (v->lasttype == (t)) + +/* lexical contexts */ +#define L_ERE 1 /* mainline ERE/ARE */ +#define L_BRE 2 /* mainline BRE */ +#define L_Q 3 /* REG_QUOTE */ +#define L_EBND 4 /* ERE/ARE bound */ +#define L_BBND 5 /* BRE bound */ +#define L_BRACK 6 /* brackets */ +#define L_CEL 7 /* collating element */ +#define L_ECL 8 /* equivalence class */ +#define L_CCL 9 /* character class */ +#define INTOCON(c) (v->lexcon = (c)) +#define INCON(con) (v->lexcon == (con)) + +/* construct pointer past end of chr array */ +#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) + +/* + * lexstart - set up lexical stuff, scan leading options + */ +static void +lexstart(struct vars * v) +{ + prefixes(v); /* may turn on new type bits etc. */ + NOERR(); + + if (v->cflags & REG_QUOTE) + { + assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))); + INTOCON(L_Q); + } + else if (v->cflags & REG_EXTENDED) + { + assert(!(v->cflags & REG_QUOTE)); + INTOCON(L_ERE); + } + else + { + assert(!(v->cflags & (REG_QUOTE | REG_ADVF))); + INTOCON(L_BRE); + } + + v->nexttype = EMPTY; /* remember we were at the start */ + next(v); /* set up the first token */ +} + +/* + * prefixes - implement various special prefixes + */ +static void +prefixes(struct vars * v) +{ + /* literal string doesn't get any of this stuff */ + if (v->cflags & REG_QUOTE) + return; + + /* initial "***" gets special things */ + if (HAVE(4) && NEXT3('*', '*', '*')) + switch (*(v->now + 3)) + { + case CHR('?'): /* "***?" error, msg shows version */ + ERR(REG_BADPAT); + return; /* proceed no further */ + break; + case CHR('='): /* "***=" shifts to literal string */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_QUOTE; + v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE); + v->now += 4; + return; /* and there can be no more prefixes */ + break; + case CHR(':'): /* "***:" shifts to AREs */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_ADVANCED; + v->now += 4; + break; + default: /* otherwise *** is just an error */ + ERR(REG_BADRPT); + return; + break; + } + + /* BREs and EREs don't get embedded options */ + if ((v->cflags & REG_ADVANCED) != REG_ADVANCED) + return; + + /* embedded options (AREs only) */ + if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) + { + NOTE(REG_UNONPOSIX); + v->now += 2; + for (; !ATEOS() && iscalpha(*v->now); v->now++) + switch (*v->now) + { + case CHR('b'): /* BREs (but why???) */ + v->cflags &= ~(REG_ADVANCED | REG_QUOTE); + break; + case CHR('c'): /* case sensitive */ + v->cflags &= ~REG_ICASE; + break; + case CHR('e'): /* plain EREs */ + v->cflags |= REG_EXTENDED; + v->cflags &= ~(REG_ADVF | REG_QUOTE); + break; + case CHR('i'): /* case insensitive */ + v->cflags |= REG_ICASE; + break; + case CHR('m'): /* Perloid synonym for n */ + case CHR('n'): /* \n affects ^ $ . [^ */ + v->cflags |= REG_NEWLINE; + break; + case CHR('p'): /* ~Perl, \n affects . [^ */ + v->cflags |= REG_NLSTOP; + v->cflags &= ~REG_NLANCH; + break; + case CHR('q'): /* literal string */ + v->cflags |= REG_QUOTE; + v->cflags &= ~REG_ADVANCED; + break; + case CHR('s'): /* single line, \n ordinary */ + v->cflags &= ~REG_NEWLINE; + break; + case CHR('t'): /* tight syntax */ + v->cflags &= ~REG_EXPANDED; + break; + case CHR('w'): /* weird, \n affects ^ $ only */ + v->cflags &= ~REG_NLSTOP; + v->cflags |= REG_NLANCH; + break; + case CHR('x'): /* expanded syntax */ + v->cflags |= REG_EXPANDED; + break; + default: + ERR(REG_BADOPT); + return; + } + if (!NEXT1(')')) + { + ERR(REG_BADOPT); + return; + } + v->now++; + if (v->cflags & REG_QUOTE) + v->cflags &= ~(REG_EXPANDED | REG_NEWLINE); + } +} + +/* + * lexnest - "call a subroutine", interpolating string at the lexical level + * + * Note, this is not a very general facility. There are a number of + * implicit assumptions about what sorts of strings can be subroutines. + */ +static void +lexnest(struct vars * v, + chr *beginp, /* start of interpolation */ + chr *endp) /* one past end of interpolation */ +{ + assert(v->savenow == NULL); /* only one level of nesting */ + v->savenow = v->now; + v->savestop = v->stop; + v->now = beginp; + v->stop = endp; +} + +/* + * string constants to interpolate as expansions of things like \d + */ +static chr backd[] = { /* \d */ + CHR('['), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static chr backD[] = { /* \D */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static chr brbackd[] = { /* \d within brackets */ + CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']') +}; +static chr backs[] = { /* \s */ + CHR('['), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static chr backS[] = { /* \S */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static chr brbacks[] = { /* \s within brackets */ + CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']') +}; +static chr backw[] = { /* \w */ + CHR('['), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static chr backW[] = { /* \W */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static chr brbackw[] = { /* \w within brackets */ + CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_') +}; + +/* + * lexword - interpolate a bracket expression for word characters + * Possibly ought to inquire whether there is a "word" character class. + */ +static void +lexword(struct vars * v) +{ + lexnest(v, backw, ENDOF(backw)); +} + +/* + * next - get next token + */ +static int /* 1 normal, 0 failure */ +next(struct vars * v) +{ + chr c; + + /* errors yield an infinite sequence of failures */ + if (ISERR()) + return 0; /* the error has set nexttype to EOS */ + + /* remember flavor of last token */ + v->lasttype = v->nexttype; + + /* REG_BOSONLY */ + if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) + { + /* at start of a REG_BOSONLY RE */ + RETV(SBEGIN, 0); /* same as \A */ + } + + /* if we're nested and we've hit end, return to outer level */ + if (v->savenow != NULL && ATEOS()) + { + v->now = v->savenow; + v->stop = v->savestop; + v->savenow = v->savestop = NULL; + } + + /* skip white space etc. if appropriate (not in literal or []) */ + if (v->cflags & REG_EXPANDED) + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_EBND: + case L_BBND: + skip(v); + break; + } + + /* handle EOS, depending on context */ + if (ATEOS()) + { + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_Q: + RET(EOS); + break; + case L_EBND: + case L_BBND: + FAILW(REG_EBRACE); + break; + case L_BRACK: + case L_CEL: + case L_ECL: + case L_CCL: + FAILW(REG_EBRACK); + break; + } + assert(NOTREACHED); + } + + /* okay, time to actually get a character */ + c = *v->now++; + + /* deal with the easy contexts, punt EREs to code below */ + switch (v->lexcon) + { + case L_BRE: /* punt BREs to separate function */ + return brenext(v, c); + break; + case L_ERE: /* see below */ + break; + case L_Q: /* literal strings are easy */ + RETV(PLAIN, c); + break; + case L_BBND: /* bounds are fairly simple */ + case L_EBND: + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + RETV(DIGIT, (chr) DIGITVAL(c)); + break; + case CHR(','): + RET(','); + break; + case CHR('}'): /* ERE bound ends with } */ + if (INCON(L_EBND)) + { + INTOCON(L_ERE); + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('}', 0); + } + RETV('}', 1); + } + else + FAILW(REG_BADBR); + break; + case CHR('\\'): /* BRE bound ends with \} */ + if (INCON(L_BBND) && NEXT1('}')) + { + v->now++; + INTOCON(L_BRE); + RET('}'); + } + else + FAILW(REG_BADBR); + break; + default: + FAILW(REG_BADBR); + break; + } + assert(NOTREACHED); + break; + case L_BRACK: /* brackets are not too hard */ + switch (c) + { + case CHR(']'): + if (LASTTYPE('[')) + RETV(PLAIN, c); + else + { + INTOCON((v->cflags & REG_EXTENDED) ? + L_ERE : L_BRE); + RET(']'); + } + break; + case CHR('\\'): + NOTE(REG_UBBS); + if (!(v->cflags & REG_ADVF)) + RETV(PLAIN, c); + NOTE(REG_UNONPOSIX); + if (ATEOS()) + FAILW(REG_EESCAPE); + (DISCARD) lexescape(v); + switch (v->nexttype) + { /* not all escapes okay here */ + case PLAIN: + return 1; + break; + case CCLASS: + switch (v->nextvalue) + { + case 'd': + lexnest(v, brbackd, ENDOF(brbackd)); + break; + case 's': + lexnest(v, brbacks, ENDOF(brbacks)); + break; + case 'w': + lexnest(v, brbackw, ENDOF(brbackw)); + break; + default: + FAILW(REG_EESCAPE); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + break; + } + /* not one of the acceptable escapes */ + FAILW(REG_EESCAPE); + break; + case CHR('-'): + if (LASTTYPE('[') || NEXT1(']')) + RETV(PLAIN, c); + else + RETV(RANGE, c); + break; + case CHR('['): + if (ATEOS()) + FAILW(REG_EBRACK); + switch (*v->now++) + { + case CHR('.'): + INTOCON(L_CEL); + /* might or might not be locale-specific */ + RET(COLLEL); + break; + case CHR('='): + INTOCON(L_ECL); + NOTE(REG_ULOCALE); + RET(ECLASS); + break; + case CHR(':'): + INTOCON(L_CCL); + NOTE(REG_ULOCALE); + RET(CCLASS); + break; + default: /* oops */ + v->now--; + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + default: + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + case L_CEL: /* collating elements are easy */ + if (c == CHR('.') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '.'); + } + else + RETV(PLAIN, c); + break; + case L_ECL: /* ditto equivalence classes */ + if (c == CHR('=') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '='); + } + else + RETV(PLAIN, c); + break; + case L_CCL: /* ditto character classes */ + if (c == CHR(':') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, ':'); + } + else + RETV(PLAIN, c); + break; + default: + assert(NOTREACHED); + break; + } + + /* that got rid of everything except EREs and AREs */ + assert(INCON(L_ERE)); + + /* deal with EREs and AREs, except for backslashes */ + switch (c) + { + case CHR('|'): + RET('|'); + break; + case CHR('*'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('*', 0); + } + RETV('*', 1); + break; + case CHR('+'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('+', 0); + } + RETV('+', 1); + break; + case CHR('?'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('?', 0); + } + RETV('?', 1); + break; + case CHR('{'): /* bounds start or plain character */ + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS() || !iscdigit(*v->now)) + { + NOTE(REG_UBRACES); + NOTE(REG_UUNSPEC); + RETV(PLAIN, c); + } + else + { + NOTE(REG_UBOUNDS); + INTOCON(L_EBND); + RET('{'); + } + assert(NOTREACHED); + break; + case CHR('('): /* parenthesis, or advanced extension */ + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + NOTE(REG_UNONPOSIX); + v->now++; + switch (*v->now++) + { + case CHR(':'): /* non-capturing paren */ + RETV('(', 0); + break; + case CHR('#'): /* comment */ + while (!ATEOS() && *v->now != CHR(')')) + v->now++; + if (!ATEOS()) + v->now++; + assert(v->nexttype == v->lasttype); + return next(v); + break; + case CHR('='): /* positive lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 1); + break; + case CHR('!'): /* negative lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 0); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + } + if (v->cflags & REG_NOSUB) + RETV('(', 0); /* all parens non-capturing */ + else + RETV('(', 1); + break; + case CHR(')'): + if (LASTTYPE('(')) + NOTE(REG_UUNSPEC); + RETV(')', c); + break; + case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + RET('^'); + break; + case CHR('$'): + RET('$'); + break; + case CHR('\\'): /* mostly punt backslashes to code below */ + if (ATEOS()) + FAILW(REG_EESCAPE); + break; + default: /* ordinary character */ + RETV(PLAIN, c); + break; + } + + /* ERE/ARE backslash handling; backslash already eaten */ + assert(!ATEOS()); + if (!(v->cflags & REG_ADVF)) + { /* only AREs have non-trivial escapes */ + if (iscalnum(*v->now)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, *v->now++); + } + (DISCARD) lexescape(v); + if (ISERR()) + FAILW(REG_EESCAPE); + if (v->nexttype == CCLASS) + { /* fudge at lexical level */ + switch (v->nextvalue) + { + case 'd': + lexnest(v, backd, ENDOF(backd)); + break; + case 'D': + lexnest(v, backD, ENDOF(backD)); + break; + case 's': + lexnest(v, backs, ENDOF(backs)); + break; + case 'S': + lexnest(v, backS, ENDOF(backS)); + break; + case 'w': + lexnest(v, backw, ENDOF(backw)); + break; + case 'W': + lexnest(v, backW, ENDOF(backW)); + break; + default: + assert(NOTREACHED); + FAILW(REG_ASSERT); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + } + /* otherwise, lexescape has already done the work */ + return !ISERR(); +} + +/* + * lexescape - parse an ARE backslash escape (backslash already eaten) + * Note slightly nonstandard use of the CCLASS type code. + */ +static int /* not actually used, but convenient for + * RETV */ +lexescape(struct vars * v) +{ + chr c; + static chr alert[] = { + CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') + }; + static chr esc[] = { + CHR('E'), CHR('S'), CHR('C') + }; + chr *save; + + assert(v->cflags & REG_ADVF); + + assert(!ATEOS()); + c = *v->now++; + if (!iscalnum(c)) + RETV(PLAIN, c); + + NOTE(REG_UNONPOSIX); + switch (c) + { + case CHR('a'): + RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); + break; + case CHR('A'): + RETV(SBEGIN, 0); + break; + case CHR('b'): + RETV(PLAIN, CHR('\b')); + break; + case CHR('B'): + RETV(PLAIN, CHR('\\')); + break; + case CHR('c'): + NOTE(REG_UUNPORT); + if (ATEOS()) + FAILW(REG_EESCAPE); + RETV(PLAIN, (chr) (*v->now++ & 037)); + break; + case CHR('d'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'd'); + break; + case CHR('D'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'D'); + break; + case CHR('e'): + NOTE(REG_UUNPORT); + RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); + break; + case CHR('f'): + RETV(PLAIN, CHR('\f')); + break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; + case CHR('n'): + RETV(PLAIN, CHR('\n')); + break; + case CHR('r'): + RETV(PLAIN, CHR('\r')); + break; + case CHR('s'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 's'); + break; + case CHR('S'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'S'); + break; + case CHR('t'): + RETV(PLAIN, CHR('\t')); + break; + case CHR('u'): + c = lexdigits(v, 16, 4, 4); + if (ISERR()) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('U'): + c = lexdigits(v, 16, 8, 8); + if (ISERR()) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('v'): + RETV(PLAIN, CHR('\v')); + break; + case CHR('w'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'w'); + break; + case CHR('W'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'W'); + break; + case CHR('x'): + NOTE(REG_UUNPORT); + c = lexdigits(v, 16, 1, 255); /* REs >255 long outside + * spec */ + if (ISERR()) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('y'): + NOTE(REG_ULOCALE); + RETV(WBDRY, 0); + break; + case CHR('Y'): + NOTE(REG_ULOCALE); + RETV(NWBDRY, 0); + break; + case CHR('Z'): + RETV(SEND, 0); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + save = v->now; + v->now--; /* put first digit back */ + c = lexdigits(v, 10, 1, 255); /* REs >255 long outside + * spec */ + if (ISERR()) + FAILW(REG_EESCAPE); + /* ugly heuristic (first test is "exactly 1 digit?") */ + if (v->now - save == 0 || (int) c <= v->nsubexp) + { + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr) c); + } + /* oops, doesn't look like it's a backref after all... */ + v->now = save; + /* and fall through into octal number */ + case CHR('0'): + NOTE(REG_UUNPORT); + v->now--; /* put first digit back */ + c = lexdigits(v, 8, 1, 3); + if (ISERR()) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + default: + assert(iscalpha(c)); + FAILW(REG_EESCAPE); /* unknown alphabetic escape */ + break; + } + assert(NOTREACHED); +} + +/* + * lexdigits - slurp up digits and return chr value + */ +static chr /* chr value; errors signalled via ERR */ +lexdigits(struct vars * v, + int base, + int minlen, + int maxlen) +{ + uchr n; /* unsigned to avoid overflow misbehavior */ + int len; + chr c; + int d; + const uchr ub = (uchr) base; + + n = 0; + for (len = 0; len < maxlen && !ATEOS(); len++) + { + c = *v->now++; + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + d = DIGITVAL(c); + break; + case CHR('a'): + case CHR('A'): + d = 10; + break; + case CHR('b'): + case CHR('B'): + d = 11; + break; + case CHR('c'): + case CHR('C'): + d = 12; + break; + case CHR('d'): + case CHR('D'): + d = 13; + break; + case CHR('e'): + case CHR('E'): + d = 14; + break; + case CHR('f'): + case CHR('F'): + d = 15; + break; + default: + v->now--; /* oops, not a digit at all */ + d = -1; + break; + } + + if (d >= base) + { /* not a plausible digit */ + v->now--; + d = -1; + } + if (d < 0) + break; /* NOTE BREAK OUT */ + n = n * ub + (uchr) d; + } + if (len < minlen) + ERR(REG_EESCAPE); + + return (chr) n; +} + +/* + * brenext - get next BRE token + * + * This is much like EREs except for all the stupid backslashes and the + * context-dependency of some things. + */ +static int /* 1 normal, 0 failure */ +brenext(struct vars * v, + chr pc) +{ + chr c = (chr) pc; + + switch (c) + { + case CHR('*'): + if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) + RETV(PLAIN, c); + RET('*'); + break; + case CHR('['): + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + if (LASTTYPE(EMPTY)) + RET('^'); + if (LASTTYPE('(')) + { + NOTE(REG_UUNSPEC); + RET('^'); + } + RETV(PLAIN, c); + break; + case CHR('$'): + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS()) + RET('$'); + if (NEXT2('\\', ')')) + { + NOTE(REG_UUNSPEC); + RET('$'); + } + RETV(PLAIN, c); + break; + case CHR('\\'): + break; /* see below */ + default: + RETV(PLAIN, c); + break; + } + + assert(c == CHR('\\')); + + if (ATEOS()) + FAILW(REG_EESCAPE); + + c = *v->now++; + switch (c) + { + case CHR('{'): + INTOCON(L_BBND); + NOTE(REG_UBOUNDS); + RET('{'); + break; + case CHR('('): + RETV('(', 1); + break; + case CHR(')'): + RETV(')', c); + break; + case CHR('<'): + NOTE(REG_UNONPOSIX); + RET('<'); + break; + case CHR('>'): + NOTE(REG_UNONPOSIX); + RET('>'); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr) DIGITVAL(c)); + break; + default: + if (iscalnum(c)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, c); + break; + } + + assert(NOTREACHED); +} + +/* + * skip - skip white space and comments in expanded form + */ +static void +skip(struct vars * v) +{ + chr *start = v->now; + + assert(v->cflags & REG_EXPANDED); + + for (;;) + { + while (!ATEOS() && iscspace(*v->now)) + v->now++; + if (ATEOS() || *v->now != CHR('#')) + break; /* NOTE BREAK OUT */ + assert(NEXT1('#')); + while (!ATEOS() && *v->now != CHR('\n')) + v->now++; + /* leave the newline to be picked up by the iscspace loop */ + } + + if (v->now != start) + NOTE(REG_UNONPOSIX); +} + +/* + * newline - return the chr for a newline + * + * This helps confine use of CHR to this source file. + */ +static chr +newline(void) +{ + return CHR('\n'); +} + +/* + * chrnamed - return the chr known by a given (chr string) name + * + * The code is a bit clumsy, but this routine gets only such specialized + * use that it hardly matters. + */ +static chr +chrnamed(struct vars * v, + chr *startp, /* start of name */ + chr *endp, /* just past end of name */ + chr lastresort) /* what to return if name lookup fails */ +{ + celt c; + int errsave; + int e; + struct cvec *cv; + + errsave = v->err; + v->err = 0; + c = element(v, startp, endp); + e = v->err; + v->err = errsave; + + if (e != 0) + return (chr) lastresort; + + cv = range(v, c, c, 0); + if (cv->nchrs == 0) + return (chr) lastresort; + return cv->chrs[0]; +} diff --git a/src/regex/regerrs.h b/src/regex/regerrs.h new file mode 100644 index 0000000000..f99dbf4f73 --- /dev/null +++ b/src/regex/regerrs.h @@ -0,0 +1,75 @@ +/* + * $Id$ + */ + +{ + REG_OKAY, "REG_OKAY", "no errors detected" +}, + +{ + REG_NOMATCH, "REG_NOMATCH", "failed to match" +}, + +{ + REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)" +}, + +{ + REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" +}, + +{ + REG_ECTYPE, "REG_ECTYPE", "invalid character class" +}, + +{ + REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" +}, + +{ + REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" +}, + +{ + REG_EBRACK, "REG_EBRACK", "brackets [] not balanced" +}, + +{ + REG_EPAREN, "REG_EPAREN", "parentheses () not balanced" +}, + +{ + REG_EBRACE, "REG_EBRACE", "braces {} not balanced" +}, + +{ + REG_BADBR, "REG_BADBR", "invalid repetition count(s)" +}, + +{ + REG_ERANGE, "REG_ERANGE", "invalid character range" +}, + +{ + REG_ESPACE, "REG_ESPACE", "out of memory" +}, + +{ + REG_BADRPT, "REG_BADRPT", "quantifier operand invalid" +}, + +{ + REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" +}, + +{ + REG_INVARG, "REG_INVARG", "invalid argument to regex function" +}, + +{ + REG_MIXED, "REG_MIXED", "character widths of regex and string differ" +}, + +{ + REG_BADOPT, "REG_BADOPT", "invalid embedded option" +}, diff --git a/src/regex/regguts.h b/src/regex/regguts.h new file mode 100644 index 0000000000..aa12dbf445 --- /dev/null +++ b/src/regex/regguts.h @@ -0,0 +1,417 @@ +/* + * Internal interface definitions, etc., for the reg package + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id$ + */ + + + +/* + * Environmental customization. It should not (I hope) be necessary to + * alter the file you are now reading -- regcustom.h should handle it all, + * given care here and elsewhere. + */ +#include "regcustom.h" + + + +/* + * Things that regcustom.h might override. + */ + +/* assertions */ +#ifndef assert +#ifndef REG_DEBUG +# ifndef NDEBUG +# define NDEBUG /* no assertions */ +# endif +#endif +#include +#endif + +/* voids */ +#ifndef DISCARD +#define DISCARD void /* for throwing values away */ +#endif +#ifndef VS +#define VS(x) ((void *)(x)) /* cast something to generic ptr */ +#endif + +/* function-pointer declarator */ +#ifndef FUNCPTR +#define FUNCPTR(name, args) (*name) args +#endif + +/* memory allocation */ +#ifndef MALLOC +#define MALLOC(n) malloc(n) +#endif +#ifndef REALLOC +#define REALLOC(p, n) realloc(VS(p), n) +#endif +#ifndef FREE +#define FREE(p) free(VS(p)) +#endif + +/* want size of a char in bits, and max value in bounded quantifiers */ +#ifndef CHAR_BIT +#include +#endif +#ifndef _POSIX2_RE_DUP_MAX +#define _POSIX2_RE_DUP_MAX 255 /* normally from */ +#endif + + + +/* + * misc + */ + +#define NOTREACHED 0 +#define xxx 1 + +#define DUPMAX _POSIX2_RE_DUP_MAX +#define INFINITY (DUPMAX+1) + +#define REMAGIC 0xfed7 /* magic number for main struct */ + + + +/* + * debugging facilities + */ +#ifdef REG_DEBUG +/* FDEBUG does finite-state tracing */ +#define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } +/* MDEBUG does higher-level tracing */ +#define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } +#else +#define FDEBUG(arglist) {} +#define MDEBUG(arglist) {} +#endif + + + +/* + * bitmap manipulation + */ +#define UBITS (CHAR_BIT * sizeof(unsigned)) +#define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS)) +#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) + + + +/* + * We dissect a chr into byts for colormap table indexing. Here we define + * a byt, which will be the same as a byte on most machines... The exact + * size of a byt is not critical, but about 8 bits is good, and extraction + * of 8-bit chunks is sometimes especially fast. + */ +#ifndef BYTBITS +#define BYTBITS 8 /* bits in a byt */ +#endif +#define BYTTAB (1<flags&FREECOL) + union tree *block; /* block of solid color, if any */ +}; + +/* the color map itself */ +struct colormap +{ + int magic; +#define CMMAGIC 0x876 + struct vars *v; /* for compile error reporting */ + size_t ncds; /* number of colordescs */ + size_t max; /* highest in use */ + color free; /* beginning of free chain (if non-0) */ + struct colordesc *cd; +#define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) +#define NINLINECDS ((size_t)10) + struct colordesc cdspace[NINLINECDS]; + union tree tree[NBYTS]; /* tree top, plus fill blocks */ +}; + +/* optimization magic to do fast chr->color mapping */ +#define B0(c) ((c) & BYTMASK) +#define B1(c) (((c)>>BYTBITS) & BYTMASK) +#define B2(c) (((c)>>(2*BYTBITS)) & BYTMASK) +#define B3(c) (((c)>>(3*BYTBITS)) & BYTMASK) +#if NBYTS == 1 +#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)]) +#endif +/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */ +#if NBYTS == 2 +#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)]) +#endif +#if NBYTS == 4 +#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)]) +#endif + + + +/* + * Interface definitions for locale-interface functions in locale.c. + * Multi-character collating elements (MCCEs) cause most of the trouble. + */ +struct cvec +{ + int nchrs; /* number of chrs */ + int chrspace; /* number of chrs possible */ + chr *chrs; /* pointer to vector of chrs */ + int nranges; /* number of ranges (chr pairs) */ + int rangespace; /* number of chrs possible */ + chr *ranges; /* pointer to vector of chr pairs */ + int nmcces; /* number of MCCEs */ + int mccespace; /* number of MCCEs possible */ + int nmccechrs; /* number of chrs used for MCCEs */ + chr *mcces[1]; /* pointers to 0-terminated MCCEs */ + /* and both batches of chrs are on the end */ +}; + +/* caution: this value cannot be changed easily */ +#define MAXMCCE 2 /* length of longest MCCE */ + + + +/* + * definitions for NFA internal representation + * + * Having a "from" pointer within each arc may seem redundant, but it + * saves a lot of hassle. + */ +struct state; + +struct arc +{ + int type; +#define ARCFREE '\0' + color co; + struct state *from; /* where it's from (and contained within) */ + struct state *to; /* where it's to */ + struct arc *outchain; /* *from's outs chain or free chain */ +#define freechain outchain + struct arc *inchain; /* *to's ins chain */ + struct arc *colorchain; /* color's arc chain */ +}; + +struct arcbatch +{ /* for bulk allocation of arcs */ + struct arcbatch *next; +#define ABSIZE 10 + struct arc a[ABSIZE]; +}; + +struct state +{ + int no; +#define FREESTATE (-1) + char flag; /* marks special states */ + int nins; /* number of inarcs */ + struct arc *ins; /* chain of inarcs */ + int nouts; /* number of outarcs */ + struct arc *outs; /* chain of outarcs */ + struct arc *free; /* chain of free arcs */ + struct state *tmp; /* temporary for traversal algorithms */ + struct state *next; /* chain for traversing all */ + struct state *prev; /* back chain */ + struct arcbatch oas; /* first arcbatch, avoid malloc in easy + * case */ + int noas; /* number of arcs used in first arcbatch */ +}; + +struct nfa +{ + struct state *pre; /* pre-initial state */ + struct state *init; /* initial state */ + struct state *final; /* final state */ + struct state *post; /* post-final state */ + int nstates; /* for numbering states */ + struct state *states; /* state-chain header */ + struct state *slast; /* tail of the chain */ + struct state *free; /* free list */ + struct colormap *cm; /* the color map */ + color bos[2]; /* colors, if any, assigned to BOS and BOL */ + color eos[2]; /* colors, if any, assigned to EOS and EOL */ + struct vars *v; /* simplifies compile error reporting */ + struct nfa *parent; /* parent NFA, if any */ +}; + + + +/* + * definitions for compacted NFA + */ +struct carc +{ + color co; /* COLORLESS is list terminator */ + int to; /* state number */ +}; + +struct cnfa +{ + int nstates; /* number of states */ + int ncolors; /* number of colors */ + int flags; +#define HASLACONS 01 /* uses lookahead constraints */ + int pre; /* setup state number */ + int post; /* teardown state number */ + color bos[2]; /* colors, if any, assigned to BOS and BOL */ + color eos[2]; /* colors, if any, assigned to EOS and EOL */ + struct carc **states; /* vector of pointers to outarc lists */ + struct carc *arcs; /* the area for the lists */ +}; + +#define ZAPCNFA(cnfa) ((cnfa).nstates = 0) +#define NULLCNFA(cnfa) ((cnfa).nstates == 0) + + + +/* + * subexpression tree + */ +struct subre +{ + char op; /* '|', '.' (concat), 'b' (backref), '(', + * '=' */ + char flags; +#define LONGER 01 /* prefers longer match */ +#define SHORTER 02 /* prefers shorter match */ +#define MIXED 04 /* mixed preference below */ +#define CAP 010 /* capturing parens below */ +#define BACKR 020 /* back reference below */ +#define INUSE 0100 /* in use in final tree */ +#define LOCAL 03 /* bits which may not propagate up */ +#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ +#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ +#define UP(f) (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED)) +#define MESSY(f) ((f)&(MIXED|CAP|BACKR)) +#define PREF(f) ((f)&LOCAL) +#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) +#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) + short retry; /* index into retry memory */ + int subno; /* subexpression number (for 'b' and '(') */ + short min; /* min repetitions, for backref only */ + short max; /* max repetitions, for backref only */ + struct subre *left; /* left child, if any (also freelist + * chain) */ + struct subre *right; /* right child, if any */ + struct state *begin; /* outarcs from here... */ + struct state *end; /* ...ending in inarcs here */ + struct cnfa cnfa; /* compacted NFA, if any */ + struct subre *chain; /* for bookkeeping and error cleanup */ +}; + + + +/* + * table of function pointers for generic manipulation functions + * A regex_t's re_fns points to one of these. + */ +struct fns +{ + void FUNCPTR(free, (regex_t *)); +}; + + + +/* + * the insides of a regex_t, hidden behind a void * + */ +struct guts +{ + int magic; +#define GUTSMAGIC 0xfed9 + int cflags; /* copy of compile flags */ + long info; /* copy of re_info */ + size_t nsub; /* copy of re_nsub */ + struct subre *tree; + struct cnfa search; /* for fast preliminary search */ + int ntree; + struct colormap cmap; + int FUNCPTR(compare, (const chr *, const chr *, size_t)); + struct subre *lacons; /* lookahead-constraint vector */ + int nlacons; /* size of lacons */ +}; -- 2.45.2