X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/734aad71947a79037af64f74c683f5eb36fe6065..0a7506c9bdc0d0d560d4b9c8a3d1089f5db425b9:/regex/engine.c diff --git a/regex/engine.c b/regex/engine.c index 4bcabfb..5d2543f 100644 --- a/regex/engine.c +++ b/regex/engine.c @@ -1,28 +1,5 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * @@ -56,8 +33,13 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * @(#)engine.c 8.5 (Berkeley) 3/20/94 */ +#include +__FBSDID("$FreeBSD: src/lib/libc/regex/engine.c,v 1.14 2004/07/12 07:35:59 tjr Exp $"); + /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that @@ -87,6 +69,17 @@ #define at lat #define match lmat #endif +#ifdef MNAMES +#define matcher mmatcher +#define fast mfast +#define slow mslow +#define dissect mdissect +#define backref mbackref +#define step mstep +#define print mprint +#define at mat +#define match mmat +#endif /* another structure passed up and down to avoid zillions of parameters */ struct match { @@ -103,6 +96,7 @@ struct match { states fresh; /* states for a fresh start */ states tmp; /* temporary */ states empty; /* empty set of states */ + mbstate_t mbs; /* multibyte conversion state */ }; /* ========= begin header generated by ./mkh ========= */ @@ -111,29 +105,28 @@ extern "C" { #endif /* === engine.c === */ -static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); -static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); -static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); -static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); -static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); -static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); -#define BOL (OUT+1) -#define EOL (BOL+1) -#define BOLEOL (BOL+2) -#define NOTHING (BOL+3) -#define BOW (BOL+4) -#define EOW (BOL+5) -#define CODEMAX (BOL+5) /* highest code used */ -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) +static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); +static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); +static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); +static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +#define BOL (OUT-1) +#define EOL (BOL-1) +#define BOLEOL (BOL-2) +#define NOTHING (BOL-3) +#define BOW (BOL-4) +#define EOW (BOL-5) +#define BADCHAR (BOL-6) +#define NONCHAR(c) ((c) <= OUT) #ifdef REDEBUG -static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); +static void print(struct match *m, char *caption, states st, int ch, FILE *d); #endif #ifdef REDEBUG -static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); +static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); #endif #ifdef REDEBUG -static char *pchar __P((int ch)); +static char *pchar(int ch); #endif #ifdef __cplusplus @@ -153,26 +146,33 @@ static char *pchar __P((int ch)); /* - matcher - the actual matching engine - == static int matcher(register struct re_guts *g, char *string, \ + == static int matcher(struct re_guts *g, char *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); */ static int /* 0 success, REG_NOMATCH failure */ matcher(g, string, nmatch, pmatch, eflags) -register struct re_guts *g; +struct re_guts *g; char *string; size_t nmatch; regmatch_t pmatch[]; int eflags; { - register char *endp; - register int i; + char *endp; + int i; struct match mv; - register struct match *m = &mv; - register char *dp; - const register sopno gf = g->firststate+1; /* +1 for OEND */ - const register sopno gl = g->laststate; + struct match *m = &mv; + char *dp; + const sopno gf = g->firststate+1; /* +1 for OEND */ + const sopno gl = g->laststate; char *start; char *stop; + /* Boyer-Moore algorithms variables */ + char *pp; + int cj, mj; + char *mustfirst; + char *mustlast; + int *matchjump; + int *charjump; /* simplify the situation where possible */ if (g->cflags®_NOSUB) @@ -189,12 +189,46 @@ int eflags; /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) - break; - if (dp == stop) /* we didn't find g->must */ - return(REG_NOMATCH); + if (g->charjump != NULL && g->matchjump != NULL) { + mustfirst = g->must; + mustlast = g->must + g->mlen - 1; + charjump = g->charjump; + matchjump = g->matchjump; + pp = mustlast; + for (dp = start+g->mlen-1; dp < stop;) { + /* Fast skip non-matches */ + while (dp < stop && charjump[(int)*dp]) + dp += charjump[(int)*dp]; + + if (dp >= stop) + break; + + /* Greedy matcher */ + /* We depend on not being used for + * for strings of length 1 + */ + while (*--dp == *--pp && pp != mustfirst); + + if (*dp == *pp) + break; + + /* Jump to next possible match */ + mj = matchjump[pp - mustfirst]; + cj = charjump[(int)*dp]; + dp += (cj < mj ? mj : cj); + pp = mustlast; + } + if (pp != mustfirst) + return(REG_NOMATCH); + } else { + for (dp = start; dp < stop; dp++) + if (*dp == g->must[0] && + stop - dp >= g->mlen && + memcmp(dp, g->must, (size_t)g->mlen) == 0) + break; + if (dp == stop) /* we didn't find g->must */ + return(REG_NOMATCH); + } } /* match struct setup */ @@ -211,6 +245,11 @@ int eflags; SETUP(m->tmp); SETUP(m->empty); CLEAR(m->empty); + ZAPSTATE(&m->mbs); + + /* Adjust start according to moffset, to speed things up */ + if (g->moffset > -1) + start = ((dp - g->moffset) < start) ? start : dp - g->moffset; /* this loop does only one repetition except for backrefs */ for (;;) { @@ -230,7 +269,8 @@ int eflags; if (endp != NULL) break; assert(m->coldp < m->endp); - m->coldp++; + m->coldp += XMBRTOWC(NULL, m->coldp, + m->endp - m->coldp, &m->mbs, 0, g->loc); } if (nmatch == 1 && !g->backrefs) break; /* no further info needed */ @@ -289,7 +329,9 @@ int eflags; /* despite initial appearances, there is no match here */ NOTE("false alarm"); - start = m->coldp + 1; /* recycle starting later */ + /* recycle starting later */ + start = m->coldp + XMBRTOWC(NULL, m->coldp, + m->endp - m->coldp, &m->mbs, 0, g->loc); assert(start <= stop); } @@ -319,30 +361,30 @@ int eflags; /* - dissect - figure out what matched what, no back references - == static char *dissect(register struct match *m, char *start, \ + == static char *dissect(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* == stop (success) always */ dissect(m, start, stop, startst, stopst) -register struct match *m; +struct match *m; char *start; char *stop; sopno startst; sopno stopst; { - register int i; - register sopno ss; /* start sop of current subRE */ - register sopno es; /* end sop of current subRE */ - register char *sp; /* start of string matched by it */ - register char *stp; /* string matched by it cannot pass here */ - register char *rest; /* start of rest of string */ - register char *tail; /* string unmatched by rest of RE */ - register sopno ssub; /* start sop of subsubRE */ - register sopno esub; /* end sop of subsubRE */ - register char *ssp; /* start of string matched by subsubRE */ - register char *sep; /* end of string matched by subsubRE */ - register char *oldssp; /* previous ssp */ - register char *dp; + int i; + sopno ss; /* start sop of current subRE */ + sopno es; /* end sop of current subRE */ + char *sp; /* start of string matched by it */ + char *stp; /* string matched by it cannot pass here */ + char *rest; /* start of rest of string */ + char *tail; /* string unmatched by rest of RE */ + sopno ssub; /* start sop of subsubRE */ + sopno esub; /* end sop of subsubRE */ + char *ssp; /* start of string matched by subsubRE */ + char *sep; /* end of string matched by subsubRE */ + char *oldssp; /* previous ssp */ + char *dp; AT("diss", start, stop, startst, stopst); sp = start; @@ -367,7 +409,7 @@ sopno stopst; assert(nope); break; case OCHAR: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0, m->g->loc); break; case OBOL: case OEOL: @@ -376,7 +418,7 @@ sopno stopst; break; case OANY: case OANYOF: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0, m->g->loc); break; case OBACK_: case O_BACK: @@ -437,6 +479,10 @@ sopno stopst; sep = ssp; ssp = oldssp; } + else if (tail==rest) { + /* Fix for test expr 105 */ + ssp = oldssp; + } assert(sep == rest); /* must exhaust substring */ assert(slow(m, ssp, sep, ssub, esub) == rest); dp = dissect(m, ssp, sep, ssub, esub); @@ -489,6 +535,14 @@ sopno stopst; i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_so = sp - m->offp; + /* fix for T.regcomp 43: don't remember previous + subexpression matches beyond the current one (i) */ + i++; + while (i<= m->g->nsub) { + m->pmatch[i].rm_so = -1; + m->pmatch[i].rm_eo = -1; + i++; + } break; case ORPAREN: i = OPND(m->g->strip[ss]); @@ -507,30 +561,31 @@ sopno stopst; /* - backref - figure out what matched what, figuring in back references - == static char *backref(register struct match *m, char *start, \ + == static char *backref(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst, sopno lev); */ static char * /* == stop (success) or NULL (failure) */ backref(m, start, stop, startst, stopst, lev) -register struct match *m; +struct match *m; char *start; char *stop; sopno startst; sopno stopst; sopno lev; /* PLUS nesting level */ { - register int i; - register sopno ss; /* start sop of current subRE */ - register char *sp; /* start of string matched by it */ - register sopno ssub; /* start sop of subsubRE */ - register sopno esub; /* end sop of subsubRE */ - register char *ssp; /* start of string matched by subsubRE */ - register char *dp; - register size_t len; - register int hard; - register sop s; - register regoff_t offsave; - register cset *cs; + int i; + sopno ss; /* start sop of current subRE */ + char *sp; /* start of string matched by it */ + sopno ssub; /* start sop of subsubRE */ + sopno esub; /* end sop of subsubRE */ + char *ssp; /* start of string matched by subsubRE */ + char *dp; + size_t len; + int hard; + sop s; + regoff_t offsave; + cset *cs; + wint_t wc; AT("back", start, stop, startst, stopst); sp = start; @@ -540,17 +595,25 @@ sopno lev; /* PLUS nesting level */ for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: - if (sp == stop || *sp++ != (char)OPND(s)) + if (sp == stop) + return(NULL); + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc); + if (wc != OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); - sp++; + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc); + if (wc == BADCHAR) + return (NULL); break; case OANYOF: + if (sp == stop) + return (NULL); cs = &m->g->sets[OPND(s)]; - if (sp == stop || !CHIN(cs, *sp++)) + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc); + if (wc == BADCHAR || !CHIN(cs, wc, m->g->loc)) return(NULL); break; case OBOL: @@ -574,8 +637,8 @@ sopno lev; /* PLUS nesting level */ (sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)) || (sp > m->beginp && - !ISWORD(*(sp-1))) ) && - (sp < m->endp && ISWORD(*sp)) ) + !ISWORD(*(sp-1), m->g->loc)) ) && + (sp < m->endp && ISWORD(*sp, m->g->loc)) ) { /* yes */ } else return(NULL); @@ -584,8 +647,8 @@ sopno lev; /* PLUS nesting level */ if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || - (sp < m->endp && !ISWORD(*sp)) ) && - (sp > m->beginp && ISWORD(*(sp-1))) ) + (sp < m->endp && !ISWORD(*sp, m->g->loc)) ) && + (sp > m->beginp && ISWORD(*(sp-1), m->g->loc)) ) { /* yes */ } else return(NULL); @@ -707,30 +770,32 @@ sopno lev; /* PLUS nesting level */ /* "can't happen" */ assert(nope); /* NOTREACHED */ + return "shut up gcc"; } /* - fast - step through the string at top speed - == static char *fast(register struct match *m, char *start, \ + == static char *fast(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* where tentative match ended, or NULL */ fast(m, start, stop, startst, stopst) -register struct match *m; +struct match *m; char *start; char *stop; sopno startst; sopno stopst; { - register states st = m->st; - register states fresh = m->fresh; - register states tmp = m->tmp; - register char *p = start; - register int c = (start == m->beginp) ? OUT : *(start-1); - register int lastc; /* previous c */ - register int flagch; - register int i; - register char *coldp; /* last p after which no match was underway */ + states st = m->st; + states fresh = m->fresh; + states tmp = m->tmp; + char *p = start; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; + int i; + char *coldp; /* last p after which no match was underway */ + size_t clen; CLEAR(st); SET1(st, startst); @@ -738,10 +803,23 @@ sopno stopst; ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in encodings supported by FreeBSD), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) + c = OUT; + else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR, m->g->loc); if (EQ(st, fresh)) coldp = p; @@ -765,12 +843,12 @@ sopno stopst; } /* how about a word boundary? */ - if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c)) ) { + if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc, m->g->loc))) && + (c != OUT && ISWORD(c, m->g->loc)) ) { flagch = BOW; } - if ( (lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c))) ) { + if ( (lastc != OUT && ISWORD(lastc, m->g->loc)) && + (flagch == EOL || (c != OUT && !ISWORD(c, m->g->loc))) ) { flagch = EOW; } if (flagch == BOW || flagch == EOW) { @@ -789,39 +867,40 @@ sopno stopst; st = step(m->g, startst, stopst, tmp, c, st); SP("aft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } assert(coldp != NULL); m->coldp = coldp; if (ISSET(st, stopst)) - return(p+1); + return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0, m->g->loc)); else return(NULL); } /* - slow - step through the string more deliberately - == static char *slow(register struct match *m, char *start, \ + == static char *slow(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* where it ended */ slow(m, start, stop, startst, stopst) -register struct match *m; +struct match *m; char *start; char *stop; sopno startst; sopno stopst; { - register states st = m->st; - register states empty = m->empty; - register states tmp = m->tmp; - register char *p = start; - register int c = (start == m->beginp) ? OUT : *(start-1); - register int lastc; /* previous c */ - register int flagch; - register int i; - register char *matchp; /* last p at which a match ended */ + states st = m->st; + states empty = m->empty; + states tmp = m->tmp; + char *p = start; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; + int i; + char *matchp; /* last p at which a match ended */ + size_t clen; AT("slow", start, stop, startst, stopst); CLEAR(st); @@ -829,10 +908,24 @@ sopno stopst; SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); matchp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in encodings supported by FreeBSD), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) { + c = OUT; + clen = 0; + } else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR, m->g->loc); /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; @@ -854,12 +947,12 @@ sopno stopst; } /* how about a word boundary? */ - if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c)) ) { + if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc, m->g->loc))) && + (c != OUT && ISWORD(c, m->g->loc)) ) { flagch = BOW; } - if ( (lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c))) ) { + if ( (lastc != OUT && ISWORD(lastc, m->g->loc)) && + (flagch == EOL || (c != OUT && !ISWORD(c, m->g->loc))) ) { flagch = EOW; } if (flagch == BOW || flagch == EOW) { @@ -880,7 +973,7 @@ sopno stopst; st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } return(matchp); @@ -889,33 +982,32 @@ sopno stopst; /* - step - map set of states reachable before char to set reachable after - == static states step(register struct re_guts *g, sopno start, sopno stop, \ - == register states bef, int ch, register states aft); - == #define BOL (OUT+1) - == #define EOL (BOL+1) - == #define BOLEOL (BOL+2) - == #define NOTHING (BOL+3) - == #define BOW (BOL+4) - == #define EOW (BOL+5) - == #define CODEMAX (BOL+5) // highest code used - == #define NONCHAR(c) ((c) > CHAR_MAX) - == #define NNONCHAR (CODEMAX-CHAR_MAX) + == static states step(struct re_guts *g, sopno start, sopno stop, \ + == states bef, int ch, states aft); + == #define BOL (OUT-1) + == #define EOL (BOL-1) + == #define BOLEOL (BOL-2) + == #define NOTHING (BOL-3) + == #define BOW (BOL-4) + == #define EOW (BOL-5) + == #define BADCHAR (BOL-6) + == #define NONCHAR(c) ((c) <= OUT) */ static states step(g, start, stop, bef, ch, aft) -register struct re_guts *g; +struct re_guts *g; sopno start; /* start state within strip */ sopno stop; /* state after stop state within strip */ -register states bef; /* states reachable before */ -int ch; /* character or NONCHAR code */ -register states aft; /* states already known reachable after */ +states bef; /* states reachable before */ +wint_t ch; /* character or NONCHAR code */ +states aft; /* states already known reachable after */ { - register cset *cs; - register sop s; - register sopno pc; - register onestate here; /* note, macros know this name */ - register sopno look; - register int i; + cset *cs; + sop s; + sopno pc; + onestate here; /* note, macros know this name */ + sopno look; + int i; for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { s = g->strip[pc]; @@ -925,8 +1017,8 @@ register states aft; /* states already known reachable after */ break; case OCHAR: /* only characters can match */ - assert(!NONCHAR(ch) || ch != (char)OPND(s)); - if (ch == (char)OPND(s)) + assert(!NONCHAR(ch) || ch != OPND(s)); + if (ch == OPND(s)) FWD(aft, bef, 1); break; case OBOL: @@ -951,7 +1043,7 @@ register states aft; /* states already known reachable after */ break; case OANYOF: cs = &g->sets[OPND(s)]; - if (!NONCHAR(ch) && CHIN(cs, ch)) + if (!NONCHAR(ch) && CHIN(cs, ch, g->loc)) FWD(aft, bef, 1); break; case OBACK_: /* ignored here */ @@ -1031,9 +1123,9 @@ states st; int ch; FILE *d; { - register struct re_guts *g = m->g; - register int i; - register int first = 1; + struct re_guts *g = m->g; + int i; + int first = 1; if (!(m->eflags®_TRACE)) return; @@ -1049,7 +1141,7 @@ FILE *d; fprintf(d, "\n"); } -/* +/* - at - print current situation == #ifdef REDEBUG == static void at(struct match *m, char *title, char *start, char *stop, \ @@ -1092,7 +1184,7 @@ int ch; { static char pbuf[10]; - if (isprint(ch) || ch == ' ') + if (isprint((uch)ch) || ch == ' ') sprintf(pbuf, "%c", ch); else sprintf(pbuf, "\\%o", ch);