-/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-/*
+/*-
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * @(#)engine.c 8.5 (Berkeley) 3/20/94
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/lib/libc/regex/engine.c,v 1.14 2004/07/12 07:35:59 tjr Exp $");
+
/*
* The matching engine and friends. This file is #included by regexec.c
* after suitable #defines of a variety of macros used herein, so that
#define at lat
#define match lmat
#endif
+#ifdef MNAMES
+#define matcher mmatcher
+#define fast mfast
+#define slow mslow
+#define dissect mdissect
+#define backref mbackref
+#define step mstep
+#define print mprint
+#define at mat
+#define match mmat
+#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
+ mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
#endif
/* === engine.c === */
-static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
-static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
-static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
-static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
-static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
-static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
-#define BOL (OUT+1)
-#define EOL (BOL+1)
-#define BOLEOL (BOL+2)
-#define NOTHING (BOL+3)
-#define BOW (BOL+4)
-#define EOW (BOL+5)
-#define CODEMAX (BOL+5) /* highest code used */
-#define NONCHAR(c) ((c) > CHAR_MAX)
-#define NNONCHAR (CODEMAX-CHAR_MAX)
+static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
+static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
+static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
+static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
+static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
+#define BOL (OUT-1)
+#define EOL (BOL-1)
+#define BOLEOL (BOL-2)
+#define NOTHING (BOL-3)
+#define BOW (BOL-4)
+#define EOW (BOL-5)
+#define BADCHAR (BOL-6)
+#define NONCHAR(c) ((c) <= OUT)
#ifdef REDEBUG
-static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
+static void print(struct match *m, char *caption, states st, int ch, FILE *d);
#endif
#ifdef REDEBUG
-static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
+static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
#endif
#ifdef REDEBUG
-static char *pchar __P((int ch));
+static char *pchar(int ch);
#endif
#ifdef __cplusplus
/*
- matcher - the actual matching engine
- == static int matcher(register struct re_guts *g, char *string, \
+ == static int matcher(struct re_guts *g, char *string, \
== size_t nmatch, regmatch_t pmatch[], int eflags);
*/
static int /* 0 success, REG_NOMATCH failure */
matcher(g, string, nmatch, pmatch, eflags)
-register struct re_guts *g;
+struct re_guts *g;
char *string;
size_t nmatch;
regmatch_t pmatch[];
int eflags;
{
- register char *endp;
- register int i;
+ char *endp;
+ int i;
struct match mv;
- register struct match *m = &mv;
- register char *dp;
- const register sopno gf = g->firststate+1; /* +1 for OEND */
- const register sopno gl = g->laststate;
+ struct match *m = &mv;
+ char *dp;
+ const sopno gf = g->firststate+1; /* +1 for OEND */
+ const sopno gl = g->laststate;
char *start;
char *stop;
+ /* Boyer-Moore algorithms variables */
+ char *pp;
+ int cj, mj;
+ char *mustfirst;
+ char *mustlast;
+ int *matchjump;
+ int *charjump;
/* simplify the situation where possible */
if (g->cflags®_NOSUB)
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
- for (dp = start; dp < stop; dp++)
- if (*dp == g->must[0] && stop - dp >= g->mlen &&
- memcmp(dp, g->must, (size_t)g->mlen) == 0)
- break;
- if (dp == stop) /* we didn't find g->must */
- return(REG_NOMATCH);
+ if (g->charjump != NULL && g->matchjump != NULL) {
+ mustfirst = g->must;
+ mustlast = g->must + g->mlen - 1;
+ charjump = g->charjump;
+ matchjump = g->matchjump;
+ pp = mustlast;
+ for (dp = start+g->mlen-1; dp < stop;) {
+ /* Fast skip non-matches */
+ while (dp < stop && charjump[(int)*dp])
+ dp += charjump[(int)*dp];
+
+ if (dp >= stop)
+ break;
+
+ /* Greedy matcher */
+ /* We depend on not being used for
+ * for strings of length 1
+ */
+ while (*--dp == *--pp && pp != mustfirst);
+
+ if (*dp == *pp)
+ break;
+
+ /* Jump to next possible match */
+ mj = matchjump[pp - mustfirst];
+ cj = charjump[(int)*dp];
+ dp += (cj < mj ? mj : cj);
+ pp = mustlast;
+ }
+ if (pp != mustfirst)
+ return(REG_NOMATCH);
+ } else {
+ for (dp = start; dp < stop; dp++)
+ if (*dp == g->must[0] &&
+ stop - dp >= g->mlen &&
+ memcmp(dp, g->must, (size_t)g->mlen) == 0)
+ break;
+ if (dp == stop) /* we didn't find g->must */
+ return(REG_NOMATCH);
+ }
}
/* match struct setup */
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
+ ZAPSTATE(&m->mbs);
+
+ /* Adjust start according to moffset, to speed things up */
+ if (g->moffset > -1)
+ start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
/* this loop does only one repetition except for backrefs */
for (;;) {
if (endp != NULL)
break;
assert(m->coldp < m->endp);
- m->coldp++;
+ m->coldp += XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0, g->loc);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
/* despite initial appearances, there is no match here */
NOTE("false alarm");
- start = m->coldp + 1; /* recycle starting later */
+ /* recycle starting later */
+ start = m->coldp + XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0, g->loc);
assert(start <= stop);
}
/*
- dissect - figure out what matched what, no back references
- == static char *dissect(register struct match *m, char *start, \
+ == static char *dissect(struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
*/
static char * /* == stop (success) always */
dissect(m, start, stop, startst, stopst)
-register struct match *m;
+struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
{
- register int i;
- register sopno ss; /* start sop of current subRE */
- register sopno es; /* end sop of current subRE */
- register char *sp; /* start of string matched by it */
- register char *stp; /* string matched by it cannot pass here */
- register char *rest; /* start of rest of string */
- register char *tail; /* string unmatched by rest of RE */
- register sopno ssub; /* start sop of subsubRE */
- register sopno esub; /* end sop of subsubRE */
- register char *ssp; /* start of string matched by subsubRE */
- register char *sep; /* end of string matched by subsubRE */
- register char *oldssp; /* previous ssp */
- register char *dp;
+ int i;
+ sopno ss; /* start sop of current subRE */
+ sopno es; /* end sop of current subRE */
+ char *sp; /* start of string matched by it */
+ char *stp; /* string matched by it cannot pass here */
+ char *rest; /* start of rest of string */
+ char *tail; /* string unmatched by rest of RE */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ char *ssp; /* start of string matched by subsubRE */
+ char *sep; /* end of string matched by subsubRE */
+ char *oldssp; /* previous ssp */
+ char *dp;
AT("diss", start, stop, startst, stopst);
sp = start;
assert(nope);
break;
case OCHAR:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0, m->g->loc);
break;
case OBOL:
case OEOL:
break;
case OANY:
case OANYOF:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0, m->g->loc);
break;
case OBACK_:
case O_BACK:
sep = ssp;
ssp = oldssp;
}
+ else if (tail==rest) {
+ /* Fix for test expr 105 */
+ ssp = oldssp;
+ }
assert(sep == rest); /* must exhaust substring */
assert(slow(m, ssp, sep, ssub, esub) == rest);
dp = dissect(m, ssp, sep, ssub, esub);
i = OPND(m->g->strip[ss]);
assert(0 < i && i <= m->g->nsub);
m->pmatch[i].rm_so = sp - m->offp;
+ /* fix for T.regcomp 43: don't remember previous
+ subexpression matches beyond the current one (i) */
+ i++;
+ while (i<= m->g->nsub) {
+ m->pmatch[i].rm_so = -1;
+ m->pmatch[i].rm_eo = -1;
+ i++;
+ }
break;
case ORPAREN:
i = OPND(m->g->strip[ss]);
/*
- backref - figure out what matched what, figuring in back references
- == static char *backref(register struct match *m, char *start, \
+ == static char *backref(struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst, sopno lev);
*/
static char * /* == stop (success) or NULL (failure) */
backref(m, start, stop, startst, stopst, lev)
-register struct match *m;
+struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
sopno lev; /* PLUS nesting level */
{
- register int i;
- register sopno ss; /* start sop of current subRE */
- register char *sp; /* start of string matched by it */
- register sopno ssub; /* start sop of subsubRE */
- register sopno esub; /* end sop of subsubRE */
- register char *ssp; /* start of string matched by subsubRE */
- register char *dp;
- register size_t len;
- register int hard;
- register sop s;
- register regoff_t offsave;
- register cset *cs;
+ int i;
+ sopno ss; /* start sop of current subRE */
+ char *sp; /* start of string matched by it */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ char *ssp; /* start of string matched by subsubRE */
+ char *dp;
+ size_t len;
+ int hard;
+ sop s;
+ regoff_t offsave;
+ cset *cs;
+ wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
- if (sp == stop || *sp++ != (char)OPND(s))
+ if (sp == stop)
+ return(NULL);
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc);
+ if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
- sp++;
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc);
+ if (wc == BADCHAR)
+ return (NULL);
break;
case OANYOF:
+ if (sp == stop)
+ return (NULL);
cs = &m->g->sets[OPND(s)];
- if (sp == stop || !CHIN(cs, *sp++))
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR, m->g->loc);
+ if (wc == BADCHAR || !CHIN(cs, wc, m->g->loc))
return(NULL);
break;
case OBOL:
(sp < m->endp && *(sp-1) == '\n' &&
(m->g->cflags®_NEWLINE)) ||
(sp > m->beginp &&
- !ISWORD(*(sp-1))) ) &&
- (sp < m->endp && ISWORD(*sp)) )
+ !ISWORD(*(sp-1), m->g->loc)) ) &&
+ (sp < m->endp && ISWORD(*sp, m->g->loc)) )
{ /* yes */ }
else
return(NULL);
if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
(sp < m->endp && *sp == '\n' &&
(m->g->cflags®_NEWLINE)) ||
- (sp < m->endp && !ISWORD(*sp)) ) &&
- (sp > m->beginp && ISWORD(*(sp-1))) )
+ (sp < m->endp && !ISWORD(*sp, m->g->loc)) ) &&
+ (sp > m->beginp && ISWORD(*(sp-1), m->g->loc)) )
{ /* yes */ }
else
return(NULL);
/* "can't happen" */
assert(nope);
/* NOTREACHED */
+ return "shut up gcc";
}
/*
- fast - step through the string at top speed
- == static char *fast(register struct match *m, char *start, \
+ == static char *fast(struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
*/
static char * /* where tentative match ended, or NULL */
fast(m, start, stop, startst, stopst)
-register struct match *m;
+struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
{
- register states st = m->st;
- register states fresh = m->fresh;
- register states tmp = m->tmp;
- register char *p = start;
- register int c = (start == m->beginp) ? OUT : *(start-1);
- register int lastc; /* previous c */
- register int flagch;
- register int i;
- register char *coldp; /* last p after which no match was underway */
+ states st = m->st;
+ states fresh = m->fresh;
+ states tmp = m->tmp;
+ char *p = start;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
+ int i;
+ char *coldp; /* last p after which no match was underway */
+ size_t clen;
CLEAR(st);
SET1(st, startst);
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp)
+ c = OUT;
+ else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR, m->g->loc);
if (EQ(st, fresh))
coldp = p;
}
/* how about a word boundary? */
- if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
- (c != OUT && ISWORD(c)) ) {
+ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc, m->g->loc))) &&
+ (c != OUT && ISWORD(c, m->g->loc)) ) {
flagch = BOW;
}
- if ( (lastc != OUT && ISWORD(lastc)) &&
- (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+ if ( (lastc != OUT && ISWORD(lastc, m->g->loc)) &&
+ (flagch == EOL || (c != OUT && !ISWORD(c, m->g->loc))) ) {
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
- return(p+1);
+ return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0, m->g->loc));
else
return(NULL);
}
/*
- slow - step through the string more deliberately
- == static char *slow(register struct match *m, char *start, \
+ == static char *slow(struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
*/
static char * /* where it ended */
slow(m, start, stop, startst, stopst)
-register struct match *m;
+struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
{
- register states st = m->st;
- register states empty = m->empty;
- register states tmp = m->tmp;
- register char *p = start;
- register int c = (start == m->beginp) ? OUT : *(start-1);
- register int lastc; /* previous c */
- register int flagch;
- register int i;
- register char *matchp; /* last p at which a match ended */
+ states st = m->st;
+ states empty = m->empty;
+ states tmp = m->tmp;
+ char *p = start;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
+ int i;
+ char *matchp; /* last p at which a match ended */
+ size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp) {
+ c = OUT;
+ clen = 0;
+ } else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR, m->g->loc);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
}
/* how about a word boundary? */
- if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
- (c != OUT && ISWORD(c)) ) {
+ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc, m->g->loc))) &&
+ (c != OUT && ISWORD(c, m->g->loc)) ) {
flagch = BOW;
}
- if ( (lastc != OUT && ISWORD(lastc)) &&
- (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+ if ( (lastc != OUT && ISWORD(lastc, m->g->loc)) &&
+ (flagch == EOL || (c != OUT && !ISWORD(c, m->g->loc))) ) {
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
return(matchp);
/*
- step - map set of states reachable before char to set reachable after
- == static states step(register struct re_guts *g, sopno start, sopno stop, \
- == register states bef, int ch, register states aft);
- == #define BOL (OUT+1)
- == #define EOL (BOL+1)
- == #define BOLEOL (BOL+2)
- == #define NOTHING (BOL+3)
- == #define BOW (BOL+4)
- == #define EOW (BOL+5)
- == #define CODEMAX (BOL+5) // highest code used
- == #define NONCHAR(c) ((c) > CHAR_MAX)
- == #define NNONCHAR (CODEMAX-CHAR_MAX)
+ == static states step(struct re_guts *g, sopno start, sopno stop, \
+ == states bef, int ch, states aft);
+ == #define BOL (OUT-1)
+ == #define EOL (BOL-1)
+ == #define BOLEOL (BOL-2)
+ == #define NOTHING (BOL-3)
+ == #define BOW (BOL-4)
+ == #define EOW (BOL-5)
+ == #define BADCHAR (BOL-6)
+ == #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(g, start, stop, bef, ch, aft)
-register struct re_guts *g;
+struct re_guts *g;
sopno start; /* start state within strip */
sopno stop; /* state after stop state within strip */
-register states bef; /* states reachable before */
-int ch; /* character or NONCHAR code */
-register states aft; /* states already known reachable after */
+states bef; /* states reachable before */
+wint_t ch; /* character or NONCHAR code */
+states aft; /* states already known reachable after */
{
- register cset *cs;
- register sop s;
- register sopno pc;
- register onestate here; /* note, macros know this name */
- register sopno look;
- register int i;
+ cset *cs;
+ sop s;
+ sopno pc;
+ onestate here; /* note, macros know this name */
+ sopno look;
+ int i;
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
s = g->strip[pc];
break;
case OCHAR:
/* only characters can match */
- assert(!NONCHAR(ch) || ch != (char)OPND(s));
- if (ch == (char)OPND(s))
+ assert(!NONCHAR(ch) || ch != OPND(s));
+ if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:
break;
case OANYOF:
cs = &g->sets[OPND(s)];
- if (!NONCHAR(ch) && CHIN(cs, ch))
+ if (!NONCHAR(ch) && CHIN(cs, ch, g->loc))
FWD(aft, bef, 1);
break;
case OBACK_: /* ignored here */
int ch;
FILE *d;
{
- register struct re_guts *g = m->g;
- register int i;
- register int first = 1;
+ struct re_guts *g = m->g;
+ int i;
+ int first = 1;
if (!(m->eflags®_TRACE))
return;
fprintf(d, "\n");
}
-/*
+/*
- at - print current situation
== #ifdef REDEBUG
== static void at(struct match *m, char *title, char *start, char *stop, \
{
static char pbuf[10];
- if (isprint(ch) || ch == ' ')
+ if (isprint((uch)ch) || ch == ' ')
sprintf(pbuf, "%c", ch);
else
sprintf(pbuf, "\\%o", ch);