From 5a41989248f5af10f67bcd526970968c0ba7439d Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 13 Jul 2001 13:31:22 +0000 Subject: [PATCH] Initial revision git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@10992 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- src/regex/debug.c | 242 +++++++ src/regex/engine.c | 1019 +++++++++++++++++++++++++++ src/regex/main.c | 510 ++++++++++++++ src/regex/regcomp.c | 1603 +++++++++++++++++++++++++++++++++++++++++++ src/regex/regex2.h | 134 ++++ src/regex/split.c | 316 +++++++++ src/regex/utils.h | 22 + 7 files changed, 3846 insertions(+) create mode 100644 src/regex/debug.c create mode 100644 src/regex/engine.c create mode 100644 src/regex/main.c create mode 100644 src/regex/regcomp.c create mode 100644 src/regex/regex2.h create mode 100644 src/regex/split.c create mode 100644 src/regex/utils.h diff --git a/src/regex/debug.c b/src/regex/debug.c new file mode 100644 index 0000000000..99ce7da6dd --- /dev/null +++ b/src/regex/debug.c @@ -0,0 +1,242 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "regex2.h" +#include "debug.ih" + +/* + - regprint - print a regexp for debugging + == void regprint(regex_t *r, FILE *d); + */ +void +regprint(r, d) +regex_t *r; +FILE *d; +{ + register struct re_guts *g = r->re_g; + register int i; + register int c; + register int last; + int nincat[NC]; + + fprintf(d, "%ld states, %d categories", (long)g->nstates, + g->ncategories); + fprintf(d, ", first %ld last %ld", (long)g->firststate, + (long)g->laststate); + if (g->iflags&USEBOL) + fprintf(d, ", USEBOL"); + if (g->iflags&USEEOL) + fprintf(d, ", USEEOL"); + if (g->iflags&BAD) + fprintf(d, ", BAD"); + if (g->nsub > 0) + fprintf(d, ", nsub=%ld", (long)g->nsub); + if (g->must != NULL) + fprintf(d, ", must(%ld) `%*s'", (long)g->mlen, (int)g->mlen, + g->must); + if (g->backrefs) + fprintf(d, ", backrefs"); + if (g->nplus > 0) + fprintf(d, ", nplus %ld", (long)g->nplus); + fprintf(d, "\n"); + s_print(g, d); + for (i = 0; i < g->ncategories; i++) { + nincat[i] = 0; + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (g->categories[c] == i) + nincat[i]++; + } + fprintf(d, "cc0#%d", nincat[0]); + for (i = 1; i < g->ncategories; i++) + if (nincat[i] == 1) { + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (g->categories[c] == i) + break; + fprintf(d, ", %d=%s", i, regchar(c)); + } + fprintf(d, "\n"); + for (i = 1; i < g->ncategories; i++) + if (nincat[i] != 1) { + fprintf(d, "cc%d\t", i); + last = -1; + for (c = CHAR_MIN; c <= CHAR_MAX+1; c++) /* +1 does flush */ + if (c <= CHAR_MAX && g->categories[c] == i) { + if (last < 0) { + fprintf(d, "%s", regchar(c)); + last = c; + } + } else { + if (last >= 0) { + if (last != c-1) + fprintf(d, "-%s", + regchar(c-1)); + last = -1; + } + } + fprintf(d, "\n"); + } +} + +/* + - s_print - print the strip for debugging + == static void s_print(register struct re_guts *g, FILE *d); + */ +static void +s_print(g, d) +register struct re_guts *g; +FILE *d; +{ + register sop *s; + register cset *cs; + register int i; + register int done = 0; + register sop opnd; + register int col = 0; + register int last; + register sopno offset = 2; +# define GAP() { if (offset % 5 == 0) { \ + if (col > 40) { \ + fprintf(d, "\n\t"); \ + col = 0; \ + } else { \ + fprintf(d, " "); \ + col++; \ + } \ + } else \ + col++; \ + offset++; \ + } + + if (OP(g->strip[0]) != OEND) + fprintf(d, "missing initial OEND!\n"); + for (s = &g->strip[1]; !done; s++) { + opnd = OPND(*s); + switch (OP(*s)) { + case OEND: + fprintf(d, "\n"); + done = 1; + break; + case OCHAR: + if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL) + fprintf(d, "\\%c", (char)opnd); + else + fprintf(d, "%s", regchar((char)opnd)); + break; + case OBOL: + fprintf(d, "^"); + break; + case OEOL: + fprintf(d, "$"); + break; + case OBOW: + fprintf(d, "\\{"); + break; + case OEOW: + fprintf(d, "\\}"); + break; + case OANY: + fprintf(d, "."); + break; + case OANYOF: + fprintf(d, "[(%ld)", (long)opnd); + cs = &g->sets[opnd]; + last = -1; + for (i = 0; i < g->csetsize+1; i++) /* +1 flushes */ + if (CHIN(cs, i) && i < g->csetsize) { + if (last < 0) { + fprintf(d, "%s", regchar(i)); + last = i; + } + } else { + if (last >= 0) { + if (last != i-1) + fprintf(d, "-%s", + regchar(i-1)); + last = -1; + } + } + fprintf(d, "]"); + break; + case OBACK_: + fprintf(d, "(\\<%ld>", (long)opnd); + break; + case O_BACK: + fprintf(d, "<%ld>\\)", (long)opnd); + break; + case OPLUS_: + fprintf(d, "(+"); + if (OP(*(s+opnd)) != O_PLUS) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_PLUS: + if (OP(*(s-opnd)) != OPLUS_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "+)"); + break; + case OQUEST_: + fprintf(d, "(?"); + if (OP(*(s+opnd)) != O_QUEST) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_QUEST: + if (OP(*(s-opnd)) != OQUEST_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "?)"); + break; + case OLPAREN: + fprintf(d, "((<%ld>", (long)opnd); + break; + case ORPAREN: + fprintf(d, "<%ld>))", (long)opnd); + break; + case OCH_: + fprintf(d, "<"); + if (OP(*(s+opnd)) != OOR2) + fprintf(d, "<%ld>", (long)opnd); + break; + case OOR1: + if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "|"); + break; + case OOR2: + fprintf(d, "|"); + if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_CH: + if (OP(*(s-opnd)) != OOR1) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, ">"); + break; + default: + fprintf(d, "!%d(%d)!", OP(*s), opnd); + break; + } + if (!done) + GAP(); + } +} + +/* + - regchar - make a character printable + == static char *regchar(int ch); + */ +static char * /* -> representation */ +regchar(ch) +int ch; +{ + static char buf[10]; + + if (isprint(ch) || ch == ' ') + sprintf(buf, "%c", ch); + else + sprintf(buf, "\\%o", ch); + return(buf); +} diff --git a/src/regex/engine.c b/src/regex/engine.c new file mode 100644 index 0000000000..919fe3f641 --- /dev/null +++ b/src/regex/engine.c @@ -0,0 +1,1019 @@ +/* + * The matching engine and friends. This file is #included by regexec.c + * after suitable #defines of a variety of macros used herein, so that + * different state representations can be used without duplicating masses + * of code. + */ + +#ifdef SNAMES +#define matcher smatcher +#define fast sfast +#define slow sslow +#define dissect sdissect +#define backref sbackref +#define step sstep +#define print sprint +#define at sat +#define match smat +#endif +#ifdef LNAMES +#define matcher lmatcher +#define fast lfast +#define slow lslow +#define dissect ldissect +#define backref lbackref +#define step lstep +#define print lprint +#define at lat +#define match lmat +#endif + +/* another structure passed up and down to avoid zillions of parameters */ +struct match { + struct re_guts *g; + int eflags; + regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ + char *offp; /* offsets work from here */ + char *beginp; /* start of string -- virtual NUL precedes */ + char *endp; /* end of string -- virtual NUL here */ + char *coldp; /* can be no match starting before here */ + char **lastpos; /* [nplus+1] */ + STATEVARS; + states st; /* current states */ + states fresh; /* states for a fresh start */ + states tmp; /* temporary */ + states empty; /* empty set of states */ +}; + +#include "engine.ih" + +#ifdef REDEBUG +#define SP(t, s, c) print(m, t, s, c, stdout) +#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) +#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } +#else +#define SP(t, s, c) /* nothing */ +#define AT(t, p1, p2, s1, s2) /* nothing */ +#define NOTE(s) /* nothing */ +#endif + +/* + - matcher - the actual matching engine + == static int matcher(register struct re_guts *g, char *string, \ + == size_t nmatch, regmatch_t pmatch[], int eflags); + */ +static int /* 0 success, REG_NOMATCH failure */ +matcher(g, string, nmatch, pmatch, eflags) +register struct re_guts *g; +char *string; +size_t nmatch; +regmatch_t pmatch[]; +int eflags; +{ + register char *endp; + register int i; + struct match mv; + register struct match *m = &mv; + register char *dp; + const register sopno gf = g->firststate+1; /* +1 for OEND */ + const register sopno gl = g->laststate; + char *start; + char *stop; + + /* simplify the situation where possible */ + if (g->cflags®_NOSUB) + nmatch = 0; + if (eflags®_STARTEND) { + start = string + pmatch[0].rm_so; + stop = string + pmatch[0].rm_eo; + } else { + start = string; + stop = start + strlen(start); + } + if (stop < start) + return(REG_INVARG); + + /* prescreening; this does wonders for this rather slow code */ + if (g->must != NULL) { + for (dp = start; dp < stop; dp++) + if (*dp == g->must[0] && stop - dp >= g->mlen && + memcmp(dp, g->must, (size_t)g->mlen) == 0) + break; + if (dp == stop) /* we didn't find g->must */ + return(REG_NOMATCH); + } + + /* match struct setup */ + m->g = g; + m->eflags = eflags; + m->pmatch = NULL; + m->lastpos = NULL; + m->offp = string; + m->beginp = start; + m->endp = stop; + STATESETUP(m, 4); + SETUP(m->st); + SETUP(m->fresh); + SETUP(m->tmp); + SETUP(m->empty); + CLEAR(m->empty); + + /* this loop does only one repetition except for backrefs */ + for (;;) { + endp = fast(m, start, stop, gf, gl); + if (endp == NULL) { /* a miss */ + STATETEARDOWN(m); + return(REG_NOMATCH); + } + if (nmatch == 0 && !g->backrefs) + break; /* no further info needed */ + + /* where? */ + assert(m->coldp != NULL); + for (;;) { + NOTE("finding start"); + endp = slow(m, m->coldp, stop, gf, gl); + if (endp != NULL) + break; + assert(m->coldp < m->endp); + m->coldp++; + } + if (nmatch == 1 && !g->backrefs) + break; /* no further info needed */ + + /* oh my, he wants the subexpressions... */ + if (m->pmatch == NULL) + m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * + sizeof(regmatch_t)); + if (m->pmatch == NULL) { + STATETEARDOWN(m); + return(REG_ESPACE); + } + for (i = 1; i <= m->g->nsub; i++) + m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; + if (!g->backrefs && !(m->eflags®_BACKR)) { + NOTE("dissecting"); + dp = dissect(m, m->coldp, endp, gf, gl); + } else { + if (g->nplus > 0 && m->lastpos == NULL) + m->lastpos = (char **)malloc((g->nplus+1) * + sizeof(char *)); + if (g->nplus > 0 && m->lastpos == NULL) { + free(m->pmatch); + STATETEARDOWN(m); + return(REG_ESPACE); + } + NOTE("backref dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + } + if (dp != NULL) + break; + + /* uh-oh... we couldn't find a subexpression-level match */ + assert(g->backrefs); /* must be back references doing it */ + assert(g->nplus == 0 || m->lastpos != NULL); + for (;;) { + if (dp != NULL || endp <= m->coldp) + break; /* defeat */ + NOTE("backoff"); + endp = slow(m, m->coldp, endp-1, gf, gl); + if (endp == NULL) + break; /* defeat */ + /* try it on a shorter possibility */ +#ifndef NDEBUG + for (i = 1; i <= m->g->nsub; i++) { + assert(m->pmatch[i].rm_so == -1); + assert(m->pmatch[i].rm_eo == -1); + } +#endif + NOTE("backoff dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + } + assert(dp == NULL || dp == endp); + if (dp != NULL) /* found a shorter one */ + break; + + /* despite initial appearances, there is no match here */ + NOTE("false alarm"); + start = m->coldp + 1; /* recycle starting later */ + assert(start <= stop); + } + + /* fill in the details if requested */ + if (nmatch > 0) { + pmatch[0].rm_so = m->coldp - m->offp; + pmatch[0].rm_eo = endp - m->offp; + } + if (nmatch > 1) { + assert(m->pmatch != NULL); + for (i = 1; i < nmatch; i++) + if (i <= m->g->nsub) + pmatch[i] = m->pmatch[i]; + else { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } + } + + if (m->pmatch != NULL) + free((char *)m->pmatch); + if (m->lastpos != NULL) + free((char *)m->lastpos); + STATETEARDOWN(m); + return(0); +} + +/* + - dissect - figure out what matched what, no back references + == static char *dissect(register struct match *m, char *start, \ + == char *stop, sopno startst, sopno stopst); + */ +static char * /* == stop (success) always */ +dissect(m, start, stop, startst, stopst) +register struct match *m; +char *start; +char *stop; +sopno startst; +sopno stopst; +{ + register int i; + register sopno ss; /* start sop of current subRE */ + register sopno es; /* end sop of current subRE */ + register char *sp; /* start of string matched by it */ + register char *stp; /* string matched by it cannot pass here */ + register char *rest; /* start of rest of string */ + register char *tail; /* string unmatched by rest of RE */ + register sopno ssub; /* start sop of subsubRE */ + register sopno esub; /* end sop of subsubRE */ + register char *ssp; /* start of string matched by subsubRE */ + register char *sep; /* end of string matched by subsubRE */ + register char *oldssp; /* previous ssp */ + register char *dp; + + AT("diss", start, stop, startst, stopst); + sp = start; + for (ss = startst; ss < stopst; ss = es) { + /* identify end of subRE */ + es = ss; + switch (OP(m->g->strip[es])) { + case OPLUS_: + case OQUEST_: + es += OPND(m->g->strip[es]); + break; + case OCH_: + while (OP(m->g->strip[es]) != O_CH) + es += OPND(m->g->strip[es]); + break; + } + es++; + + /* figure out what it matched */ + switch (OP(m->g->strip[ss])) { + case OEND: + assert(nope); + break; + case OCHAR: + sp++; + break; + case OBOL: + case OEOL: + case OBOW: + case OEOW: + break; + case OANY: + case OANYOF: + sp++; + break; + case OBACK_: + case O_BACK: + assert(nope); + break; + /* cases where length of match is hard to find */ + case OQUEST_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = es - 1; + /* did innards match? */ + if (slow(m, sp, rest, ssub, esub) != NULL) { + dp = dissect(m, sp, rest, ssub, esub); + assert(dp == rest); + } else /* no */ + assert(sp == rest); + sp = rest; + break; + case OPLUS_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = es - 1; + ssp = sp; + oldssp = ssp; + for (;;) { /* find last match of innards */ + sep = slow(m, ssp, rest, ssub, esub); + if (sep == NULL || sep == ssp) + break; /* failed or matched null */ + oldssp = ssp; /* on to next try */ + ssp = sep; + } + if (sep == NULL) { + /* last successful match */ + sep = ssp; + ssp = oldssp; + } + assert(sep == rest); /* must exhaust substring */ + assert(slow(m, ssp, sep, ssub, esub) == rest); + dp = dissect(m, ssp, sep, ssub, esub); + assert(dp == sep); + sp = rest; + break; + case OCH_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = ss + OPND(m->g->strip[ss]) - 1; + assert(OP(m->g->strip[esub]) == OOR1); + for (;;) { /* find first matching branch */ + if (slow(m, sp, rest, ssub, esub) == rest) + break; /* it matched all of it */ + /* that one missed, try next one */ + assert(OP(m->g->strip[esub]) == OOR1); + esub++; + assert(OP(m->g->strip[esub]) == OOR2); + ssub = esub + 1; + esub += OPND(m->g->strip[esub]); + if (OP(m->g->strip[esub]) == OOR2) + esub--; + else + assert(OP(m->g->strip[esub]) == O_CH); + } + dp = dissect(m, sp, rest, ssub, esub); + assert(dp == rest); + sp = rest; + break; + case O_PLUS: + case O_QUEST: + case OOR1: + case OOR2: + case O_CH: + assert(nope); + break; + case OLPAREN: + i = OPND(m->g->strip[ss]); + assert(0 < i && i <= m->g->nsub); + m->pmatch[i].rm_so = sp - m->offp; + break; + case ORPAREN: + i = OPND(m->g->strip[ss]); + assert(0 < i && i <= m->g->nsub); + m->pmatch[i].rm_eo = sp - m->offp; + break; + default: /* uh oh */ + assert(nope); + break; + } + } + + assert(sp == stop); + return(sp); +} + +/* + - backref - figure out what matched what, figuring in back references + == static char *backref(register struct match *m, char *start, \ + == char *stop, sopno startst, sopno stopst, sopno lev); + */ +static char * /* == stop (success) or NULL (failure) */ +backref(m, start, stop, startst, stopst, lev) +register struct match *m; +char *start; +char *stop; +sopno startst; +sopno stopst; +sopno lev; /* PLUS nesting level */ +{ + register int i; + register sopno ss; /* start sop of current subRE */ + register char *sp; /* start of string matched by it */ + register sopno ssub; /* start sop of subsubRE */ + register sopno esub; /* end sop of subsubRE */ + register char *ssp; /* start of string matched by subsubRE */ + register char *dp; + register size_t len; + register int hard; + register sop s; + register regoff_t offsave; + register cset *cs; + + AT("back", start, stop, startst, stopst); + sp = start; + + /* get as far as we can with easy stuff */ + hard = 0; + for (ss = startst; !hard && ss < stopst; ss++) + switch (OP(s = m->g->strip[ss])) { + case OCHAR: + if (sp == stop || *sp++ != (char)OPND(s)) + return(NULL); + break; + case OANY: + if (sp == stop) + return(NULL); + sp++; + break; + case OANYOF: + cs = &m->g->sets[OPND(s)]; + if (sp == stop || !CHIN(cs, *sp++)) + return(NULL); + break; + case OBOL: + if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || + (sp < m->endp && *(sp-1) == '\n' && + (m->g->cflags®_NEWLINE)) ) + { /* yes */ } + else + return(NULL); + break; + case OEOL: + if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || + (sp < m->endp && *sp == '\n' && + (m->g->cflags®_NEWLINE)) ) + { /* yes */ } + else + return(NULL); + break; + case OBOW: + if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || + (sp < m->endp && *(sp-1) == '\n' && + (m->g->cflags®_NEWLINE)) || + (sp > m->beginp && + !ISWORD(*(sp-1))) ) && + (sp < m->endp && ISWORD(*sp)) ) + { /* yes */ } + else + return(NULL); + break; + case OEOW: + if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || + (sp < m->endp && *sp == '\n' && + (m->g->cflags®_NEWLINE)) || + (sp < m->endp && !ISWORD(*sp)) ) && + (sp > m->beginp && ISWORD(*(sp-1))) ) + { /* yes */ } + else + return(NULL); + break; + case O_QUEST: + break; + case OOR1: /* matches null but needs to skip */ + ss++; + s = m->g->strip[ss]; + do { + assert(OP(s) == OOR2); + ss += OPND(s); + } while (OP(s = m->g->strip[ss]) != O_CH); + /* note that the ss++ gets us past the O_CH */ + break; + default: /* have to make a choice */ + hard = 1; + break; + } + if (!hard) { /* that was it! */ + if (sp != stop) + return(NULL); + return(sp); + } + ss--; /* adjust for the for's final increment */ + + /* the hard stuff */ + AT("hard", sp, stop, ss, stopst); + s = m->g->strip[ss]; + switch (OP(s)) { + case OBACK_: /* the vilest depths */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + if (m->pmatch[i].rm_eo == -1) + return(NULL); + assert(m->pmatch[i].rm_so != -1); + len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; + assert(stop - m->beginp >= len); + if (sp > stop - len) + return(NULL); /* not enough left to match */ + ssp = m->offp + m->pmatch[i].rm_so; + if (memcmp(sp, ssp, len) != 0) + return(NULL); + while (m->g->strip[ss] != SOP(O_BACK, i)) + ss++; + return(backref(m, sp+len, stop, ss+1, stopst, lev)); + break; + case OQUEST_: /* to null or not */ + dp = backref(m, sp, stop, ss+1, stopst, lev); + if (dp != NULL) + return(dp); /* not */ + return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); + break; + case OPLUS_: + assert(m->lastpos != NULL); + assert(lev+1 <= m->g->nplus); + m->lastpos[lev+1] = sp; + return(backref(m, sp, stop, ss+1, stopst, lev+1)); + break; + case O_PLUS: + if (sp == m->lastpos[lev]) /* last pass matched null */ + return(backref(m, sp, stop, ss+1, stopst, lev-1)); + /* try another pass */ + m->lastpos[lev] = sp; + dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); + if (dp == NULL) + return(backref(m, sp, stop, ss+1, stopst, lev-1)); + else + return(dp); + break; + case OCH_: /* find the right one, if any */ + ssub = ss + 1; + esub = ss + OPND(s) - 1; + assert(OP(m->g->strip[esub]) == OOR1); + for (;;) { /* find first matching branch */ + dp = backref(m, sp, stop, ssub, esub, lev); + if (dp != NULL) + return(dp); + /* that one missed, try next one */ + if (OP(m->g->strip[esub]) == O_CH) + return(NULL); /* there is none */ + esub++; + assert(OP(m->g->strip[esub]) == OOR2); + ssub = esub + 1; + esub += OPND(m->g->strip[esub]); + if (OP(m->g->strip[esub]) == OOR2) + esub--; + else + assert(OP(m->g->strip[esub]) == O_CH); + } + break; + case OLPAREN: /* must undo assignment if rest fails */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + offsave = m->pmatch[i].rm_so; + m->pmatch[i].rm_so = sp - m->offp; + dp = backref(m, sp, stop, ss+1, stopst, lev); + if (dp != NULL) + return(dp); + m->pmatch[i].rm_so = offsave; + return(NULL); + break; + case ORPAREN: /* must undo assignment if rest fails */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + offsave = m->pmatch[i].rm_eo; + m->pmatch[i].rm_eo = sp - m->offp; + dp = backref(m, sp, stop, ss+1, stopst, lev); + if (dp != NULL) + return(dp); + m->pmatch[i].rm_eo = offsave; + return(NULL); + break; + default: /* uh oh */ + assert(nope); + break; + } + + /* "can't happen" */ + assert(nope); + /* NOTREACHED */ + return((char *)NULL); /* dummy */ +} + +/* + - fast - step through the string at top speed + == static char *fast(register struct match *m, char *start, \ + == char *stop, sopno startst, sopno stopst); + */ +static char * /* where tentative match ended, or NULL */ +fast(m, start, stop, startst, stopst) +register struct match *m; +char *start; +char *stop; +sopno startst; +sopno stopst; +{ + register states st = m->st; + register states fresh = m->fresh; + register states tmp = m->tmp; + register char *p = start; + register int c = (start == m->beginp) ? OUT : *(start-1); + register int lastc; /* previous c */ + register int flagch; + register int i; + register char *coldp; /* last p after which no match was underway */ + + CLEAR(st); + SET1(st, startst); + st = step(m->g, startst, stopst, st, NOTHING, st); + ASSIGN(fresh, st); + SP("start", st, *p); + coldp = NULL; + for (;;) { + /* next character */ + lastc = c; + c = (p == m->endp) ? OUT : *p; + if (EQ(st, fresh)) + coldp = p; + + /* is there an EOL and/or BOL between lastc and c? */ + flagch = '\0'; + i = 0; + if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || + (lastc == OUT && !(m->eflags®_NOTBOL)) ) { + flagch = BOL; + i = m->g->nbol; + } + if ( (c == '\n' && m->g->cflags®_NEWLINE) || + (c == OUT && !(m->eflags®_NOTEOL)) ) { + flagch = (flagch == BOL) ? BOLEOL : EOL; + i += m->g->neol; + } + if (i != 0) { + for (; i > 0; i--) + st = step(m->g, startst, stopst, st, flagch, st); + SP("boleol", st, c); + } + + /* how about a word boundary? */ + if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && + (c != OUT && ISWORD(c)) ) { + flagch = BOW; + } + if ( (lastc != OUT && ISWORD(lastc)) && + (flagch == EOL || (c != OUT && !ISWORD(c))) ) { + flagch = EOW; + } + if (flagch == BOW || flagch == EOW) { + st = step(m->g, startst, stopst, st, flagch, st); + SP("boweow", st, c); + } + + /* are we done? */ + if (ISSET(st, stopst) || p == stop) + break; /* NOTE BREAK OUT */ + + /* no, we must deal with this character */ + ASSIGN(tmp, st); + ASSIGN(st, fresh); + assert(c != OUT); + st = step(m->g, startst, stopst, tmp, c, st); + SP("aft", st, c); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + p++; + } + + assert(coldp != NULL); + m->coldp = coldp; + if (ISSET(st, stopst)) + return(p+1); + else + return(NULL); +} + +/* + - slow - step through the string more deliberately + == static char *slow(register struct match *m, char *start, \ + == char *stop, sopno startst, sopno stopst); + */ +static char * /* where it ended */ +slow(m, start, stop, startst, stopst) +register struct match *m; +char *start; +char *stop; +sopno startst; +sopno stopst; +{ + register states st = m->st; + register states empty = m->empty; + register states tmp = m->tmp; + register char *p = start; + register int c = (start == m->beginp) ? OUT : *(start-1); + register int lastc; /* previous c */ + register int flagch; + register int i; + register char *matchp; /* last p at which a match ended */ + + AT("slow", start, stop, startst, stopst); + CLEAR(st); + SET1(st, startst); + SP("sstart", st, *p); + st = step(m->g, startst, stopst, st, NOTHING, st); + matchp = NULL; + for (;;) { + /* next character */ + lastc = c; + c = (p == m->endp) ? OUT : *p; + + /* is there an EOL and/or BOL between lastc and c? */ + flagch = '\0'; + i = 0; + if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || + (lastc == OUT && !(m->eflags®_NOTBOL)) ) { + flagch = BOL; + i = m->g->nbol; + } + if ( (c == '\n' && m->g->cflags®_NEWLINE) || + (c == OUT && !(m->eflags®_NOTEOL)) ) { + flagch = (flagch == BOL) ? BOLEOL : EOL; + i += m->g->neol; + } + if (i != 0) { + for (; i > 0; i--) + st = step(m->g, startst, stopst, st, flagch, st); + SP("sboleol", st, c); + } + + /* how about a word boundary? */ + if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && + (c != OUT && ISWORD(c)) ) { + flagch = BOW; + } + if ( (lastc != OUT && ISWORD(lastc)) && + (flagch == EOL || (c != OUT && !ISWORD(c))) ) { + flagch = EOW; + } + if (flagch == BOW || flagch == EOW) { + st = step(m->g, startst, stopst, st, flagch, st); + SP("sboweow", st, c); + } + + /* are we done? */ + if (ISSET(st, stopst)) + matchp = p; + if (EQ(st, empty) || p == stop) + break; /* NOTE BREAK OUT */ + + /* no, we must deal with this character */ + ASSIGN(tmp, st); + ASSIGN(st, empty); + assert(c != OUT); + st = step(m->g, startst, stopst, tmp, c, st); + SP("saft", st, c); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + p++; + } + + return(matchp); +} + + +/* + - step - map set of states reachable before char to set reachable after + == static states step(register struct re_guts *g, sopno start, sopno stop, \ + == register states bef, int ch, register states aft); + == #define BOL (OUT+1) + == #define EOL (BOL+1) + == #define BOLEOL (BOL+2) + == #define NOTHING (BOL+3) + == #define BOW (BOL+4) + == #define EOW (BOL+5) + == #define CODEMAX (BOL+5) // highest code used + == #define NONCHAR(c) ((c) > CHAR_MAX) + == #define NNONCHAR (CODEMAX-CHAR_MAX) + */ +static states +step(g, start, stop, bef, ch, aft) +register struct re_guts *g; +sopno start; /* start state within strip */ +sopno stop; /* state after stop state within strip */ +register states bef; /* states reachable before */ +int ch; /* character or NONCHAR code */ +register states aft; /* states already known reachable after */ +{ + register cset *cs; + register sop s; + register sopno pc; + register onestate here; /* note, macros know this name */ + register sopno look; + register long i; + + for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { + s = g->strip[pc]; + switch (OP(s)) { + case OEND: + assert(pc == stop-1); + break; + case OCHAR: + /* only characters can match */ + assert(!NONCHAR(ch) || ch != (char)OPND(s)); + if (ch == (char)OPND(s)) + FWD(aft, bef, 1); + break; + case OBOL: + if (ch == BOL || ch == BOLEOL) + FWD(aft, bef, 1); + break; + case OEOL: + if (ch == EOL || ch == BOLEOL) + FWD(aft, bef, 1); + break; + case OBOW: + if (ch == BOW) + FWD(aft, bef, 1); + break; + case OEOW: + if (ch == EOW) + FWD(aft, bef, 1); + break; + case OANY: + if (!NONCHAR(ch)) + FWD(aft, bef, 1); + break; + case OANYOF: + cs = &g->sets[OPND(s)]; + if (!NONCHAR(ch) && CHIN(cs, ch)) + FWD(aft, bef, 1); + break; + case OBACK_: /* ignored here */ + case O_BACK: + FWD(aft, aft, 1); + break; + case OPLUS_: /* forward, this is just an empty */ + FWD(aft, aft, 1); + break; + case O_PLUS: /* both forward and back */ + FWD(aft, aft, 1); + i = ISSETBACK(aft, OPND(s)); + BACK(aft, aft, OPND(s)); + if (!i && ISSETBACK(aft, OPND(s))) { + /* oho, must reconsider loop body */ + pc -= OPND(s) + 1; + INIT(here, pc); + } + break; + case OQUEST_: /* two branches, both forward */ + FWD(aft, aft, 1); + FWD(aft, aft, OPND(s)); + break; + case O_QUEST: /* just an empty */ + FWD(aft, aft, 1); + break; + case OLPAREN: /* not significant here */ + case ORPAREN: + FWD(aft, aft, 1); + break; + case OCH_: /* mark the first two branches */ + FWD(aft, aft, 1); + assert(OP(g->strip[pc+OPND(s)]) == OOR2); + FWD(aft, aft, OPND(s)); + break; + case OOR1: /* done a branch, find the O_CH */ + if (ISSTATEIN(aft, here)) { + for (look = 1; + OP(s = g->strip[pc+look]) != O_CH; + look += OPND(s)) + assert(OP(s) == OOR2); + FWD(aft, aft, look); + } + break; + case OOR2: /* propagate OCH_'s marking */ + FWD(aft, aft, 1); + if (OP(g->strip[pc+OPND(s)]) != O_CH) { + assert(OP(g->strip[pc+OPND(s)]) == OOR2); + FWD(aft, aft, OPND(s)); + } + break; + case O_CH: /* just empty */ + FWD(aft, aft, 1); + break; + default: /* ooooops... */ + assert(nope); + break; + } + } + + return(aft); +} + +#ifdef REDEBUG +/* + - print - print a set of states + == #ifdef REDEBUG + == static void print(struct match *m, char *caption, states st, \ + == int ch, FILE *d); + == #endif + */ +static void +print(m, caption, st, ch, d) +struct match *m; +char *caption; +states st; +int ch; +FILE *d; +{ + register struct re_guts *g = m->g; + register int i; + register int first = 1; + + if (!(m->eflags®_TRACE)) + return; + + fprintf(d, "%s", caption); + if (ch != '\0') + fprintf(d, " %s", pchar(ch)); + for (i = 0; i < g->nstates; i++) + if (ISSET(st, i)) { + fprintf(d, "%s%d", (first) ? "\t" : ", ", i); + first = 0; + } + fprintf(d, "\n"); +} + +/* + - at - print current situation + == #ifdef REDEBUG + == static void at(struct match *m, char *title, char *start, char *stop, \ + == sopno startst, sopno stopst); + == #endif + */ +static void +at(m, title, start, stop, startst, stopst) +struct match *m; +char *title; +char *start; +char *stop; +sopno startst; +sopno stopst; +{ + if (!(m->eflags®_TRACE)) + return; + + printf("%s %s-", title, pchar(*start)); + printf("%s ", pchar(*stop)); + printf("%ld-%ld\n", (long)startst, (long)stopst); +} + +#ifndef PCHARDONE +#define PCHARDONE /* never again */ +/* + - pchar - make a character printable + == #ifdef REDEBUG + == static char *pchar(int ch); + == #endif + * + * Is this identical to regchar() over in debug.c? Well, yes. But a + * duplicate here avoids having a debugging-capable regexec.o tied to + * a matching debug.o, and this is convenient. It all disappears in + * the non-debug compilation anyway, so it doesn't matter much. + */ +static char * /* -> representation */ +pchar(ch) +int ch; +{ + static char pbuf[10]; + + if (isprint(ch) || ch == ' ') + sprintf(pbuf, "%c", ch); + else + sprintf(pbuf, "\\%o", ch); + return(pbuf); +} +#endif +#endif + +#undef matcher +#undef fast +#undef slow +#undef dissect +#undef backref +#undef step +#undef print +#undef at +#undef match diff --git a/src/regex/main.c b/src/regex/main.c new file mode 100644 index 0000000000..0221e7713d --- /dev/null +++ b/src/regex/main.c @@ -0,0 +1,510 @@ +#include +#include +#include +#include +#include + +#include "main.ih" + +char *progname; +int debug = 0; +int line = 0; +int status = 0; + +int copts = REG_EXTENDED; +int eopts = 0; +regoff_t startoff = 0; +regoff_t endoff = 0; + + +extern int split(); +extern void regprint(); + +/* + - main - do the simple case, hand off to regress() for regression + */ +main(argc, argv) +int argc; +char *argv[]; +{ + regex_t re; +# define NS 10 + regmatch_t subs[NS]; + char erbuf[100]; + int err; + size_t len; + int c; + int errflg = 0; + register int i; + extern int optind; + extern char *optarg; + + progname = argv[0]; + + while ((c = getopt(argc, argv, "c:e:S:E:x")) != EOF) + switch (c) { + case 'c': /* compile options */ + copts = options('c', optarg); + break; + case 'e': /* execute options */ + eopts = options('e', optarg); + break; + case 'S': /* start offset */ + startoff = (regoff_t)atoi(optarg); + break; + case 'E': /* end offset */ + endoff = (regoff_t)atoi(optarg); + break; + case 'x': /* Debugging. */ + debug++; + break; + case '?': + default: + errflg++; + break; + } + if (errflg) { + fprintf(stderr, "usage: %s ", progname); + fprintf(stderr, "[-c copt][-C][-d] [re]\n"); + exit(2); + } + + if (optind >= argc) { + regress(stdin); + exit(status); + } + + err = regcomp(&re, argv[optind++], copts); + if (err) { + len = regerror(err, &re, erbuf, sizeof(erbuf)); + fprintf(stderr, "error %s, %d/%d `%s'\n", + eprint(err), len, sizeof(erbuf), erbuf); + exit(status); + } + regprint(&re, stdout); + + if (optind >= argc) { + regfree(&re); + exit(status); + } + + if (eopts®_STARTEND) { + subs[0].rm_so = startoff; + subs[0].rm_eo = strlen(argv[optind]) - endoff; + } + err = regexec(&re, argv[optind], (size_t)NS, subs, eopts); + if (err) { + len = regerror(err, &re, erbuf, sizeof(erbuf)); + fprintf(stderr, "error %s, %d/%d `%s'\n", + eprint(err), len, sizeof(erbuf), erbuf); + exit(status); + } + if (!(copts®_NOSUB)) { + len = (int)(subs[0].rm_eo - subs[0].rm_so); + if (subs[0].rm_so != -1) { + if (len != 0) + printf("match `%.*s'\n", len, + argv[optind] + subs[0].rm_so); + else + printf("match `'@%.1s\n", + argv[optind] + subs[0].rm_so); + } + for (i = 1; i < NS; i++) + if (subs[i].rm_so != -1) + printf("(%d) `%.*s'\n", i, + (int)(subs[i].rm_eo - subs[i].rm_so), + argv[optind] + subs[i].rm_so); + } + exit(status); +} + +/* + - regress - main loop of regression test + == void regress(FILE *in); + */ +void +regress(in) +FILE *in; +{ + char inbuf[1000]; +# define MAXF 10 + char *f[MAXF]; + int nf; + int i; + char erbuf[100]; + size_t ne; + char *badpat = "invalid regular expression"; +# define SHORT 10 + char *bpname = "REG_BADPAT"; + regex_t re; + + while (fgets(inbuf, sizeof(inbuf), in) != NULL) { + line++; + if (inbuf[0] == '#' || inbuf[0] == '\n') + continue; /* NOTE CONTINUE */ + inbuf[strlen(inbuf)-1] = '\0'; /* get rid of stupid \n */ + if (debug) + fprintf(stdout, "%d:\n", line); + nf = split(inbuf, f, MAXF, "\t\t"); + if (nf < 3) { + fprintf(stderr, "bad input, line %d\n", line); + exit(1); + } + for (i = 0; i < nf; i++) + if (strcmp(f[i], "\"\"") == 0) + f[i] = ""; + if (nf <= 3) + f[3] = NULL; + if (nf <= 4) + f[4] = NULL; + try(f[0], f[1], f[2], f[3], f[4], options('c', f[1])); + if (opt('&', f[1])) /* try with either type of RE */ + try(f[0], f[1], f[2], f[3], f[4], + options('c', f[1]) &~ REG_EXTENDED); + } + + ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); + if (strcmp(erbuf, badpat) != 0 || ne != strlen(badpat)+1) { + fprintf(stderr, "end: regerror() test gave `%s' not `%s'\n", + erbuf, badpat); + status = 1; + } + ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, (size_t)SHORT); + if (strncmp(erbuf, badpat, SHORT-1) != 0 || erbuf[SHORT-1] != '\0' || + ne != strlen(badpat)+1) { + fprintf(stderr, "end: regerror() short test gave `%s' not `%.*s'\n", + erbuf, SHORT-1, badpat); + status = 1; + } + ne = regerror(REG_ITOA|REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); + if (strcmp(erbuf, bpname) != 0 || ne != strlen(bpname)+1) { + fprintf(stderr, "end: regerror() ITOA test gave `%s' not `%s'\n", + erbuf, bpname); + status = 1; + } + re.re_endp = bpname; + ne = regerror(REG_ATOI, &re, erbuf, sizeof(erbuf)); + if (atoi(erbuf) != (int)REG_BADPAT) { + fprintf(stderr, "end: regerror() ATOI test gave `%s' not `%ld'\n", + erbuf, (long)REG_BADPAT); + status = 1; + } else if (ne != strlen(erbuf)+1) { + fprintf(stderr, "end: regerror() ATOI test len(`%s') = %ld\n", + erbuf, (long)REG_BADPAT); + status = 1; + } +} + +/* + - try - try it, and report on problems + == void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts); + */ +void +try(f0, f1, f2, f3, f4, opts) +char *f0; +char *f1; +char *f2; +char *f3; +char *f4; +int opts; /* may not match f1 */ +{ + regex_t re; +# define NSUBS 10 + regmatch_t subs[NSUBS]; +# define NSHOULD 15 + char *should[NSHOULD]; + int nshould; + char erbuf[100]; + int err; + int len; + char *type = (opts & REG_EXTENDED) ? "ERE" : "BRE"; + register int i; + char *grump; + char f0copy[1000]; + char f2copy[1000]; + + strcpy(f0copy, f0); + re.re_endp = (opts®_PEND) ? f0copy + strlen(f0copy) : NULL; + fixstr(f0copy); + err = regcomp(&re, f0copy, opts); + if (err != 0 && (!opt('C', f1) || err != efind(f2))) { + /* unexpected error or wrong error */ + len = regerror(err, &re, erbuf, sizeof(erbuf)); + fprintf(stderr, "%d: %s error %s, %d/%d `%s'\n", + line, type, eprint(err), len, + sizeof(erbuf), erbuf); + status = 1; + } else if (err == 0 && opt('C', f1)) { + /* unexpected success */ + fprintf(stderr, "%d: %s should have given REG_%s\n", + line, type, f2); + status = 1; + err = 1; /* so we won't try regexec */ + } + + if (err != 0) { + regfree(&re); + return; + } + + strcpy(f2copy, f2); + fixstr(f2copy); + + if (options('e', f1)®_STARTEND) { + if (strchr(f2, '(') == NULL || strchr(f2, ')') == NULL) + fprintf(stderr, "%d: bad STARTEND syntax\n", line); + subs[0].rm_so = strchr(f2, '(') - f2 + 1; + subs[0].rm_eo = strchr(f2, ')') - f2; + } + err = regexec(&re, f2copy, NSUBS, subs, options('e', f1)); + + if (err != 0 && (f3 != NULL || err != REG_NOMATCH)) { + /* unexpected error or wrong error */ + len = regerror(err, &re, erbuf, sizeof(erbuf)); + fprintf(stderr, "%d: %s exec error %s, %d/%d `%s'\n", + line, type, eprint(err), len, + sizeof(erbuf), erbuf); + status = 1; + } else if (err != 0) { + /* nothing more to check */ + } else if (f3 == NULL) { + /* unexpected success */ + fprintf(stderr, "%d: %s exec should have failed\n", + line, type); + status = 1; + err = 1; /* just on principle */ + } else if (opts®_NOSUB) { + /* nothing more to check */ + } else if ((grump = check(f2, subs[0], f3)) != NULL) { + fprintf(stderr, "%d: %s %s\n", line, type, grump); + status = 1; + err = 1; + } + + if (err != 0 || f4 == NULL) { + regfree(&re); + return; + } + + for (i = 1; i < NSHOULD; i++) + should[i] = NULL; + nshould = split(f4, should+1, NSHOULD-1, ","); + if (nshould == 0) { + nshould = 1; + should[1] = ""; + } + for (i = 1; i < NSUBS; i++) { + grump = check(f2, subs[i], should[i]); + if (grump != NULL) { + fprintf(stderr, "%d: %s $%d %s\n", line, + type, i, grump); + status = 1; + err = 1; + } + } + + regfree(&re); +} + +/* + - options - pick options out of a regression-test string + == int options(int type, char *s); + */ +int +options(type, s) +int type; /* 'c' compile, 'e' exec */ +char *s; +{ + register char *p; + register int o = (type == 'c') ? copts : eopts; + register char *legal = (type == 'c') ? "bisnmp" : "^$#tl"; + + for (p = s; *p != '\0'; p++) + if (strchr(legal, *p) != NULL) + switch (*p) { + case 'b': + o &= ~REG_EXTENDED; + break; + case 'i': + o |= REG_ICASE; + break; + case 's': + o |= REG_NOSUB; + break; + case 'n': + o |= REG_NEWLINE; + break; + case 'm': + o &= ~REG_EXTENDED; + o |= REG_NOSPEC; + break; + case 'p': + o |= REG_PEND; + break; + case '^': + o |= REG_NOTBOL; + break; + case '$': + o |= REG_NOTEOL; + break; + case '#': + o |= REG_STARTEND; + break; + case 't': /* trace */ + o |= REG_TRACE; + break; + case 'l': /* force long representation */ + o |= REG_LARGE; + break; + case 'r': /* force backref use */ + o |= REG_BACKR; + break; + } + return(o); +} + +/* + - opt - is a particular option in a regression string? + == int opt(int c, char *s); + */ +int /* predicate */ +opt(c, s) +int c; +char *s; +{ + return(strchr(s, c) != NULL); +} + +/* + - fixstr - transform magic characters in strings + == void fixstr(register char *p); + */ +void +fixstr(p) +register char *p; +{ + if (p == NULL) + return; + + for (; *p != '\0'; p++) + if (*p == 'N') + *p = '\n'; + else if (*p == 'T') + *p = '\t'; + else if (*p == 'S') + *p = ' '; + else if (*p == 'Z') + *p = '\0'; +} + +/* + - check - check a substring match + == char *check(char *str, regmatch_t sub, char *should); + */ +char * /* NULL or complaint */ +check(str, sub, should) +char *str; +regmatch_t sub; +char *should; +{ + register int len; + register int shlen; + register char *p; + static char grump[500]; + register char *at = NULL; + + if (should != NULL && strcmp(should, "-") == 0) + should = NULL; + if (should != NULL && should[0] == '@') { + at = should + 1; + should = ""; + } + + /* check rm_so and rm_eo for consistency */ + if (sub.rm_so > sub.rm_eo || (sub.rm_so == -1 && sub.rm_eo != -1) || + (sub.rm_so != -1 && sub.rm_eo == -1) || + (sub.rm_so != -1 && sub.rm_so < 0) || + (sub.rm_eo != -1 && sub.rm_eo < 0) ) { + sprintf(grump, "start %ld end %ld", (long)sub.rm_so, + (long)sub.rm_eo); + return(grump); + } + + /* check for no match */ + if (sub.rm_so == -1 && should == NULL) + return(NULL); + if (sub.rm_so == -1) + return("did not match"); + + /* check for in range */ + if (sub.rm_eo > strlen(str)) { + sprintf(grump, "start %ld end %ld, past end of string", + (long)sub.rm_so, (long)sub.rm_eo); + return(grump); + } + + len = (int)(sub.rm_eo - sub.rm_so); + shlen = (int)strlen(should); + p = str + sub.rm_so; + + /* check for not supposed to match */ + if (should == NULL) { + sprintf(grump, "matched `%.*s'", len, p); + return(grump); + } + + /* check for wrong match */ + if (len != shlen || strncmp(p, should, (size_t)shlen) != 0) { + sprintf(grump, "matched `%.*s' instead", len, p); + return(grump); + } + if (shlen > 0) + return(NULL); + + /* check null match in right place */ + if (at == NULL) + return(NULL); + shlen = strlen(at); + if (shlen == 0) + shlen = 1; /* force check for end-of-string */ + if (strncmp(p, at, shlen) != 0) { + sprintf(grump, "matched null at `%.20s'", p); + return(grump); + } + return(NULL); +} + +/* + - eprint - convert error number to name + == static char *eprint(int err); + */ +static char * +eprint(err) +int err; +{ + static char epbuf[100]; + size_t len; + + len = regerror(REG_ITOA|err, (regex_t *)NULL, epbuf, sizeof(epbuf)); + assert(len <= sizeof(epbuf)); + return(epbuf); +} + +/* + - efind - convert error name to number + == static int efind(char *name); + */ +static int +efind(name) +char *name; +{ + static char efbuf[100]; + size_t n; + regex_t re; + + sprintf(efbuf, "REG_%s", name); + assert(strlen(efbuf) < sizeof(efbuf)); + re.re_endp = efbuf; + (void) regerror(REG_ATOI, &re, efbuf, sizeof(efbuf)); + return(atoi(efbuf)); +} diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c new file mode 100644 index 0000000000..dc48601109 --- /dev/null +++ b/src/regex/regcomp.c @@ -0,0 +1,1603 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "regex2.h" + +#include "cclass.h" +#include "cname.h" + +/* + * parse structure, passed up and down to avoid global variables and + * other clumsinesses + */ +struct parse { + char *next; /* next character in RE */ + char *end; /* end of string (-> NUL normally) */ + int error; /* has an error been seen? */ + sop *strip; /* malloced strip */ + sopno ssize; /* malloced strip size (allocated) */ + sopno slen; /* malloced strip length (used) */ + int ncsalloc; /* number of csets allocated */ + struct re_guts *g; +# define NPAREN 10 /* we need to remember () 1-9 for back refs */ + sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ + sopno pend[NPAREN]; /* -> ) ([0] unused) */ +}; + +#include "regcomp.ih" + +static char nuls[10]; /* place to point scanner in event of error */ + +/* + * macros for use with parse structure + * BEWARE: these know that the parse structure is named `p' !!! + */ +#define PEEK() (*p->next) +#define PEEK2() (*(p->next+1)) +#define MORE() (p->next < p->end) +#define MORE2() (p->next+1 < p->end) +#define SEE(c) (MORE() && PEEK() == (c)) +#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) +#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) +#define NEXT() (p->next++) +#define NEXT2() (p->next += 2) +#define NEXTn(n) (p->next += (n)) +#define GETNEXT() (*p->next++) +#define SETERROR(e) seterr(p, (e)) +#define REQUIRE(co, e) ((co) || SETERROR(e)) +#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) +#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) +#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) +#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) +#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) +#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) +#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) +#define HERE() (p->slen) +#define THERE() (p->slen - 1) +#define THERETHERE() (p->slen - 2) +#define DROP(n) (p->slen -= (n)) + +#ifndef NDEBUG +static int never = 0; /* for use in asserts; shuts lint up */ +#else +#define never 0 /* some s have bugs too */ +#endif + +/* + - regcomp - interface for parser and compilation + = extern int regcomp(regex_t *, const char *, int); + = #define REG_BASIC 0000 + = #define REG_EXTENDED 0001 + = #define REG_ICASE 0002 + = #define REG_NOSUB 0004 + = #define REG_NEWLINE 0010 + = #define REG_NOSPEC 0020 + = #define REG_PEND 0040 + = #define REG_DUMP 0200 + */ +int /* 0 success, otherwise REG_something */ +regcomp(preg, pattern, cflags) +regex_t *preg; +const char *pattern; +int cflags; +{ + struct parse pa; + register struct re_guts *g; + register struct parse *p = &pa; + register int i; + register size_t len; +#ifdef REDEBUG +# define GOODFLAGS(f) (f) +#else +# define GOODFLAGS(f) ((f)&~REG_DUMP) +#endif + + cflags = GOODFLAGS(cflags); + if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) + return(REG_INVARG); + + if (cflags®_PEND) { + if (preg->re_endp < pattern) + return(REG_INVARG); + len = preg->re_endp - pattern; + } else + len = strlen((char *)pattern); + + /* do the mallocs early so failure handling is easy */ + g = (struct re_guts *)malloc(sizeof(struct re_guts) + + (NC-1)*sizeof(cat_t)); + if (g == NULL) + return(REG_ESPACE); + p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ + p->strip = (sop *)malloc(p->ssize * sizeof(sop)); + p->slen = 0; + if (p->strip == NULL) { + free((char *)g); + return(REG_ESPACE); + } + + /* set things up */ + p->g = g; + p->next = (char *)pattern; /* convenience; we do not modify it */ + p->end = p->next + len; + p->error = 0; + p->ncsalloc = 0; + for (i = 0; i < NPAREN; i++) { + p->pbegin[i] = 0; + p->pend[i] = 0; + } + g->csetsize = NC; + g->sets = NULL; + g->setbits = NULL; + g->ncsets = 0; + g->cflags = cflags; + g->iflags = 0; + g->nbol = 0; + g->neol = 0; + g->must = NULL; + g->mlen = 0; + g->nsub = 0; + g->ncategories = 1; /* category 0 is "everything else" */ + g->categories = &g->catspace[-(CHAR_MIN)]; + (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); + g->backrefs = 0; + + /* do it */ + EMIT(OEND, 0); + g->firststate = THERE(); + if (cflags®_EXTENDED) + p_ere(p, OUT); + else if (cflags®_NOSPEC) + p_str(p); + else + p_bre(p, OUT, OUT); + EMIT(OEND, 0); + g->laststate = THERE(); + + /* tidy up loose ends and fill things in */ + categorize(p, g); + stripsnug(p, g); + findmust(p, g); + g->nplus = pluscount(p, g); + g->magic = MAGIC2; + preg->re_nsub = g->nsub; + preg->re_g = g; + preg->re_magic = MAGIC1; +#ifndef REDEBUG + /* not debugging, so can't rely on the assert() in regexec() */ + if (g->iflags&BAD) + SETERROR(REG_ASSERT); +#endif + + /* win or lose, we're done */ + if (p->error != 0) /* lose */ + regfree(preg); + return(p->error); +} + +/* + - p_ere - ERE parser top level, concatenation and alternation + == static void p_ere(register struct parse *p, int stop); + */ +static void +p_ere(p, stop) +register struct parse *p; +int stop; /* character this ERE should end at */ +{ + register char c; + register sopno prevback; + register sopno prevfwd; + register sopno conc; + register int first = 1; /* is this the first alternative? */ + + for (;;) { + /* do a bunch of concatenated expressions */ + conc = HERE(); + while (MORE() && (c = PEEK()) != '|' && c != stop) + p_ere_exp(p); + REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ + + if (!EAT('|')) + break; /* NOTE BREAK OUT */ + + if (first) { + INSERT(OCH_, conc); /* offset is wrong */ + prevfwd = conc; + prevback = conc; + first = 0; + } + ASTERN(OOR1, prevback); + prevback = THERE(); + AHEAD(prevfwd); /* fix previous offset */ + prevfwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + } + + if (!first) { /* tail-end fixups */ + AHEAD(prevfwd); + ASTERN(O_CH, prevback); + } + + assert(!MORE() || SEE(stop)); +} + +/* + - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op + == static void p_ere_exp(register struct parse *p); + */ +static void +p_ere_exp(p) +register struct parse *p; +{ + register char c; + register sopno pos; + register int count; + register int count2; + register sopno subno; + int wascaret = 0; + + assert(MORE()); /* caller should have ensured this */ + c = GETNEXT(); + + pos = HERE(); + switch (c) { + case '(': + REQUIRE(MORE(), REG_EPAREN); + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + if (!SEE(')')) + p_ere(p, ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + MUSTEAT(')', REG_EPAREN); + break; +#ifndef POSIX_MISTAKE + case ')': /* happens only if no current unmatched ( */ + /* + * You may ask, why the ifndef? Because I didn't notice + * this until slightly too late for 1003.2, and none of the + * other 1003.2 regular-expression reviewers noticed it at + * all. So an unmatched ) is legal POSIX, at least until + * we can get it fixed. + */ + SETERROR(REG_EPAREN); + break; +#endif + case '^': + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + wascaret = 1; + break; + case '$': + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + break; + case '|': + SETERROR(REG_EMPTY); + break; + case '*': + case '+': + case '?': + SETERROR(REG_BADRPT); + break; + case '.': + if (p->g->cflags®_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case '\\': + REQUIRE(MORE(), REG_EESCAPE); + c = GETNEXT(); + ordinary(p, c); + break; + case '{': /* okay as ordinary except if digit follows */ + REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); + /* FALLTHROUGH */ + default: + ordinary(p, c); + break; + } + + if (!MORE()) + return; + c = PEEK(); + /* we call { a repetition if followed by a digit */ + if (!( c == '*' || c == '+' || c == '?' || + (c == '{' && MORE2() && isdigit(PEEK2())) )) + return; /* no repetition, we're done */ + NEXT(); + + REQUIRE(!wascaret, REG_BADRPT); + switch (c) { + case '*': /* implemented as +? */ + /* this case does not require the (y|) trick, noKLUDGE */ + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + break; + case '+': + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + break; + case '?': + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, pos); /* offset slightly wrong */ + ASTERN(OOR1, pos); /* this one's right */ + AHEAD(pos); /* fix the OCH_ */ + EMIT(OOR2, 0); /* offset very wrong... */ + AHEAD(THERE()); /* ...so fix it */ + ASTERN(O_CH, THERETHERE()); + break; + case '{': + count = p_count(p); + if (EAT(',')) { + if (isdigit(PEEK())) { + count2 = p_count(p); + REQUIRE(count <= count2, REG_BADBR); + } else /* single number with comma */ + count2 = INFINITY; + } else /* just a single number */ + count2 = count; + repeat(p, pos, count, count2); + if (!EAT('}')) { /* error heuristics */ + while (MORE() && PEEK() != '}') + NEXT(); + REQUIRE(MORE(), REG_EBRACE); + SETERROR(REG_BADBR); + } + break; + } + + if (!MORE()) + return; + c = PEEK(); + if (!( c == '*' || c == '+' || c == '?' || + (c == '{' && MORE2() && isdigit(PEEK2())) ) ) + return; + SETERROR(REG_BADRPT); +} + +/* + - p_str - string (no metacharacters) "parser" + == static void p_str(register struct parse *p); + */ +static void +p_str(p) +register struct parse *p; +{ + REQUIRE(MORE(), REG_EMPTY); + while (MORE()) + ordinary(p, GETNEXT()); +} + +/* + - p_bre - BRE parser top level, anchoring and concatenation + == static void p_bre(register struct parse *p, register int end1, \ + == register int end2); + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor. The + * only undesirable side effect is that '$' gets included as a character + * category in such cases. This is fairly harmless; not worth fixing. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_bre(p, end1, end2) +register struct parse *p; +register int end1; /* first terminating character */ +register int end2; /* second terminating character */ +{ + register sopno start = HERE(); + register int first = 1; /* first subexpression? */ + register int wasdollar = 0; + + if (EAT('^')) { + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + } + while (MORE() && !SEETWO(end1, end2)) { + wasdollar = p_simp_re(p, first); + first = 0; + } + if (wasdollar) { /* oops, that was a trailing anchor */ + DROP(1); + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + } + + REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +} + +/* + - p_simp_re - parse a simple RE, an atom possibly followed by a repetition + == static int p_simp_re(register struct parse *p, int starordinary); + */ +static int /* was the simple RE an unbackslashed $? */ +p_simp_re(p, starordinary) +register struct parse *p; +int starordinary; /* is a leading * an ordinary character? */ +{ + register int c; + register int count; + register int count2; + register sopno pos; + register int i; + register sopno subno; +# define BACKSL (1<g->cflags®_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case BACKSL|'{': + SETERROR(REG_BADRPT); + break; + case BACKSL|'(': + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + /* the MORE here is an error heuristic */ + if (MORE() && !SEETWO('\\', ')')) + p_bre(p, '\\', ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + REQUIRE(EATTWO('\\', ')'), REG_EPAREN); + break; + case BACKSL|')': /* should not get here -- must be user */ + case BACKSL|'}': + SETERROR(REG_EPAREN); + break; + case BACKSL|'1': + case BACKSL|'2': + case BACKSL|'3': + case BACKSL|'4': + case BACKSL|'5': + case BACKSL|'6': + case BACKSL|'7': + case BACKSL|'8': + case BACKSL|'9': + i = (c&~BACKSL) - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + case '*': + REQUIRE(starordinary, REG_BADRPT); + /* FALLTHROUGH */ + default: + ordinary(p, (char)c); /* takes off BACKSL, if any */ + break; + } + + if (EAT('*')) { /* implemented as +? */ + /* this case does not require the (y|) trick, noKLUDGE */ + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '{')) { + count = p_count(p); + if (EAT(',')) { + if (MORE() && isdigit(PEEK())) { + count2 = p_count(p); + REQUIRE(count <= count2, REG_BADBR); + } else /* single number with comma */ + count2 = INFINITY; + } else /* just a single number */ + count2 = count; + repeat(p, pos, count, count2); + if (!EATTWO('\\', '}')) { /* error heuristics */ + while (MORE() && !SEETWO('\\', '}')) + NEXT(); + REQUIRE(MORE(), REG_EBRACE); + SETERROR(REG_BADBR); + } + } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ + return(1); + + return(0); +} + +/* + - p_count - parse a repetition count + == static int p_count(register struct parse *p); + */ +static int /* the value */ +p_count(p) +register struct parse *p; +{ + register int count = 0; + register int ndigits = 0; + + while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { + count = count*10 + (GETNEXT() - '0'); + ndigits++; + } + + REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); + return(count); +} + +/* + - p_bracket - parse a bracketed character list + == static void p_bracket(register struct parse *p); + * + * Note a significant property of this code: if the allocset() did SETERROR, + * no set operations are done. + */ +static void +p_bracket(p) +register struct parse *p; +{ + register cset *cs = allocset(p); + register int invert = 0; + + /* Dept of Truly Sickening Special-Case Kludges */ + if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { + EMIT(OBOW, 0); + NEXTn(6); + return; + } + if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { + EMIT(OEOW, 0); + NEXTn(6); + return; + } + + if (EAT('^')) + invert++; /* make note to invert set at end */ + if (EAT(']')) + CHadd(cs, ']'); + else if (EAT('-')) + CHadd(cs, '-'); + while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) + p_b_term(p, cs); + if (EAT('-')) + CHadd(cs, '-'); + MUSTEAT(']', REG_EBRACK); + + if (p->error != 0) /* don't mess things up further */ + return; + + if (p->g->cflags®_ICASE) { + register int i; + register int ci; + + for (i = p->g->csetsize - 1; i >= 0; i--) + if (CHIN(cs, i) && isalpha(i)) { + ci = othercase(i); + if (ci != i) + CHadd(cs, ci); + } + if (cs->multis != NULL) + mccase(p, cs); + } + if (invert) { + register int i; + + for (i = p->g->csetsize - 1; i >= 0; i--) + if (CHIN(cs, i)) + CHsub(cs, i); + else + CHadd(cs, i); + if (p->g->cflags®_NEWLINE) + CHsub(cs, '\n'); + if (cs->multis != NULL) + mcinvert(p, cs); + } + + assert(cs->multis == NULL); /* xxx */ + + if (nch(p, cs) == 1) { /* optimize singleton sets */ + ordinary(p, firstch(p, cs)); + freeset(p, cs); + } else + EMIT(OANYOF, freezeset(p, cs)); +} + +/* + - p_b_term - parse one term of a bracketed character list + == static void p_b_term(register struct parse *p, register cset *cs); + */ +static void +p_b_term(p, cs) +register struct parse *p; +register cset *cs; +{ + register char c; + register char start, finish; + register int i; + + /* classify what we've got */ + switch ((MORE()) ? PEEK() : '\0') { + case '[': + c = (MORE2()) ? PEEK2() : '\0'; + break; + case '-': + SETERROR(REG_ERANGE); + return; /* NOTE RETURN */ + break; + default: + c = '\0'; + break; + } + + switch (c) { + case ':': /* character class */ + NEXT2(); + REQUIRE(MORE(), REG_EBRACK); + c = PEEK(); + REQUIRE(c != '-' && c != ']', REG_ECTYPE); + p_b_cclass(p, cs); + REQUIRE(MORE(), REG_EBRACK); + REQUIRE(EATTWO(':', ']'), REG_ECTYPE); + break; + case '=': /* equivalence class */ + NEXT2(); + REQUIRE(MORE(), REG_EBRACK); + c = PEEK(); + REQUIRE(c != '-' && c != ']', REG_ECOLLATE); + p_b_eclass(p, cs); + REQUIRE(MORE(), REG_EBRACK); + REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); + break; + default: /* symbol, ordinary character, or range */ +/* xxx revision needed for multichar stuff */ + start = p_b_symbol(p); + if (SEE('-') && MORE2() && PEEK2() != ']') { + /* range */ + NEXT(); + if (EAT('-')) + finish = '-'; + else + finish = p_b_symbol(p); + } else + finish = start; +/* xxx what about signed chars here... */ + REQUIRE(start <= finish, REG_ERANGE); + for (i = start; i <= finish; i++) + CHadd(cs, i); + break; + } +} + +/* + - p_b_cclass - parse a character-class name and deal with it + == static void p_b_cclass(register struct parse *p, register cset *cs); + */ +static void +p_b_cclass(p, cs) +register struct parse *p; +register cset *cs; +{ + register char *sp = p->next; + register struct cclass *cp; + register size_t len; + register char *u; + register char c; + + while (MORE() && isalpha(PEEK())) + NEXT(); + len = p->next - sp; + for (cp = cclasses; cp->name != NULL; cp++) + if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + break; + if (cp->name == NULL) { + /* oops, didn't find it */ + SETERROR(REG_ECTYPE); + return; + } + + u = cp->chars; + while ((c = *u++) != '\0') + CHadd(cs, c); + for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) + MCadd(p, cs, u); +} + +/* + - p_b_eclass - parse an equivalence-class name and deal with it + == static void p_b_eclass(register struct parse *p, register cset *cs); + * + * This implementation is incomplete. xxx + */ +static void +p_b_eclass(p, cs) +register struct parse *p; +register cset *cs; +{ + register char c; + + c = p_b_coll_elem(p, '='); + CHadd(cs, c); +} + +/* + - p_b_symbol - parse a character or [..]ed multicharacter collating symbol + == static char p_b_symbol(register struct parse *p); + */ +static char /* value of symbol */ +p_b_symbol(p) +register struct parse *p; +{ + register char value; + + REQUIRE(MORE(), REG_EBRACK); + if (!EATTWO('[', '.')) + return(GETNEXT()); + + /* collating symbol */ + value = p_b_coll_elem(p, '.'); + REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); + return(value); +} + +/* + - p_b_coll_elem - parse a collating-element name and look it up + == static char p_b_coll_elem(register struct parse *p, int endc); + */ +static char /* value of collating element */ +p_b_coll_elem(p, endc) +register struct parse *p; +int endc; /* name ended by endc,']' */ +{ + register char *sp = p->next; + register struct cname *cp; + register int len; + + while (MORE() && !SEETWO(endc, ']')) + NEXT(); + if (!MORE()) { + SETERROR(REG_EBRACK); + return(0); + } + len = p->next - sp; + for (cp = cnames; cp->name != NULL; cp++) + if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + return(cp->code); /* known name */ + if (len == 1) + return(*sp); /* single character */ + SETERROR(REG_ECOLLATE); /* neither */ + return(0); +} + +/* + - othercase - return the case counterpart of an alphabetic + == static char othercase(int ch); + */ +static char /* if no counterpart, return ch */ +othercase(ch) +int ch; +{ + assert(isalpha(ch)); + if (isupper(ch)) + return(tolower(ch)); + else if (islower(ch)) + return(toupper(ch)); + else /* peculiar, but could happen */ + return(ch); +} + +/* + - bothcases - emit a dualcase version of a two-case character + == static void bothcases(register struct parse *p, int ch); + * + * Boy, is this implementation ever a kludge... + */ +static void +bothcases(p, ch) +register struct parse *p; +int ch; +{ + register char *oldnext = p->next; + register char *oldend = p->end; + char bracket[3]; + + assert(othercase(ch) != ch); /* p_bracket() would recurse */ + p->next = bracket; + p->end = bracket+2; + bracket[0] = ch; + bracket[1] = ']'; + bracket[2] = '\0'; + p_bracket(p); + assert(p->next == bracket+2); + p->next = oldnext; + p->end = oldend; +} + +/* + - ordinary - emit an ordinary character + == static void ordinary(register struct parse *p, register int ch); + */ +static void +ordinary(p, ch) +register struct parse *p; +register int ch; +{ + register cat_t *cap = p->g->categories; + + if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) + bothcases(p, ch); + else { + EMIT(OCHAR, (unsigned char)ch); + if (cap[ch] == 0) + cap[ch] = p->g->ncategories++; + } +} + +/* + - nonnewline - emit REG_NEWLINE version of OANY + == static void nonnewline(register struct parse *p); + * + * Boy, is this implementation ever a kludge... + */ +static void +nonnewline(p) +register struct parse *p; +{ + register char *oldnext = p->next; + register char *oldend = p->end; + char bracket[4]; + + p->next = bracket; + p->end = bracket+3; + bracket[0] = '^'; + bracket[1] = '\n'; + bracket[2] = ']'; + bracket[3] = '\0'; + p_bracket(p); + assert(p->next == bracket+3); + p->next = oldnext; + p->end = oldend; +} + +/* + - repeat - generate code for a bounded repetition, recursively if needed + == static void repeat(register struct parse *p, sopno start, int from, int to); + */ +static void +repeat(p, start, from, to) +register struct parse *p; +sopno start; /* operand from here to end of strip */ +int from; /* repeated from this number */ +int to; /* to this number of times (maybe INFINITY) */ +{ + register sopno finish = HERE(); +# define N 2 +# define INF 3 +# define REP(f, t) ((f)*8 + (t)) +# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) + register sopno copy; + + if (p->error != 0) /* head off possible runaway recursion */ + return; + + assert(from <= to); + + switch (REP(MAP(from), MAP(to))) { + case REP(0, 0): /* must be user doing this */ + DROP(finish-start); /* drop the operand */ + break; + case REP(0, 1): /* as x{1,1}? */ + case REP(0, N): /* as x{1,n}? */ + case REP(0, INF): /* as x{1,}? */ + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, start); /* offset is wrong... */ + repeat(p, start+1, 1, to); + ASTERN(OOR1, start); + AHEAD(start); /* ... fix it */ + EMIT(OOR2, 0); + AHEAD(THERE()); + ASTERN(O_CH, THERETHERE()); + break; + case REP(1, 1): /* trivial case */ + /* done */ + break; + case REP(1, N): /* as x?x{1,n-1} */ + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, start); + ASTERN(OOR1, start); + AHEAD(start); + EMIT(OOR2, 0); /* offset very wrong... */ + AHEAD(THERE()); /* ...so fix it */ + ASTERN(O_CH, THERETHERE()); + copy = dupl(p, start+1, finish+1); + assert(copy == finish+4); + repeat(p, copy, 1, to-1); + break; + case REP(1, INF): /* as x+ */ + INSERT(OPLUS_, start); + ASTERN(O_PLUS, start); + break; + case REP(N, N): /* as xx{m-1,n-1} */ + copy = dupl(p, start, finish); + repeat(p, copy, from-1, to-1); + break; + case REP(N, INF): /* as xx{n-1,INF} */ + copy = dupl(p, start, finish); + repeat(p, copy, from-1, to); + break; + default: /* "can't happen" */ + SETERROR(REG_ASSERT); /* just in case */ + break; + } +} + +/* + - seterr - set an error condition + == static int seterr(register struct parse *p, int e); + */ +static int /* useless but makes type checking happy */ +seterr(p, e) +register struct parse *p; +int e; +{ + if (p->error == 0) /* keep earliest error condition */ + p->error = e; + p->next = nuls; /* try to bring things to a halt */ + p->end = nuls; + return(0); /* make the return value well-defined */ +} + +/* + - allocset - allocate a set of characters for [] + == static cset *allocset(register struct parse *p); + */ +static cset * +allocset(p) +register struct parse *p; +{ + register int no = p->g->ncsets++; + register size_t nc; + register size_t nbytes; + register cset *cs; + register size_t css = (size_t)p->g->csetsize; + register int i; + + if (no >= p->ncsalloc) { /* need another column of space */ + p->ncsalloc += CHAR_BIT; + nc = p->ncsalloc; + assert(nc % CHAR_BIT == 0); + nbytes = nc / CHAR_BIT * css; + if (p->g->sets == NULL) + p->g->sets = (cset *)malloc(nc * sizeof(cset)); + else + p->g->sets = (cset *)realloc((char *)p->g->sets, + nc * sizeof(cset)); + if (p->g->setbits == NULL) + p->g->setbits = (uch *)malloc(nbytes); + else { + p->g->setbits = (uch *)realloc((char *)p->g->setbits, + nbytes); + /* xxx this isn't right if setbits is now NULL */ + for (i = 0; i < no; i++) + p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); + } + if (p->g->sets != NULL && p->g->setbits != NULL) + (void) memset((char *)p->g->setbits + (nbytes - css), + 0, css); + else { + no = 0; + SETERROR(REG_ESPACE); + /* caller's responsibility not to do set ops */ + } + } + + assert(p->g->sets != NULL); /* xxx */ + cs = &p->g->sets[no]; + cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); + cs->mask = 1 << ((no) % CHAR_BIT); + cs->hash = 0; + cs->smultis = 0; + cs->multis = NULL; + + return(cs); +} + +/* + - freeset - free a now-unused set + == static void freeset(register struct parse *p, register cset *cs); + */ +static void +freeset(p, cs) +register struct parse *p; +register cset *cs; +{ + register int i; + register cset *top = &p->g->sets[p->g->ncsets]; + register size_t css = (size_t)p->g->csetsize; + + for (i = 0; i < css; i++) + CHsub(cs, i); + if (cs == top-1) /* recover only the easy case */ + p->g->ncsets--; +} + +/* + - freezeset - final processing on a set of characters + == static int freezeset(register struct parse *p, register cset *cs); + * + * The main task here is merging identical sets. This is usually a waste + * of time (although the hash code minimizes the overhead), but can win + * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash + * is done using addition rather than xor -- all ASCII [aA] sets xor to + * the same value! + */ +static int /* set number */ +freezeset(p, cs) +register struct parse *p; +register cset *cs; +{ + register uch h = cs->hash; + register int i; + register cset *top = &p->g->sets[p->g->ncsets]; + register cset *cs2; + register size_t css = (size_t)p->g->csetsize; + + /* look for an earlier one which is the same */ + for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) + if (cs2->hash == h && cs2 != cs) { + /* maybe */ + for (i = 0; i < css; i++) + if (!!CHIN(cs2, i) != !!CHIN(cs, i)) + break; /* no */ + if (i == css) + break; /* yes */ + } + + if (cs2 < top) { /* found one */ + freeset(p, cs); + cs = cs2; + } + + return((int)(cs - p->g->sets)); +} + +/* + - firstch - return first character in a set (which must have at least one) + == static int firstch(register struct parse *p, register cset *cs); + */ +static int /* character; there is no "none" value */ +firstch(p, cs) +register struct parse *p; +register cset *cs; +{ + register int i; + register size_t css = (size_t)p->g->csetsize; + + for (i = 0; i < css; i++) + if (CHIN(cs, i)) + return((char)i); + assert(never); + return(0); /* arbitrary */ +} + +/* + - nch - number of characters in a set + == static int nch(register struct parse *p, register cset *cs); + */ +static int +nch(p, cs) +register struct parse *p; +register cset *cs; +{ + register int i; + register size_t css = (size_t)p->g->csetsize; + register int n = 0; + + for (i = 0; i < css; i++) + if (CHIN(cs, i)) + n++; + return(n); +} + +/* + - mcadd - add a collating element to a cset + == static void mcadd(register struct parse *p, register cset *cs, \ + == register char *cp); + */ +static void +mcadd(p, cs, cp) +register struct parse *p; +register cset *cs; +register char *cp; +{ + register size_t oldend = cs->smultis; + + cs->smultis += strlen(cp) + 1; + if (cs->multis == NULL) + cs->multis = malloc(cs->smultis); + else + cs->multis = realloc(cs->multis, cs->smultis); + if (cs->multis == NULL) { + SETERROR(REG_ESPACE); + return; + } + + (void) strcpy(cs->multis + oldend - 1, cp); + cs->multis[cs->smultis - 1] = '\0'; +} + +/* + - mcsub - subtract a collating element from a cset + == static void mcsub(register cset *cs, register char *cp); + */ +static void +mcsub(cs, cp) +register cset *cs; +register char *cp; +{ + register char *fp = mcfind(cs, cp); + register size_t len = strlen(fp); + + assert(fp != NULL); + (void) memmove(fp, fp + len + 1, + cs->smultis - (fp + len + 1 - cs->multis)); + cs->smultis -= len; + + if (cs->smultis == 0) { + free(cs->multis); + cs->multis = NULL; + return; + } + + cs->multis = realloc(cs->multis, cs->smultis); + assert(cs->multis != NULL); +} + +/* + - mcin - is a collating element in a cset? + == static int mcin(register cset *cs, register char *cp); + */ +static int +mcin(cs, cp) +register cset *cs; +register char *cp; +{ + return(mcfind(cs, cp) != NULL); +} + +/* + - mcfind - find a collating element in a cset + == static char *mcfind(register cset *cs, register char *cp); + */ +static char * +mcfind(cs, cp) +register cset *cs; +register char *cp; +{ + register char *p; + + if (cs->multis == NULL) + return(NULL); + for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) + if (strcmp(cp, p) == 0) + return(p); + return(NULL); +} + +/* + - mcinvert - invert the list of collating elements in a cset + == static void mcinvert(register struct parse *p, register cset *cs); + * + * This would have to know the set of possibilities. Implementation + * is deferred. + */ +static void +mcinvert(p, cs) +register struct parse *p; +register cset *cs; +{ + assert(cs->multis == NULL); /* xxx */ +} + +/* + - mccase - add case counterparts of the list of collating elements in a cset + == static void mccase(register struct parse *p, register cset *cs); + * + * This would have to know the set of possibilities. Implementation + * is deferred. + */ +static void +mccase(p, cs) +register struct parse *p; +register cset *cs; +{ + assert(cs->multis == NULL); /* xxx */ +} + +/* + - isinsets - is this character in any sets? + == static int isinsets(register struct re_guts *g, int c); + */ +static int /* predicate */ +isinsets(g, c) +register struct re_guts *g; +int c; +{ + register uch *col; + register int i; + register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; + register unsigned uc = (unsigned char)c; + + for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) + if (col[uc] != 0) + return(1); + return(0); +} + +/* + - samesets - are these two characters in exactly the same sets? + == static int samesets(register struct re_guts *g, int c1, int c2); + */ +static int /* predicate */ +samesets(g, c1, c2) +register struct re_guts *g; +int c1; +int c2; +{ + register uch *col; + register int i; + register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; + register unsigned uc1 = (unsigned char)c1; + register unsigned uc2 = (unsigned char)c2; + + for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) + if (col[uc1] != col[uc2]) + return(0); + return(1); +} + +/* + - categorize - sort out character categories + == static void categorize(struct parse *p, register struct re_guts *g); + */ +static void +categorize(p, g) +struct parse *p; +register struct re_guts *g; +{ + register cat_t *cats = g->categories; + register int c; + register int c2; + register cat_t cat; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (cats[c] == 0 && isinsets(g, c)) { + cat = g->ncategories++; + cats[c] = cat; + for (c2 = c+1; c2 <= CHAR_MAX; c2++) + if (cats[c2] == 0 && samesets(g, c, c2)) + cats[c2] = cat; + } +} + +/* + - dupl - emit a duplicate of a bunch of sops + == static sopno dupl(register struct parse *p, sopno start, sopno finish); + */ +static sopno /* start of duplicate */ +dupl(p, start, finish) +register struct parse *p; +sopno start; /* from here */ +sopno finish; /* to this less one */ +{ + register sopno ret = HERE(); + register sopno len = finish - start; + + assert(finish >= start); + if (len == 0) + return(ret); + enlarge(p, p->ssize + len); /* this many unexpected additions */ + assert(p->ssize >= p->slen + len); + (void) memcpy((char *)(p->strip + p->slen), + (char *)(p->strip + start), (size_t)len*sizeof(sop)); + p->slen += len; + return(ret); +} + +/* + - doemit - emit a strip operator + == static void doemit(register struct parse *p, sop op, size_t opnd); + * + * It might seem better to implement this as a macro with a function as + * hard-case backup, but it's just too big and messy unless there are + * some changes to the data structures. Maybe later. + */ +static void +doemit(p, op, opnd) +register struct parse *p; +sop op; +size_t opnd; +{ + /* avoid making error situations worse */ + if (p->error != 0) + return; + + /* deal with oversize operands ("can't happen", more or less) */ + assert(opnd < 1<slen >= p->ssize) + enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ + assert(p->slen < p->ssize); + + /* finally, it's all reduced to the easy case */ + p->strip[p->slen++] = SOP(op, opnd); +} + +/* + - doinsert - insert a sop into the strip + == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); + */ +static void +doinsert(p, op, opnd, pos) +register struct parse *p; +sop op; +size_t opnd; +sopno pos; +{ + register sopno sn; + register sop s; + register int i; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + sn = HERE(); + EMIT(op, opnd); /* do checks, ensure space */ + assert(HERE() == sn+1); + s = p->strip[sn]; + + /* adjust paren pointers */ + assert(pos > 0); + for (i = 1; i < NPAREN; i++) { + if (p->pbegin[i] >= pos) { + p->pbegin[i]++; + } + if (p->pend[i] >= pos) { + p->pend[i]++; + } + } + + memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], + (HERE()-pos-1)*sizeof(sop)); + p->strip[pos] = s; +} + +/* + - dofwd - complete a forward reference + == static void dofwd(register struct parse *p, sopno pos, sop value); + */ +static void +dofwd(p, pos, value) +register struct parse *p; +register sopno pos; +sop value; +{ + /* avoid making error situations worse */ + if (p->error != 0) + return; + + assert(value < 1<strip[pos] = OP(p->strip[pos]) | value; +} + +/* + - enlarge - enlarge the strip + == static void enlarge(register struct parse *p, sopno size); + */ +static void +enlarge(p, size) +register struct parse *p; +register sopno size; +{ + register sop *sp; + + if (p->ssize >= size) + return; + + sp = (sop *)realloc(p->strip, size*sizeof(sop)); + if (sp == NULL) { + SETERROR(REG_ESPACE); + return; + } + p->strip = sp; + p->ssize = size; +} + +/* + - stripsnug - compact the strip + == static void stripsnug(register struct parse *p, register struct re_guts *g); + */ +static void +stripsnug(p, g) +register struct parse *p; +register struct re_guts *g; +{ + g->nstates = p->slen; + g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); + if (g->strip == NULL) { + SETERROR(REG_ESPACE); + g->strip = p->strip; + } +} + +/* + - findmust - fill in must and mlen with longest mandatory literal string + == static void findmust(register struct parse *p, register struct re_guts *g); + * + * This algorithm could do fancy things like analyzing the operands of | + * for common subsequences. Someday. This code is simple and finds most + * of the interesting cases. + * + * Note that must and mlen got initialized during setup. + */ +static void +findmust(p, g) +struct parse *p; +register struct re_guts *g; +{ + register sop *scan; + sop *start; + register sop *newstart; + register sopno newlen; + register sop s; + register char *cp; + register sopno i; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + /* find the longest OCHAR sequence in strip */ + newlen = 0; + scan = g->strip + 1; + do { + s = *scan++; + switch (OP(s)) { + case OCHAR: /* sequence member */ + if (newlen == 0) /* new sequence */ + newstart = scan - 1; + newlen++; + break; + case OPLUS_: /* things that don't break one */ + case OLPAREN: + case ORPAREN: + break; + case OQUEST_: /* things that must be skipped */ + case OCH_: + scan--; + do { + scan += OPND(s); + s = *scan; + /* assert() interferes w debug printouts */ + if (OP(s) != O_QUEST && OP(s) != O_CH && + OP(s) != OOR2) { + g->iflags |= BAD; + return; + } + } while (OP(s) != O_QUEST && OP(s) != O_CH); + /* fallthrough */ + default: /* things that break a sequence */ + if (newlen > g->mlen) { /* ends one */ + start = newstart; + g->mlen = newlen; + } + newlen = 0; + break; + } + } while (OP(s) != OEND); + + if (g->mlen == 0) /* there isn't one */ + return; + + /* turn it into a character string */ + g->must = malloc((size_t)g->mlen + 1); + if (g->must == NULL) { /* argh; just forget it */ + g->mlen = 0; + return; + } + cp = g->must; + scan = start; + for (i = g->mlen; i > 0; i--) { + while (OP(s = *scan++) != OCHAR) + continue; + assert(cp < g->must + g->mlen); + *cp++ = (char)OPND(s); + } + assert(cp == g->must + g->mlen); + *cp++ = '\0'; /* just on general principles */ +} + +/* + - pluscount - count + nesting + == static sopno pluscount(register struct parse *p, register struct re_guts *g); + */ +static sopno /* nesting depth */ +pluscount(p, g) +struct parse *p; +register struct re_guts *g; +{ + register sop *scan; + register sop s; + register sopno plusnest = 0; + register sopno maxnest = 0; + + if (p->error != 0) + return(0); /* there may not be an OEND */ + + scan = g->strip + 1; + do { + s = *scan++; + switch (OP(s)) { + case OPLUS_: + plusnest++; + break; + case O_PLUS: + if (plusnest > maxnest) + maxnest = plusnest; + plusnest--; + break; + } + } while (OP(s) != OEND); + if (plusnest != 0) + g->iflags |= BAD; + return(maxnest); +} diff --git a/src/regex/regex2.h b/src/regex/regex2.h new file mode 100644 index 0000000000..58fd8d8a43 --- /dev/null +++ b/src/regex/regex2.h @@ -0,0 +1,134 @@ +/* + * First, the stuff that ends up in the outside-world include file + = typedef off_t regoff_t; + = typedef struct { + = int re_magic; + = size_t re_nsub; // number of parenthesized subexpressions + = const char *re_endp; // end pointer for REG_PEND + = struct re_guts *re_g; // none of your business :-) + = } regex_t; + = typedef struct { + = regoff_t rm_so; // start of match + = regoff_t rm_eo; // end of match + = } regmatch_t; + */ +/* + * internals of regex_t + */ +#define MAGIC1 ((('r'^0200)<<8) | 'e') + +/* + * The internal representation is a *strip*, a sequence of + * operators ending with an endmarker. (Some terminology etc. is a + * historical relic of earlier versions which used multiple strips.) + * Certain oddities in the representation are there to permit running + * the machinery backwards; in particular, any deviation from sequential + * flow must be marked at both its source and its destination. Some + * fine points: + * + * - OPLUS_ and O_PLUS are *inside* the loop they create. + * - OQUEST_ and O_QUEST are *outside* the bypass they create. + * - OCH_ and O_CH are *outside* the multi-way branch they create, while + * OOR1 and OOR2 are respectively the end and the beginning of one of + * the branches. Note that there is an implicit OOR2 following OCH_ + * and an implicit OOR1 preceding O_CH. + * + * In state representations, an operator's bit is on to signify a state + * immediately *preceding* "execution" of that operator. + */ +typedef long sop; /* strip operator */ +typedef long sopno; +#define OPRMASK 0x7c000000 +#define OPDMASK 0x03ffffff +#define OPSHIFT (26) +#define OP(n) ((n)&OPRMASK) +#define OPND(n) ((n)&OPDMASK) +#define SOP(op, opnd) ((op)|(opnd)) +/* operators meaning operand */ +/* (back, fwd are offsets) */ +#define OEND (1< uch [csetsize] */ + uch mask; /* bit within array */ + uch hash; /* hash code */ + size_t smultis; + char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ +} cset; +/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ +#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) +#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) +#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) +#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ +#define MCsub(p, cs, cp) mcsub(p, cs, cp) +#define MCin(p, cs, cp) mcin(p, cs, cp) + +/* stuff for character categories */ +typedef unsigned char cat_t; + +/* + * main compiled-expression structure + */ +struct re_guts { + int magic; +# define MAGIC2 ((('R'^0200)<<8)|'E') + sop *strip; /* malloced area for strip */ + int csetsize; /* number of bits in a cset vector */ + int ncsets; /* number of csets in use */ + cset *sets; /* -> cset [ncsets] */ + uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ + int cflags; /* copy of regcomp() cflags argument */ + sopno nstates; /* = number of sops */ + sopno firststate; /* the initial OEND (normally 0) */ + sopno laststate; /* the final OEND */ + int iflags; /* internal flags */ +# define USEBOL 01 /* used ^ */ +# define USEEOL 02 /* used $ */ +# define BAD 04 /* something wrong */ + int nbol; /* number of ^ used */ + int neol; /* number of $ used */ + int ncategories; /* how many character categories */ + cat_t *categories; /* ->catspace[-CHAR_MIN] */ + char *must; /* match must contain this string */ + int mlen; /* length of must */ + size_t nsub; /* copy of re_nsub */ + int backrefs; /* does it use back references? */ + sopno nplus; /* how deep does it nest +s? */ + /* catspace must be last */ + cat_t catspace[1]; /* actually [NC] */ +}; + +/* misc utilities */ +#define OUT (CHAR_MAX+1) /* a non-character value */ +#define ISWORD(c) (isalnum(c) || (c) == '_') diff --git a/src/regex/split.c b/src/regex/split.c new file mode 100644 index 0000000000..188bdb775b --- /dev/null +++ b/src/regex/split.c @@ -0,0 +1,316 @@ +#include +#include + +/* + - split - divide a string into fields, like awk split() + = int split(char *string, char *fields[], int nfields, char *sep); + */ +int /* number of fields, including overflow */ +split(string, fields, nfields, sep) +char *string; +char *fields[]; /* list is not NULL-terminated */ +int nfields; /* number of entries available in fields[] */ +char *sep; /* "" white, "c" single char, "ab" [ab]+ */ +{ + register char *p = string; + register char c; /* latest character */ + register char sepc = sep[0]; + register char sepc2; + register int fn; + register char **fp = fields; + register char *sepp; + register int trimtrail; + + /* white space */ + if (sepc == '\0') { + while ((c = *p++) == ' ' || c == '\t') + continue; + p--; + trimtrail = 1; + sep = " \t"; /* note, code below knows this is 2 long */ + sepc = ' '; + } else + trimtrail = 0; + sepc2 = sep[1]; /* now we can safely pick this up */ + + /* catch empties */ + if (*p == '\0') + return(0); + + /* single separator */ + if (sepc2 == '\0') { + fn = nfields; + for (;;) { + *fp++ = p; + fn--; + if (fn == 0) + break; + while ((c = *p++) != sepc) + if (c == '\0') + return(nfields - fn); + *(p-1) = '\0'; + } + /* we have overflowed the fields vector -- just count them */ + fn = nfields; + for (;;) { + while ((c = *p++) != sepc) + if (c == '\0') + return(fn); + fn++; + } + /* not reached */ + } + + /* two separators */ + if (sep[2] == '\0') { + fn = nfields; + for (;;) { + *fp++ = p; + fn--; + while ((c = *p++) != sepc && c != sepc2) + if (c == '\0') { + if (trimtrail && **(fp-1) == '\0') + fn++; + return(nfields - fn); + } + if (fn == 0) + break; + *(p-1) = '\0'; + while ((c = *p++) == sepc || c == sepc2) + continue; + p--; + } + /* we have overflowed the fields vector -- just count them */ + fn = nfields; + while (c != '\0') { + while ((c = *p++) == sepc || c == sepc2) + continue; + p--; + fn++; + while ((c = *p++) != '\0' && c != sepc && c != sepc2) + continue; + } + /* might have to trim trailing white space */ + if (trimtrail) { + p--; + while ((c = *--p) == sepc || c == sepc2) + continue; + p++; + if (*p != '\0') { + if (fn == nfields+1) + *p = '\0'; + fn--; + } + } + return(fn); + } + + /* n separators */ + fn = 0; + for (;;) { + if (fn < nfields) + *fp++ = p; + fn++; + for (;;) { + c = *p++; + if (c == '\0') + return(fn); + sepp = sep; + while ((sepc = *sepp++) != '\0' && sepc != c) + continue; + if (sepc != '\0') /* it was a separator */ + break; + } + if (fn < nfields) + *(p-1) = '\0'; + for (;;) { + c = *p++; + sepp = sep; + while ((sepc = *sepp++) != '\0' && sepc != c) + continue; + if (sepc == '\0') /* it wasn't a separator */ + break; + } + p--; + } + + /* not reached */ +} + +#ifdef TEST_SPLIT + + +/* + * test program + * pgm runs regression + * pgm sep splits stdin lines by sep + * pgm str sep splits str by sep + * pgm str sep n splits str by sep n times + */ +int +main(argc, argv) +int argc; +char *argv[]; +{ + char buf[512]; + register int n; +# define MNF 10 + char *fields[MNF]; + + if (argc > 4) + for (n = atoi(argv[3]); n > 0; n--) { + (void) strcpy(buf, argv[1]); + } + else if (argc > 3) + for (n = atoi(argv[3]); n > 0; n--) { + (void) strcpy(buf, argv[1]); + (void) split(buf, fields, MNF, argv[2]); + } + else if (argc > 2) + dosplit(argv[1], argv[2]); + else if (argc > 1) + while (fgets(buf, sizeof(buf), stdin) != NULL) { + buf[strlen(buf)-1] = '\0'; /* stomp newline */ + dosplit(buf, argv[1]); + } + else + regress(); + + exit(0); +} + +dosplit(string, seps) +char *string; +char *seps; +{ +# define NF 5 + char *fields[NF]; + register int nf; + + nf = split(string, fields, NF, seps); + print(nf, NF, fields); +} + +print(nf, nfp, fields) +int nf; +int nfp; +char *fields[]; +{ + register int fn; + register int bound; + + bound = (nf > nfp) ? nfp : nf; + printf("%d:\t", nf); + for (fn = 0; fn < bound; fn++) + printf("\"%s\"%s", fields[fn], (fn+1 < nf) ? ", " : "\n"); +} + +#define RNF 5 /* some table entries know this */ +struct { + char *str; + char *seps; + int nf; + char *fi[RNF]; +} tests[] = { + "", " ", 0, { "" }, + " ", " ", 2, { "", "" }, + "x", " ", 1, { "x" }, + "xy", " ", 1, { "xy" }, + "x y", " ", 2, { "x", "y" }, + "abc def g ", " ", 5, { "abc", "def", "", "g", "" }, + " a bcd", " ", 4, { "", "", "a", "bcd" }, + "a b c d e f", " ", 6, { "a", "b", "c", "d", "e f" }, + " a b c d ", " ", 6, { "", "a", "b", "c", "d " }, + + "", " _", 0, { "" }, + " ", " _", 2, { "", "" }, + "x", " _", 1, { "x" }, + "x y", " _", 2, { "x", "y" }, + "ab _ cd", " _", 2, { "ab", "cd" }, + " a_b c ", " _", 5, { "", "a", "b", "c", "" }, + "a b c_d e f", " _", 6, { "a", "b", "c", "d", "e f" }, + " a b c d ", " _", 6, { "", "a", "b", "c", "d " }, + + "", " _~", 0, { "" }, + " ", " _~", 2, { "", "" }, + "x", " _~", 1, { "x" }, + "x y", " _~", 2, { "x", "y" }, + "ab _~ cd", " _~", 2, { "ab", "cd" }, + " a_b c~", " _~", 5, { "", "a", "b", "c", "" }, + "a b_c d~e f", " _~", 6, { "a", "b", "c", "d", "e f" }, + "~a b c d ", " _~", 6, { "", "a", "b", "c", "d " }, + + "", " _~-", 0, { "" }, + " ", " _~-", 2, { "", "" }, + "x", " _~-", 1, { "x" }, + "x y", " _~-", 2, { "x", "y" }, + "ab _~- cd", " _~-", 2, { "ab", "cd" }, + " a_b c~", " _~-", 5, { "", "a", "b", "c", "" }, + "a b_c-d~e f", " _~-", 6, { "a", "b", "c", "d", "e f" }, + "~a-b c d ", " _~-", 6, { "", "a", "b", "c", "d " }, + + "", " ", 0, { "" }, + " ", " ", 2, { "", "" }, + "x", " ", 1, { "x" }, + "xy", " ", 1, { "xy" }, + "x y", " ", 2, { "x", "y" }, + "abc def g ", " ", 4, { "abc", "def", "g", "" }, + " a bcd", " ", 3, { "", "a", "bcd" }, + "a b c d e f", " ", 6, { "a", "b", "c", "d", "e f" }, + " a b c d ", " ", 6, { "", "a", "b", "c", "d " }, + + "", "", 0, { "" }, + " ", "", 0, { "" }, + "x", "", 1, { "x" }, + "xy", "", 1, { "xy" }, + "x y", "", 2, { "x", "y" }, + "abc def g ", "", 3, { "abc", "def", "g" }, + "\t a bcd", "", 2, { "a", "bcd" }, + " a \tb\t c ", "", 3, { "a", "b", "c" }, + "a b c d e ", "", 5, { "a", "b", "c", "d", "e" }, + "a b\tc d e f", "", 6, { "a", "b", "c", "d", "e f" }, + " a b c d e f ", "", 6, { "a", "b", "c", "d", "e f " }, + + NULL, NULL, 0, { NULL }, +}; + +regress() +{ + char buf[512]; + register int n; + char *fields[RNF+1]; + register int nf; + register int i; + register int printit; + register char *f; + + for (n = 0; tests[n].str != NULL; n++) { + (void) strcpy(buf, tests[n].str); + fields[RNF] = NULL; + nf = split(buf, fields, RNF, tests[n].seps); + printit = 0; + if (nf != tests[n].nf) { + printf("split `%s' by `%s' gave %d fields, not %d\n", + tests[n].str, tests[n].seps, nf, tests[n].nf); + printit = 1; + } else if (fields[RNF] != NULL) { + printf("split() went beyond array end\n"); + printit = 1; + } else { + for (i = 0; i < nf && i < RNF; i++) { + f = fields[i]; + if (f == NULL) + f = "(NULL)"; + if (strcmp(f, tests[n].fi[i]) != 0) { + printf("split `%s' by `%s', field %d is `%s', not `%s'\n", + tests[n].str, tests[n].seps, + i, fields[i], tests[n].fi[i]); + printit = 1; + } + } + } + if (printit) + print(nf, RNF, fields); + } +} +#endif diff --git a/src/regex/utils.h b/src/regex/utils.h new file mode 100644 index 0000000000..1a997ac8fc --- /dev/null +++ b/src/regex/utils.h @@ -0,0 +1,22 @@ +/* utility definitions */ +#ifdef _POSIX2_RE_DUP_MAX +#define DUPMAX _POSIX2_RE_DUP_MAX +#else +#define DUPMAX 255 +#endif +#define INFINITY (DUPMAX + 1) +#define NC (CHAR_MAX - CHAR_MIN + 1) +typedef unsigned char uch; + +/* switch off assertions (if not already off) if no REDEBUG */ +#ifndef REDEBUG +#ifndef NDEBUG +#define NDEBUG /* no assertions please */ +#endif +#endif +#include + +/* for old systems with bcopy() but no memmove() */ +#ifdef USEBCOPY +#define memmove(d, s, c) bcopy(s, d, c) +#endif -- 2.45.2