Tcl regex lib

author Ryan Norton <wxprojects@comcast.net>

Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)

committer Ryan Norton <wxprojects@comcast.net>

Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)
author Ryan Norton <wxprojects@comcast.net>
Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)
committer Ryan Norton <wxprojects@comcast.net>
Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)
diff --git a/src/regex/regc_lex.c b/src/regex/regc_lex.c

new file mode 100644 (file)

index 0000000..a24290d
--- /dev/null
+++ b/src/regex/regc_lex.c
@@ -0,0 +1,1146 @@
+/*
+ * lexical analyzer
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.     All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header$
+ *
+ */
+
+/* scanning macros (know about v) */
+#define ATEOS()                (v->now >= v->stop)
+#define HAVE(n)                (v->stop - v->now >= (n))
+#define NEXT1(c)       (!ATEOS() && *v->now == CHR(c))
+#define NEXT2(a,b)     (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
+#define NEXT3(a,b,c)   (HAVE(3) && *v->now == CHR(a) && \
+                                               *(v->now+1) == CHR(b) && \
+                                               *(v->now+2) == CHR(c))
+#define SET(c)         (v->nexttype = (c))
+#define SETV(c, n)     (v->nexttype = (c), v->nextvalue = (n))
+#define RET(c)         return (SET(c), 1)
+#define RETV(c, n)     return (SETV(c, n), 1)
+#define FAILW(e)       return (ERR(e), 0)      /* ERR does SET(EOS) */
+#define LASTTYPE(t) (v->lasttype == (t))
+
+/* lexical contexts */
+#define L_ERE  1                               /* mainline ERE/ARE */
+#define L_BRE  2                               /* mainline BRE */
+#define L_Q 3                                  /* REG_QUOTE */
+#define L_EBND 4                               /* ERE/ARE bound */
+#define L_BBND 5                               /* BRE bound */
+#define L_BRACK 6                              /* brackets */
+#define L_CEL  7                               /* collating element */
+#define L_ECL  8                               /* equivalence class */
+#define L_CCL  9                               /* character class */
+#define INTOCON(c)     (v->lexcon = (c))
+#define INCON(con)     (v->lexcon == (con))
+
+/* construct pointer past end of chr array */
+#define ENDOF(array)   ((array) + sizeof(array)/sizeof(chr))
+
+/*
+ * lexstart - set up lexical stuff, scan leading options
+ */
+static void
+lexstart(struct vars * v)
+{
+       prefixes(v);                            /* may turn on new type bits etc. */
+       NOERR();
+
+       if (v->cflags & REG_QUOTE)
+       {
+               assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
+               INTOCON(L_Q);
+       }
+       else if (v->cflags & REG_EXTENDED)
+       {
+               assert(!(v->cflags & REG_QUOTE));
+               INTOCON(L_ERE);
+       }
+       else
+       {
+               assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
+               INTOCON(L_BRE);
+       }
+
+       v->nexttype = EMPTY;            /* remember we were at the start */
+       next(v);                                        /* set up the first token */
+}
+
+/*
+ * prefixes - implement various special prefixes
+ */
+static void
+prefixes(struct vars * v)
+{
+       /* literal string doesn't get any of this stuff */
+       if (v->cflags & REG_QUOTE)
+               return;
+
+       /* initial "***" gets special things */
+       if (HAVE(4) && NEXT3('*', '*', '*'))
+               switch (*(v->now + 3))
+               {
+                       case CHR('?'):          /* "***?" error, msg shows version */
+                               ERR(REG_BADPAT);
+                               return;                 /* proceed no further */
+                               break;
+                       case CHR('='):          /* "***=" shifts to literal string */
+                               NOTE(REG_UNONPOSIX);
+                               v->cflags |= REG_QUOTE;
+                               v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
+                               v->now += 4;
+                               return;                 /* and there can be no more prefixes */
+                               break;
+                       case CHR(':'):          /* "***:" shifts to AREs */
+                               NOTE(REG_UNONPOSIX);
+                               v->cflags |= REG_ADVANCED;
+                               v->now += 4;
+                               break;
+                       default:                        /* otherwise *** is just an error */
+                               ERR(REG_BADRPT);
+                               return;
+                               break;
+               }
+
+       /* BREs and EREs don't get embedded options */
+       if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
+               return;
+
+       /* embedded options (AREs only) */
+       if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
+       {
+               NOTE(REG_UNONPOSIX);
+               v->now += 2;
+               for (; !ATEOS() && iscalpha(*v->now); v->now++)
+                       switch (*v->now)
+                       {
+                               case CHR('b'):  /* BREs (but why???) */
+                                       v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+                                       break;
+                               case CHR('c'):  /* case sensitive */
+                                       v->cflags &= ~REG_ICASE;
+                                       break;
+                               case CHR('e'):  /* plain EREs */
+                                       v->cflags |= REG_EXTENDED;
+                                       v->cflags &= ~(REG_ADVF | REG_QUOTE);
+                                       break;
+                               case CHR('i'):  /* case insensitive */
+                                       v->cflags |= REG_ICASE;
+                                       break;
+                               case CHR('m'):  /* Perloid synonym for n */
+                               case CHR('n'):  /* \n affects ^ $ . [^ */
+                                       v->cflags |= REG_NEWLINE;
+                                       break;
+                               case CHR('p'):  /* ~Perl, \n affects . [^ */
+                                       v->cflags |= REG_NLSTOP;
+                                       v->cflags &= ~REG_NLANCH;
+                                       break;
+                               case CHR('q'):  /* literal string */
+                                       v->cflags |= REG_QUOTE;
+                                       v->cflags &= ~REG_ADVANCED;
+                                       break;
+                               case CHR('s'):  /* single line, \n ordinary */
+                                       v->cflags &= ~REG_NEWLINE;
+                                       break;
+                               case CHR('t'):  /* tight syntax */
+                                       v->cflags &= ~REG_EXPANDED;
+                                       break;
+                               case CHR('w'):  /* weird, \n affects ^ $ only */
+                                       v->cflags &= ~REG_NLSTOP;
+                                       v->cflags |= REG_NLANCH;
+                                       break;
+                               case CHR('x'):  /* expanded syntax */
+                                       v->cflags |= REG_EXPANDED;
+                                       break;
+                               default:
+                                       ERR(REG_BADOPT);
+                                       return;
+                       }
+               if (!NEXT1(')'))
+               {
+                       ERR(REG_BADOPT);
+                       return;
+               }
+               v->now++;
+               if (v->cflags & REG_QUOTE)
+                       v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
+       }
+}
+
+/*
+ * lexnest - "call a subroutine", interpolating string at the lexical level
+ *
+ * Note, this is not a very general facility.  There are a number of
+ * implicit assumptions about what sorts of strings can be subroutines.
+ */
+static void
+lexnest(struct vars * v,
+               chr *beginp,                    /* start of interpolation */
+               chr *endp)                              /* one past end of interpolation */
+{
+       assert(v->savenow == NULL); /* only one level of nesting */
+       v->savenow = v->now;
+       v->savestop = v->stop;
+       v->now = beginp;
+       v->stop = endp;
+}
+
+/*
+ * string constants to interpolate as expansions of things like \d
+ */
+static chr     backd[] = {                     /* \d */
+       CHR('['), CHR('['), CHR(':'),
+       CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+       CHR(':'), CHR(']'), CHR(']')
+};
+static chr     backD[] = {                     /* \D */
+       CHR('['), CHR('^'), CHR('['), CHR(':'),
+       CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+       CHR(':'), CHR(']'), CHR(']')
+};
+static chr     brbackd[] = {           /* \d within brackets */
+       CHR('['), CHR(':'),
+       CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+       CHR(':'), CHR(']')
+};
+static chr     backs[] = {                     /* \s */
+       CHR('['), CHR('['), CHR(':'),
+       CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+       CHR(':'), CHR(']'), CHR(']')
+};
+static chr     backS[] = {                     /* \S */
+       CHR('['), CHR('^'), CHR('['), CHR(':'),
+       CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+       CHR(':'), CHR(']'), CHR(']')
+};
+static chr     brbacks[] = {           /* \s within brackets */
+       CHR('['), CHR(':'),
+       CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+       CHR(':'), CHR(']')
+};
+static chr     backw[] = {                     /* \w */
+       CHR('['), CHR('['), CHR(':'),
+       CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+       CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr     backW[] = {                     /* \W */
+       CHR('['), CHR('^'), CHR('['), CHR(':'),
+       CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+       CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr     brbackw[] = {           /* \w within brackets */
+       CHR('['), CHR(':'),
+       CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+       CHR(':'), CHR(']'), CHR('_')
+};
+
+/*
+ * lexword - interpolate a bracket expression for word characters
+ * Possibly ought to inquire whether there is a "word" character class.
+ */
+static void
+lexword(struct vars * v)
+{
+       lexnest(v, backw, ENDOF(backw));
+}
+
+/*
+ * next - get next token
+ */
+static int                                             /* 1 normal, 0 failure */
+next(struct vars * v)
+{
+       chr                     c;
+
+       /* errors yield an infinite sequence of failures */
+       if (ISERR())
+               return 0;                               /* the error has set nexttype to EOS */
+
+       /* remember flavor of last token */
+       v->lasttype = v->nexttype;
+
+       /* REG_BOSONLY */
+       if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
+       {
+               /* at start of a REG_BOSONLY RE */
+               RETV(SBEGIN, 0);                /* same as \A */
+       }
+
+       /* if we're nested and we've hit end, return to outer level */
+       if (v->savenow != NULL && ATEOS())
+       {
+               v->now = v->savenow;
+               v->stop = v->savestop;
+               v->savenow = v->savestop = NULL;
+       }
+
+       /* skip white space etc. if appropriate (not in literal or []) */
+       if (v->cflags & REG_EXPANDED)
+               switch (v->lexcon)
+               {
+                       case L_ERE:
+                       case L_BRE:
+                       case L_EBND:
+                       case L_BBND:
+                               skip(v);
+                               break;
+               }
+
+       /* handle EOS, depending on context */
+       if (ATEOS())
+       {
+               switch (v->lexcon)
+               {
+                       case L_ERE:
+                       case L_BRE:
+                       case L_Q:
+                               RET(EOS);
+                               break;
+                       case L_EBND:
+                       case L_BBND:
+                               FAILW(REG_EBRACE);
+                               break;
+                       case L_BRACK:
+                       case L_CEL:
+                       case L_ECL:
+                       case L_CCL:
+                               FAILW(REG_EBRACK);
+                               break;
+               }
+               assert(NOTREACHED);
+       }
+
+       /* okay, time to actually get a character */
+       c = *v->now++;
+
+       /* deal with the easy contexts, punt EREs to code below */
+       switch (v->lexcon)
+       {
+               case L_BRE:                             /* punt BREs to separate function */
+                       return brenext(v, c);
+                       break;
+               case L_ERE:                             /* see below */
+                       break;
+               case L_Q:                               /* literal strings are easy */
+                       RETV(PLAIN, c);
+                       break;
+               case L_BBND:                    /* bounds are fairly simple */
+               case L_EBND:
+                       switch (c)
+                       {
+                               case CHR('0'):
+                               case CHR('1'):
+                               case CHR('2'):
+                               case CHR('3'):
+                               case CHR('4'):
+                               case CHR('5'):
+                               case CHR('6'):
+                               case CHR('7'):
+                               case CHR('8'):
+                               case CHR('9'):
+                                       RETV(DIGIT, (chr) DIGITVAL(c));
+                                       break;
+                               case CHR(','):
+                                       RET(',');
+                                       break;
+                               case CHR('}'):  /* ERE bound ends with } */
+                                       if (INCON(L_EBND))
+                                       {
+                                               INTOCON(L_ERE);
+                                               if ((v->cflags & REG_ADVF) && NEXT1('?'))
+                                               {
+                                                       v->now++;
+                                                       NOTE(REG_UNONPOSIX);
+                                                       RETV('}', 0);
+                                               }
+                                               RETV('}', 1);
+                                       }
+                                       else
+                                               FAILW(REG_BADBR);
+                                       break;
+                               case CHR('\\'): /* BRE bound ends with \} */
+                                       if (INCON(L_BBND) && NEXT1('}'))
+                                       {
+                                               v->now++;
+                                               INTOCON(L_BRE);
+                                               RET('}');
+                                       }
+                                       else
+                                               FAILW(REG_BADBR);
+                                       break;
+                               default:
+                                       FAILW(REG_BADBR);
+                                       break;
+                       }
+                       assert(NOTREACHED);
+                       break;
+               case L_BRACK:                   /* brackets are not too hard */
+                       switch (c)
+                       {
+                               case CHR(']'):
+                                       if (LASTTYPE('['))
+                                               RETV(PLAIN, c);
+                                       else
+                                       {
+                                               INTOCON((v->cflags & REG_EXTENDED) ?
+                                                               L_ERE : L_BRE);
+                                               RET(']');
+                                       }
+                                       break;
+                               case CHR('\\'):
+                                       NOTE(REG_UBBS);
+                                       if (!(v->cflags & REG_ADVF))
+                                               RETV(PLAIN, c);
+                                       NOTE(REG_UNONPOSIX);
+                                       if (ATEOS())
+                                               FAILW(REG_EESCAPE);
+                                       (DISCARD) lexescape(v);
+                                       switch (v->nexttype)
+                                       {                       /* not all escapes okay here */
+                                               case PLAIN:
+                                                       return 1;
+                                                       break;
+                                               case CCLASS:
+                                                       switch (v->nextvalue)
+                                                       {
+                                                               case 'd':
+                                                                       lexnest(v, brbackd, ENDOF(brbackd));
+                                                                       break;
+                                                               case 's':
+                                                                       lexnest(v, brbacks, ENDOF(brbacks));
+                                                                       break;
+                                                               case 'w':
+                                                                       lexnest(v, brbackw, ENDOF(brbackw));
+                                                                       break;
+                                                               default:
+                                                                       FAILW(REG_EESCAPE);
+                                                                       break;
+                                                       }
+                                                       /* lexnest done, back up and try again */
+                                                       v->nexttype = v->lasttype;
+                                                       return next(v);
+                                                       break;
+                                       }
+                                       /* not one of the acceptable escapes */
+                                       FAILW(REG_EESCAPE);
+                                       break;
+                               case CHR('-'):
+                                       if (LASTTYPE('[') || NEXT1(']'))
+                                               RETV(PLAIN, c);
+                                       else
+                                               RETV(RANGE, c);
+                                       break;
+                               case CHR('['):
+                                       if (ATEOS())
+                                               FAILW(REG_EBRACK);
+                                       switch (*v->now++)
+                                       {
+                                               case CHR('.'):
+                                                       INTOCON(L_CEL);
+                                                       /* might or might not be locale-specific */
+                                                       RET(COLLEL);
+                                                       break;
+                                               case CHR('='):
+                                                       INTOCON(L_ECL);
+                                                       NOTE(REG_ULOCALE);
+                                                       RET(ECLASS);
+                                                       break;
+                                               case CHR(':'):
+                                                       INTOCON(L_CCL);
+                                                       NOTE(REG_ULOCALE);
+                                                       RET(CCLASS);
+                                                       break;
+                                               default:                /* oops */
+                                                       v->now--;
+                                                       RETV(PLAIN, c);
+                                                       break;
+                                       }
+                                       assert(NOTREACHED);
+                                       break;
+                               default:
+                                       RETV(PLAIN, c);
+                                       break;
+                       }
+                       assert(NOTREACHED);
+                       break;
+               case L_CEL:                             /* collating elements are easy */
+                       if (c == CHR('.') && NEXT1(']'))
+                       {
+                               v->now++;
+                               INTOCON(L_BRACK);
+                               RETV(END, '.');
+                       }
+                       else
+                               RETV(PLAIN, c);
+                       break;
+               case L_ECL:                             /* ditto equivalence classes */
+                       if (c == CHR('=') && NEXT1(']'))
+                       {
+                               v->now++;
+                               INTOCON(L_BRACK);
+                               RETV(END, '=');
+                       }
+                       else
+                               RETV(PLAIN, c);
+                       break;
+               case L_CCL:                             /* ditto character classes */
+                       if (c == CHR(':') && NEXT1(']'))
+                       {
+                               v->now++;
+                               INTOCON(L_BRACK);
+                               RETV(END, ':');
+                       }
+                       else
+                               RETV(PLAIN, c);
+                       break;
+               default:
+                       assert(NOTREACHED);
+                       break;
+       }
+
+       /* that got rid of everything except EREs and AREs */
+       assert(INCON(L_ERE));
+
+       /* deal with EREs and AREs, except for backslashes */
+       switch (c)
+       {
+               case CHR('|'):
+                       RET('|');
+                       break;
+               case CHR('*'):
+                       if ((v->cflags & REG_ADVF) && NEXT1('?'))
+                       {
+                               v->now++;
+                               NOTE(REG_UNONPOSIX);
+                               RETV('*', 0);
+                       }
+                       RETV('*', 1);
+                       break;
+               case CHR('+'):
+                       if ((v->cflags & REG_ADVF) && NEXT1('?'))
+                       {
+                               v->now++;
+                               NOTE(REG_UNONPOSIX);
+                               RETV('+', 0);
+                       }
+                       RETV('+', 1);
+                       break;
+               case CHR('?'):
+                       if ((v->cflags & REG_ADVF) && NEXT1('?'))
+                       {
+                               v->now++;
+                               NOTE(REG_UNONPOSIX);
+                               RETV('?', 0);
+                       }
+                       RETV('?', 1);
+                       break;
+               case CHR('{'):                  /* bounds start or plain character */
+                       if (v->cflags & REG_EXPANDED)
+                               skip(v);
+                       if (ATEOS() || !iscdigit(*v->now))
+                       {
+                               NOTE(REG_UBRACES);
+                               NOTE(REG_UUNSPEC);
+                               RETV(PLAIN, c);
+                       }
+                       else
+                       {
+                               NOTE(REG_UBOUNDS);
+                               INTOCON(L_EBND);
+                               RET('{');
+                       }
+                       assert(NOTREACHED);
+                       break;
+               case CHR('('):                  /* parenthesis, or advanced extension */
+                       if ((v->cflags & REG_ADVF) && NEXT1('?'))
+                       {
+                               NOTE(REG_UNONPOSIX);
+                               v->now++;
+                               switch (*v->now++)
+                               {
+                                       case CHR(':'):          /* non-capturing paren */
+                                               RETV('(', 0);
+                                               break;
+                                       case CHR('#'):          /* comment */
+                                               while (!ATEOS() && *v->now != CHR(')'))
+                                                       v->now++;
+                                               if (!ATEOS())
+                                                       v->now++;
+                                               assert(v->nexttype == v->lasttype);
+                                               return next(v);
+                                               break;
+                                       case CHR('='):          /* positive lookahead */
+                                               NOTE(REG_ULOOKAHEAD);
+                                               RETV(LACON, 1);
+                                               break;
+                                       case CHR('!'):          /* negative lookahead */
+                                               NOTE(REG_ULOOKAHEAD);
+                                               RETV(LACON, 0);
+                                               break;
+                                       default:
+                                               FAILW(REG_BADRPT);
+                                               break;
+                               }
+                               assert(NOTREACHED);
+                       }
+                       if (v->cflags & REG_NOSUB)
+                               RETV('(', 0);   /* all parens non-capturing */
+                       else
+                               RETV('(', 1);
+                       break;
+               case CHR(')'):
+                       if (LASTTYPE('('))
+                               NOTE(REG_UUNSPEC);
+                       RETV(')', c);
+                       break;
+               case CHR('['):                  /* easy except for [[:<:]] and [[:>:]] */
+                       if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+                               *(v->now + 1) == CHR(':') &&
+                               (*(v->now + 2) == CHR('<') ||
+                                *(v->now + 2) == CHR('>')) &&
+                               *(v->now + 3) == CHR(':') &&
+                               *(v->now + 4) == CHR(']') &&
+                               *(v->now + 5) == CHR(']'))
+                       {
+                               c = *(v->now + 2);
+                               v->now += 6;
+                               NOTE(REG_UNONPOSIX);
+                               RET((c == CHR('<')) ? '<' : '>');
+                       }
+                       INTOCON(L_BRACK);
+                       if (NEXT1('^'))
+                       {
+                               v->now++;
+                               RETV('[', 0);
+                       }
+                       RETV('[', 1);
+                       break;
+               case CHR('.'):
+                       RET('.');
+                       break;
+               case CHR('^'):
+                       RET('^');
+                       break;
+               case CHR('$'):
+                       RET('$');
+                       break;
+               case CHR('\\'): /* mostly punt backslashes to code below */
+                       if (ATEOS())
+                               FAILW(REG_EESCAPE);
+                       break;
+               default:                                /* ordinary character */
+                       RETV(PLAIN, c);
+                       break;
+       }
+
+       /* ERE/ARE backslash handling; backslash already eaten */
+       assert(!ATEOS());
+       if (!(v->cflags & REG_ADVF))
+       {                                                       /* only AREs have non-trivial escapes */
+               if (iscalnum(*v->now))
+               {
+                       NOTE(REG_UBSALNUM);
+                       NOTE(REG_UUNSPEC);
+               }
+               RETV(PLAIN, *v->now++);
+       }
+       (DISCARD) lexescape(v);
+       if (ISERR())
+               FAILW(REG_EESCAPE);
+       if (v->nexttype == CCLASS)
+       {                                                       /* fudge at lexical level */
+               switch (v->nextvalue)
+               {
+                       case 'd':
+                               lexnest(v, backd, ENDOF(backd));
+                               break;
+                       case 'D':
+                               lexnest(v, backD, ENDOF(backD));
+                               break;
+                       case 's':
+                               lexnest(v, backs, ENDOF(backs));
+                               break;
+                       case 'S':
+                               lexnest(v, backS, ENDOF(backS));
+                               break;
+                       case 'w':
+                               lexnest(v, backw, ENDOF(backw));
+                               break;
+                       case 'W':
+                               lexnest(v, backW, ENDOF(backW));
+                               break;
+                       default:
+                               assert(NOTREACHED);
+                               FAILW(REG_ASSERT);
+                               break;
+               }
+               /* lexnest done, back up and try again */
+               v->nexttype = v->lasttype;
+               return next(v);
+       }
+       /* otherwise, lexescape has already done the work */
+       return !ISERR();
+}
+
+/*
+ * lexescape - parse an ARE backslash escape (backslash already eaten)
+ * Note slightly nonstandard use of the CCLASS type code.
+ */
+static int                                             /* not actually used, but convenient for
+                                                                * RETV */
+lexescape(struct vars * v)
+{
+       chr                     c;
+       static chr      alert[] = {
+               CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
+       };
+       static chr      esc[] = {
+               CHR('E'), CHR('S'), CHR('C')
+       };
+       chr                *save;
+
+       assert(v->cflags & REG_ADVF);
+
+       assert(!ATEOS());
+       c = *v->now++;
+       if (!iscalnum(c))
+               RETV(PLAIN, c);
+
+       NOTE(REG_UNONPOSIX);
+       switch (c)
+       {
+               case CHR('a'):
+                       RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
+                       break;
+               case CHR('A'):
+                       RETV(SBEGIN, 0);
+                       break;
+               case CHR('b'):
+                       RETV(PLAIN, CHR('\b'));
+                       break;
+               case CHR('B'):
+                       RETV(PLAIN, CHR('\\'));
+                       break;
+               case CHR('c'):
+                       NOTE(REG_UUNPORT);
+                       if (ATEOS())
+                               FAILW(REG_EESCAPE);
+                       RETV(PLAIN, (chr) (*v->now++ & 037));
+                       break;
+               case CHR('d'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 'd');
+                       break;
+               case CHR('D'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 'D');
+                       break;
+               case CHR('e'):
+                       NOTE(REG_UUNPORT);
+                       RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
+                       break;
+               case CHR('f'):
+                       RETV(PLAIN, CHR('\f'));
+                       break;
+               case CHR('m'):
+                       RET('<');
+                       break;
+               case CHR('M'):
+                       RET('>');
+                       break;
+               case CHR('n'):
+                       RETV(PLAIN, CHR('\n'));
+                       break;
+               case CHR('r'):
+                       RETV(PLAIN, CHR('\r'));
+                       break;
+               case CHR('s'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 's');
+                       break;
+               case CHR('S'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 'S');
+                       break;
+               case CHR('t'):
+                       RETV(PLAIN, CHR('\t'));
+                       break;
+               case CHR('u'):
+                       c = lexdigits(v, 16, 4, 4);
+                       if (ISERR())
+                               FAILW(REG_EESCAPE);
+                       RETV(PLAIN, c);
+                       break;
+               case CHR('U'):
+                       c = lexdigits(v, 16, 8, 8);
+                       if (ISERR())
+                               FAILW(REG_EESCAPE);
+                       RETV(PLAIN, c);
+                       break;
+               case CHR('v'):
+                       RETV(PLAIN, CHR('\v'));
+                       break;
+               case CHR('w'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 'w');
+                       break;
+               case CHR('W'):
+                       NOTE(REG_ULOCALE);
+                       RETV(CCLASS, 'W');
+                       break;
+               case CHR('x'):
+                       NOTE(REG_UUNPORT);
+                       c = lexdigits(v, 16, 1, 255);           /* REs >255 long outside
+                                                                                                * spec */
+                       if (ISERR())
+                               FAILW(REG_EESCAPE);
+                       RETV(PLAIN, c);
+                       break;
+               case CHR('y'):
+                       NOTE(REG_ULOCALE);
+                       RETV(WBDRY, 0);
+                       break;
+               case CHR('Y'):
+                       NOTE(REG_ULOCALE);
+                       RETV(NWBDRY, 0);
+                       break;
+               case CHR('Z'):
+                       RETV(SEND, 0);
+                       break;
+               case CHR('1'):
+               case CHR('2'):
+               case CHR('3'):
+               case CHR('4'):
+               case CHR('5'):
+               case CHR('6'):
+               case CHR('7'):
+               case CHR('8'):
+               case CHR('9'):
+                       save = v->now;
+                       v->now--;                       /* put first digit back */
+                       c = lexdigits(v, 10, 1, 255);           /* REs >255 long outside
+                                                                                                * spec */
+                       if (ISERR())
+                               FAILW(REG_EESCAPE);
+                       /* ugly heuristic (first test is "exactly 1 digit?") */
+                       if (v->now - save == 0 || (int) c <= v->nsubexp)
+                       {
+                               NOTE(REG_UBACKREF);
+                               RETV(BACKREF, (chr) c);
+                       }
+                       /* oops, doesn't look like it's a backref after all... */
+                       v->now = save;
+                       /* and fall through into octal number */
+               case CHR('0'):
+                       NOTE(REG_UUNPORT);
+                       v->now--;                       /* put first digit back */
+                       c = lexdigits(v, 8, 1, 3);
+                       if (ISERR())
+                               FAILW(REG_EESCAPE);
+                       RETV(PLAIN, c);
+                       break;
+               default:
+                       assert(iscalpha(c));
+                       FAILW(REG_EESCAPE); /* unknown alphabetic escape */
+                       break;
+       }
+       assert(NOTREACHED);
+}
+
+/*
+ * lexdigits - slurp up digits and return chr value
+ */
+static chr                                             /* chr value; errors signalled via ERR */
+lexdigits(struct vars * v,
+                 int base,
+                 int minlen,
+                 int maxlen)
+{
+       uchr            n;                              /* unsigned to avoid overflow misbehavior */
+       int                     len;
+       chr                     c;
+       int                     d;
+       const uchr      ub = (uchr) base;
+
+       n = 0;
+       for (len = 0; len < maxlen && !ATEOS(); len++)
+       {
+               c = *v->now++;
+               switch (c)
+               {
+                       case CHR('0'):
+                       case CHR('1'):
+                       case CHR('2'):
+                       case CHR('3'):
+                       case CHR('4'):
+                       case CHR('5'):
+                       case CHR('6'):
+                       case CHR('7'):
+                       case CHR('8'):
+                       case CHR('9'):
+                               d = DIGITVAL(c);
+                               break;
+                       case CHR('a'):
+                       case CHR('A'):
+                               d = 10;
+                               break;
+                       case CHR('b'):
+                       case CHR('B'):
+                               d = 11;
+                               break;
+                       case CHR('c'):
+                       case CHR('C'):
+                               d = 12;
+                               break;
+                       case CHR('d'):
+                       case CHR('D'):
+                               d = 13;
+                               break;
+                       case CHR('e'):
+                       case CHR('E'):
+                               d = 14;
+                               break;
+                       case CHR('f'):
+                       case CHR('F'):
+                               d = 15;
+                               break;
+                       default:
+                               v->now--;               /* oops, not a digit at all */
+                               d = -1;
+                               break;
+               }
+
+               if (d >= base)
+               {                                               /* not a plausible digit */
+                       v->now--;
+                       d = -1;
+               }
+               if (d < 0)
+                       break;                          /* NOTE BREAK OUT */
+               n = n * ub + (uchr) d;
+       }
+       if (len < minlen)
+               ERR(REG_EESCAPE);
+
+       return (chr) n;
+}
+
+/*
+ * brenext - get next BRE token
+ *
+ * This is much like EREs except for all the stupid backslashes and the
+ * context-dependency of some things.
+ */
+static int                                             /* 1 normal, 0 failure */
+brenext(struct vars * v,
+               chr pc)
+{
+       chr                     c = (chr) pc;
+
+       switch (c)
+       {
+               case CHR('*'):
+                       if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
+                               RETV(PLAIN, c);
+                       RET('*');
+                       break;
+               case CHR('['):
+                       if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+                               *(v->now + 1) == CHR(':') &&
+                               (*(v->now + 2) == CHR('<') ||
+                                *(v->now + 2) == CHR('>')) &&
+                               *(v->now + 3) == CHR(':') &&
+                               *(v->now + 4) == CHR(']') &&
+                               *(v->now + 5) == CHR(']'))
+                       {
+                               c = *(v->now + 2);
+                               v->now += 6;
+                               NOTE(REG_UNONPOSIX);
+                               RET((c == CHR('<')) ? '<' : '>');
+                       }
+                       INTOCON(L_BRACK);
+                       if (NEXT1('^'))
+                       {
+                               v->now++;
+                               RETV('[', 0);
+                       }
+                       RETV('[', 1);
+                       break;
+               case CHR('.'):
+                       RET('.');
+                       break;
+               case CHR('^'):
+                       if (LASTTYPE(EMPTY))
+                               RET('^');
+                       if (LASTTYPE('('))
+                       {
+                               NOTE(REG_UUNSPEC);
+                               RET('^');
+                       }
+                       RETV(PLAIN, c);
+                       break;
+               case CHR('$'):
+                       if (v->cflags & REG_EXPANDED)
+                               skip(v);
+                       if (ATEOS())
+                               RET('$');
+                       if (NEXT2('\\', ')'))
+                       {
+                               NOTE(REG_UUNSPEC);
+                               RET('$');
+                       }
+                       RETV(PLAIN, c);
+                       break;
+               case CHR('\\'):
+                       break;                          /* see below */
+               default:
+                       RETV(PLAIN, c);
+                       break;
+       }
+
+       assert(c == CHR('\\'));
+
+       if (ATEOS())
+               FAILW(REG_EESCAPE);
+
+       c = *v->now++;
+       switch (c)
+       {
+               case CHR('{'):
+                       INTOCON(L_BBND);
+                       NOTE(REG_UBOUNDS);
+                       RET('{');
+                       break;
+               case CHR('('):
+                       RETV('(', 1);
+                       break;
+               case CHR(')'):
+                       RETV(')', c);
+                       break;
+               case CHR('<'):
+                       NOTE(REG_UNONPOSIX);
+                       RET('<');
+                       break;
+               case CHR('>'):
+                       NOTE(REG_UNONPOSIX);
+                       RET('>');
+                       break;
+               case CHR('1'):
+               case CHR('2'):
+               case CHR('3'):
+               case CHR('4'):
+               case CHR('5'):
+               case CHR('6'):
+               case CHR('7'):
+               case CHR('8'):
+               case CHR('9'):
+                       NOTE(REG_UBACKREF);
+                       RETV(BACKREF, (chr) DIGITVAL(c));
+                       break;
+               default:
+                       if (iscalnum(c))
+                       {
+                               NOTE(REG_UBSALNUM);
+                               NOTE(REG_UUNSPEC);
+                       }
+                       RETV(PLAIN, c);
+                       break;
+       }
+
+       assert(NOTREACHED);
+}
+
+/*
+ * skip - skip white space and comments in expanded form
+ */
+static void
+skip(struct vars * v)
+{
+       chr                *start = v->now;
+
+       assert(v->cflags & REG_EXPANDED);
+
+       for (;;)
+       {
+               while (!ATEOS() && iscspace(*v->now))
+                       v->now++;
+               if (ATEOS() || *v->now != CHR('#'))
+                       break;                          /* NOTE BREAK OUT */
+               assert(NEXT1('#'));
+               while (!ATEOS() && *v->now != CHR('\n'))
+                       v->now++;
+               /* leave the newline to be picked up by the iscspace loop */
+       }
+
+       if (v->now != start)
+               NOTE(REG_UNONPOSIX);
+}
+
+/*
+ * newline - return the chr for a newline
+ *
+ * This helps confine use of CHR to this source file.
+ */
+static chr
+newline(void)
+{
+       return CHR('\n');
+}
+
+/*
+ * chrnamed - return the chr known by a given (chr string) name
+ *
+ * The code is a bit clumsy, but this routine gets only such specialized
+ * use that it hardly matters.
+ */
+static chr
+chrnamed(struct vars * v,
+                chr *startp,                   /* start of name */
+                chr *endp,                             /* just past end of name */
+                chr lastresort)                /* what to return if name lookup fails */
+{
+       celt            c;
+       int                     errsave;
+       int                     e;
+       struct cvec *cv;
+
+       errsave = v->err;
+       v->err = 0;
+       c = element(v, startp, endp);
+       e = v->err;
+       v->err = errsave;
+
+       if (e != 0)
+               return (chr) lastresort;
+
+       cv = range(v, c, c, 0);
+       if (cv->nchrs == 0)
+               return (chr) lastresort;
+       return cv->chrs[0];
+}
diff --git a/src/regex/regerrs.h b/src/regex/regerrs.h

new file mode 100644 (file)

index 0000000..f99dbf4
--- /dev/null
+++ b/src/regex/regerrs.h
@@ -0,0 +1,75 @@
+/*
+ * $Id$
+ */
+
+{
+       REG_OKAY, "REG_OKAY", "no errors detected"
+},
+
+{
+       REG_NOMATCH, "REG_NOMATCH", "failed to match"
+},
+
+{
+       REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)"
+},
+
+{
+       REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"
+},
+
+{
+       REG_ECTYPE, "REG_ECTYPE", "invalid character class"
+},
+
+{
+       REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence"
+},
+
+{
+       REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"
+},
+
+{
+       REG_EBRACK, "REG_EBRACK", "brackets [] not balanced"
+},
+
+{
+       REG_EPAREN, "REG_EPAREN", "parentheses () not balanced"
+},
+
+{
+       REG_EBRACE, "REG_EBRACE", "braces {} not balanced"
+},
+
+{
+       REG_BADBR, "REG_BADBR", "invalid repetition count(s)"
+},
+
+{
+       REG_ERANGE, "REG_ERANGE", "invalid character range"
+},
+
+{
+       REG_ESPACE, "REG_ESPACE", "out of memory"
+},
+
+{
+       REG_BADRPT, "REG_BADRPT", "quantifier operand invalid"
+},
+
+{
+       REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"
+},
+
+{
+       REG_INVARG, "REG_INVARG", "invalid argument to regex function"
+},
+
+{
+       REG_MIXED, "REG_MIXED", "character widths of regex and string differ"
+},
+
+{
+       REG_BADOPT, "REG_BADOPT", "invalid embedded option"
+},
diff --git a/src/regex/regguts.h b/src/regex/regguts.h

new file mode 100644 (file)

index 0000000..aa12dbf
--- /dev/null
+++ b/src/regex/regguts.h
@@ -0,0 +1,417 @@
+/*
+ * Internal interface definitions, etc., for the reg package
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.     All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+
+
+/*
+ * Environmental customization.  It should not (I hope) be necessary to
+ * alter the file you are now reading -- regcustom.h should handle it all,
+ * given care here and elsewhere.
+ */
+#include "regcustom.h"
+
+
+
+/*
+ * Things that regcustom.h might override.
+ */
+
+/* assertions */
+#ifndef assert
+#ifndef REG_DEBUG
+#      ifndef NDEBUG
+#              define  NDEBUG                                  /* no assertions */
+#      endif
+#endif
+#include <assert.h>
+#endif
+
+/* voids */
+#ifndef DISCARD
+#define DISCARD void                   /* for throwing values away */
+#endif
+#ifndef VS
+#define VS(x)  ((void *)(x))   /* cast something to generic ptr */
+#endif
+
+/* function-pointer declarator */
+#ifndef FUNCPTR
+#define FUNCPTR(name, args) (*name) args
+#endif
+
+/* memory allocation */
+#ifndef MALLOC
+#define MALLOC(n)      malloc(n)
+#endif
+#ifndef REALLOC
+#define REALLOC(p, n)  realloc(VS(p), n)
+#endif
+#ifndef FREE
+#define FREE(p)                free(VS(p))
+#endif
+
+/* want size of a char in bits, and max value in bounded quantifiers */
+#ifndef CHAR_BIT
+#include <limits.h>
+#endif
+#ifndef _POSIX2_RE_DUP_MAX
+#define _POSIX2_RE_DUP_MAX     255 /* normally from <limits.h> */
+#endif
+
+
+
+/*
+ * misc
+ */
+
+#define NOTREACHED     0
+#define xxx            1
+
+#define DUPMAX _POSIX2_RE_DUP_MAX
+#define INFINITY       (DUPMAX+1)
+
+#define REMAGIC 0xfed7                 /* magic number for main struct */
+
+
+
+/*
+ * debugging facilities
+ */
+#ifdef REG_DEBUG
+/* FDEBUG does finite-state tracing */
+#define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
+/* MDEBUG does higher-level tracing */
+#define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
+#else
+#define FDEBUG(arglist) {}
+#define MDEBUG(arglist) {}
+#endif
+
+
+
+/*
+ * bitmap manipulation
+ */
+#define UBITS  (CHAR_BIT * sizeof(unsigned))
+#define BSET(uv, sn)   ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
+#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
+
+
+
+/*
+ * We dissect a chr into byts for colormap table indexing.     Here we define
+ * a byt, which will be the same as a byte on most machines... The exact
+ * size of a byt is not critical, but about 8 bits is good, and extraction
+ * of 8-bit chunks is sometimes especially fast.
+ */
+#ifndef BYTBITS
+#define BYTBITS 8                              /* bits in a byt */
+#endif
+#define BYTTAB (1<<BYTBITS)    /* size of table with one entry per byt
+                                                                * value */
+#define BYTMASK (BYTTAB-1)             /* bit mask for byt */
+#define NBYTS  ((CHRBITS+BYTBITS-1)/BYTBITS)
+/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
+
+
+
+/*
+ * As soon as possible, we map chrs into equivalence classes -- "colors" --
+ * which are of much more manageable number.
+ */
+typedef short color;                   /* colors of characters */
+typedef int pcolor;                            /* what color promotes to */
+
+#define COLORLESS      (-1)            /* impossible color */
+#define WHITE          0                       /* default color, parent of all others */
+
+
+
+/*
+ * A colormap is a tree -- more precisely, a DAG -- indexed at each level
+ * by a byt of the chr, to map the chr to a color efficiently. Because
+ * lower sections of the tree can be shared, it can exploit the usual
+ * sparseness of such a mapping table. The tree is always NBYTS levels
+ * deep (in the past it was shallower during construction but was "filled"
+ * to full depth at the end of that); areas that are unaltered as yet point
+ * to "fill blocks" which are entirely WHITE in color.
+ */
+
+/* the tree itself */
+struct colors
+{
+       color           ccolor[BYTTAB];
+};
+struct ptrs
+{
+       union tree *pptr[BYTTAB];
+};
+union tree
+{
+       struct colors colors;
+       struct ptrs ptrs;
+};
+
+#define tcolor colors.ccolor
+#define tptr   ptrs.pptr
+
+/* internal per-color structure for the color machinery */
+struct colordesc
+{
+       uchr            nchrs;                  /* number of chars of this color */
+       color           sub;                    /* open subcolor (if any); free chain ptr */
+#define  NOSUB  COLORLESS
+       struct arc *arcs;                       /* color chain */
+       int                     flags;
+#define  FREECOL 01                            /* currently free */
+#define  PSEUDO  02                            /* pseudocolor, no real chars */
+#define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
+       union tree *block;                      /* block of solid color, if any */
+};
+
+/* the color map itself */
+struct colormap
+{
+       int                     magic;
+#define  CMMAGIC 0x876
+       struct vars *v;                         /* for compile error reporting */
+       size_t          ncds;                   /* number of colordescs */
+       size_t          max;                    /* highest in use */
+       color           free;                   /* beginning of free chain (if non-0) */
+       struct colordesc *cd;
+#define  CDEND(cm)      (&(cm)->cd[(cm)->max + 1])
+#define  NINLINECDS  ((size_t)10)
+       struct colordesc cdspace[NINLINECDS];
+       union tree      tree[NBYTS];    /* tree top, plus fill blocks */
+};
+
+/* optimization magic to do fast chr->color mapping */
+#define B0(c)  ((c) & BYTMASK)
+#define B1(c)  (((c)>>BYTBITS) & BYTMASK)
+#define B2(c)  (((c)>>(2*BYTBITS)) & BYTMASK)
+#define B3(c)  (((c)>>(3*BYTBITS)) & BYTMASK)
+#if NBYTS == 1
+#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
+#endif
+/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
+#if NBYTS == 2
+#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+#if NBYTS == 4
+#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+
+
+
+/*
+ * Interface definitions for locale-interface functions in locale.c.
+ * Multi-character collating elements (MCCEs) cause most of the trouble.
+ */
+struct cvec
+{
+       int                     nchrs;                  /* number of chrs */
+       int                     chrspace;               /* number of chrs possible */
+       chr                *chrs;                       /* pointer to vector of chrs */
+       int                     nranges;                /* number of ranges (chr pairs) */
+       int                     rangespace;             /* number of chrs possible */
+       chr                *ranges;                     /* pointer to vector of chr pairs */
+       int                     nmcces;                 /* number of MCCEs */
+       int                     mccespace;              /* number of MCCEs possible */
+       int                     nmccechrs;              /* number of chrs used for MCCEs */
+       chr                *mcces[1];           /* pointers to 0-terminated MCCEs */
+       /* and both batches of chrs are on the end */
+};
+
+/* caution:  this value cannot be changed easily */
+#define MAXMCCE 2                              /* length of longest MCCE */
+
+
+
+/*
+ * definitions for NFA internal representation
+ *
+ * Having a "from" pointer within each arc may seem redundant, but it
+ * saves a lot of hassle.
+ */
+struct state;
+
+struct arc
+{
+       int                     type;
+#define  ARCFREE '\0'
+       color           co;
+       struct state *from;                     /* where it's from (and contained within) */
+       struct state *to;                       /* where it's to */
+       struct arc *outchain;           /* *from's outs chain or free chain */
+#define  freechain      outchain
+       struct arc *inchain;            /* *to's ins chain */
+       struct arc *colorchain;         /* color's arc chain */
+};
+
+struct arcbatch
+{                                                              /* for bulk allocation of arcs */
+       struct arcbatch *next;
+#define  ABSIZE  10
+       struct arc      a[ABSIZE];
+};
+
+struct state
+{
+       int                     no;
+#define  FREESTATE      (-1)
+       char            flag;                   /* marks special states */
+       int                     nins;                   /* number of inarcs */
+       struct arc *ins;                        /* chain of inarcs */
+       int                     nouts;                  /* number of outarcs */
+       struct arc *outs;                       /* chain of outarcs */
+       struct arc *free;                       /* chain of free arcs */
+       struct state *tmp;                      /* temporary for traversal algorithms */
+       struct state *next;                     /* chain for traversing all */
+       struct state *prev;                     /* back chain */
+       struct arcbatch oas;            /* first arcbatch, avoid malloc in easy
+                                                                * case */
+       int                     noas;                   /* number of arcs used in first arcbatch */
+};
+
+struct nfa
+{
+       struct state *pre;                      /* pre-initial state */
+       struct state *init;                     /* initial state */
+       struct state *final;            /* final state */
+       struct state *post;                     /* post-final state */
+       int                     nstates;                /* for numbering states */
+       struct state *states;           /* state-chain header */
+       struct state *slast;            /* tail of the chain */
+       struct state *free;                     /* free list */
+       struct colormap *cm;            /* the color map */
+       color           bos[2];                 /* colors, if any, assigned to BOS and BOL */
+       color           eos[2];                 /* colors, if any, assigned to EOS and EOL */
+       struct vars *v;                         /* simplifies compile error reporting */
+       struct nfa *parent;                     /* parent NFA, if any */
+};
+
+
+
+/*
+ * definitions for compacted NFA
+ */
+struct carc
+{
+       color           co;                             /* COLORLESS is list terminator */
+       int                     to;                             /* state number */
+};
+
+struct cnfa
+{
+       int                     nstates;                /* number of states */
+       int                     ncolors;                /* number of colors */
+       int                     flags;
+#define  HASLACONS      01                     /* uses lookahead constraints */
+       int                     pre;                    /* setup state number */
+       int                     post;                   /* teardown state number */
+       color           bos[2];                 /* colors, if any, assigned to BOS and BOL */
+       color           eos[2];                 /* colors, if any, assigned to EOS and EOL */
+       struct carc **states;           /* vector of pointers to outarc lists */
+       struct carc *arcs;                      /* the area for the lists */
+};
+
+#define ZAPCNFA(cnfa)  ((cnfa).nstates = 0)
+#define NULLCNFA(cnfa) ((cnfa).nstates == 0)
+
+
+
+/*
+ * subexpression tree
+ */
+struct subre
+{
+       char            op;                             /* '|', '.' (concat), 'b' (backref), '(',
+                                                                * '=' */
+       char            flags;
+#define  LONGER  01                            /* prefers longer match */
+#define  SHORTER 02                            /* prefers shorter match */
+#define  MIXED  04                             /* mixed preference below */
+#define  CAP 010                               /* capturing parens below */
+#define  BACKR  020                    /* back reference below */
+#define  INUSE  0100                   /* in use in final tree */
+#define  LOCAL  03                             /* bits which may not propagate up */
+#define  LMIX(f) ((f)<<2)              /* LONGER -> MIXED */
+#define  SMIX(f) ((f)<<1)              /* SHORTER -> MIXED */
+#define  UP(f)  (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
+#define  MESSY(f)       ((f)&(MIXED|CAP|BACKR))
+#define  PREF(f) ((f)&LOCAL)
+#define  PREF2(f1, f2)  ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
+#define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
+       short           retry;                  /* index into retry memory */
+       int                     subno;                  /* subexpression number (for 'b' and '(') */
+       short           min;                    /* min repetitions, for backref only */
+       short           max;                    /* max repetitions, for backref only */
+       struct subre *left;                     /* left child, if any (also freelist
+                                                                * chain) */
+       struct subre *right;            /* right child, if any */
+       struct state *begin;            /* outarcs from here... */
+       struct state *end;                      /* ...ending in inarcs here */
+       struct cnfa cnfa;                       /* compacted NFA, if any */
+       struct subre *chain;            /* for bookkeeping and error cleanup */
+};
+
+
+
+/*
+ * table of function pointers for generic manipulation functions
+ * A regex_t's re_fns points to one of these.
+ */
+struct fns
+{
+       void            FUNCPTR(free, (regex_t *));
+};
+
+
+
+/*
+ * the insides of a regex_t, hidden behind a void *
+ */
+struct guts
+{
+       int                     magic;
+#define  GUTSMAGIC      0xfed9
+       int                     cflags;                 /* copy of compile flags */
+       long            info;                   /* copy of re_info */
+       size_t          nsub;                   /* copy of re_nsub */
+       struct subre *tree;
+       struct cnfa search;                     /* for fast preliminary search */
+       int                     ntree;
+       struct colormap cmap;
+       int                     FUNCPTR(compare, (const chr *, const chr *, size_t));
+       struct subre *lacons;           /* lookahead-constraint vector */
+       int                     nlacons;                /* size of lacons */
+};
author	Ryan Norton <wxprojects@comcast.net>
	Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)
committer	Ryan Norton <wxprojects@comcast.net>
	Wed, 13 Oct 1999 02:22:17 +0000 (02:22 +0000)
src/regex/regc_lex.c	[new file with mode: 0644]	patch \| blob
src/regex/regerrs.h	[new file with mode: 0644]	patch \| blob
src/regex/regguts.h	[new file with mode: 0644]	patch \| blob