src/regex/regcomp.c

   1 /*
   2  * re_*comp and friends - compile REs
   3  * This file #includes several others (see the bottom).
   4  *
   5  * Copyright (c) 1998, 1999 Henry Spencer.      All rights reserved.
   6  *
   7  * Development of this software was funded, in part, by Cray Research Inc.,
   8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
   9  * Corporation, none of whom are responsible for the results.  The author
  10  * thanks all of them.
  11  *
  12  * Redistribution and use in source and binary forms -- with or without
  13  * modification -- are permitted for any purpose, provided that
  14  * redistributions in source form retain this entire copyright notice and
  15  * indicate the origin and nature of any modifications.
  16  *
  17  * I'd appreciate being given credit for this package in the documentation
  18  * of software which uses it, but that is not a requirement.
  19  *
  20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
  32  *
  33  */
  34
  35 #include "regguts.h"
  36
  37 /*
  38  * forward declarations, up here so forward datatypes etc. are defined early
  39  */
  40 /* === regcomp.c === */
  41 static void moresubs(struct vars *, int);
  42 static int      freev(struct vars *, int);
  43 static void makesearch(struct vars *, struct nfa *);
  44 static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
  45 static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
  46 static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
  47 static void nonword(struct vars *, int, struct state *, struct state *);
  48 static void word(struct vars *, int, struct state *, struct state *);
  49 static int      scannum(struct vars *);
  50 static void repeat(struct vars *, struct state *, struct state *, int, int);
  51 static void bracket(struct vars *, struct state *, struct state *);
  52 static void cbracket(struct vars *, struct state *, struct state *);
  53 static void brackpart(struct vars *, struct state *, struct state *);
  54 static chr *scanplain(struct vars *);
  55 static void leaders(struct vars *, struct cvec *);
  56 static void onechr(struct vars *, chr, struct state *, struct state *);
  57 static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
  58 static celt nextleader(struct vars *, chr, chr);
  59 static void wordchrs(struct vars *);
  60 static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
  61 static void freesubre(struct vars *, struct subre *);
  62 static void freesrnode(struct vars *, struct subre *);
  63 static void optst(struct vars *, struct subre *);
  64 static int      numst(struct subre *, int);
  65 static void markst(struct subre *);
  66 static void cleanst(struct vars *);
  67 static long nfatree(struct vars *, struct subre *, FILE *);
  68 static long nfanode(struct vars *, struct subre *, FILE *);
  69 static int      newlacon(struct vars *, struct state *, struct state *, int);
  70 static void freelacons(struct subre *, int);
  71 static void rfree(regex_t *);
  72
  73 #ifdef REG_DEBUG
  74 static void dump(regex_t *, FILE *);
  75 static void dumpst(struct subre *, FILE *, int);
  76 static void stdump(struct subre *, FILE *, int);
  77 static char *stid(struct subre *, char *, size_t);
  78 #endif
  79 /* === regc_lex.c === */
  80 static void lexstart(struct vars *);
  81 static void prefixes(struct vars *);
  82 static void lexnest(struct vars *, chr *, chr *);
  83 static void lexword(struct vars *);
  84 static int      next(struct vars *);
  85 static int      lexescape(struct vars *);
  86 static chr      lexdigits(struct vars *, int, int, int);
  87 static int      brenext(struct vars *, chr);
  88 static void skip(struct vars *);
  89 static chr      newline(void);
  90 static chr      chrnamed(struct vars *, chr *, chr *, chr);
  91
  92 /* === regc_color.c === */
  93 static void initcm(struct vars *, struct colormap *);
  94 static void freecm(struct colormap *);
  95 static void cmtreefree(struct colormap *, union tree *, int);
  96 static color setcolor(struct colormap *, chr, pcolor);
  97 static color maxcolor(struct colormap *);
  98 static color newcolor(struct colormap *);
  99 static void freecolor(struct colormap *, pcolor);
 100 static color pseudocolor(struct colormap *);
 101 static color subcolor(struct colormap *, chr c);
 102 static color newsub(struct colormap *, pcolor);
 103 static void subrange(struct vars *, chr, chr, struct state *, struct state *);
 104 static void subblock(struct vars *, chr, struct state *, struct state *);
 105 static void okcolors(struct nfa *, struct colormap *);
 106 static void colorchain(struct colormap *, struct arc *);
 107 static void uncolorchain(struct colormap *, struct arc *);
 108 static int      singleton(struct colormap *, chr c);
 109 static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
 110 static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
 111
 112 #ifdef REG_DEBUG
 113 static void dumpcolors(struct colormap *, FILE *);
 114 static void fillcheck(struct colormap *, union tree *, int, FILE *);
 115 static void dumpchr(chr, FILE *);
 116 #endif
 117 /* === regc_nfa.c === */
 118 static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
 119 static void freenfa(struct nfa *);
 120 static struct state *newstate(struct nfa *);
 121 static struct state *newfstate(struct nfa *, int flag);
 122 static void dropstate(struct nfa *, struct state *);
 123 static void freestate(struct nfa *, struct state *);
 124 static void destroystate(struct nfa *, struct state *);
 125 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
 126 static struct arc *allocarc(struct nfa *, struct state *);
 127 static void freearc(struct nfa *, struct arc *);
 128 static struct arc *findarc(struct state *, int, pcolor);
 129 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
 130 static void moveins(struct nfa *, struct state *, struct state *);
 131 static void copyins(struct nfa *, struct state *, struct state *);
 132 static void moveouts(struct nfa *, struct state *, struct state *);
 133 static void copyouts(struct nfa *, struct state *, struct state *);
 134 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 135 static void delsub(struct nfa *, struct state *, struct state *);
 136 static void deltraverse(struct nfa *, struct state *, struct state *);
 137 static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
 138 static void duptraverse(struct nfa *, struct state *, struct state *);
 139 static void cleartraverse(struct nfa *, struct state *);
 140 static void specialcolors(struct nfa *);
 141 static long optimize(struct nfa *, FILE *);
 142 static void pullback(struct nfa *, FILE *);
 143 static int      pull(struct nfa *, struct arc *);
 144 static void pushfwd(struct nfa *, FILE *);
 145 static int      push(struct nfa *, struct arc *);
 146
 147 #define INCOMPATIBLE    1               /* destroys arc */
 148 #define SATISFIED       2                       /* constraint satisfied */
 149 #define COMPATIBLE      3                       /* compatible but not satisfied yet */
 150 static int      combine(struct arc *, struct arc *);
 151 static void fixempties(struct nfa *, FILE *);
 152 static int      unempty(struct nfa *, struct arc *);
 153 static void cleanup(struct nfa *);
 154 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
 155 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
 156 static long analyze(struct nfa *);
 157 static void compact(struct nfa *, struct cnfa *);
 158 static void carcsort(struct carc *, struct carc *);
 159 static void freecnfa(struct cnfa *);
 160 static void dumpnfa(struct nfa *, FILE *);
 161
 162 #ifdef REG_DEBUG
 163 static void dumpstate(struct state *, FILE *);
 164 static void dumparcs(struct state *, FILE *);
 165 static int      dumprarcs(struct arc *, struct state *, FILE *, int);
 166 static void dumparc(struct arc *, struct state *, FILE *);
 167 static void dumpcnfa(struct cnfa *, FILE *);
 168 static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
 169 #endif
 170 /* === regc_cvec.c === */
 171 static struct cvec *newcvec(int, int, int);
 172 static struct cvec *clearcvec(struct cvec *);
 173 static void addchr(struct cvec *, chr);
 174 static void addrange(struct cvec *, chr, chr);
 175 static void addmcce(struct cvec *, chr *, chr *);
 176 static int      haschr(struct cvec *, chr);
 177 static struct cvec *getcvec(struct vars *, int, int, int);
 178 static void freecvec(struct cvec *);
 179
 180 /* === regc_locale.c === */
 181 extern int      wx_isdigit(wx_wchar c);
 182 extern int      wx_isalpha(wx_wchar c);
 183 extern int      wx_isalnum(wx_wchar c);
 184 extern int      wx_isupper(wx_wchar c);
 185 extern int      wx_islower(wx_wchar c);
 186 extern int      wx_isgraph(wx_wchar c);
 187 extern int      wx_ispunct(wx_wchar c);
 188 extern int      wx_isspace(wx_wchar c);
 189 extern wx_wchar wx_toupper(wx_wchar c);
 190 extern wx_wchar wx_tolower(wx_wchar c);
 191 static int      nmcces(struct vars *);
 192 static int      nleaders(struct vars *);
 193 static struct cvec *allmcces(struct vars *, struct cvec *);
 194 static celt element(struct vars *, chr *, chr *);
 195 static struct cvec *range(struct vars *, celt, celt, int);
 196 static int      before(celt, celt);
 197 static struct cvec *eclass(struct vars *, celt, int);
 198 static struct cvec *cclass(struct vars *, chr *, chr *, int);
 199 static struct cvec *allcases(struct vars *, chr);
 200 static int      cmp(const chr *, const chr *, size_t);
 201 static int      casecmp(const chr *, const chr *, size_t);
 202
 203
 204 /* internal variables, bundled for easy passing around */
 205 struct vars
 206 {
 207         regex_t    *re;
 208         chr                *now;                        /* scan pointer into string */
 209         chr                *stop;                       /* end of string */
 210         chr                *savenow;            /* saved now and stop for "subroutine
 211                                                                  * call" */
 212         chr                *savestop;
 213         int                     err;                    /* error code (0 if none) */
 214         int                     cflags;                 /* copy of compile flags */
 215         int                     lasttype;               /* type of previous token */
 216         int                     nexttype;               /* type of next token */
 217         chr                     nextvalue;              /* value (if any) of next token */
 218         int                     lexcon;                 /* lexical context type (see lex.c) */
 219         int                     nsubexp;                /* subexpression count */
 220         struct subre **subs;            /* subRE pointer vector */
 221         size_t          nsubs;                  /* length of vector */
 222         struct subre *sub10[10];        /* initial vector, enough for most */
 223         struct nfa *nfa;                        /* the NFA */
 224         struct colormap *cm;            /* character color map */
 225         color           nlcolor;                /* color of newline */
 226         struct state *wordchrs;         /* state in nfa holding word-char outarcs */
 227         struct subre *tree;                     /* subexpression tree */
 228         struct subre *treechain;        /* all tree nodes allocated */
 229         struct subre *treefree;         /* any free tree nodes */
 230         int                     ntree;                  /* number of tree nodes */
 231         struct cvec *cv;                        /* interface cvec */
 232         struct cvec *cv2;                       /* utility cvec */
 233         struct cvec *mcces;                     /* collating-element information */
 234 #define  ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
 235         struct state *mccepbegin;       /* in nfa, start of MCCE prototypes */
 236         struct state *mccepend;         /* in nfa, end of MCCE prototypes */
 237         struct subre *lacons;           /* lookahead-constraint vector */
 238         int                     nlacons;                /* size of lacons */
 239 };
 240
 241 /* parsing macros; most know that `v' is the struct vars pointer */
 242 #define NEXT()  (next(v))               /* advance by one token */
 243 #define SEE(t)  (v->nexttype == (t))    /* is next token this? */
 244 #define EAT(t)  (SEE(t) && next(v))             /* if next is this, swallow it */
 245 #define VISERR(vv)      ((vv)->err != 0)        /* have we seen an error yet? */
 246 #define ISERR() VISERR(v)
 247 #define VERR(vv,e)      ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
 248                                                         ((vv)->err = (e)))
 249 #define ERR(e)  VERR(v, e)              /* record an error */
 250 #define NOERR() {if (ISERR()) return;}  /* if error seen, return */
 251 #define NOERRN()        {if (ISERR()) return NULL;} /* NOERR with retval */
 252 #define NOERRZ()        {if (ISERR()) return 0;}        /* NOERR with retval */
 253 #define INSIST(c, e)    ((c) ? 0 : ERR(e))              /* if condition false,
 254                                                                                                  * error */
 255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
 256 #define EMPTYARC(x, y)  newarc(v->nfa, EMPTY, 0, x, y)
 257
 258 /* token type codes, some also used as NFA arc types */
 259 #define EMPTY   'n'                             /* no token present */
 260 #define EOS 'e'                                 /* end of string */
 261 #define PLAIN   'p'                             /* ordinary character */
 262 #define DIGIT   'd'                             /* digit (in bound) */
 263 #define BACKREF 'b'                             /* back reference */
 264 #define COLLEL  'I'                             /* start of [. */
 265 #define ECLASS  'E'                             /* start of [= */
 266 #define CCLASS  'C'                             /* start of [: */
 267 #define END 'X'                                 /* end of [. [= [: */
 268 #define RANGE   'R'                             /* - within [] which might be range delim. */
 269 #define LACON   'L'                             /* lookahead constraint subRE */
 270 #define AHEAD   'a'                             /* color-lookahead arc */
 271 #define BEHIND  'r'                             /* color-lookbehind arc */
 272 #define WBDRY   'w'                             /* word boundary constraint */
 273 #define NWBDRY  'W'                             /* non-word-boundary constraint */
 274 #define SBEGIN  'A'                             /* beginning of string (even if not BOL) */
 275 #define SEND    'Z'                             /* end of string (even if not EOL) */
 276 #define PREFER  'P'                             /* length preference */
 277
 278 /* is an arc colored, and hence on a color chain? */
 279 #define COLORED(a)      ((a)->type == PLAIN || (a)->type == AHEAD || \
 280                                                         (a)->type == BEHIND)
 281
 282
 283
 284 /* static function list */
 285 static struct fns functions = {
 286         rfree,                                          /* regfree insides */
 287 };
 288
 289
 290
 291 /*
 292  * regcomp - compile regular expression
 293  */
 294 int
 295 regcomp(regex_t *re,
 296                    const chr *string,
 297                    int flags)
 298 {
 299         return wx_regcomp(re, string, wx_strlen(string), flags);
 300 }
 301 int
 302 wx_regcomp(regex_t *re,
 303                    const chr *string,
 304                    size_t len,
 305                    int flags)
 306 {
 307         struct vars var;
 308         struct vars *v = &var;
 309         struct guts *g;
 310         int                     i;
 311         size_t          j;
 312
 313 #ifdef REG_DEBUG
 314         FILE       *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
 315
 316 #else
 317         FILE       *debug = (FILE *) NULL;
 318 #endif
 319
 320 #define  CNOERR()        { if (ISERR()) return freev(v, v->err); }
 321
 322         /* sanity checks */
 323
 324         if (re == NULL || string == NULL)
 325                 return REG_INVARG;
 326         if ((flags & REG_QUOTE) &&
 327                 (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
 328                 return REG_INVARG;
 329         if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
 330                 return REG_INVARG;
 331
 332         /* initial setup (after which freev() is callable) */
 333         v->re = re;
 334         v->now = (chr *) string;
 335         v->stop = v->now + len;
 336         v->savenow = v->savestop = NULL;
 337         v->err = 0;
 338         v->cflags = flags;
 339         v->nsubexp = 0;
 340         v->subs = v->sub10;
 341         v->nsubs = 10;
 342         for (j = 0; j < v->nsubs; j++)
 343                 v->subs[j] = NULL;
 344         v->nfa = NULL;
 345         v->cm = NULL;
 346         v->nlcolor = COLORLESS;
 347         v->wordchrs = NULL;
 348         v->tree = NULL;
 349         v->treechain = NULL;
 350         v->treefree = NULL;
 351         v->cv = NULL;
 352         v->cv2 = NULL;
 353         v->mcces = NULL;
 354         v->lacons = NULL;
 355         v->nlacons = 0;
 356         re->re_magic = REMAGIC;
 357         re->re_info = 0;                        /* bits get set during parse */
 358         re->re_csize = sizeof(chr);
 359         re->re_guts = NULL;
 360         re->re_fns = VS(&functions);
 361
 362         /* more complex setup, malloced things */
 363         re->re_guts = VS(MALLOC(sizeof(struct guts)));
 364         if (re->re_guts == NULL)
 365                 return freev(v, REG_ESPACE);
 366         g = (struct guts *) re->re_guts;
 367         g->tree = NULL;
 368         initcm(v, &g->cmap);
 369         v->cm = &g->cmap;
 370         g->lacons = NULL;
 371         g->nlacons = 0;
 372         ZAPCNFA(g->search);
 373         v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
 374         CNOERR();
 375         v->cv = newcvec(100, 20, 10);
 376         if (v->cv == NULL)
 377                 return freev(v, REG_ESPACE);
 378         i = nmcces(v);
 379         if (i > 0)
 380         {
 381                 v->mcces = newcvec(nleaders(v), 0, i);
 382                 CNOERR();
 383                 v->mcces = allmcces(v, v->mcces);
 384                 leaders(v, v->mcces);
 385                 addmcce(v->mcces, (chr *) NULL, (chr *) NULL);  /* dummy */
 386         }
 387         CNOERR();
 388
 389         /* parsing */
 390         lexstart(v);                            /* also handles prefixes */
 391         if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
 392         {
 393                 /* assign newline a unique color */
 394                 v->nlcolor = subcolor(v->cm, newline());
 395                 okcolors(v->nfa, v->cm);
 396         }
 397         CNOERR();
 398         v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
 399         assert(SEE(EOS));                       /* even if error; ISERR() => SEE(EOS) */
 400         CNOERR();
 401         assert(v->tree != NULL);
 402
 403         /* finish setup of nfa and its subre tree */
 404         specialcolors(v->nfa);
 405         CNOERR();
 406 #ifdef REG_DEBUG
 407         if (debug != NULL)
 408         {
 409                 fprintf(debug, "\n\n\n========= RAW ==========\n");
 410                 dumpnfa(v->nfa, debug);
 411                 dumpst(v->tree, debug, 1);
 412                 }
 413 #endif
 414         optst(v, v->tree);
 415         v->ntree = numst(v->tree, 1);
 416         markst(v->tree);
 417         cleanst(v);
 418 #ifdef REG_DEBUG
 419         if (debug != NULL)
 420         {
 421                 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
 422                 dumpst(v->tree, debug, 1);
 423         }
 424 #endif
 425
 426         /* build compacted NFAs for tree and lacons */
 427         re->re_info |= nfatree(v, v->tree, debug);
 428         CNOERR();
 429         assert(v->nlacons == 0 || v->lacons != NULL);
 430         for (i = 1; i < v->nlacons; i++)
 431         {
 432 #ifdef REG_DEBUG
 433                 if (debug != NULL)
 434                         fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
 435 #endif
 436                 nfanode(v, &v->lacons[i], debug);
 437         }
 438         CNOERR();
 439         if (v->tree->flags & SHORTER)
 440                 NOTE(REG_USHORTEST);
 441
 442         /* build compacted NFAs for tree, lacons, fast search */
 443 #ifdef REG_DEBUG
 444         if (debug != NULL)
 445                 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
 446 #endif
 447         /* can sacrifice main NFA now, so use it as work area */
 448         (DISCARD) optimize(v->nfa, debug);
 449         CNOERR();
 450         makesearch(v, v->nfa);
 451         CNOERR();
 452         compact(v->nfa, &g->search);
 453         CNOERR();
 454
 455         /* looks okay, package it up */
 456         re->re_nsub = v->nsubexp;
 457         v->re = NULL;                           /* freev no longer frees re */
 458         g->magic = GUTSMAGIC;
 459         g->cflags = v->cflags;
 460         g->info = re->re_info;
 461         g->nsub = re->re_nsub;
 462         g->tree = v->tree;
 463         v->tree = NULL;
 464         g->ntree = v->ntree;
 465         g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
 466         g->lacons = v->lacons;
 467         v->lacons = NULL;
 468         g->nlacons = v->nlacons;
 469
 470 #ifdef REG_DEBUG
 471         if (flags & REG_DUMP)
 472                 dump(re, stdout);
 473 #endif
 474
 475         assert(v->err == 0);
 476         return freev(v, 0);
 477 }
 478
 479 /*
 480  * moresubs - enlarge subRE vector
 481  */
 482 static void
 483 moresubs(struct vars * v,
 484                  int wanted)                    /* want enough room for this one */
 485 {
 486         struct subre **p;
 487         size_t          n;
 488
 489         assert(wanted > 0 && (size_t) wanted >= v->nsubs);
 490         n = (size_t) wanted *3 / 2 + 1;
 491
 492         if (v->subs == v->sub10)
 493         {
 494                 p = (struct subre **) MALLOC(n * sizeof(struct subre *));
 495                 if (p != NULL)
 496                         memcpy(VS(p), VS(v->subs),
 497                                    v->nsubs * sizeof(struct subre *));
 498                 }
 499                 else
 500                 p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
 501         if (p == NULL)
 502         {
 503                 ERR(REG_ESPACE);
 504                 return;
 505         }
 506         v->subs = p;
 507         for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
 508                 *p = NULL;
 509         assert(v->nsubs == n);
 510         assert((size_t) wanted < v->nsubs);
 511 }
 512
 513 /*
 514  * freev - free vars struct's substructures where necessary
 515  *
 516  * Optionally does error-number setting, and always returns error code
 517  * (if any), to make error-handling code terser.
 518  */
 519 static int
 520 freev(struct vars * v,
 521           int err)
 522 {
 523         if (v->re != NULL)
 524                 rfree(v->re);
 525         if (v->subs != v->sub10)
 526                 FREE(v->subs);
 527         if (v->nfa != NULL)
 528                 freenfa(v->nfa);
 529         if (v->tree != NULL)
 530                 freesubre(v, v->tree);
 531         if (v->treechain != NULL)
 532                 cleanst(v);
 533         if (v->cv != NULL)
 534                 freecvec(v->cv);
 535         if (v->cv2 != NULL)
 536                 freecvec(v->cv2);
 537         if (v->mcces != NULL)
 538                 freecvec(v->mcces);
 539         if (v->lacons != NULL)
 540                 freelacons(v->lacons, v->nlacons);
 541         ERR(err);                                       /* nop if err==0 */
 542
 543         return v->err;
 544 }
 545
 546 /*
 547  * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
 548  * NFA must have been optimize()d already.
 549  */
 550 static void
 551 makesearch(struct vars * v,
 552                    struct nfa * nfa)
 553 {
 554         struct arc *a;
 555         struct arc *b;
 556         struct state *pre = nfa->pre;
 557         struct state *s;
 558         struct state *s2;
 559         struct state *slist;
 560
 561         /* no loops are needed if it's anchored */
 562         for (a = pre->outs; a != NULL; a = a->outchain)
 563         {
 564                 assert(a->type == PLAIN);
 565                 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
 566                         break;
 567         }
 568         if (a != NULL)
 569         {
 570                 /* add implicit .* in front */
 571                 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
 572
 573                 /* and ^* and \A* too -- not always necessary, but harmless */
 574                 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
 575                 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
 576         }
 577
 578         /*
 579          * Now here's the subtle part.  Because many REs have no lookback
 580          * constraints, often knowing when you were in the pre state tells you
 581          * little; it's the next state(s) that are informative.  But some of
 582          * them may have other inarcs, i.e. it may be possible to make actual
 583          * progress and then return to one of them.  We must de-optimize such
 584          * cases, splitting each such state into progress and no-progress
 585          * states.
 586  */
 587
 588         /* first, make a list of the states */
 589         slist = NULL;
 590         for (a = pre->outs; a != NULL; a = a->outchain)
 591         {
 592                 s = a->to;
 593                 for (b = s->ins; b != NULL; b = b->inchain)
 594                         if (b->from != pre)
 595                 break;
 596                 if (b != NULL)
 597                 {                                               /* must be split */
 598                         s->tmp = slist;
 599                         slist = s;
 600         }
 601                 }
 602
 603         /* do the splits */
 604         for (s = slist; s != NULL; s = s2)
 605         {
 606                 s2 = newstate(nfa);
 607                 copyouts(nfa, s, s2);
 608                 for (a = s->ins; a != NULL; a = b)
 609                 {
 610                         b = a->inchain;
 611                         if (a->from != pre)
 612                         {
 613                                 cparc(nfa, a, a->from, s2);
 614                                 freearc(nfa, a);
 615                         }
 616                 }
 617                 s2 = s->tmp;
 618                 s->tmp = NULL;                  /* clean up while we're at it */
 619         }
 620 }
 621
 622 /*
 623  * parse - parse an RE
 624  *
 625  * This is actually just the top level, which parses a bunch of branches
 626  * tied together with '|'.      They appear in the tree as the left children
 627  * of a chain of '|' subres.
 628  */
 629 static struct subre *
 630 parse(struct vars * v,
 631           int stopper,                          /* EOS or ')' */
 632           int type,                                     /* LACON (lookahead subRE) or PLAIN */
 633           struct state * init,          /* initial state */
 634           struct state * final)         /* final state */
 635 {
 636         struct state *left;                     /* scaffolding for branch */
 637         struct state *right;
 638         struct subre *branches;         /* top level */
 639         struct subre *branch;           /* current branch */
 640         struct subre *t;                        /* temporary */
 641         int                     firstbranch;    /* is this the first branch? */
 642
 643         assert(stopper == ')' || stopper == EOS);
 644
 645         branches = subre(v, '|', LONGER, init, final);
 646         NOERRN();
 647         branch = branches;
 648         firstbranch = 1;
 649         do
 650         {                                                       /* a branch */
 651                 if (!firstbranch)
 652                 {
 653                         /* need a place to hang it */
 654                         branch->right = subre(v, '|', LONGER, init, final);
 655                         NOERRN();
 656                         branch = branch->right;
 657                 }
 658                 firstbranch = 0;
 659                 left = newstate(v->nfa);
 660                 right = newstate(v->nfa);
 661                 NOERRN();
 662                 EMPTYARC(init, left);
 663                 EMPTYARC(right, final);
 664                 NOERRN();
 665                 branch->left = parsebranch(v, stopper, type, left, right, 0);
 666                 NOERRN();
 667                 branch->flags |= UP(branch->flags | branch->left->flags);
 668                 if ((branch->flags & ~branches->flags) != 0)    /* new flags */
 669                         for (t = branches; t != branch; t = t->right)
 670                                 t->flags |= branch->flags;
 671         } while (EAT('|'));
 672         assert(SEE(stopper) || SEE(EOS));
 673
 674         if (!SEE(stopper))
 675         {
 676                 assert(stopper == ')' && SEE(EOS));
 677                 ERR(REG_EPAREN);
 678         }
 679
 680         /* optimize out simple cases */
 681         if (branch == branches)
 682         {                                                       /* only one branch */
 683                 assert(branch->right == NULL);
 684                 t = branch->left;
 685                 branch->left = NULL;
 686                 freesubre(v, branches);
 687                 branches = t;
 688         }
 689         else if (!MESSY(branches->flags))
 690         {                                                       /* no interesting innards */
 691                 freesubre(v, branches->left);
 692                 branches->left = NULL;
 693                 freesubre(v, branches->right);
 694                 branches->right = NULL;
 695                 branches->op = '=';
 696         }
 697
 698         return branches;
 699 }
 700
 701 /*
 702  * parsebranch - parse one branch of an RE
 703  *
 704  * This mostly manages concatenation, working closely with parseqatom().
 705  * Concatenated things are bundled up as much as possible, with separate
 706  * ',' nodes introduced only when necessary due to substructure.
 707  */
 708 static struct subre *
 709 parsebranch(struct vars * v,
 710                         int stopper,            /* EOS or ')' */
 711                         int type,                       /* LACON (lookahead subRE) or PLAIN */
 712                         struct state * left,    /* leftmost state */
 713                         struct state * right,           /* rightmost state */
 714                         int partial)            /* is this only part of a branch? */
 715 {
 716         struct state *lp;                       /* left end of current construct */
 717         int                     seencontent;    /* is there anything in this branch yet? */
 718         struct subre *t;
 719
 720         lp = left;
 721         seencontent = 0;
 722         t = subre(v, '=', 0, left, right);      /* op '=' is tentative */
 723         NOERRN();
 724         while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
 725         {
 726                 if (seencontent)
 727                 {                                               /* implicit concat operator */
 728                         lp = newstate(v->nfa);
 729                         NOERRN();
 730                         moveins(v->nfa, right, lp);
 731         }
 732                 seencontent = 1;
 733
 734                 /* NB, recursion in parseqatom() may swallow rest of branch */
 735                 parseqatom(v, stopper, type, lp, right, t);
 736         }
 737
 738         if (!seencontent)
 739         {                                                       /* empty branch */
 740                 if (!partial)
 741                         NOTE(REG_UUNSPEC);
 742                 assert(lp == left);
 743                 EMPTYARC(left, right);
 744         }
 745
 746         return t;
 747 }
 748
 749 /*
 750  * parseqatom - parse one quantified atom or constraint of an RE
 751  *
 752  * The bookkeeping near the end cooperates very closely with parsebranch();
 753  * in particular, it contains a recursion that can involve parsing the rest
 754  * of the branch, making this function's name somewhat inaccurate.
 755  */
 756 static void
 757 parseqatom(struct vars * v,
 758                    int stopper,                 /* EOS or ')' */
 759                    int type,                    /* LACON (lookahead subRE) or PLAIN */
 760                    struct state * lp,   /* left state to hang it on */
 761                    struct state * rp,   /* right state to hang it on */
 762                    struct subre * top)  /* subtree top */
 763 {
 764         struct state *s;                        /* temporaries for new states */
 765         struct state *s2;
 766
 767 #define  ARCV(t, val)    newarc(v->nfa, t, val, lp, rp)
 768         int                     m,
 769                                 n;
 770         struct subre *atom;                     /* atom's subtree */
 771         struct subre *t;
 772         int                     cap;                    /* capturing parens? */
 773         int                     pos;                    /* positive lookahead? */
 774         int                     subno;                  /* capturing-parens or backref number */
 775         int                     atomtype;
 776         int                     qprefer;                /* quantifier short/long preference */
 777         int                     f;
 778         struct subre **atomp;           /* where the pointer to atom is */
 779
 780         /* initial bookkeeping */
 781         atom = NULL;
 782         assert(lp->nouts == 0);         /* must string new code */
 783         assert(rp->nins == 0);          /* between lp and rp */
 784         subno = 0;                                      /* just to shut lint up */
 785
 786         /* an atom or constraint... */
 787         atomtype = v->nexttype;
 788         switch (atomtype)
 789         {
 790                         /* first, constraints, which end by returning */
 791                 case '^':
 792                         ARCV('^', 1);
 793                         if (v->cflags & REG_NLANCH)
 794                                 ARCV(BEHIND, v->nlcolor);
 795                         NEXT();
 796                         return;
 797                         break;
 798                 case '$':
 799                         ARCV('$', 1);
 800                         if (v->cflags & REG_NLANCH)
 801                                 ARCV(AHEAD, v->nlcolor);
 802                         NEXT();
 803                         return;
 804                         break;
 805                 case SBEGIN:
 806                         ARCV('^', 1);           /* BOL */
 807                         ARCV('^', 0);           /* or BOS */
 808                         NEXT();
 809                         return;
 810                         break;
 811                 case SEND:
 812                         ARCV('$', 1);           /* EOL */
 813                         ARCV('$', 0);           /* or EOS */
 814                         NEXT();
 815                         return;
 816                         break;
 817                 case '<':
 818                         wordchrs(v);            /* does NEXT() */
 819                         s = newstate(v->nfa);
 820                         NOERR();
 821                         nonword(v, BEHIND, lp, s);
 822                         word(v, AHEAD, s, rp);
 823                         return;
 824                         break;
 825                 case '>':
 826                         wordchrs(v);            /* does NEXT() */
 827                         s = newstate(v->nfa);
 828                         NOERR();
 829                         word(v, BEHIND, lp, s);
 830                         nonword(v, AHEAD, s, rp);
 831                         return;
 832                 break;
 833                 case WBDRY:
 834                         wordchrs(v);            /* does NEXT() */
 835                         s = newstate(v->nfa);
 836                         NOERR();
 837                         nonword(v, BEHIND, lp, s);
 838                         word(v, AHEAD, s, rp);
 839                         s = newstate(v->nfa);
 840                         NOERR();
 841                         word(v, BEHIND, lp, s);
 842                         nonword(v, AHEAD, s, rp);
 843                         return;
 844                         break;
 845                 case NWBDRY:
 846                         wordchrs(v);            /* does NEXT() */
 847                         s = newstate(v->nfa);
 848                         NOERR();
 849                         word(v, BEHIND, lp, s);
 850                         word(v, AHEAD, s, rp);
 851                         s = newstate(v->nfa);
 852                         NOERR();
 853                         nonword(v, BEHIND, lp, s);
 854                         nonword(v, AHEAD, s, rp);
 855                         return;
 856                 break;
 857                 case LACON:                             /* lookahead constraint */
 858                         pos = v->nextvalue;
 859                         NEXT();
 860                         s = newstate(v->nfa);
 861                         s2 = newstate(v->nfa);
 862                         NOERR();
 863                         t = parse(v, ')', LACON, s, s2);
 864                         freesubre(v, t);        /* internal structure irrelevant */
 865                         assert(SEE(')') || ISERR());
 866                         NEXT();
 867                         n = newlacon(v, s, s2, pos);
 868                         NOERR();
 869                         ARCV(LACON, n);
 870                         return;
 871                         break;
 872                         /* then errors, to get them out of the way */
 873                 case '*':
 874                 case '+':
 875                 case '?':
 876                 case '{':
 877                         ERR(REG_BADRPT);
 878                         return;
 879                         break;
 880         default:
 881                         ERR(REG_ASSERT);
 882                         return;
 883                 break;
 884                         /* then plain characters, and minor variants on that theme */
 885                 case ')':                               /* unbalanced paren */
 886                         if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
 887                         {
 888                                 ERR(REG_EPAREN);
 889                                 return;
 890         }
 891                         /* legal in EREs due to specification botch */
 892                         NOTE(REG_UPBOTCH);
 893                         /* fallthrough into case PLAIN */
 894                 case PLAIN:
 895                         onechr(v, v->nextvalue, lp, rp);
 896                         okcolors(v->nfa, v->cm);
 897                         NOERR();
 898                         NEXT();
 899                 break;
 900                 case '[':
 901                         if (v->nextvalue == 1)
 902                                 bracket(v, lp, rp);
 903                         else
 904                                 cbracket(v, lp, rp);
 905                         assert(SEE(']') || ISERR());
 906                         NEXT();
 907                 break;
 908                 case '.':
 909                         rainbow(v->nfa, v->cm, PLAIN,
 910                                         (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
 911                                         lp, rp);
 912                         NEXT();
 913                         break;
 914                         /* and finally the ugly stuff */
 915                 case '(':                               /* value flags as capturing or non */
 916                         cap = (type == LACON) ? 0 : v->nextvalue;
 917                         if (cap)
 918                         {
 919                                 v->nsubexp++;
 920                                 subno = v->nsubexp;
 921                                 if ((size_t) subno >= v->nsubs)
 922                                         moresubs(v, subno);
 923                                 assert((size_t) subno < v->nsubs);
 924                         }
 925                         else
 926                                 atomtype = PLAIN;               /* something that's not '(' */
 927                         NEXT();
 928                         /* need new endpoints because tree will contain pointers */
 929                         s = newstate(v->nfa);
 930                         s2 = newstate(v->nfa);
 931                         NOERR();
 932                         EMPTYARC(lp, s);
 933                         EMPTYARC(s2, rp);
 934                         NOERR();
 935                         atom = parse(v, ')', PLAIN, s, s2);
 936                         assert(SEE(')') || ISERR());
 937                         NEXT();
 938                         NOERR();
 939                         if (cap)
 940                         {
 941                                 v->subs[subno] = atom;
 942                                 t = subre(v, '(', atom->flags | CAP, lp, rp);
 943                                 NOERR();
 944                                 t->subno = subno;
 945                                 t->left = atom;
 946                                 atom = t;
 947                         }
 948                         /* postpone everything else pending possible {0} */
 949                         break;
 950                 case BACKREF:                   /* the Feature From The Black Lagoon */
 951                         INSIST(type != LACON, REG_ESUBREG);
 952                         INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
 953                         INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
 954                         NOERR();
 955                         assert(v->nextvalue > 0);
 956                         atom = subre(v, 'b', BACKR, lp, rp);
 957                         subno = v->nextvalue;
 958                         atom->subno = subno;
 959                         EMPTYARC(lp, rp);       /* temporarily, so there's something */
 960                         NEXT();
 961                 break;
 962         }
 963
 964         /* ...and an atom may be followed by a quantifier */
 965         switch (v->nexttype)
 966         {
 967                 case '*':
 968                         m = 0;
 969                         n = INFINITY;
 970                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 971                         NEXT();
 972                         break;
 973                 case '+':
 974                         m = 1;
 975                         n = INFINITY;
 976                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 977                         NEXT();
 978                         break;
 979                 case '?':
 980                         m = 0;
 981                         n = 1;
 982                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 983                 NEXT();
 984                         break;
 985                 case '{':
 986                         NEXT();
 987                         m = scannum(v);
 988                         if (EAT(','))
 989                         {
 990                                 if (SEE(DIGIT))
 991                                         n = scannum(v);
 992                                 else
 993                                         n = INFINITY;
 994                                 if (m > n)
 995                                 {
 996                                         ERR(REG_BADBR);
 997                 return;
 998         }
 999                                 /* {m,n} exercises preference, even if it's {m,m} */
1000                                 qprefer = (v->nextvalue) ? LONGER : SHORTER;
1001                         }
1002                         else
1003                         {
1004                                 n = m;
1005                                 /* {m} passes operand's preference through */
1006                                 qprefer = 0;
1007                         }
1008                         if (!SEE('}'))
1009                         {                                       /* catches errors too */
1010                                 ERR(REG_BADBR);
1011                                 return;
1012                         }
1013                         NEXT();
1014                         break;
1015                 default:                                /* no quantifier */
1016                         m = n = 1;
1017                         qprefer = 0;
1018                         break;
1019         }
1020
1021         /* annoying special case:  {0} or {0,0} cancels everything */
1022         if (m == 0 && n == 0)
1023         {
1024                 if (atom != NULL)
1025                         freesubre(v, atom);
1026                 if (atomtype == '(')
1027                         v->subs[subno] = NULL;
1028                 delsub(v->nfa, lp, rp);
1029                 EMPTYARC(lp, rp);
1030                 return;
1031         }
1032
1033         /* if not a messy case, avoid hard part */
1034         assert(!MESSY(top->flags));
1035         f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1036         if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1037         {
1038                 if (!(m == 1 && n == 1))
1039                         repeat(v, lp, rp, m, n);
1040                 if (atom != NULL)
1041                         freesubre(v, atom);
1042                 top->flags = f;
1043                 return;
1044         }
1045
1046         /*
1047          * hard part:  something messy That is, capturing parens, back
1048          * reference, short/long clash, or an atom with substructure
1049          * containing one of those.
1050  */
1051
1052         /* now we'll need a subre for the contents even if they're boring */
1053         if (atom == NULL)
1054         {
1055                 atom = subre(v, '=', 0, lp, rp);
1056                 NOERR();
1057         }
1058
1059         /*
1060          * prepare a general-purpose state skeleton
1061          *
1062          * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1063          * [lp] ----> [s2] ----bypass---------------------
1064          *
1065          * where bypass is an empty, and prefix is some repetitions of atom
1066  */
1067         s = newstate(v->nfa);           /* first, new endpoints for the atom */
1068         s2 = newstate(v->nfa);
1069         NOERR();
1070         moveouts(v->nfa, lp, s);
1071         moveins(v->nfa, rp, s2);
1072         NOERR();
1073         atom->begin = s;
1074         atom->end = s2;
1075         s = newstate(v->nfa);           /* and spots for prefix and bypass */
1076         s2 = newstate(v->nfa);
1077         NOERR();
1078         EMPTYARC(lp, s);
1079         EMPTYARC(lp, s2);
1080         NOERR();
1081
1082         /* break remaining subRE into x{...} and what follows */
1083         t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1084         t->left = atom;
1085         atomp = &t->left;
1086         /* here we should recurse... but we must postpone that to the end */
1087
1088         /* split top into prefix and remaining */
1089         assert(top->op == '=' && top->left == NULL && top->right == NULL);
1090         top->left = subre(v, '=', top->flags, top->begin, lp);
1091         top->op = '.';
1092         top->right = t;
1093
1094         /* if it's a backref, now is the time to replicate the subNFA */
1095         if (atomtype == BACKREF)
1096         {
1097                 assert(atom->begin->nouts == 1);                /* just the EMPTY */
1098                 delsub(v->nfa, atom->begin, atom->end);
1099                 assert(v->subs[subno] != NULL);
1100                 /* and here's why the recursion got postponed:  it must */
1101                 /* wait until the skeleton is filled in, because it may */
1102                 /* hit a backref that wants to copy the filled-in skeleton */
1103                 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1104                            atom->begin, atom->end);
1105                 NOERR();
1106         }
1107
1108         /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1109         if (m == 0)
1110         {
1111                 EMPTYARC(s2, atom->end);        /* the bypass */
1112                 assert(PREF(qprefer) != 0);
1113                 f = COMBINE(qprefer, atom->flags);
1114                 t = subre(v, '|', f, lp, atom->end);
1115                 NOERR();
1116                 t->left = atom;
1117                 t->right = subre(v, '|', PREF(f), s2, atom->end);
1118                 NOERR();
1119                 t->right->left = subre(v, '=', 0, s2, atom->end);
1120                 NOERR();
1121                 *atomp = t;
1122                 atomp = &t->left;
1123                 m = 1;
1124         }
1125
1126         /* deal with the rest of the quantifier */
1127         if (atomtype == BACKREF)
1128         {
1129                 /* special case:  backrefs have internal quantifiers */
1130                 EMPTYARC(s, atom->begin);               /* empty prefix */
1131                 /* just stuff everything into atom */
1132                 repeat(v, atom->begin, atom->end, m, n);
1133                 atom->min = (short) m;
1134                 atom->max = (short) n;
1135                 atom->flags |= COMBINE(qprefer, atom->flags);
1136         }
1137         else if (m == 1 && n == 1)
1138         {
1139                 /* no/vacuous quantifier:  done */
1140                 EMPTYARC(s, atom->begin);               /* empty prefix */
1141         }
1142         else
1143         {
1144                 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1145                 /* parens in only second x */
1146                 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1147                 assert(m >= 1 && m != INFINITY && n >= 1);
1148                 repeat(v, s, atom->begin, m - 1, (n == INFINITY) ? n : n - 1);
1149                 f = COMBINE(qprefer, atom->flags);
1150                 t = subre(v, '.', f, s, atom->end);             /* prefix and atom */
1151                 NOERR();
1152                 t->left = subre(v, '=', PREF(f), s, atom->begin);
1153                 NOERR();
1154                 t->right = atom;
1155                 *atomp = t;
1156         }
1157
1158         /* and finally, look after that postponed recursion */
1159         t = top->right;
1160         if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1161                 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1162         else
1163         {
1164                 EMPTYARC(atom->end, rp);
1165                 t->right = subre(v, '=', 0, atom->end, rp);
1166         }
1167         assert(SEE('|') || SEE(stopper) || SEE(EOS));
1168         t->flags |= COMBINE(t->flags, t->right->flags);
1169         top->flags |= COMBINE(top->flags, t->flags);
1170 }
1171
1172 /*
1173  * nonword - generate arcs for non-word-character ahead or behind
1174  */
1175 static void
1176 nonword(struct vars * v,
1177                 int dir,                                /* AHEAD or BEHIND */
1178                 struct state * lp,
1179                 struct state * rp)
1180 {
1181         int                     anchor = (dir == AHEAD) ? '$' : '^';
1182
1183         assert(dir == AHEAD || dir == BEHIND);
1184         newarc(v->nfa, anchor, 1, lp, rp);
1185         newarc(v->nfa, anchor, 0, lp, rp);
1186         colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1187         /* (no need for special attention to \n) */
1188 }
1189
1190 /*
1191  * word - generate arcs for word character ahead or behind
1192  */
1193 static void
1194 word(struct vars * v,
1195          int dir,                                       /* AHEAD or BEHIND */
1196          struct state * lp,
1197          struct state * rp)
1198 {
1199         assert(dir == AHEAD || dir == BEHIND);
1200         cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1201         /* (no need for special attention to \n) */
1202 }
1203
1204 /*
1205  * scannum - scan a number
1206  */
1207 static int                                              /* value, <= DUPMAX */
1208 scannum(struct vars * v)
1209 {
1210         int                     n = 0;
1211
1212         while (SEE(DIGIT) && n < DUPMAX)
1213         {
1214                 n = n * 10 + v->nextvalue;
1215                 NEXT();
1216         }
1217         if (SEE(DIGIT) || n > DUPMAX)
1218         {
1219                 ERR(REG_BADBR);
1220                 return 0;
1221         }
1222         return n;
1223 }
1224
1225 /*
1226  * repeat - replicate subNFA for quantifiers
1227  *
1228  * The duplication sequences used here are chosen carefully so that any
1229  * pointers starting out pointing into the subexpression end up pointing into
1230  * the last occurrence.  (Note that it may not be strung between the same
1231  * left and right end states, however!)  This used to be important for the
1232  * subRE tree, although the important bits are now handled by the in-line
1233  * code in parse(), and when this is called, it doesn't matter any more.
1234  */
1235 static void
1236 repeat(struct vars * v,
1237            struct state * lp,
1238            struct state * rp,
1239            int m,
1240            int n)
1241 {
1242 #define  SOME    2
1243 #define  INF 3
1244 #define  PAIR(x, y)  ((x)*4 + (y))
1245 #define  REDUCE(x)       ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1246         const int       rm = REDUCE(m);
1247         const int       rn = REDUCE(n);
1248         struct state *s;
1249         struct state *s2;
1250
1251         switch (PAIR(rm, rn))
1252         {
1253                 case PAIR(0, 0):                /* empty string */
1254                         delsub(v->nfa, lp, rp);
1255                         EMPTYARC(lp, rp);
1256                         break;
1257                 case PAIR(0, 1):                /* do as x| */
1258                         EMPTYARC(lp, rp);
1259                         break;
1260                 case PAIR(0, SOME):             /* do as x{1,n}| */
1261                         repeat(v, lp, rp, 1, n);
1262                         NOERR();
1263                         EMPTYARC(lp, rp);
1264                         break;
1265                 case PAIR(0, INF):              /* loop x around */
1266                         s = newstate(v->nfa);
1267                         NOERR();
1268                         moveouts(v->nfa, lp, s);
1269                         moveins(v->nfa, rp, s);
1270                         EMPTYARC(lp, s);
1271                         EMPTYARC(s, rp);
1272                         break;
1273                 case PAIR(1, 1):                /* no action required */
1274                         break;
1275                 case PAIR(1, SOME):             /* do as x{0,n-1}x = (x{1,n-1}|)x */
1276                         s = newstate(v->nfa);
1277                         NOERR();
1278                         moveouts(v->nfa, lp, s);
1279                         dupnfa(v->nfa, s, rp, lp, s);
1280                         NOERR();
1281                         repeat(v, lp, s, 1, n - 1);
1282                         NOERR();
1283                         EMPTYARC(lp, s);
1284                         break;
1285                 case PAIR(1, INF):              /* add loopback arc */
1286                         s = newstate(v->nfa);
1287                         s2 = newstate(v->nfa);
1288                         NOERR();
1289                         moveouts(v->nfa, lp, s);
1290                         moveins(v->nfa, rp, s2);
1291                         EMPTYARC(lp, s);
1292                         EMPTYARC(s2, rp);
1293                         EMPTYARC(s2, s);
1294                         break;
1295                 case PAIR(SOME, SOME):  /* do as x{m-1,n-1}x */
1296                         s = newstate(v->nfa);
1297                         NOERR();
1298                         moveouts(v->nfa, lp, s);
1299                         dupnfa(v->nfa, s, rp, lp, s);
1300                         NOERR();
1301                         repeat(v, lp, s, m - 1, n - 1);
1302                         break;
1303                 case PAIR(SOME, INF):   /* do as x{m-1,}x */
1304                         s = newstate(v->nfa);
1305                         NOERR();
1306                         moveouts(v->nfa, lp, s);
1307                         dupnfa(v->nfa, s, rp, lp, s);
1308                         NOERR();
1309                         repeat(v, lp, s, m - 1, n);
1310                         break;
1311                 default:
1312                         ERR(REG_ASSERT);
1313                         break;
1314         }
1315 }
1316
1317 /*
1318  * bracket - handle non-complemented bracket expression
1319  * Also called from cbracket for complemented bracket expressions.
1320  */
1321 static void
1322 bracket(struct vars * v,
1323                 struct state * lp,
1324                 struct state * rp)
1325 {
1326         assert(SEE('['));
1327         NEXT();
1328         while (!SEE(']') && !SEE(EOS))
1329                 brackpart(v, lp, rp);
1330         assert(SEE(']') || ISERR());
1331         okcolors(v->nfa, v->cm);
1332 }
1333
1334 /*
1335  * cbracket - handle complemented bracket expression
1336  * We do it by calling bracket() with dummy endpoints, and then complementing
1337  * the result.  The alternative would be to invoke rainbow(), and then delete
1338  * arcs as the b.e. is seen... but that gets messy.
1339  */
1340 static void
1341 cbracket(struct vars * v,
1342                  struct state * lp,
1343                  struct state * rp)
1344 {
1345         struct state *left = newstate(v->nfa);
1346         struct state *right = newstate(v->nfa);
1347         struct state *s;
1348         struct arc *a;                          /* arc from lp */
1349         struct arc *ba;                         /* arc from left, from bracket() */
1350         struct arc *pa;                         /* MCCE-prototype arc */
1351         color           co;
1352         chr                *p;
1353         int                     i;
1354
1355         NOERR();
1356         bracket(v, left, right);
1357         if (v->cflags & REG_NLSTOP)
1358                 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1359         NOERR();
1360
1361         assert(lp->nouts == 0);         /* all outarcs will be ours */
1362
1363         /* easy part of complementing */
1364         colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1365         NOERR();
1366         if (v->mcces == NULL)
1367         {                                                       /* no MCCEs -- we're done */
1368                 dropstate(v->nfa, left);
1369                 assert(right->nins == 0);
1370                 freestate(v->nfa, right);
1371                 return;
1372         }
1373
1374         /* but complementing gets messy in the presence of MCCEs... */
1375         NOTE(REG_ULOCALE);
1376         for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--)
1377         {
1378                 co = GETCOLOR(v->cm, *p);
1379                 a = findarc(lp, PLAIN, co);
1380                 ba = findarc(left, PLAIN, co);
1381                 if (ba == NULL)
1382                 {
1383                         assert(a != NULL);
1384                         freearc(v->nfa, a);
1385                 }
1386                 else
1387                         assert(a == NULL);
1388                 s = newstate(v->nfa);
1389                 NOERR();
1390                 newarc(v->nfa, PLAIN, co, lp, s);
1391                 NOERR();
1392                 pa = findarc(v->mccepbegin, PLAIN, co);
1393                 assert(pa != NULL);
1394                 if (ba == NULL)
1395                 {                                               /* easy case, need all of them */
1396                         cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1397                         newarc(v->nfa, '$', 1, s, rp);
1398                         newarc(v->nfa, '$', 0, s, rp);
1399                         colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1400                 }
1401                 else
1402                 {                                               /* must be selective */
1403                         if (findarc(ba->to, '$', 1) == NULL)
1404                         {
1405                                 newarc(v->nfa, '$', 1, s, rp);
1406                                 newarc(v->nfa, '$', 0, s, rp);
1407                                 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1408                                                                 s, rp);
1409                         }
1410                         for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1411                                 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1412                                         newarc(v->nfa, PLAIN, pa->co, s, rp);
1413                         if (s->nouts == 0)      /* limit of selectivity: none */
1414                                 dropstate(v->nfa, s);   /* frees arc too */
1415                 }
1416                 NOERR();
1417         }
1418
1419         delsub(v->nfa, left, right);
1420         assert(left->nouts == 0);
1421         freestate(v->nfa, left);
1422         assert(right->nins == 0);
1423         freestate(v->nfa, right);
1424 }
1425
1426 /*
1427  * brackpart - handle one item (or range) within a bracket expression
1428  */
1429 static void
1430 brackpart(struct vars * v,
1431                   struct state * lp,
1432                   struct state * rp)
1433 {
1434         celt            startc;
1435         celt            endc;
1436         struct cvec *cv;
1437         chr                *startp;
1438         chr                *endp;
1439         chr                     c[1];
1440
1441         /* parse something, get rid of special cases, take shortcuts */
1442         switch (v->nexttype)
1443         {
1444                 case RANGE:                             /* a-b-c or other botch */
1445                         ERR(REG_ERANGE);
1446                         return;
1447                 break;
1448                 case PLAIN:
1449                         c[0] = v->nextvalue;
1450                         NEXT();
1451                         /* shortcut for ordinary chr (not range, not MCCE leader) */
1452                         if (!SEE(RANGE) && !ISCELEADER(v, c[0]))
1453                         {
1454                                 onechr(v, c[0], lp, rp);
1455                                 return;
1456                         }
1457                         startc = element(v, c, c + 1);
1458                         NOERR();
1459                 break;
1460                 case COLLEL:
1461                         startp = v->now;
1462                         endp = scanplain(v);
1463                         INSIST(startp < endp, REG_ECOLLATE);
1464                         NOERR();
1465                         startc = element(v, startp, endp);
1466                         NOERR();
1467                         break;
1468                 case ECLASS:
1469                         startp = v->now;
1470                         endp = scanplain(v);
1471                         INSIST(startp < endp, REG_ECOLLATE);
1472                         NOERR();
1473                         startc = element(v, startp, endp);
1474                         NOERR();
1475                         cv = eclass(v, startc, (v->cflags & REG_ICASE));
1476                         NOERR();
1477                         dovec(v, cv, lp, rp);
1478                         return;
1479                 break;
1480                 case CCLASS:
1481                         startp = v->now;
1482                         endp = scanplain(v);
1483                         INSIST(startp < endp, REG_ECTYPE);
1484                         NOERR();
1485                         cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1486                         NOERR();
1487                         dovec(v, cv, lp, rp);
1488                         return;
1489                 break;
1490                 default:
1491                         ERR(REG_ASSERT);
1492                         return;
1493                 break;
1494         }
1495
1496         if (SEE(RANGE))
1497         {
1498                 NEXT();
1499                 switch (v->nexttype)
1500                 {
1501                         case PLAIN:
1502                         case RANGE:
1503                                 c[0] = v->nextvalue;
1504                                 NEXT();
1505                                 endc = element(v, c, c + 1);
1506                                 NOERR();
1507                 break;
1508                         case COLLEL:
1509                                 startp = v->now;
1510                                 endp = scanplain(v);
1511                                 INSIST(startp < endp, REG_ECOLLATE);
1512                                 NOERR();
1513                                 endc = element(v, startp, endp);
1514                                 NOERR();
1515                 break;
1516                         default:
1517                                 ERR(REG_ERANGE);
1518                                 return;
1519                 break;
1520         }
1521         }
1522         else
1523                 endc = startc;
1524
1525         /*
1526          * Ranges are unportable.  Actually, standard C does guarantee that
1527          * digits are contiguous, but making that an exception is just too
1528          * complicated.
1529  */
1530         if (startc != endc)
1531                 NOTE(REG_UUNPORT);
1532         cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1533         NOERR();
1534         dovec(v, cv, lp, rp);
1535 }
1536
1537 /*
1538  * scanplain - scan PLAIN contents of [. etc.
1539  *
1540  * Certain bits of trickery in lex.c know that this code does not try
1541  * to look past the final bracket of the [. etc.
1542  */
1543 static chr *                                    /* just after end of sequence */
1544 scanplain(struct vars * v)
1545 {
1546         chr                *endp;
1547
1548         assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1549         NEXT();
1550
1551         endp = v->now;
1552         while (SEE(PLAIN))
1553         {
1554                 endp = v->now;
1555                 NEXT();
1556         }
1557
1558         assert(SEE(END) || ISERR());
1559         NEXT();
1560
1561         return endp;
1562 }
1563
1564 /*
1565  * leaders - process a cvec of collating elements to also include leaders
1566  * Also gives all characters involved their own colors, which is almost
1567  * certainly necessary, and sets up little disconnected subNFA.
1568  */
1569 static void
1570 leaders(struct vars * v,
1571                 struct cvec * cv)
1572 {
1573         int                     mcce;
1574         chr                *p;
1575         chr                     leader;
1576         struct state *s;
1577         struct arc *a;
1578
1579         v->mccepbegin = newstate(v->nfa);
1580         v->mccepend = newstate(v->nfa);
1581         NOERR();
1582
1583         for (mcce = 0; mcce < cv->nmcces; mcce++)
1584         {
1585                 p = cv->mcces[mcce];
1586                 leader = *p;
1587                 if (!haschr(cv, leader))
1588                 {
1589                         addchr(cv, leader);
1590                         s = newstate(v->nfa);
1591                         newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1592                                    v->mccepbegin, s);
1593                         okcolors(v->nfa, v->cm);
1594                 }
1595                 else
1596                 {
1597                         a = findarc(v->mccepbegin, PLAIN,
1598                                                 GETCOLOR(v->cm, leader));
1599                         assert(a != NULL);
1600                         s = a->to;
1601                         assert(s != v->mccepend);
1602                 }
1603                 p++;
1604                 assert(*p != 0 && *(p + 1) == 0);               /* only 2-char MCCEs for
1605                                                                                                  * now */
1606                 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1607                 okcolors(v->nfa, v->cm);
1608         }
1609 }
1610
1611 /*
1612  * onechr - fill in arcs for a plain character, and possible case complements
1613  * This is mostly a shortcut for efficient handling of the common case.
1614  */
1615 static void
1616 onechr(struct vars * v,
1617            chr c,
1618            struct state * lp,
1619            struct state * rp)
1620 {
1621         if (!(v->cflags & REG_ICASE))
1622         {
1623                 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1624                 return;
1625         }
1626
1627         /* rats, need general case anyway... */
1628         dovec(v, allcases(v, c), lp, rp);
1629 }
1630
1631 /*
1632  * dovec - fill in arcs for each element of a cvec
1633  * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1634  */
1635 static void
1636 dovec(struct vars * v,
1637           struct cvec * cv,
1638           struct state * lp,
1639           struct state * rp)
1640 {
1641         chr                     ch,
1642                                 from,
1643                                 to;
1644         celt            ce;
1645         chr                *p;
1646         int                     i;
1647         color           co;
1648         struct cvec *leads;
1649         struct arc *a;
1650         struct arc *pa;                         /* arc in prototype */
1651         struct state *s;
1652         struct state *ps;                       /* state in prototype */
1653
1654         /* need a place to store leaders, if any */
1655         if (nmcces(v) > 0)
1656         {
1657                 assert(v->mcces != NULL);
1658                 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs)
1659                 {
1660                         if (v->cv2 != NULL)
1661                                 free(v->cv2);
1662                         v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1663                         NOERR();
1664                         leads = v->cv2;
1665                 }
1666                 else
1667                         leads = clearcvec(v->cv2);
1668                 }
1669         else
1670                 leads = NULL;
1671
1672         /* first, get the ordinary characters out of the way */
1673         for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
1674         {
1675                 ch = *p;
1676                 if (!ISCELEADER(v, ch))
1677                         newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1678                 else
1679                 {
1680                         assert(singleton(v->cm, ch));
1681                         assert(leads != NULL);
1682                         if (!haschr(leads, ch))
1683                                 addchr(leads, ch);
1684         }
1685         }
1686
1687         /* and the ranges */
1688         for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
1689         {
1690                 from = *p;
1691                 to = *(p + 1);
1692                 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT)
1693                 {
1694                         if (from < ce)
1695                                 subrange(v, from, ce - 1, lp, rp);
1696                         assert(singleton(v->cm, ce));
1697                         assert(leads != NULL);
1698                         if (!haschr(leads, ce))
1699                                 addchr(leads, ce);
1700                         from = ce + 1;
1701                 }
1702                 if (from <= to)
1703                         subrange(v, from, to, lp, rp);
1704         }
1705
1706         if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1707                 return;
1708
1709         /* deal with the MCCE leaders */
1710         NOTE(REG_ULOCALE);
1711         for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--)
1712         {
1713                 co = GETCOLOR(v->cm, *p);
1714                 a = findarc(lp, PLAIN, co);
1715                 if (a != NULL)
1716                         s = a->to;
1717                 else
1718                 {
1719                         s = newstate(v->nfa);
1720                         NOERR();
1721                         newarc(v->nfa, PLAIN, co, lp, s);
1722                         NOERR();
1723                 }
1724                 pa = findarc(v->mccepbegin, PLAIN, co);
1725                 assert(pa != NULL);
1726                 ps = pa->to;
1727                 newarc(v->nfa, '$', 1, s, rp);
1728                 newarc(v->nfa, '$', 0, s, rp);
1729                 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1730                 NOERR();
1731         }
1732
1733         /* and the MCCEs */
1734         for (i = 0; i < cv->nmcces; i++)
1735         {
1736                 p = cv->mcces[i];
1737                 assert(singleton(v->cm, *p));
1738                 if (!singleton(v->cm, *p))
1739                 {
1740                         ERR(REG_ASSERT);
1741                         return;
1742                 }
1743                 ch = *p++;
1744                 co = GETCOLOR(v->cm, ch);
1745                 a = findarc(lp, PLAIN, co);
1746                 if (a != NULL)
1747                         s = a->to;
1748                 else
1749                 {
1750                         s = newstate(v->nfa);
1751                         NOERR();
1752                         newarc(v->nfa, PLAIN, co, lp, s);
1753                         NOERR();
1754                 }
1755                 assert(*p != 0);                /* at least two chars */
1756                 assert(singleton(v->cm, *p));
1757                 ch = *p++;
1758                 co = GETCOLOR(v->cm, ch);
1759                 assert(*p == 0);                /* and only two, for now */
1760                 newarc(v->nfa, PLAIN, co, s, rp);
1761                 NOERR();
1762         }
1763 }
1764
1765 /*
1766  * nextleader - find next MCCE leader within range
1767  */
1768 static celt                                             /* NOCELT means none */
1769 nextleader(struct vars * v,
1770                    chr from,
1771                    chr to)
1772 {
1773         int                     i;
1774         chr                *p;
1775         chr                     ch;
1776         celt            it = NOCELT;
1777
1778         if (v->mcces == NULL)
1779                 return it;
1780
1781         for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++)
1782         {
1783                 ch = *p;
1784                 if (from <= ch && ch <= to)
1785                         if (it == NOCELT || ch < it)
1786                                 it = ch;
1787         }
1788         return it;
1789 }
1790
1791 /*
1792  * wordchrs - set up word-chr list for word-boundary stuff, if needed
1793  *
1794  * The list is kept as a bunch of arcs between two dummy states; it's
1795  * disposed of by the unreachable-states sweep in NFA optimization.
1796  * Does NEXT().  Must not be called from any unusual lexical context.
1797  * This should be reconciled with the \w etc. handling in lex.c, and
1798  * should be cleaned up to reduce dependencies on input scanning.
1799  */
1800 static void
1801 wordchrs(struct vars * v)
1802 {
1803         struct state *left;
1804         struct state *right;
1805
1806         if (v->wordchrs != NULL)
1807         {
1808                 NEXT();                                 /* for consistency */
1809                 return;
1810         }
1811
1812         left = newstate(v->nfa);
1813         right = newstate(v->nfa);
1814         NOERR();
1815         /* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
1816         lexword(v);
1817         NEXT();
1818         assert(v->savenow != NULL && SEE('['));
1819         bracket(v, left, right);
1820         assert((v->savenow != NULL && SEE(']')) || ISERR());
1821         NEXT();
1822         NOERR();
1823         v->wordchrs = left;
1824 }
1825
1826 /*
1827  * subre - allocate a subre
1828  */
1829 static struct subre *
1830 subre(struct vars * v,
1831           int op,
1832           int flags,
1833           struct state * begin,
1834           struct state * end)
1835 {
1836         struct subre *ret;
1837
1838         ret = v->treefree;
1839         if (ret != NULL)
1840                 v->treefree = ret->left;
1841         else
1842         {
1843                 ret = (struct subre *) MALLOC(sizeof(struct subre));
1844                 if (ret == NULL)
1845                 {
1846                         ERR(REG_ESPACE);
1847                         return NULL;
1848         }
1849                 ret->chain = v->treechain;
1850                 v->treechain = ret;
1851         }
1852
1853         assert(strchr("|.b(=", op) != NULL);
1854
1855         ret->op = op;
1856         ret->flags = flags;
1857         ret->retry = 0;
1858         ret->subno = 0;
1859         ret->min = ret->max = 1;
1860         ret->left = NULL;
1861         ret->right = NULL;
1862         ret->begin = begin;
1863         ret->end = end;
1864         ZAPCNFA(ret->cnfa);
1865
1866         return ret;
1867 }
1868
1869 /*
1870  * freesubre - free a subRE subtree
1871  */
1872 static void
1873 freesubre(struct vars * v,              /* might be NULL */
1874                   struct subre * sr)
1875 {
1876         if (sr == NULL)
1877                 return;
1878
1879         if (sr->left != NULL)
1880                 freesubre(v, sr->left);
1881         if (sr->right != NULL)
1882                 freesubre(v, sr->right);
1883
1884         freesrnode(v, sr);
1885 }
1886
1887 /*
1888  * freesrnode - free one node in a subRE subtree
1889  */
1890 static void
1891 freesrnode(struct vars * v,             /* might be NULL */
1892                    struct subre * sr)
1893 {
1894         if (sr == NULL)
1895                 return;
1896
1897         if (!NULLCNFA(sr->cnfa))
1898                 freecnfa(&sr->cnfa);
1899         sr->flags = 0;
1900
1901         if (v != NULL)
1902         {
1903                 sr->left = v->treefree;
1904                 v->treefree = sr;
1905         }
1906         else
1907                 FREE(sr);
1908 }
1909
1910 /*
1911  * optst - optimize a subRE subtree
1912  */
1913 static void
1914 optst(struct vars * v,
1915           struct subre * t)
1916 {
1917         if (t == NULL)
1918                 return;
1919
1920         /* recurse through children */
1921         if (t->left != NULL)
1922                 optst(v, t->left);
1923         if (t->right != NULL)
1924                 optst(v, t->right);
1925 }
1926
1927 /*
1928  * numst - number tree nodes (assigning retry indexes)
1929  */
1930 static int                                              /* next number */
1931 numst(struct subre * t,
1932           int start)                            /* starting point for subtree numbers */
1933 {
1934         int                     i;
1935
1936         assert(t != NULL);
1937
1938         i = start;
1939         t->retry = (short) i++;
1940         if (t->left != NULL)
1941                 i = numst(t->left, i);
1942         if (t->right != NULL)
1943                 i = numst(t->right, i);
1944         return i;
1945 }
1946
1947 /*
1948  * markst - mark tree nodes as INUSE
1949  */
1950 static void
1951 markst(struct subre * t)
1952 {
1953         assert(t != NULL);
1954
1955         t->flags |= INUSE;
1956         if (t->left != NULL)
1957                 markst(t->left);
1958         if (t->right != NULL)
1959                 markst(t->right);
1960 }
1961
1962 /*
1963  * cleanst - free any tree nodes not marked INUSE
1964  */
1965 static void
1966 cleanst(struct vars * v)
1967 {
1968         struct subre *t;
1969         struct subre *next;
1970
1971         for (t = v->treechain; t != NULL; t = next)
1972         {
1973                 next = t->chain;
1974                 if (!(t->flags & INUSE))
1975                         FREE(t);
1976                 }
1977         v->treechain = NULL;
1978         v->treefree = NULL;                     /* just on general principles */
1979 }
1980
1981 /*
1982  * nfatree - turn a subRE subtree into a tree of compacted NFAs
1983  */
1984 static long                                             /* optimize results from top node */
1985 nfatree(struct vars * v,
1986                 struct subre * t,
1987                 FILE *f)                                /* for debug output */
1988 {
1989         assert(t != NULL && t->begin != NULL);
1990
1991         if (t->left != NULL)
1992                 (DISCARD) nfatree(v, t->left, f);
1993         if (t->right != NULL)
1994                 (DISCARD) nfatree(v, t->right, f);
1995
1996         return nfanode(v, t, f);
1997 }
1998
1999 /*
2000  * nfanode - do one NFA for nfatree
2001  */
2002 static long                                             /* optimize results */
2003 nfanode(struct vars * v,
2004                 struct subre * t,
2005                 FILE *f)                                /* for debug output */
2006 {
2007         struct nfa *nfa;
2008         long            ret = 0;
2009
2010         assert(t->begin != NULL);
2011
2012 #ifdef REG_DEBUG
2013         if (f != NULL)
2014         {
2015                 char            idbuf[50];
2016
2017                 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
2018                                 stid(t, idbuf, sizeof(idbuf)));
2019         }
2020 #endif
2021         nfa = newnfa(v, v->cm, v->nfa);
2022         NOERRZ();
2023         dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
2024         if (!ISERR())
2025         {
2026                 specialcolors(nfa);
2027                 ret = optimize(nfa, f);
2028         }
2029         if (!ISERR())
2030                 compact(nfa, &t->cnfa);
2031
2032         freenfa(nfa);
2033         return ret;
2034 }
2035
2036 /*
2037  * newlacon - allocate a lookahead-constraint subRE
2038  */
2039 static int                                              /* lacon number */
2040 newlacon(struct vars * v,
2041                  struct state * begin,
2042                  struct state * end,
2043                  int pos)
2044 {
2045         int                     n;
2046         struct subre *sub;
2047
2048         if (v->nlacons == 0)
2049         {
2050                 v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
2051                 n = 1;                                  /* skip 0th */
2052                 v->nlacons = 2;
2053                 }
2054         else
2055         {
2056                 v->lacons = (struct subre *) REALLOC(v->lacons,
2057                                                                 (v->nlacons + 1) * sizeof(struct subre));
2058                 n = v->nlacons++;
2059                 }
2060         if (v->lacons == NULL)
2061         {
2062                 ERR(REG_ESPACE);
2063                 return 0;
2064         }
2065         sub = &v->lacons[n];
2066         sub->begin = begin;
2067         sub->end = end;
2068         sub->subno = pos;
2069         ZAPCNFA(sub->cnfa);
2070         return n;
2071 }
2072
2073 /*
2074  * freelacons - free lookahead-constraint subRE vector
2075  */
2076 static void
2077 freelacons(struct subre * subs,
2078                    int n)
2079 {
2080         struct subre *sub;
2081         int                     i;
2082
2083         assert(n > 0);
2084         for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)      /* no 0th */
2085                 if (!NULLCNFA(sub->cnfa))
2086                         freecnfa(&sub->cnfa);
2087         FREE(subs);
2088 }
2089
2090 /*
2091  * rfree - free a whole RE (insides of regfree)
2092  */
2093 static void
2094 rfree(regex_t *re)
2095 {
2096         struct guts *g;
2097
2098         if (re == NULL || re->re_magic != REMAGIC)
2099                 return;
2100
2101         re->re_magic = 0;                       /* invalidate RE */
2102         g = (struct guts *) re->re_guts;
2103         re->re_guts = NULL;
2104         re->re_fns = NULL;
2105         g->magic = 0;
2106         freecm(&g->cmap);
2107         if (g->tree != NULL)
2108                 freesubre((struct vars *) NULL, g->tree);
2109         if (g->lacons != NULL)
2110                 freelacons(g->lacons, g->nlacons);
2111         if (!NULLCNFA(g->search))
2112                 freecnfa(&g->search);
2113         FREE(g);
2114 }
2115
2116 #ifdef REG_DEBUG
2117
2118 /*
2119  * dump - dump an RE in human-readable form
2120  */
2121 static void
2122 dump(regex_t *re,
2123          FILE *f)
2124 {
2125         struct guts *g;
2126         int                     i;
2127
2128         if (re->re_magic != REMAGIC)
2129                 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2130                                 REMAGIC);
2131         if (re->re_guts == NULL)
2132         {
2133                 fprintf(f, "NULL guts!!!\n");
2134                 return;
2135         }
2136         g = (struct guts *) re->re_guts;
2137         if (g->magic != GUTSMAGIC)
2138                 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2139                                 GUTSMAGIC);
2140
2141         fprintf(f, "\n\n\n========= DUMP ==========\n");
2142         fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2143                         re->re_nsub, re->re_info, re->re_csize, g->ntree);
2144
2145         dumpcolors(&g->cmap, f);
2146         if (!NULLCNFA(g->search))
2147         {
2148                 printf("\nsearch:\n");
2149                 dumpcnfa(&g->search, f);
2150         }
2151         for (i = 1; i < g->nlacons; i++)
2152         {
2153                 fprintf(f, "\nla%d (%s):\n", i,
2154                                 (g->lacons[i].subno) ? "positive" : "negative");
2155                 dumpcnfa(&g->lacons[i].cnfa, f);
2156         }
2157         fprintf(f, "\n");
2158         dumpst(g->tree, f, 0);
2159 }
2160
2161 /*
2162  * dumpst - dump a subRE tree
2163  */
2164 static void
2165 dumpst(struct subre * t,
2166            FILE *f,
2167            int nfapresent)                      /* is the original NFA still around? */
2168 {
2169         if (t == NULL)
2170                 fprintf(f, "null tree\n");
2171         else
2172                 stdump(t, f, nfapresent);
2173         fflush(f);
2174 }
2175
2176 /*
2177  * stdump - recursive guts of dumpst
2178  */
2179 static void
2180 stdump(struct subre * t,
2181            FILE *f,
2182            int nfapresent)                      /* is the original NFA still around? */
2183 {
2184         char            idbuf[50];
2185
2186         fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2187         if (t->flags & LONGER)
2188                 fprintf(f, " longest");
2189         if (t->flags & SHORTER)
2190                 fprintf(f, " shortest");
2191         if (t->flags & MIXED)
2192                 fprintf(f, " hasmixed");
2193         if (t->flags & CAP)
2194                 fprintf(f, " hascapture");
2195         if (t->flags & BACKR)
2196                 fprintf(f, " hasbackref");
2197         if (!(t->flags & INUSE))
2198                 fprintf(f, " UNUSED");
2199         if (t->subno != 0)
2200                 fprintf(f, " (#%d)", t->subno);
2201         if (t->min != 1 || t->max != 1)
2202         {
2203                 fprintf(f, " {%d,", t->min);
2204                 if (t->max != INFINITY)
2205                         fprintf(f, "%d", t->max);
2206                 fprintf(f, "}");
2207         }
2208         if (nfapresent)
2209                 fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2210         if (t->left != NULL)
2211                 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2212         if (t->right != NULL)
2213                 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2214         if (!NULLCNFA(t->cnfa))
2215         {
2216                 fprintf(f, "\n");
2217                 dumpcnfa(&t->cnfa, f);
2218                 fprintf(f, "\n");
2219         }
2220         if (t->left != NULL)
2221                 stdump(t->left, f, nfapresent);
2222         if (t->right != NULL)
2223                 stdump(t->right, f, nfapresent);
2224 }
2225
2226 /*
2227  * stid - identify a subtree node for dumping
2228  */
2229 static char *                                   /* points to buf or constant string */
2230 stid(struct subre * t,
2231          char *buf,
2232          size_t bufsize)
2233 {
2234         /* big enough for hex int or decimal t->retry? */
2235         if (bufsize < sizeof(int) * 2 + 3 || bufsize < sizeof(t->retry) * 3 + 1)
2236                 return "unable";
2237         if (t->retry != 0)
2238                 sprintf(buf, "%d", t->retry);
2239         else
2240                 sprintf(buf, "0x%x", (int) t);  /* may lose bits, that's okay */
2241         return buf;
2242 }
2243 #endif   /* REG_DEBUG */
2244
2245
2246 #include "regc_lex.c"
2247 #include "regc_color.c"
2248 #include "regc_nfa.c"
2249 #include "regc_cvec.c"
2250 #include "regc_locale.c"