src/regex/regcomp.c

   1 /*
   2  * re_*comp and friends - compile REs
   3  * This file #includes several others (see the bottom).
   4  *
   5  * Copyright (c) 1998, 1999 Henry Spencer.      All rights reserved.
   6  *
   7  * Development of this software was funded, in part, by Cray Research Inc.,
   8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
   9  * Corporation, none of whom are responsible for the results.  The author
  10  * thanks all of them.
  11  *
  12  * Redistribution and use in source and binary forms -- with or without
  13  * modification -- are permitted for any purpose, provided that
  14  * redistributions in source form retain this entire copyright notice and
  15  * indicate the origin and nature of any modifications.
  16  *
  17  * I'd appreciate being given credit for this package in the documentation
  18  * of software which uses it, but that is not a requirement.
  19  *
  20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
  32  *
  33  */
  34
  35 #include "regguts.h"
  36
  37 /*
  38  * forward declarations, up here so forward datatypes etc. are defined early
  39  */
  40 /* === regcomp.c === */
  41 static void moresubs(struct vars *, int);
  42 static int      freev(struct vars *, int);
  43 static void makesearch(struct vars *, struct nfa *);
  44 static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
  45 static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
  46 static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
  47 static void nonword(struct vars *, int, struct state *, struct state *);
  48 static void word(struct vars *, int, struct state *, struct state *);
  49 static int      scannum(struct vars *);
  50 static void repeat(struct vars *, struct state *, struct state *, int, int);
  51 static void bracket(struct vars *, struct state *, struct state *);
  52 static void cbracket(struct vars *, struct state *, struct state *);
  53 static void brackpart(struct vars *, struct state *, struct state *);
  54 static chr *scanplain(struct vars *);
  55 static void leaders(struct vars *, struct cvec *);
  56 static void onechr(struct vars *, chr, struct state *, struct state *);
  57 static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
  58 static celt nextleader(struct vars *, chr, chr);
  59 static void wordchrs(struct vars *);
  60 static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
  61 static void freesubre(struct vars *, struct subre *);
  62 static void freesrnode(struct vars *, struct subre *);
  63 static void optst(struct vars *, struct subre *);
  64 static int      numst(struct subre *, int);
  65 static void markst(struct subre *);
  66 static void cleanst(struct vars *);
  67 static long nfatree(struct vars *, struct subre *, FILE *);
  68 static long nfanode(struct vars *, struct subre *, FILE *);
  69 static int      newlacon(struct vars *, struct state *, struct state *, int);
  70 static void freelacons(struct subre *, int);
  71 static void rfree(regex_t *);
  72
  73 #ifdef REG_DEBUG
  74 static void dump(regex_t *, FILE *);
  75 static void dumpst(struct subre *, FILE *, int);
  76 static void stdump(struct subre *, FILE *, int);
  77 static char *stid(struct subre *, char *, size_t);
  78 #endif
  79 /* === regc_lex.c === */
  80 static void lexstart(struct vars *);
  81 static void prefixes(struct vars *);
  82 static void lexnest(struct vars *, chr *, chr *);
  83 static void lexword(struct vars *);
  84 static int      next(struct vars *);
  85 static int      lexescape(struct vars *);
  86 static chr      lexdigits(struct vars *, int, int, int);
  87 static int      brenext(struct vars *, chr);
  88 static void skip(struct vars *);
  89 static chr      newline(void);
  90 static chr      chrnamed(struct vars *, chr *, chr *, chr);
  91
  92 /* === regc_color.c === */
  93 static void initcm(struct vars *, struct colormap *);
  94 static void freecm(struct colormap *);
  95 static void cmtreefree(struct colormap *, union tree *, int);
  96 static color setcolor(struct colormap *, chr, pcolor);
  97 static color maxcolor(struct colormap *);
  98 static color newcolor(struct colormap *);
  99 static void freecolor(struct colormap *, pcolor);
 100 static color pseudocolor(struct colormap *);
 101 static color subcolor(struct colormap *, chr c);
 102 static color newsub(struct colormap *, pcolor);
 103 static void subrange(struct vars *, chr, chr, struct state *, struct state *);
 104 static void subblock(struct vars *, chr, struct state *, struct state *);
 105 static void okcolors(struct nfa *, struct colormap *);
 106 static void colorchain(struct colormap *, struct arc *);
 107 static void uncolorchain(struct colormap *, struct arc *);
 108 static int      singleton(struct colormap *, chr c);
 109 static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
 110 static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
 111
 112 #ifdef REG_DEBUG
 113 static void dumpcolors(struct colormap *, FILE *);
 114 static void fillcheck(struct colormap *, union tree *, int, FILE *);
 115 static void dumpchr(chr, FILE *);
 116 #endif
 117 /* === regc_nfa.c === */
 118 static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
 119 static void freenfa(struct nfa *);
 120 static struct state *newstate(struct nfa *);
 121 static struct state *newfstate(struct nfa *, int flag);
 122 static void dropstate(struct nfa *, struct state *);
 123 static void freestate(struct nfa *, struct state *);
 124 static void destroystate(struct nfa *, struct state *);
 125 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
 126 static struct arc *allocarc(struct nfa *, struct state *);
 127 static void freearc(struct nfa *, struct arc *);
 128 static struct arc *findarc(struct state *, int, pcolor);
 129 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
 130 static void moveins(struct nfa *, struct state *, struct state *);
 131 static void copyins(struct nfa *, struct state *, struct state *);
 132 static void moveouts(struct nfa *, struct state *, struct state *);
 133 static void copyouts(struct nfa *, struct state *, struct state *);
 134 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 135 static void delsub(struct nfa *, struct state *, struct state *);
 136 static void deltraverse(struct nfa *, struct state *, struct state *);
 137 static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
 138 static void duptraverse(struct nfa *, struct state *, struct state *);
 139 static void cleartraverse(struct nfa *, struct state *);
 140 static void specialcolors(struct nfa *);
 141 static long optimize(struct nfa *, FILE *);
 142 static void pullback(struct nfa *, FILE *);
 143 static int      pull(struct nfa *, struct arc *);
 144 static void pushfwd(struct nfa *, FILE *);
 145 static int      push(struct nfa *, struct arc *);
 146
 147 #define INCOMPATIBLE    1               /* destroys arc */
 148 #define SATISFIED       2                       /* constraint satisfied */
 149 #define COMPATIBLE      3                       /* compatible but not satisfied yet */
 150 static int      combine(struct arc *, struct arc *);
 151 static void fixempties(struct nfa *, FILE *);
 152 static int      unempty(struct nfa *, struct arc *);
 153 static void cleanup(struct nfa *);
 154 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
 155 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
 156 static long analyze(struct nfa *);
 157 static void compact(struct nfa *, struct cnfa *);
 158 static void carcsort(struct carc *, struct carc *);
 159 static void freecnfa(struct cnfa *);
 160 static void dumpnfa(struct nfa *, FILE *);
 161
 162 #ifdef REG_DEBUG
 163 static void dumpstate(struct state *, FILE *);
 164 static void dumparcs(struct state *, FILE *);
 165 static int      dumprarcs(struct arc *, struct state *, FILE *, int);
 166 static void dumparc(struct arc *, struct state *, FILE *);
 167 static void dumpcnfa(struct cnfa *, FILE *);
 168 static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
 169 #endif
 170 /* === regc_cvec.c === */
 171 static struct cvec *newcvec(int, int, int);
 172 static struct cvec *clearcvec(struct cvec *);
 173 static void addchr(struct cvec *, chr);
 174 static void addrange(struct cvec *, chr, chr);
 175 static void addmcce(struct cvec *, chr *, chr *);
 176 static int      haschr(struct cvec *, chr);
 177 static struct cvec *getcvec(struct vars *, int, int, int);
 178 static void freecvec(struct cvec *);
 179
 180 /* === regc_locale.c === */
 181 static int      wx_isdigit(wx_wchar c);
 182 static int      wx_isalpha(wx_wchar c);
 183 static int      wx_isalnum(wx_wchar c);
 184 static int      wx_isupper(wx_wchar c);
 185 static int      wx_islower(wx_wchar c);
 186 static int      wx_isgraph(wx_wchar c);
 187 static int      wx_ispunct(wx_wchar c);
 188 static int      wx_isspace(wx_wchar c);
 189 static wx_wchar wx_toupper(wx_wchar c);
 190 static wx_wchar wx_tolower(wx_wchar c);
 191 static int      nmcces(struct vars *);
 192 static int      nleaders(struct vars *);
 193 static struct cvec *allmcces(struct vars *, struct cvec *);
 194 static celt element(struct vars *, chr *, chr *);
 195 static struct cvec *range(struct vars *, celt, celt, int);
 196 static int      before(celt, celt);
 197 static struct cvec *eclass(struct vars *, celt, int);
 198 static struct cvec *cclass(struct vars *, chr *, chr *, int);
 199 static struct cvec *allcases(struct vars *, chr);
 200 static int      cmp(const chr *, const chr *, size_t);
 201 static int      casecmp(const chr *, const chr *, size_t);
 202
 203
 204 /* internal variables, bundled for easy passing around */
 205 struct vars
 206 {
 207         regex_t    *re;
 208         chr                *now;                        /* scan pointer into string */
 209         chr                *stop;                       /* end of string */
 210         chr                *savenow;            /* saved now and stop for "subroutine
 211                                                                  * call" */
 212         chr                *savestop;
 213         int                     err;                    /* error code (0 if none) */
 214         int                     cflags;                 /* copy of compile flags */
 215         int                     lasttype;               /* type of previous token */
 216         int                     nexttype;               /* type of next token */
 217         chr                     nextvalue;              /* value (if any) of next token */
 218         int                     lexcon;                 /* lexical context type (see lex.c) */
 219         int                     nsubexp;                /* subexpression count */
 220         struct subre **subs;            /* subRE pointer vector */
 221         size_t          nsubs;                  /* length of vector */
 222         struct subre *sub10[10];        /* initial vector, enough for most */
 223         struct nfa *nfa;                        /* the NFA */
 224         struct colormap *cm;            /* character color map */
 225         color           nlcolor;                /* color of newline */
 226         struct state *wordchrs;         /* state in nfa holding word-char outarcs */
 227         struct subre *tree;                     /* subexpression tree */
 228         struct subre *treechain;        /* all tree nodes allocated */
 229         struct subre *treefree;         /* any free tree nodes */
 230         int                     ntree;                  /* number of tree nodes */
 231         struct cvec *cv;                        /* interface cvec */
 232         struct cvec *cv2;                       /* utility cvec */
 233         struct cvec *mcces;                     /* collating-element information */
 234 #define  ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
 235         struct state *mccepbegin;       /* in nfa, start of MCCE prototypes */
 236         struct state *mccepend;         /* in nfa, end of MCCE prototypes */
 237         struct subre *lacons;           /* lookahead-constraint vector */
 238         int                     nlacons;                /* size of lacons */
 239 };
 240
 241 /* parsing macros; most know that `v' is the struct vars pointer */
 242 #define NEXT()  (next(v))               /* advance by one token */
 243 #define SEE(t)  (v->nexttype == (t))    /* is next token this? */
 244 #define EAT(t)  (SEE(t) && next(v))             /* if next is this, swallow it */
 245 #define VISERR(vv)      ((vv)->err != 0)        /* have we seen an error yet? */
 246 #define ISERR() VISERR(v)
 247 #define VERR(vv,e)      ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
 248                                                         ((vv)->err = (e)))
 249 #define ERR(e)  VERR(v, e)              /* record an error */
 250 #define NOERR() {if (ISERR()) return;}  /* if error seen, return */
 251 #define NOERRN()        {if (ISERR()) return NULL;} /* NOERR with retval */
 252 #define NOERRZ()        {if (ISERR()) return 0;}        /* NOERR with retval */
 253 #define INSIST(c, e)    ((c) ? 0 : ERR(e))              /* if condition false,
 254                                                                                                  * error */
 255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
 256 #define EMPTYARC(x, y)  newarc(v->nfa, EMPTY, 0, x, y)
 257
 258 /* token type codes, some also used as NFA arc types */
 259 #define EMPTY   'n'                             /* no token present */
 260 #define EOS 'e'                                 /* end of string */
 261 #define PLAIN   'p'                             /* ordinary character */
 262 #define DIGIT   'd'                             /* digit (in bound) */
 263 #define BACKREF 'b'                             /* back reference */
 264 #define COLLEL  'I'                             /* start of [. */
 265 #define ECLASS  'E'                             /* start of [= */
 266 #define CCLASS  'C'                             /* start of [: */
 267 #define END 'X'                                 /* end of [. [= [: */
 268 #define RANGE   'R'                             /* - within [] which might be range delim. */
 269 #define LACON   'L'                             /* lookahead constraint subRE */
 270 #define AHEAD   'a'                             /* color-lookahead arc */
 271 #define BEHIND  'r'                             /* color-lookbehind arc */
 272 #define WBDRY   'w'                             /* word boundary constraint */
 273 #define NWBDRY  'W'                             /* non-word-boundary constraint */
 274 #define SBEGIN  'A'                             /* beginning of string (even if not BOL) */
 275 #define SEND    'Z'                             /* end of string (even if not EOL) */
 276 #define PREFER  'P'                             /* length preference */
 277
 278 /* is an arc colored, and hence on a color chain? */
 279 #define COLORED(a)      ((a)->type == PLAIN || (a)->type == AHEAD || \
 280                                                         (a)->type == BEHIND)
 281
 282
 283
 284 /* static function list */
 285 static struct fns functions = {
 286         rfree,                                          /* regfree insides */
 287 };
 288
 289
 290
 291 /*
 292  * regcomp - compile regular expression
 293  */
 294 int
 295 regcomp(regex_t *re,
 296                    const chr *string,
 297                    int flags)
 298 {
 299
 300     size_t nLen = 0;
 301     chr* s2 = (chr*) string;
 302
 303     if (string && *string)
 304     {
 305         while(*++s2);
 306     }
 307
 308     nLen = ((s2 - string) / sizeof(chr));
 309
 310         return wx_regcomp(re, string, nLen, flags);
 311 }
 312 int
 313 wx_regcomp(regex_t *re,
 314                    const chr *string,
 315                    size_t len,
 316                    int flags)
 317 {
 318         struct vars var;
 319         struct vars *v = &var;
 320         struct guts *g;
 321         int                     i;
 322         size_t          j;
 323
 324 #ifdef REG_DEBUG
 325         FILE       *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
 326
 327 #else
 328         FILE       *debug = (FILE *) NULL;
 329 #endif
 330
 331 #define  CNOERR()        { if (ISERR()) return freev(v, v->err); }
 332
 333         /* sanity checks */
 334
 335         if (re == NULL || string == NULL)
 336                 return REG_INVARG;
 337         if ((flags & REG_QUOTE) &&
 338                 (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
 339                 return REG_INVARG;
 340         if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
 341                 return REG_INVARG;
 342
 343         /* initial setup (after which freev() is callable) */
 344         v->re = re;
 345         v->now = (chr *) string;
 346         v->stop = v->now + len;
 347         v->savenow = v->savestop = NULL;
 348         v->err = 0;
 349         v->cflags = flags;
 350         v->nsubexp = 0;
 351         v->subs = v->sub10;
 352         v->nsubs = 10;
 353         for (j = 0; j < v->nsubs; j++)
 354                 v->subs[j] = NULL;
 355         v->nfa = NULL;
 356         v->cm = NULL;
 357         v->nlcolor = COLORLESS;
 358         v->wordchrs = NULL;
 359         v->tree = NULL;
 360         v->treechain = NULL;
 361         v->treefree = NULL;
 362         v->cv = NULL;
 363         v->cv2 = NULL;
 364         v->mcces = NULL;
 365         v->lacons = NULL;
 366         v->nlacons = 0;
 367         re->re_magic = REMAGIC;
 368         re->re_info = 0;                        /* bits get set during parse */
 369         re->re_csize = sizeof(chr);
 370         re->re_guts = NULL;
 371         re->re_fns = VS(&functions);
 372
 373         /* more complex setup, malloced things */
 374         re->re_guts = VS(MALLOC(sizeof(struct guts)));
 375         if (re->re_guts == NULL)
 376                 return freev(v, REG_ESPACE);
 377         g = (struct guts *) re->re_guts;
 378         g->tree = NULL;
 379         initcm(v, &g->cmap);
 380         v->cm = &g->cmap;
 381         g->lacons = NULL;
 382         g->nlacons = 0;
 383         ZAPCNFA(g->search);
 384         v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
 385         CNOERR();
 386         v->cv = newcvec(100, 20, 10);
 387         if (v->cv == NULL)
 388                 return freev(v, REG_ESPACE);
 389         i = nmcces(v);
 390         if (i > 0)
 391         {
 392                 v->mcces = newcvec(nleaders(v), 0, i);
 393                 CNOERR();
 394                 v->mcces = allmcces(v, v->mcces);
 395                 leaders(v, v->mcces);
 396                 addmcce(v->mcces, (chr *) NULL, (chr *) NULL);  /* dummy */
 397         }
 398         CNOERR();
 399
 400         /* parsing */
 401         lexstart(v);                            /* also handles prefixes */
 402         if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
 403         {
 404                 /* assign newline a unique color */
 405                 v->nlcolor = subcolor(v->cm, newline());
 406                 okcolors(v->nfa, v->cm);
 407         }
 408         CNOERR();
 409         v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
 410         assert(SEE(EOS));                       /* even if error; ISERR() => SEE(EOS) */
 411         CNOERR();
 412         assert(v->tree != NULL);
 413
 414         /* finish setup of nfa and its subre tree */
 415         specialcolors(v->nfa);
 416         CNOERR();
 417 #ifdef REG_DEBUG
 418         if (debug != NULL)
 419         {
 420                 fprintf(debug, "\n\n\n========= RAW ==========\n");
 421                 dumpnfa(v->nfa, debug);
 422                 dumpst(v->tree, debug, 1);
 423                 }
 424 #endif
 425         optst(v, v->tree);
 426         v->ntree = numst(v->tree, 1);
 427         markst(v->tree);
 428         cleanst(v);
 429 #ifdef REG_DEBUG
 430         if (debug != NULL)
 431         {
 432                 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
 433                 dumpst(v->tree, debug, 1);
 434         }
 435 #endif
 436
 437         /* build compacted NFAs for tree and lacons */
 438         re->re_info |= nfatree(v, v->tree, debug);
 439         CNOERR();
 440         assert(v->nlacons == 0 || v->lacons != NULL);
 441         for (i = 1; i < v->nlacons; i++)
 442         {
 443 #ifdef REG_DEBUG
 444                 if (debug != NULL)
 445                         fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
 446 #endif
 447                 nfanode(v, &v->lacons[i], debug);
 448         }
 449         CNOERR();
 450         if (v->tree->flags & SHORTER)
 451                 NOTE(REG_USHORTEST);
 452
 453         /* build compacted NFAs for tree, lacons, fast search */
 454 #ifdef REG_DEBUG
 455         if (debug != NULL)
 456                 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
 457 #endif
 458         /* can sacrifice main NFA now, so use it as work area */
 459         (DISCARD) optimize(v->nfa, debug);
 460         CNOERR();
 461         makesearch(v, v->nfa);
 462         CNOERR();
 463         compact(v->nfa, &g->search);
 464         CNOERR();
 465
 466         /* looks okay, package it up */
 467         re->re_nsub = v->nsubexp;
 468         v->re = NULL;                           /* freev no longer frees re */
 469         g->magic = GUTSMAGIC;
 470         g->cflags = v->cflags;
 471         g->info = re->re_info;
 472         g->nsub = re->re_nsub;
 473         g->tree = v->tree;
 474         v->tree = NULL;
 475         g->ntree = v->ntree;
 476         g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
 477         g->lacons = v->lacons;
 478         v->lacons = NULL;
 479         g->nlacons = v->nlacons;
 480
 481 #ifdef REG_DEBUG
 482         if (flags & REG_DUMP)
 483                 dump(re, stdout);
 484 #endif
 485
 486         assert(v->err == 0);
 487         return freev(v, 0);
 488 }
 489
 490 /*
 491  * moresubs - enlarge subRE vector
 492  */
 493 static void
 494 moresubs(struct vars * v,
 495                  int wanted)                    /* want enough room for this one */
 496 {
 497         struct subre **p;
 498         size_t          n;
 499
 500         assert(wanted > 0 && (size_t) wanted >= v->nsubs);
 501         n = (size_t) wanted *3 / 2 + 1;
 502
 503         if (v->subs == v->sub10)
 504         {
 505                 p = (struct subre **) MALLOC(n * sizeof(struct subre *));
 506                 if (p != NULL)
 507                         memcpy(VS(p), VS(v->subs),
 508                                    v->nsubs * sizeof(struct subre *));
 509                 }
 510                 else
 511                 p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
 512         if (p == NULL)
 513         {
 514                 ERR(REG_ESPACE);
 515                 return;
 516         }
 517         v->subs = p;
 518         for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
 519                 *p = NULL;
 520         assert(v->nsubs == n);
 521         assert((size_t) wanted < v->nsubs);
 522 }
 523
 524 /*
 525  * freev - free vars struct's substructures where necessary
 526  *
 527  * Optionally does error-number setting, and always returns error code
 528  * (if any), to make error-handling code terser.
 529  */
 530 static int
 531 freev(struct vars * v,
 532           int err)
 533 {
 534         if (v->re != NULL)
 535                 rfree(v->re);
 536         if (v->subs != v->sub10)
 537                 FREE(v->subs);
 538         if (v->nfa != NULL)
 539                 freenfa(v->nfa);
 540         if (v->tree != NULL)
 541                 freesubre(v, v->tree);
 542         if (v->treechain != NULL)
 543                 cleanst(v);
 544         if (v->cv != NULL)
 545                 freecvec(v->cv);
 546         if (v->cv2 != NULL)
 547                 freecvec(v->cv2);
 548         if (v->mcces != NULL)
 549                 freecvec(v->mcces);
 550         if (v->lacons != NULL)
 551                 freelacons(v->lacons, v->nlacons);
 552         ERR(err);                                       /* nop if err==0 */
 553
 554         return v->err;
 555 }
 556
 557 /*
 558  * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
 559  * NFA must have been optimize()d already.
 560  */
 561 static void
 562 makesearch(struct vars * v,
 563                    struct nfa * nfa)
 564 {
 565         struct arc *a;
 566         struct arc *b;
 567         struct state *pre = nfa->pre;
 568         struct state *s;
 569         struct state *s2;
 570         struct state *slist;
 571
 572         /* no loops are needed if it's anchored */
 573         for (a = pre->outs; a != NULL; a = a->outchain)
 574         {
 575                 assert(a->type == PLAIN);
 576                 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
 577                         break;
 578         }
 579         if (a != NULL)
 580         {
 581                 /* add implicit .* in front */
 582                 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
 583
 584                 /* and ^* and \A* too -- not always necessary, but harmless */
 585                 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
 586                 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
 587         }
 588
 589         /*
 590          * Now here's the subtle part.  Because many REs have no lookback
 591          * constraints, often knowing when you were in the pre state tells you
 592          * little; it's the next state(s) that are informative.  But some of
 593          * them may have other inarcs, i.e. it may be possible to make actual
 594          * progress and then return to one of them.  We must de-optimize such
 595          * cases, splitting each such state into progress and no-progress
 596          * states.
 597  */
 598
 599         /* first, make a list of the states */
 600         slist = NULL;
 601         for (a = pre->outs; a != NULL; a = a->outchain)
 602         {
 603                 s = a->to;
 604                 for (b = s->ins; b != NULL; b = b->inchain)
 605                         if (b->from != pre)
 606                 break;
 607                 if (b != NULL)
 608                 {                                               /* must be split */
 609                         s->tmp = slist;
 610                         slist = s;
 611         }
 612                 }
 613
 614         /* do the splits */
 615         for (s = slist; s != NULL; s = s2)
 616         {
 617                 s2 = newstate(nfa);
 618                 copyouts(nfa, s, s2);
 619                 for (a = s->ins; a != NULL; a = b)
 620                 {
 621                         b = a->inchain;
 622                         if (a->from != pre)
 623                         {
 624                                 cparc(nfa, a, a->from, s2);
 625                                 freearc(nfa, a);
 626                         }
 627                 }
 628                 s2 = s->tmp;
 629                 s->tmp = NULL;                  /* clean up while we're at it */
 630         }
 631 }
 632
 633 /*
 634  * parse - parse an RE
 635  *
 636  * This is actually just the top level, which parses a bunch of branches
 637  * tied together with '|'.      They appear in the tree as the left children
 638  * of a chain of '|' subres.
 639  */
 640 static struct subre *
 641 parse(struct vars * v,
 642           int stopper,                          /* EOS or ')' */
 643           int type,                                     /* LACON (lookahead subRE) or PLAIN */
 644           struct state * init,          /* initial state */
 645           struct state * final)         /* final state */
 646 {
 647         struct state *left;                     /* scaffolding for branch */
 648         struct state *right;
 649         struct subre *branches;         /* top level */
 650         struct subre *branch;           /* current branch */
 651         struct subre *t;                        /* temporary */
 652         int                     firstbranch;    /* is this the first branch? */
 653
 654         assert(stopper == ')' || stopper == EOS);
 655
 656         branches = subre(v, '|', LONGER, init, final);
 657         NOERRN();
 658         branch = branches;
 659         firstbranch = 1;
 660         do
 661         {                                                       /* a branch */
 662                 if (!firstbranch)
 663                 {
 664                         /* need a place to hang it */
 665                         branch->right = subre(v, '|', LONGER, init, final);
 666                         NOERRN();
 667                         branch = branch->right;
 668                 }
 669                 firstbranch = 0;
 670                 left = newstate(v->nfa);
 671                 right = newstate(v->nfa);
 672                 NOERRN();
 673                 EMPTYARC(init, left);
 674                 EMPTYARC(right, final);
 675                 NOERRN();
 676                 branch->left = parsebranch(v, stopper, type, left, right, 0);
 677                 NOERRN();
 678                 branch->flags |= UP(branch->flags | branch->left->flags);
 679                 if ((branch->flags & ~branches->flags) != 0)    /* new flags */
 680                         for (t = branches; t != branch; t = t->right)
 681                                 t->flags |= branch->flags;
 682         } while (EAT('|'));
 683         assert(SEE(stopper) || SEE(EOS));
 684
 685         if (!SEE(stopper))
 686         {
 687                 assert(stopper == ')' && SEE(EOS));
 688                 ERR(REG_EPAREN);
 689         }
 690
 691         /* optimize out simple cases */
 692         if (branch == branches)
 693         {                                                       /* only one branch */
 694                 assert(branch->right == NULL);
 695                 t = branch->left;
 696                 branch->left = NULL;
 697                 freesubre(v, branches);
 698                 branches = t;
 699         }
 700         else if (!MESSY(branches->flags))
 701         {                                                       /* no interesting innards */
 702                 freesubre(v, branches->left);
 703                 branches->left = NULL;
 704                 freesubre(v, branches->right);
 705                 branches->right = NULL;
 706                 branches->op = '=';
 707         }
 708
 709         return branches;
 710 }
 711
 712 /*
 713  * parsebranch - parse one branch of an RE
 714  *
 715  * This mostly manages concatenation, working closely with parseqatom().
 716  * Concatenated things are bundled up as much as possible, with separate
 717  * ',' nodes introduced only when necessary due to substructure.
 718  */
 719 static struct subre *
 720 parsebranch(struct vars * v,
 721                         int stopper,            /* EOS or ')' */
 722                         int type,                       /* LACON (lookahead subRE) or PLAIN */
 723                         struct state * left,    /* leftmost state */
 724                         struct state * right,           /* rightmost state */
 725                         int partial)            /* is this only part of a branch? */
 726 {
 727         struct state *lp;                       /* left end of current construct */
 728         int                     seencontent;    /* is there anything in this branch yet? */
 729         struct subre *t;
 730
 731         lp = left;
 732         seencontent = 0;
 733         t = subre(v, '=', 0, left, right);      /* op '=' is tentative */
 734         NOERRN();
 735         while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
 736         {
 737                 if (seencontent)
 738                 {                                               /* implicit concat operator */
 739                         lp = newstate(v->nfa);
 740                         NOERRN();
 741                         moveins(v->nfa, right, lp);
 742         }
 743                 seencontent = 1;
 744
 745                 /* NB, recursion in parseqatom() may swallow rest of branch */
 746                 parseqatom(v, stopper, type, lp, right, t);
 747         }
 748
 749         if (!seencontent)
 750         {                                                       /* empty branch */
 751                 if (!partial)
 752                         NOTE(REG_UUNSPEC);
 753                 assert(lp == left);
 754                 EMPTYARC(left, right);
 755         }
 756
 757         return t;
 758 }
 759
 760 /*
 761  * parseqatom - parse one quantified atom or constraint of an RE
 762  *
 763  * The bookkeeping near the end cooperates very closely with parsebranch();
 764  * in particular, it contains a recursion that can involve parsing the rest
 765  * of the branch, making this function's name somewhat inaccurate.
 766  */
 767 static void
 768 parseqatom(struct vars * v,
 769                    int stopper,                 /* EOS or ')' */
 770                    int type,                    /* LACON (lookahead subRE) or PLAIN */
 771                    struct state * lp,   /* left state to hang it on */
 772                    struct state * rp,   /* right state to hang it on */
 773                    struct subre * top)  /* subtree top */
 774 {
 775         struct state *s;                        /* temporaries for new states */
 776         struct state *s2;
 777
 778 #define  ARCV(t, val)    newarc(v->nfa, t, val, lp, rp)
 779         int                     m,
 780                                 n;
 781         struct subre *atom;                     /* atom's subtree */
 782         struct subre *t;
 783         int                     cap;                    /* capturing parens? */
 784         int                     pos;                    /* positive lookahead? */
 785         int                     subno;                  /* capturing-parens or backref number */
 786         int                     atomtype;
 787         int                     qprefer;                /* quantifier short/long preference */
 788         int                     f;
 789         struct subre **atomp;           /* where the pointer to atom is */
 790
 791         /* initial bookkeeping */
 792         atom = NULL;
 793         assert(lp->nouts == 0);         /* must string new code */
 794         assert(rp->nins == 0);          /* between lp and rp */
 795         subno = 0;                                      /* just to shut lint up */
 796
 797         /* an atom or constraint... */
 798         atomtype = v->nexttype;
 799         switch (atomtype)
 800         {
 801                         /* first, constraints, which end by returning */
 802                 case '^':
 803                         ARCV('^', 1);
 804                         if (v->cflags & REG_NLANCH)
 805                                 ARCV(BEHIND, v->nlcolor);
 806                         NEXT();
 807                         return;
 808                         break;
 809                 case '$':
 810                         ARCV('$', 1);
 811                         if (v->cflags & REG_NLANCH)
 812                                 ARCV(AHEAD, v->nlcolor);
 813                         NEXT();
 814                         return;
 815                         break;
 816                 case SBEGIN:
 817                         ARCV('^', 1);           /* BOL */
 818                         ARCV('^', 0);           /* or BOS */
 819                         NEXT();
 820                         return;
 821                         break;
 822                 case SEND:
 823                         ARCV('$', 1);           /* EOL */
 824                         ARCV('$', 0);           /* or EOS */
 825                         NEXT();
 826                         return;
 827                         break;
 828                 case '<':
 829                         wordchrs(v);            /* does NEXT() */
 830                         s = newstate(v->nfa);
 831                         NOERR();
 832                         nonword(v, BEHIND, lp, s);
 833                         word(v, AHEAD, s, rp);
 834                         return;
 835                         break;
 836                 case '>':
 837                         wordchrs(v);            /* does NEXT() */
 838                         s = newstate(v->nfa);
 839                         NOERR();
 840                         word(v, BEHIND, lp, s);
 841                         nonword(v, AHEAD, s, rp);
 842                         return;
 843                 break;
 844                 case WBDRY:
 845                         wordchrs(v);            /* does NEXT() */
 846                         s = newstate(v->nfa);
 847                         NOERR();
 848                         nonword(v, BEHIND, lp, s);
 849                         word(v, AHEAD, s, rp);
 850                         s = newstate(v->nfa);
 851                         NOERR();
 852                         word(v, BEHIND, lp, s);
 853                         nonword(v, AHEAD, s, rp);
 854                         return;
 855                         break;
 856                 case NWBDRY:
 857                         wordchrs(v);            /* does NEXT() */
 858                         s = newstate(v->nfa);
 859                         NOERR();
 860                         word(v, BEHIND, lp, s);
 861                         word(v, AHEAD, s, rp);
 862                         s = newstate(v->nfa);
 863                         NOERR();
 864                         nonword(v, BEHIND, lp, s);
 865                         nonword(v, AHEAD, s, rp);
 866                         return;
 867                 break;
 868                 case LACON:                             /* lookahead constraint */
 869                         pos = v->nextvalue;
 870                         NEXT();
 871                         s = newstate(v->nfa);
 872                         s2 = newstate(v->nfa);
 873                         NOERR();
 874                         t = parse(v, ')', LACON, s, s2);
 875                         freesubre(v, t);        /* internal structure irrelevant */
 876                         assert(SEE(')') || ISERR());
 877                         NEXT();
 878                         n = newlacon(v, s, s2, pos);
 879                         NOERR();
 880                         ARCV(LACON, n);
 881                         return;
 882                         break;
 883                         /* then errors, to get them out of the way */
 884                 case '*':
 885                 case '+':
 886                 case '?':
 887                 case '{':
 888                         ERR(REG_BADRPT);
 889                         return;
 890                         break;
 891         default:
 892                         ERR(REG_ASSERT);
 893                         return;
 894                 break;
 895                         /* then plain characters, and minor variants on that theme */
 896                 case ')':                               /* unbalanced paren */
 897                         if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
 898                         {
 899                                 ERR(REG_EPAREN);
 900                                 return;
 901         }
 902                         /* legal in EREs due to specification botch */
 903                         NOTE(REG_UPBOTCH);
 904                         /* fallthrough into case PLAIN */
 905                 case PLAIN:
 906                         onechr(v, v->nextvalue, lp, rp);
 907                         okcolors(v->nfa, v->cm);
 908                         NOERR();
 909                         NEXT();
 910                 break;
 911                 case '[':
 912                         if (v->nextvalue == 1)
 913                                 bracket(v, lp, rp);
 914                         else
 915                                 cbracket(v, lp, rp);
 916                         assert(SEE(']') || ISERR());
 917                         NEXT();
 918                 break;
 919                 case '.':
 920                         rainbow(v->nfa, v->cm, PLAIN,
 921                                         (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
 922                                         lp, rp);
 923                         NEXT();
 924                         break;
 925                         /* and finally the ugly stuff */
 926                 case '(':                               /* value flags as capturing or non */
 927                         cap = (type == LACON) ? 0 : v->nextvalue;
 928                         if (cap)
 929                         {
 930                                 v->nsubexp++;
 931                                 subno = v->nsubexp;
 932                                 if ((size_t) subno >= v->nsubs)
 933                                         moresubs(v, subno);
 934                                 assert((size_t) subno < v->nsubs);
 935                         }
 936                         else
 937                                 atomtype = PLAIN;               /* something that's not '(' */
 938                         NEXT();
 939                         /* need new endpoints because tree will contain pointers */
 940                         s = newstate(v->nfa);
 941                         s2 = newstate(v->nfa);
 942                         NOERR();
 943                         EMPTYARC(lp, s);
 944                         EMPTYARC(s2, rp);
 945                         NOERR();
 946                         atom = parse(v, ')', PLAIN, s, s2);
 947                         assert(SEE(')') || ISERR());
 948                         NEXT();
 949                         NOERR();
 950                         if (cap)
 951                         {
 952                                 v->subs[subno] = atom;
 953                                 t = subre(v, '(', atom->flags | CAP, lp, rp);
 954                                 NOERR();
 955                                 t->subno = subno;
 956                                 t->left = atom;
 957                                 atom = t;
 958                         }
 959                         /* postpone everything else pending possible {0} */
 960                         break;
 961                 case BACKREF:                   /* the Feature From The Black Lagoon */
 962                         INSIST(type != LACON, REG_ESUBREG);
 963                         INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
 964                         INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
 965                         NOERR();
 966                         assert(v->nextvalue > 0);
 967                         atom = subre(v, 'b', BACKR, lp, rp);
 968                         subno = v->nextvalue;
 969                         atom->subno = subno;
 970                         EMPTYARC(lp, rp);       /* temporarily, so there's something */
 971                         NEXT();
 972                 break;
 973         }
 974
 975         /* ...and an atom may be followed by a quantifier */
 976         switch (v->nexttype)
 977         {
 978                 case '*':
 979                         m = 0;
 980                         n = INFINITY;
 981                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 982                         NEXT();
 983                         break;
 984                 case '+':
 985                         m = 1;
 986                         n = INFINITY;
 987                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 988                         NEXT();
 989                         break;
 990                 case '?':
 991                         m = 0;
 992                         n = 1;
 993                         qprefer = (v->nextvalue) ? LONGER : SHORTER;
 994                 NEXT();
 995                         break;
 996                 case '{':
 997                         NEXT();
 998                         m = scannum(v);
 999                         if (EAT(','))
1000                         {
1001                                 if (SEE(DIGIT))
1002                                         n = scannum(v);
1003                                 else
1004                                         n = INFINITY;
1005                                 if (m > n)
1006                                 {
1007                                         ERR(REG_BADBR);
1008                 return;
1009         }
1010                                 /* {m,n} exercises preference, even if it's {m,m} */
1011                                 qprefer = (v->nextvalue) ? LONGER : SHORTER;
1012                         }
1013                         else
1014                         {
1015                                 n = m;
1016                                 /* {m} passes operand's preference through */
1017                                 qprefer = 0;
1018                         }
1019                         if (!SEE('}'))
1020                         {                                       /* catches errors too */
1021                                 ERR(REG_BADBR);
1022                                 return;
1023                         }
1024                         NEXT();
1025                         break;
1026                 default:                                /* no quantifier */
1027                         m = n = 1;
1028                         qprefer = 0;
1029                         break;
1030         }
1031
1032         /* annoying special case:  {0} or {0,0} cancels everything */
1033         if (m == 0 && n == 0)
1034         {
1035                 if (atom != NULL)
1036                         freesubre(v, atom);
1037                 if (atomtype == '(')
1038                         v->subs[subno] = NULL;
1039                 delsub(v->nfa, lp, rp);
1040                 EMPTYARC(lp, rp);
1041                 return;
1042         }
1043
1044         /* if not a messy case, avoid hard part */
1045         assert(!MESSY(top->flags));
1046         f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1047         if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1048         {
1049                 if (!(m == 1 && n == 1))
1050                         repeat(v, lp, rp, m, n);
1051                 if (atom != NULL)
1052                         freesubre(v, atom);
1053                 top->flags = f;
1054                 return;
1055         }
1056
1057         /*
1058          * hard part:  something messy That is, capturing parens, back
1059          * reference, short/long clash, or an atom with substructure
1060          * containing one of those.
1061  */
1062
1063         /* now we'll need a subre for the contents even if they're boring */
1064         if (atom == NULL)
1065         {
1066                 atom = subre(v, '=', 0, lp, rp);
1067                 NOERR();
1068         }
1069
1070         /*
1071          * prepare a general-purpose state skeleton
1072          *
1073          * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1074          * [lp] ----> [s2] ----bypass---------------------
1075          *
1076          * where bypass is an empty, and prefix is some repetitions of atom
1077  */
1078         s = newstate(v->nfa);           /* first, new endpoints for the atom */
1079         s2 = newstate(v->nfa);
1080         NOERR();
1081         moveouts(v->nfa, lp, s);
1082         moveins(v->nfa, rp, s2);
1083         NOERR();
1084         atom->begin = s;
1085         atom->end = s2;
1086         s = newstate(v->nfa);           /* and spots for prefix and bypass */
1087         s2 = newstate(v->nfa);
1088         NOERR();
1089         EMPTYARC(lp, s);
1090         EMPTYARC(lp, s2);
1091         NOERR();
1092
1093         /* break remaining subRE into x{...} and what follows */
1094         t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1095         t->left = atom;
1096         atomp = &t->left;
1097         /* here we should recurse... but we must postpone that to the end */
1098
1099         /* split top into prefix and remaining */
1100         assert(top->op == '=' && top->left == NULL && top->right == NULL);
1101         top->left = subre(v, '=', top->flags, top->begin, lp);
1102         top->op = '.';
1103         top->right = t;
1104
1105         /* if it's a backref, now is the time to replicate the subNFA */
1106         if (atomtype == BACKREF)
1107         {
1108                 assert(atom->begin->nouts == 1);                /* just the EMPTY */
1109                 delsub(v->nfa, atom->begin, atom->end);
1110                 assert(v->subs[subno] != NULL);
1111                 /* and here's why the recursion got postponed:  it must */
1112                 /* wait until the skeleton is filled in, because it may */
1113                 /* hit a backref that wants to copy the filled-in skeleton */
1114                 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1115                            atom->begin, atom->end);
1116                 NOERR();
1117         }
1118
1119         /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1120         if (m == 0)
1121         {
1122                 EMPTYARC(s2, atom->end);        /* the bypass */
1123                 assert(PREF(qprefer) != 0);
1124                 f = COMBINE(qprefer, atom->flags);
1125                 t = subre(v, '|', f, lp, atom->end);
1126                 NOERR();
1127                 t->left = atom;
1128                 t->right = subre(v, '|', PREF(f), s2, atom->end);
1129                 NOERR();
1130                 t->right->left = subre(v, '=', 0, s2, atom->end);
1131                 NOERR();
1132                 *atomp = t;
1133                 atomp = &t->left;
1134                 m = 1;
1135         }
1136
1137         /* deal with the rest of the quantifier */
1138         if (atomtype == BACKREF)
1139         {
1140                 /* special case:  backrefs have internal quantifiers */
1141                 EMPTYARC(s, atom->begin);               /* empty prefix */
1142                 /* just stuff everything into atom */
1143                 repeat(v, atom->begin, atom->end, m, n);
1144                 atom->min = (short) m;
1145                 atom->max = (short) n;
1146                 atom->flags |= COMBINE(qprefer, atom->flags);
1147         }
1148         else if (m == 1 && n == 1)
1149         {
1150                 /* no/vacuous quantifier:  done */
1151                 EMPTYARC(s, atom->begin);               /* empty prefix */
1152         }
1153         else
1154         {
1155                 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1156                 /* parens in only second x */
1157                 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1158                 assert(m >= 1 && m != INFINITY && n >= 1);
1159                 repeat(v, s, atom->begin, m - 1, (n == INFINITY) ? n : n - 1);
1160                 f = COMBINE(qprefer, atom->flags);
1161                 t = subre(v, '.', f, s, atom->end);             /* prefix and atom */
1162                 NOERR();
1163                 t->left = subre(v, '=', PREF(f), s, atom->begin);
1164                 NOERR();
1165                 t->right = atom;
1166                 *atomp = t;
1167         }
1168
1169         /* and finally, look after that postponed recursion */
1170         t = top->right;
1171         if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1172                 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1173         else
1174         {
1175                 EMPTYARC(atom->end, rp);
1176                 t->right = subre(v, '=', 0, atom->end, rp);
1177         }
1178         assert(SEE('|') || SEE(stopper) || SEE(EOS));
1179         t->flags |= COMBINE(t->flags, t->right->flags);
1180         top->flags |= COMBINE(top->flags, t->flags);
1181 }
1182
1183 /*
1184  * nonword - generate arcs for non-word-character ahead or behind
1185  */
1186 static void
1187 nonword(struct vars * v,
1188                 int dir,                                /* AHEAD or BEHIND */
1189                 struct state * lp,
1190                 struct state * rp)
1191 {
1192         int                     anchor = (dir == AHEAD) ? '$' : '^';
1193
1194         assert(dir == AHEAD || dir == BEHIND);
1195         newarc(v->nfa, anchor, 1, lp, rp);
1196         newarc(v->nfa, anchor, 0, lp, rp);
1197         colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1198         /* (no need for special attention to \n) */
1199 }
1200
1201 /*
1202  * word - generate arcs for word character ahead or behind
1203  */
1204 static void
1205 word(struct vars * v,
1206          int dir,                                       /* AHEAD or BEHIND */
1207          struct state * lp,
1208          struct state * rp)
1209 {
1210         assert(dir == AHEAD || dir == BEHIND);
1211         cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1212         /* (no need for special attention to \n) */
1213 }
1214
1215 /*
1216  * scannum - scan a number
1217  */
1218 static int                                              /* value, <= DUPMAX */
1219 scannum(struct vars * v)
1220 {
1221         int                     n = 0;
1222
1223         while (SEE(DIGIT) && n < DUPMAX)
1224         {
1225                 n = n * 10 + v->nextvalue;
1226                 NEXT();
1227         }
1228         if (SEE(DIGIT) || n > DUPMAX)
1229         {
1230                 ERR(REG_BADBR);
1231                 return 0;
1232         }
1233         return n;
1234 }
1235
1236 /*
1237  * repeat - replicate subNFA for quantifiers
1238  *
1239  * The duplication sequences used here are chosen carefully so that any
1240  * pointers starting out pointing into the subexpression end up pointing into
1241  * the last occurrence.  (Note that it may not be strung between the same
1242  * left and right end states, however!)  This used to be important for the
1243  * subRE tree, although the important bits are now handled by the in-line
1244  * code in parse(), and when this is called, it doesn't matter any more.
1245  */
1246 static void
1247 repeat(struct vars * v,
1248            struct state * lp,
1249            struct state * rp,
1250            int m,
1251            int n)
1252 {
1253 #define  SOME    2
1254 #define  INF 3
1255 #define  PAIR(x, y)  ((x)*4 + (y))
1256 #define  REDUCE(x)       ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1257         const int       rm = REDUCE(m);
1258         const int       rn = REDUCE(n);
1259         struct state *s;
1260         struct state *s2;
1261
1262         switch (PAIR(rm, rn))
1263         {
1264                 case PAIR(0, 0):                /* empty string */
1265                         delsub(v->nfa, lp, rp);
1266                         EMPTYARC(lp, rp);
1267                         break;
1268                 case PAIR(0, 1):                /* do as x| */
1269                         EMPTYARC(lp, rp);
1270                         break;
1271                 case PAIR(0, SOME):             /* do as x{1,n}| */
1272                         repeat(v, lp, rp, 1, n);
1273                         NOERR();
1274                         EMPTYARC(lp, rp);
1275                         break;
1276                 case PAIR(0, INF):              /* loop x around */
1277                         s = newstate(v->nfa);
1278                         NOERR();
1279                         moveouts(v->nfa, lp, s);
1280                         moveins(v->nfa, rp, s);
1281                         EMPTYARC(lp, s);
1282                         EMPTYARC(s, rp);
1283                         break;
1284                 case PAIR(1, 1):                /* no action required */
1285                         break;
1286                 case PAIR(1, SOME):             /* do as x{0,n-1}x = (x{1,n-1}|)x */
1287                         s = newstate(v->nfa);
1288                         NOERR();
1289                         moveouts(v->nfa, lp, s);
1290                         dupnfa(v->nfa, s, rp, lp, s);
1291                         NOERR();
1292                         repeat(v, lp, s, 1, n - 1);
1293                         NOERR();
1294                         EMPTYARC(lp, s);
1295                         break;
1296                 case PAIR(1, INF):              /* add loopback arc */
1297                         s = newstate(v->nfa);
1298                         s2 = newstate(v->nfa);
1299                         NOERR();
1300                         moveouts(v->nfa, lp, s);
1301                         moveins(v->nfa, rp, s2);
1302                         EMPTYARC(lp, s);
1303                         EMPTYARC(s2, rp);
1304                         EMPTYARC(s2, s);
1305                         break;
1306                 case PAIR(SOME, SOME):  /* do as x{m-1,n-1}x */
1307                         s = newstate(v->nfa);
1308                         NOERR();
1309                         moveouts(v->nfa, lp, s);
1310                         dupnfa(v->nfa, s, rp, lp, s);
1311                         NOERR();
1312                         repeat(v, lp, s, m - 1, n - 1);
1313                         break;
1314                 case PAIR(SOME, INF):   /* do as x{m-1,}x */
1315                         s = newstate(v->nfa);
1316                         NOERR();
1317                         moveouts(v->nfa, lp, s);
1318                         dupnfa(v->nfa, s, rp, lp, s);
1319                         NOERR();
1320                         repeat(v, lp, s, m - 1, n);
1321                         break;
1322                 default:
1323                         ERR(REG_ASSERT);
1324                         break;
1325         }
1326 }
1327
1328 /*
1329  * bracket - handle non-complemented bracket expression
1330  * Also called from cbracket for complemented bracket expressions.
1331  */
1332 static void
1333 bracket(struct vars * v,
1334                 struct state * lp,
1335                 struct state * rp)
1336 {
1337         assert(SEE('['));
1338         NEXT();
1339         while (!SEE(']') && !SEE(EOS))
1340                 brackpart(v, lp, rp);
1341         assert(SEE(']') || ISERR());
1342         okcolors(v->nfa, v->cm);
1343 }
1344
1345 /*
1346  * cbracket - handle complemented bracket expression
1347  * We do it by calling bracket() with dummy endpoints, and then complementing
1348  * the result.  The alternative would be to invoke rainbow(), and then delete
1349  * arcs as the b.e. is seen... but that gets messy.
1350  */
1351 static void
1352 cbracket(struct vars * v,
1353                  struct state * lp,
1354                  struct state * rp)
1355 {
1356         struct state *left = newstate(v->nfa);
1357         struct state *right = newstate(v->nfa);
1358         struct state *s;
1359         struct arc *a;                          /* arc from lp */
1360         struct arc *ba;                         /* arc from left, from bracket() */
1361         struct arc *pa;                         /* MCCE-prototype arc */
1362         color           co;
1363         chr                *p;
1364         int                     i;
1365
1366         NOERR();
1367         bracket(v, left, right);
1368         if (v->cflags & REG_NLSTOP)
1369                 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1370         NOERR();
1371
1372         assert(lp->nouts == 0);         /* all outarcs will be ours */
1373
1374         /* easy part of complementing */
1375         colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1376         NOERR();
1377         if (v->mcces == NULL)
1378         {                                                       /* no MCCEs -- we're done */
1379                 dropstate(v->nfa, left);
1380                 assert(right->nins == 0);
1381                 freestate(v->nfa, right);
1382                 return;
1383         }
1384
1385         /* but complementing gets messy in the presence of MCCEs... */
1386         NOTE(REG_ULOCALE);
1387         for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--)
1388         {
1389                 co = GETCOLOR(v->cm, *p);
1390                 a = findarc(lp, PLAIN, co);
1391                 ba = findarc(left, PLAIN, co);
1392                 if (ba == NULL)
1393                 {
1394                         assert(a != NULL);
1395                         freearc(v->nfa, a);
1396                 }
1397                 else
1398                         assert(a == NULL);
1399                 s = newstate(v->nfa);
1400                 NOERR();
1401                 newarc(v->nfa, PLAIN, co, lp, s);
1402                 NOERR();
1403                 pa = findarc(v->mccepbegin, PLAIN, co);
1404                 assert(pa != NULL);
1405                 if (ba == NULL)
1406                 {                                               /* easy case, need all of them */
1407                         cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1408                         newarc(v->nfa, '$', 1, s, rp);
1409                         newarc(v->nfa, '$', 0, s, rp);
1410                         colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1411                 }
1412                 else
1413                 {                                               /* must be selective */
1414                         if (findarc(ba->to, '$', 1) == NULL)
1415                         {
1416                                 newarc(v->nfa, '$', 1, s, rp);
1417                                 newarc(v->nfa, '$', 0, s, rp);
1418                                 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1419                                                                 s, rp);
1420                         }
1421                         for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1422                                 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1423                                         newarc(v->nfa, PLAIN, pa->co, s, rp);
1424                         if (s->nouts == 0)      /* limit of selectivity: none */
1425                                 dropstate(v->nfa, s);   /* frees arc too */
1426                 }
1427                 NOERR();
1428         }
1429
1430         delsub(v->nfa, left, right);
1431         assert(left->nouts == 0);
1432         freestate(v->nfa, left);
1433         assert(right->nins == 0);
1434         freestate(v->nfa, right);
1435 }
1436
1437 /*
1438  * brackpart - handle one item (or range) within a bracket expression
1439  */
1440 static void
1441 brackpart(struct vars * v,
1442                   struct state * lp,
1443                   struct state * rp)
1444 {
1445         celt            startc;
1446         celt            endc;
1447         struct cvec *cv;
1448         chr                *startp;
1449         chr                *endp;
1450         chr                     c[1];
1451
1452         /* parse something, get rid of special cases, take shortcuts */
1453         switch (v->nexttype)
1454         {
1455                 case RANGE:                             /* a-b-c or other botch */
1456                         ERR(REG_ERANGE);
1457                         return;
1458                 break;
1459                 case PLAIN:
1460                         c[0] = v->nextvalue;
1461                         NEXT();
1462                         /* shortcut for ordinary chr (not range, not MCCE leader) */
1463                         if (!SEE(RANGE) && !ISCELEADER(v, c[0]))
1464                         {
1465                                 onechr(v, c[0], lp, rp);
1466                                 return;
1467                         }
1468                         startc = element(v, c, c + 1);
1469                         NOERR();
1470                 break;
1471                 case COLLEL:
1472                         startp = v->now;
1473                         endp = scanplain(v);
1474                         INSIST(startp < endp, REG_ECOLLATE);
1475                         NOERR();
1476                         startc = element(v, startp, endp);
1477                         NOERR();
1478                         break;
1479                 case ECLASS:
1480                         startp = v->now;
1481                         endp = scanplain(v);
1482                         INSIST(startp < endp, REG_ECOLLATE);
1483                         NOERR();
1484                         startc = element(v, startp, endp);
1485                         NOERR();
1486                         cv = eclass(v, startc, (v->cflags & REG_ICASE));
1487                         NOERR();
1488                         dovec(v, cv, lp, rp);
1489                         return;
1490                 break;
1491                 case CCLASS:
1492                         startp = v->now;
1493                         endp = scanplain(v);
1494                         INSIST(startp < endp, REG_ECTYPE);
1495                         NOERR();
1496                         cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1497                         NOERR();
1498                         dovec(v, cv, lp, rp);
1499                         return;
1500                 break;
1501                 default:
1502                         ERR(REG_ASSERT);
1503                         return;
1504                 break;
1505         }
1506
1507         if (SEE(RANGE))
1508         {
1509                 NEXT();
1510                 switch (v->nexttype)
1511                 {
1512                         case PLAIN:
1513                         case RANGE:
1514                                 c[0] = v->nextvalue;
1515                                 NEXT();
1516                                 endc = element(v, c, c + 1);
1517                                 NOERR();
1518                 break;
1519                         case COLLEL:
1520                                 startp = v->now;
1521                                 endp = scanplain(v);
1522                                 INSIST(startp < endp, REG_ECOLLATE);
1523                                 NOERR();
1524                                 endc = element(v, startp, endp);
1525                                 NOERR();
1526                 break;
1527                         default:
1528                                 ERR(REG_ERANGE);
1529                                 return;
1530                 break;
1531         }
1532         }
1533         else
1534                 endc = startc;
1535
1536         /*
1537          * Ranges are unportable.  Actually, standard C does guarantee that
1538          * digits are contiguous, but making that an exception is just too
1539          * complicated.
1540  */
1541         if (startc != endc)
1542                 NOTE(REG_UUNPORT);
1543         cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1544         NOERR();
1545         dovec(v, cv, lp, rp);
1546 }
1547
1548 /*
1549  * scanplain - scan PLAIN contents of [. etc.
1550  *
1551  * Certain bits of trickery in lex.c know that this code does not try
1552  * to look past the final bracket of the [. etc.
1553  */
1554 static chr *                                    /* just after end of sequence */
1555 scanplain(struct vars * v)
1556 {
1557         chr                *endp;
1558
1559         assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1560         NEXT();
1561
1562         endp = v->now;
1563         while (SEE(PLAIN))
1564         {
1565                 endp = v->now;
1566                 NEXT();
1567         }
1568
1569         assert(SEE(END) || ISERR());
1570         NEXT();
1571
1572         return endp;
1573 }
1574
1575 /*
1576  * leaders - process a cvec of collating elements to also include leaders
1577  * Also gives all characters involved their own colors, which is almost
1578  * certainly necessary, and sets up little disconnected subNFA.
1579  */
1580 static void
1581 leaders(struct vars * v,
1582                 struct cvec * cv)
1583 {
1584         int                     mcce;
1585         chr                *p;
1586         chr                     leader;
1587         struct state *s;
1588         struct arc *a;
1589
1590         v->mccepbegin = newstate(v->nfa);
1591         v->mccepend = newstate(v->nfa);
1592         NOERR();
1593
1594         for (mcce = 0; mcce < cv->nmcces; mcce++)
1595         {
1596                 p = cv->mcces[mcce];
1597                 leader = *p;
1598                 if (!haschr(cv, leader))
1599                 {
1600                         addchr(cv, leader);
1601                         s = newstate(v->nfa);
1602                         newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1603                                    v->mccepbegin, s);
1604                         okcolors(v->nfa, v->cm);
1605                 }
1606                 else
1607                 {
1608                         a = findarc(v->mccepbegin, PLAIN,
1609                                                 GETCOLOR(v->cm, leader));
1610                         assert(a != NULL);
1611                         s = a->to;
1612                         assert(s != v->mccepend);
1613                 }
1614                 p++;
1615                 assert(*p != 0 && *(p + 1) == 0);               /* only 2-char MCCEs for
1616                                                                                                  * now */
1617                 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1618                 okcolors(v->nfa, v->cm);
1619         }
1620 }
1621
1622 /*
1623  * onechr - fill in arcs for a plain character, and possible case complements
1624  * This is mostly a shortcut for efficient handling of the common case.
1625  */
1626 static void
1627 onechr(struct vars * v,
1628            chr c,
1629            struct state * lp,
1630            struct state * rp)
1631 {
1632         if (!(v->cflags & REG_ICASE))
1633         {
1634                 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1635                 return;
1636         }
1637
1638         /* rats, need general case anyway... */
1639         dovec(v, allcases(v, c), lp, rp);
1640 }
1641
1642 /*
1643  * dovec - fill in arcs for each element of a cvec
1644  * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1645  */
1646 static void
1647 dovec(struct vars * v,
1648           struct cvec * cv,
1649           struct state * lp,
1650           struct state * rp)
1651 {
1652         chr                     ch,
1653                                 from,
1654                                 to;
1655         celt            ce;
1656         chr                *p;
1657         int                     i;
1658         color           co;
1659         struct cvec *leads;
1660         struct arc *a;
1661         struct arc *pa;                         /* arc in prototype */
1662         struct state *s;
1663         struct state *ps;                       /* state in prototype */
1664
1665         /* need a place to store leaders, if any */
1666         if (nmcces(v) > 0)
1667         {
1668                 assert(v->mcces != NULL);
1669                 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs)
1670                 {
1671                         if (v->cv2 != NULL)
1672                                 free(v->cv2);
1673                         v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1674                         NOERR();
1675                         leads = v->cv2;
1676                 }
1677                 else
1678                         leads = clearcvec(v->cv2);
1679                 }
1680         else
1681                 leads = NULL;
1682
1683         /* first, get the ordinary characters out of the way */
1684         for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
1685         {
1686                 ch = *p;
1687                 if (!ISCELEADER(v, ch))
1688                         newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1689                 else
1690                 {
1691                         assert(singleton(v->cm, ch));
1692                         assert(leads != NULL);
1693                         if (!haschr(leads, ch))
1694                                 addchr(leads, ch);
1695         }
1696         }
1697
1698         /* and the ranges */
1699         for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
1700         {
1701                 from = *p;
1702                 to = *(p + 1);
1703                 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT)
1704                 {
1705                         if (from < ce)
1706                                 subrange(v, from, ce - 1, lp, rp);
1707                         assert(singleton(v->cm, ce));
1708                         assert(leads != NULL);
1709                         if (!haschr(leads, ce))
1710                                 addchr(leads, ce);
1711                         from = ce + 1;
1712                 }
1713                 if (from <= to)
1714                         subrange(v, from, to, lp, rp);
1715         }
1716
1717         if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1718                 return;
1719
1720         /* deal with the MCCE leaders */
1721         NOTE(REG_ULOCALE);
1722         for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--)
1723         {
1724                 co = GETCOLOR(v->cm, *p);
1725                 a = findarc(lp, PLAIN, co);
1726                 if (a != NULL)
1727                         s = a->to;
1728                 else
1729                 {
1730                         s = newstate(v->nfa);
1731                         NOERR();
1732                         newarc(v->nfa, PLAIN, co, lp, s);
1733                         NOERR();
1734                 }
1735                 pa = findarc(v->mccepbegin, PLAIN, co);
1736                 assert(pa != NULL);
1737                 ps = pa->to;
1738                 newarc(v->nfa, '$', 1, s, rp);
1739                 newarc(v->nfa, '$', 0, s, rp);
1740                 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1741                 NOERR();
1742         }
1743
1744         /* and the MCCEs */
1745         for (i = 0; i < cv->nmcces; i++)
1746         {
1747                 p = cv->mcces[i];
1748                 assert(singleton(v->cm, *p));
1749                 if (!singleton(v->cm, *p))
1750                 {
1751                         ERR(REG_ASSERT);
1752                         return;
1753                 }
1754                 ch = *p++;
1755                 co = GETCOLOR(v->cm, ch);
1756                 a = findarc(lp, PLAIN, co);
1757                 if (a != NULL)
1758                         s = a->to;
1759                 else
1760                 {
1761                         s = newstate(v->nfa);
1762                         NOERR();
1763                         newarc(v->nfa, PLAIN, co, lp, s);
1764                         NOERR();
1765                 }
1766                 assert(*p != 0);                /* at least two chars */
1767                 assert(singleton(v->cm, *p));
1768                 ch = *p++;
1769                 co = GETCOLOR(v->cm, ch);
1770                 assert(*p == 0);                /* and only two, for now */
1771                 newarc(v->nfa, PLAIN, co, s, rp);
1772                 NOERR();
1773         }
1774 }
1775
1776 /*
1777  * nextleader - find next MCCE leader within range
1778  */
1779 static celt                                             /* NOCELT means none */
1780 nextleader(struct vars * v,
1781                    chr from,
1782                    chr to)
1783 {
1784         int                     i;
1785         chr                *p;
1786         chr                     ch;
1787         celt            it = NOCELT;
1788
1789         if (v->mcces == NULL)
1790                 return it;
1791
1792         for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++)
1793         {
1794                 ch = *p;
1795                 if (from <= ch && ch <= to)
1796                         if (it == NOCELT || ch < it)
1797                                 it = ch;
1798         }
1799         return it;
1800 }
1801
1802 /*
1803  * wordchrs - set up word-chr list for word-boundary stuff, if needed
1804  *
1805  * The list is kept as a bunch of arcs between two dummy states; it's
1806  * disposed of by the unreachable-states sweep in NFA optimization.
1807  * Does NEXT().  Must not be called from any unusual lexical context.
1808  * This should be reconciled with the \w etc. handling in lex.c, and
1809  * should be cleaned up to reduce dependencies on input scanning.
1810  */
1811 static void
1812 wordchrs(struct vars * v)
1813 {
1814         struct state *left;
1815         struct state *right;
1816
1817         if (v->wordchrs != NULL)
1818         {
1819                 NEXT();                                 /* for consistency */
1820                 return;
1821         }
1822
1823         left = newstate(v->nfa);
1824         right = newstate(v->nfa);
1825         NOERR();
1826         /* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
1827         lexword(v);
1828         NEXT();
1829         assert(v->savenow != NULL && SEE('['));
1830         bracket(v, left, right);
1831         assert((v->savenow != NULL && SEE(']')) || ISERR());
1832         NEXT();
1833         NOERR();
1834         v->wordchrs = left;
1835 }
1836
1837 /*
1838  * subre - allocate a subre
1839  */
1840 static struct subre *
1841 subre(struct vars * v,
1842           int op,
1843           int flags,
1844           struct state * begin,
1845           struct state * end)
1846 {
1847         struct subre *ret;
1848
1849         ret = v->treefree;
1850         if (ret != NULL)
1851                 v->treefree = ret->left;
1852         else
1853         {
1854                 ret = (struct subre *) MALLOC(sizeof(struct subre));
1855                 if (ret == NULL)
1856                 {
1857                         ERR(REG_ESPACE);
1858                         return NULL;
1859         }
1860                 ret->chain = v->treechain;
1861                 v->treechain = ret;
1862         }
1863
1864         assert(strchr("|.b(=", op) != NULL);
1865
1866         ret->op = op;
1867         ret->flags = flags;
1868         ret->retry = 0;
1869         ret->subno = 0;
1870         ret->min = ret->max = 1;
1871         ret->left = NULL;
1872         ret->right = NULL;
1873         ret->begin = begin;
1874         ret->end = end;
1875         ZAPCNFA(ret->cnfa);
1876
1877         return ret;
1878 }
1879
1880 /*
1881  * freesubre - free a subRE subtree
1882  */
1883 static void
1884 freesubre(struct vars * v,              /* might be NULL */
1885                   struct subre * sr)
1886 {
1887         if (sr == NULL)
1888                 return;
1889
1890         if (sr->left != NULL)
1891                 freesubre(v, sr->left);
1892         if (sr->right != NULL)
1893                 freesubre(v, sr->right);
1894
1895         freesrnode(v, sr);
1896 }
1897
1898 /*
1899  * freesrnode - free one node in a subRE subtree
1900  */
1901 static void
1902 freesrnode(struct vars * v,             /* might be NULL */
1903                    struct subre * sr)
1904 {
1905         if (sr == NULL)
1906                 return;
1907
1908         if (!NULLCNFA(sr->cnfa))
1909                 freecnfa(&sr->cnfa);
1910         sr->flags = 0;
1911
1912         if (v != NULL)
1913         {
1914                 sr->left = v->treefree;
1915                 v->treefree = sr;
1916         }
1917         else
1918                 FREE(sr);
1919 }
1920
1921 /*
1922  * optst - optimize a subRE subtree
1923  */
1924 static void
1925 optst(struct vars * v,
1926           struct subre * t)
1927 {
1928         if (t == NULL)
1929                 return;
1930
1931         /* recurse through children */
1932         if (t->left != NULL)
1933                 optst(v, t->left);
1934         if (t->right != NULL)
1935                 optst(v, t->right);
1936 }
1937
1938 /*
1939  * numst - number tree nodes (assigning retry indexes)
1940  */
1941 static int                                              /* next number */
1942 numst(struct subre * t,
1943           int start)                            /* starting point for subtree numbers */
1944 {
1945         int                     i;
1946
1947         assert(t != NULL);
1948
1949         i = start;
1950         t->retry = (short) i++;
1951         if (t->left != NULL)
1952                 i = numst(t->left, i);
1953         if (t->right != NULL)
1954                 i = numst(t->right, i);
1955         return i;
1956 }
1957
1958 /*
1959  * markst - mark tree nodes as INUSE
1960  */
1961 static void
1962 markst(struct subre * t)
1963 {
1964         assert(t != NULL);
1965
1966         t->flags |= INUSE;
1967         if (t->left != NULL)
1968                 markst(t->left);
1969         if (t->right != NULL)
1970                 markst(t->right);
1971 }
1972
1973 /*
1974  * cleanst - free any tree nodes not marked INUSE
1975  */
1976 static void
1977 cleanst(struct vars * v)
1978 {
1979         struct subre *t;
1980         struct subre *next;
1981
1982         for (t = v->treechain; t != NULL; t = next)
1983         {
1984                 next = t->chain;
1985                 if (!(t->flags & INUSE))
1986                         FREE(t);
1987                 }
1988         v->treechain = NULL;
1989         v->treefree = NULL;                     /* just on general principles */
1990 }
1991
1992 /*
1993  * nfatree - turn a subRE subtree into a tree of compacted NFAs
1994  */
1995 static long                                             /* optimize results from top node */
1996 nfatree(struct vars * v,
1997                 struct subre * t,
1998                 FILE *f)                                /* for debug output */
1999 {
2000         assert(t != NULL && t->begin != NULL);
2001
2002         if (t->left != NULL)
2003                 (DISCARD) nfatree(v, t->left, f);
2004         if (t->right != NULL)
2005                 (DISCARD) nfatree(v, t->right, f);
2006
2007         return nfanode(v, t, f);
2008 }
2009
2010 /*
2011  * nfanode - do one NFA for nfatree
2012  */
2013 static long                                             /* optimize results */
2014 nfanode(struct vars * v,
2015                 struct subre * t,
2016                 FILE *f)                                /* for debug output */
2017 {
2018         struct nfa *nfa;
2019         long            ret = 0;
2020
2021         assert(t->begin != NULL);
2022
2023 #ifdef REG_DEBUG
2024         if (f != NULL)
2025         {
2026                 char            idbuf[50];
2027
2028                 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
2029                                 stid(t, idbuf, sizeof(idbuf)));
2030         }
2031 #endif
2032         nfa = newnfa(v, v->cm, v->nfa);
2033         NOERRZ();
2034         dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
2035         if (!ISERR())
2036         {
2037                 specialcolors(nfa);
2038                 ret = optimize(nfa, f);
2039         }
2040         if (!ISERR())
2041                 compact(nfa, &t->cnfa);
2042
2043         freenfa(nfa);
2044         return ret;
2045 }
2046
2047 /*
2048  * newlacon - allocate a lookahead-constraint subRE
2049  */
2050 static int                                              /* lacon number */
2051 newlacon(struct vars * v,
2052                  struct state * begin,
2053                  struct state * end,
2054                  int pos)
2055 {
2056         int                     n;
2057         struct subre *sub;
2058
2059         if (v->nlacons == 0)
2060         {
2061                 v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
2062                 n = 1;                                  /* skip 0th */
2063                 v->nlacons = 2;
2064                 }
2065         else
2066         {
2067                 v->lacons = (struct subre *) REALLOC(v->lacons,
2068                                                                 (v->nlacons + 1) * sizeof(struct subre));
2069                 n = v->nlacons++;
2070                 }
2071         if (v->lacons == NULL)
2072         {
2073                 ERR(REG_ESPACE);
2074                 return 0;
2075         }
2076         sub = &v->lacons[n];
2077         sub->begin = begin;
2078         sub->end = end;
2079         sub->subno = pos;
2080         ZAPCNFA(sub->cnfa);
2081         return n;
2082 }
2083
2084 /*
2085  * freelacons - free lookahead-constraint subRE vector
2086  */
2087 static void
2088 freelacons(struct subre * subs,
2089                    int n)
2090 {
2091         struct subre *sub;
2092         int                     i;
2093
2094         assert(n > 0);
2095         for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)      /* no 0th */
2096                 if (!NULLCNFA(sub->cnfa))
2097                         freecnfa(&sub->cnfa);
2098         FREE(subs);
2099 }
2100
2101 /*
2102  * rfree - free a whole RE (insides of regfree)
2103  */
2104 static void
2105 rfree(regex_t *re)
2106 {
2107         struct guts *g;
2108
2109         if (re == NULL || re->re_magic != REMAGIC)
2110                 return;
2111
2112         re->re_magic = 0;                       /* invalidate RE */
2113         g = (struct guts *) re->re_guts;
2114         re->re_guts = NULL;
2115         re->re_fns = NULL;
2116         g->magic = 0;
2117         freecm(&g->cmap);
2118         if (g->tree != NULL)
2119                 freesubre((struct vars *) NULL, g->tree);
2120         if (g->lacons != NULL)
2121                 freelacons(g->lacons, g->nlacons);
2122         if (!NULLCNFA(g->search))
2123                 freecnfa(&g->search);
2124         FREE(g);
2125 }
2126
2127 #ifdef REG_DEBUG
2128
2129 /*
2130  * dump - dump an RE in human-readable form
2131  */
2132 static void
2133 dump(regex_t *re,
2134          FILE *f)
2135 {
2136         struct guts *g;
2137         int                     i;
2138
2139         if (re->re_magic != REMAGIC)
2140                 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2141                                 REMAGIC);
2142         if (re->re_guts == NULL)
2143         {
2144                 fprintf(f, "NULL guts!!!\n");
2145                 return;
2146         }
2147         g = (struct guts *) re->re_guts;
2148         if (g->magic != GUTSMAGIC)
2149                 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2150                                 GUTSMAGIC);
2151
2152         fprintf(f, "\n\n\n========= DUMP ==========\n");
2153         fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2154                         re->re_nsub, re->re_info, re->re_csize, g->ntree);
2155
2156         dumpcolors(&g->cmap, f);
2157         if (!NULLCNFA(g->search))
2158         {
2159                 printf("\nsearch:\n");
2160                 dumpcnfa(&g->search, f);
2161         }
2162         for (i = 1; i < g->nlacons; i++)
2163         {
2164                 fprintf(f, "\nla%d (%s):\n", i,
2165                                 (g->lacons[i].subno) ? "positive" : "negative");
2166                 dumpcnfa(&g->lacons[i].cnfa, f);
2167         }
2168         fprintf(f, "\n");
2169         dumpst(g->tree, f, 0);
2170 }
2171
2172 /*
2173  * dumpst - dump a subRE tree
2174  */
2175 static void
2176 dumpst(struct subre * t,
2177            FILE *f,
2178            int nfapresent)                      /* is the original NFA still around? */
2179 {
2180         if (t == NULL)
2181                 fprintf(f, "null tree\n");
2182         else
2183                 stdump(t, f, nfapresent);
2184         fflush(f);
2185 }
2186
2187 /*
2188  * stdump - recursive guts of dumpst
2189  */
2190 static void
2191 stdump(struct subre * t,
2192            FILE *f,
2193            int nfapresent)                      /* is the original NFA still around? */
2194 {
2195         char            idbuf[50];
2196
2197         fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2198         if (t->flags & LONGER)
2199                 fprintf(f, " longest");
2200         if (t->flags & SHORTER)
2201                 fprintf(f, " shortest");
2202         if (t->flags & MIXED)
2203                 fprintf(f, " hasmixed");
2204         if (t->flags & CAP)
2205                 fprintf(f, " hascapture");
2206         if (t->flags & BACKR)
2207                 fprintf(f, " hasbackref");
2208         if (!(t->flags & INUSE))
2209                 fprintf(f, " UNUSED");
2210         if (t->subno != 0)
2211                 fprintf(f, " (#%d)", t->subno);
2212         if (t->min != 1 || t->max != 1)
2213         {
2214                 fprintf(f, " {%d,", t->min);
2215                 if (t->max != INFINITY)
2216                         fprintf(f, "%d", t->max);
2217                 fprintf(f, "}");
2218         }
2219         if (nfapresent)
2220                 fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2221         if (t->left != NULL)
2222                 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2223         if (t->right != NULL)
2224                 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2225         if (!NULLCNFA(t->cnfa))
2226         {
2227                 fprintf(f, "\n");
2228                 dumpcnfa(&t->cnfa, f);
2229                 fprintf(f, "\n");
2230         }
2231         if (t->left != NULL)
2232                 stdump(t->left, f, nfapresent);
2233         if (t->right != NULL)
2234                 stdump(t->right, f, nfapresent);
2235 }
2236
2237 /*
2238  * stid - identify a subtree node for dumping
2239  */
2240 static char *                                   /* points to buf or constant string */
2241 stid(struct subre * t,
2242          char *buf,
2243          size_t bufsize)
2244 {
2245         /* big enough for hex int or decimal t->retry? */
2246         if (bufsize < sizeof(int) * 2 + 3 || bufsize < sizeof(t->retry) * 3 + 1)
2247                 return "unable";
2248         if (t->retry != 0)
2249                 sprintf(buf, "%d", t->retry);
2250         else
2251                 sprintf(buf, "0x%x", (int) t);  /* may lose bits, that's okay */
2252         return buf;
2253 }
2254 #endif   /* REG_DEBUG */
2255
2256
2257 #include "regc_lex.c"
2258 #include "regc_color.c"
2259 #include "regc_nfa.c"
2260 #include "regc_cvec.c"
2261 #include "regc_locale.c"