]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
   2  * re_*comp and friends - compile REs 
   3  * This file #includes several others (see the bottom). 
   5  * Copyright (c) 1998, 1999 Henry Spencer.      All rights reserved. 
   7  * Development of this software was funded, in part, by Cray Research Inc., 
   8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 
   9  * Corporation, none of whom are responsible for the results.  The author 
  12  * Redistribution and use in source and binary forms -- with or without 
  13  * modification -- are permitted for any purpose, provided that 
  14  * redistributions in source form retain this entire copyright notice and 
  15  * indicate the origin and nature of any modifications. 
  17  * I'd appreciate being given credit for this package in the documentation 
  18  * of software which uses it, but that is not a requirement. 
  20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 
  21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
  22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL 
  23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  31  * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $ 
  38  * forward declarations, up here so forward datatypes etc. are defined early 
  40 /* === regcomp.c === */ 
  41 static void moresubs(struct vars 
*, int); 
  42 static int      freev(struct vars 
*, int); 
  43 static void makesearch(struct vars 
*, struct nfa 
*); 
  44 static struct subre 
*parse(struct vars 
*, int, int, struct state 
*, struct state 
*); 
  45 static struct subre 
*parsebranch(struct vars 
*, int, int, struct state 
*, struct state 
*, int); 
  46 static void parseqatom(struct vars 
*, int, int, struct state 
*, struct state 
*, struct subre 
*); 
  47 static void nonword(struct vars 
*, int, struct state 
*, struct state 
*); 
  48 static void word(struct vars 
*, int, struct state 
*, struct state 
*); 
  49 static int      scannum(struct vars 
*); 
  50 static void repeat(struct vars 
*, struct state 
*, struct state 
*, int, int); 
  51 static void bracket(struct vars 
*, struct state 
*, struct state 
*); 
  52 static void cbracket(struct vars 
*, struct state 
*, struct state 
*); 
  53 static void brackpart(struct vars 
*, struct state 
*, struct state 
*); 
  54 static chr 
*scanplain(struct vars 
*); 
  55 static void leaders(struct vars 
*, struct cvec 
*); 
  56 static void onechr(struct vars 
*, chr
, struct state 
*, struct state 
*); 
  57 static void dovec(struct vars 
*, struct cvec 
*, struct state 
*, struct state 
*); 
  58 static celt 
nextleader(struct vars 
*, chr
, chr
); 
  59 static void wordchrs(struct vars 
*); 
  60 static struct subre 
*subre(struct vars 
*, int, int, struct state 
*, struct state 
*); 
  61 static void freesubre(struct vars 
*, struct subre 
*); 
  62 static void freesrnode(struct vars 
*, struct subre 
*); 
  63 static void optst(struct vars 
*, struct subre 
*); 
  64 static int      numst(struct subre 
*, int); 
  65 static void markst(struct subre 
*); 
  66 static void cleanst(struct vars 
*); 
  67 static long nfatree(struct vars 
*, struct subre 
*, FILE *); 
  68 static long nfanode(struct vars 
*, struct subre 
*, FILE *); 
  69 static int      newlacon(struct vars 
*, struct state 
*, struct state 
*, int); 
  70 static void freelacons(struct subre 
*, int); 
  71 static void rfree(regex_t 
*); 
  74 static void dump(regex_t 
*, FILE *); 
  75 static void dumpst(struct subre 
*, FILE *, int); 
  76 static void stdump(struct subre 
*, FILE *, int); 
  77 static char *stid(struct subre 
*, char *, size_t); 
  79 /* === regc_lex.c === */ 
  80 static void lexstart(struct vars 
*); 
  81 static void prefixes(struct vars 
*); 
  82 static void lexnest(struct vars 
*, chr 
*, chr 
*); 
  83 static void lexword(struct vars 
*); 
  84 static int      next(struct vars 
*); 
  85 static int      lexescape(struct vars 
*); 
  86 static chr      
lexdigits(struct vars 
*, int, int, int); 
  87 static int      brenext(struct vars 
*, chr
); 
  88 static void skip(struct vars 
*); 
  89 static chr      
newline(void); 
  90 static chr      
chrnamed(struct vars 
*, chr 
*, chr 
*, chr
); 
  92 /* === regc_color.c === */ 
  93 static void initcm(struct vars 
*, struct colormap 
*); 
  94 static void freecm(struct colormap 
*); 
  95 static void cmtreefree(struct colormap 
*, union tree 
*, int); 
  96 static color 
setcolor(struct colormap 
*, chr
, pcolor
); 
  97 static color 
maxcolor(struct colormap 
*); 
  98 static color 
newcolor(struct colormap 
*); 
  99 static void freecolor(struct colormap 
*, pcolor
); 
 100 static color 
pseudocolor(struct colormap 
*); 
 101 static color 
subcolor(struct colormap 
*, chr c
); 
 102 static color 
newsub(struct colormap 
*, pcolor
); 
 103 static void subrange(struct vars 
*, chr
, chr
, struct state 
*, struct state 
*); 
 104 static void subblock(struct vars 
*, chr
, struct state 
*, struct state 
*); 
 105 static void okcolors(struct nfa 
*, struct colormap 
*); 
 106 static void colorchain(struct colormap 
*, struct arc 
*); 
 107 static void uncolorchain(struct colormap 
*, struct arc 
*); 
 108 static int      singleton(struct colormap 
*, chr c
); 
 109 static void rainbow(struct nfa 
*, struct colormap 
*, int, pcolor
, struct state 
*, struct state 
*); 
 110 static void colorcomplement(struct nfa 
*, struct colormap 
*, int, struct state 
*, struct state 
*, struct state 
*); 
 113 static void dumpcolors(struct colormap 
*, FILE *); 
 114 static void fillcheck(struct colormap 
*, union tree 
*, int, FILE *); 
 115 static void dumpchr(chr
, FILE *); 
 117 /* === regc_nfa.c === */ 
 118 static struct nfa 
*newnfa(struct vars 
*, struct colormap 
*, struct nfa 
*); 
 119 static void freenfa(struct nfa 
*); 
 120 static struct state 
*newstate(struct nfa 
*); 
 121 static struct state 
*newfstate(struct nfa 
*, int flag
); 
 122 static void dropstate(struct nfa 
*, struct state 
*); 
 123 static void freestate(struct nfa 
*, struct state 
*); 
 124 static void destroystate(struct nfa 
*, struct state 
*); 
 125 static void newarc(struct nfa 
*, int, pcolor
, struct state 
*, struct state 
*); 
 126 static struct arc 
*allocarc(struct nfa 
*, struct state 
*); 
 127 static void freearc(struct nfa 
*, struct arc 
*); 
 128 static struct arc 
*findarc(struct state 
*, int, pcolor
); 
 129 static void cparc(struct nfa 
*, struct arc 
*, struct state 
*, struct state 
*); 
 130 static void moveins(struct nfa 
*, struct state 
*, struct state 
*); 
 131 static void copyins(struct nfa 
*, struct state 
*, struct state 
*); 
 132 static void moveouts(struct nfa 
*, struct state 
*, struct state 
*); 
 133 static void copyouts(struct nfa 
*, struct state 
*, struct state 
*); 
 134 static void cloneouts(struct nfa 
*, struct state 
*, struct state 
*, struct state 
*, int); 
 135 static void delsub(struct nfa 
*, struct state 
*, struct state 
*); 
 136 static void deltraverse(struct nfa 
*, struct state 
*, struct state 
*); 
 137 static void dupnfa(struct nfa 
*, struct state 
*, struct state 
*, struct state 
*, struct state 
*); 
 138 static void duptraverse(struct nfa 
*, struct state 
*, struct state 
*); 
 139 static void cleartraverse(struct nfa 
*, struct state 
*); 
 140 static void specialcolors(struct nfa 
*); 
 141 static long optimize(struct nfa 
*, FILE *); 
 142 static void pullback(struct nfa 
*, FILE *); 
 143 static int      pull(struct nfa 
*, struct arc 
*); 
 144 static void pushfwd(struct nfa 
*, FILE *); 
 145 static int      push(struct nfa 
*, struct arc 
*); 
 147 #define INCOMPATIBLE    1               /* destroys arc */ 
 148 #define SATISFIED       2                       /* constraint satisfied */ 
 149 #define COMPATIBLE      3                       /* compatible but not satisfied yet */ 
 150 static int      combine(struct arc 
*, struct arc 
*); 
 151 static void fixempties(struct nfa 
*, FILE *); 
 152 static int      unempty(struct nfa 
*, struct arc 
*); 
 153 static void cleanup(struct nfa 
*); 
 154 static void markreachable(struct nfa 
*, struct state 
*, struct state 
*, struct state 
*); 
 155 static void markcanreach(struct nfa 
*, struct state 
*, struct state 
*, struct state 
*); 
 156 static long analyze(struct nfa 
*); 
 157 static void compact(struct nfa 
*, struct cnfa 
*); 
 158 static void carcsort(struct carc 
*, struct carc 
*); 
 159 static void freecnfa(struct cnfa 
*); 
 160 static void dumpnfa(struct nfa 
*, FILE *); 
 163 static void dumpstate(struct state 
*, FILE *); 
 164 static void dumparcs(struct state 
*, FILE *); 
 165 static int      dumprarcs(struct arc 
*, struct state 
*, FILE *, int); 
 166 static void dumparc(struct arc 
*, struct state 
*, FILE *); 
 167 static void dumpcnfa(struct cnfa 
*, FILE *); 
 168 static void dumpcstate(int, struct carc 
*, struct cnfa 
*, FILE *); 
 170 /* === regc_cvec.c === */ 
 171 static struct cvec 
*newcvec(int, int, int); 
 172 static struct cvec 
*clearcvec(struct cvec 
*); 
 173 static void addchr(struct cvec 
*, chr
); 
 174 static void addrange(struct cvec 
*, chr
, chr
); 
 175 static void addmcce(struct cvec 
*, chr 
*, chr 
*); 
 176 static int      haschr(struct cvec 
*, chr
); 
 177 static struct cvec 
*getcvec(struct vars 
*, int, int, int); 
 178 static void freecvec(struct cvec 
*); 
 180 /* === regc_locale.c === */ 
 181 extern int      wx_isdigit(wx_wchar c
); 
 182 extern int      wx_isalpha(wx_wchar c
); 
 183 extern int      wx_isalnum(wx_wchar c
); 
 184 extern int      wx_isupper(wx_wchar c
); 
 185 extern int      wx_islower(wx_wchar c
); 
 186 extern int      wx_isgraph(wx_wchar c
); 
 187 extern int      wx_ispunct(wx_wchar c
); 
 188 extern int      wx_isspace(wx_wchar c
); 
 189 extern wx_wchar 
wx_toupper(wx_wchar c
); 
 190 extern wx_wchar 
wx_tolower(wx_wchar c
); 
 191 static int      nmcces(struct vars 
*); 
 192 static int      nleaders(struct vars 
*); 
 193 static struct cvec 
*allmcces(struct vars 
*, struct cvec 
*); 
 194 static celt 
element(struct vars 
*, chr 
*, chr 
*); 
 195 static struct cvec 
*range(struct vars 
*, celt
, celt
, int); 
 196 static int      before(celt
, celt
); 
 197 static struct cvec 
*eclass(struct vars 
*, celt
, int); 
 198 static struct cvec 
*cclass(struct vars 
*, chr 
*, chr 
*, int); 
 199 static struct cvec 
*allcases(struct vars 
*, chr
); 
 200 static int      cmp(const chr 
*, const chr 
*, size_t); 
 201 static int      casecmp(const chr 
*, const chr 
*, size_t); 
 204 /* internal variables, bundled for easy passing around */ 
 208         chr                
*now
;                        /* scan pointer into string */ 
 209         chr                
*stop
;                       /* end of string */ 
 210         chr                
*savenow
;            /* saved now and stop for "subroutine 
 213         int                     err
;                    /* error code (0 if none) */ 
 214         int                     cflags
;                 /* copy of compile flags */ 
 215         int                     lasttype
;               /* type of previous token */ 
 216         int                     nexttype
;               /* type of next token */ 
 217         chr                     nextvalue
;              /* value (if any) of next token */ 
 218         int                     lexcon
;                 /* lexical context type (see lex.c) */ 
 219         int                     nsubexp
;                /* subexpression count */ 
 220         struct subre 
**subs
;            /* subRE pointer vector */ 
 221         size_t          nsubs
;                  /* length of vector */ 
 222         struct subre 
*sub10
[10];        /* initial vector, enough for most */ 
 223         struct nfa 
*nfa
;                        /* the NFA */ 
 224         struct colormap 
*cm
;            /* character color map */ 
 225         color           nlcolor
;                /* color of newline */ 
 226         struct state 
*wordchrs
;         /* state in nfa holding word-char outarcs */ 
 227         struct subre 
*tree
;                     /* subexpression tree */ 
 228         struct subre 
*treechain
;        /* all tree nodes allocated */ 
 229         struct subre 
*treefree
;         /* any free tree nodes */ 
 230         int                     ntree
;                  /* number of tree nodes */ 
 231         struct cvec 
*cv
;                        /* interface cvec */ 
 232         struct cvec 
*cv2
;                       /* utility cvec */ 
 233         struct cvec 
*mcces
;                     /* collating-element information */ 
 234 #define  ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c))) 
 235         struct state 
*mccepbegin
;       /* in nfa, start of MCCE prototypes */ 
 236         struct state 
*mccepend
;         /* in nfa, end of MCCE prototypes */ 
 237         struct subre 
*lacons
;           /* lookahead-constraint vector */ 
 238         int                     nlacons
;                /* size of lacons */ 
 241 /* parsing macros; most know that `v' is the struct vars pointer */ 
 242 #define NEXT()  (next(v))               /* advance by one token */ 
 243 #define SEE(t)  (v->nexttype == (t))    /* is next token this? */ 
 244 #define EAT(t)  (SEE(t) && next(v))             /* if next is this, swallow it */ 
 245 #define VISERR(vv)      ((vv)->err != 0)        /* have we seen an error yet? */ 
 246 #define ISERR() VISERR(v) 
 247 #define VERR(vv,e)      ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\ 
 249 #define ERR(e)  VERR(v, e)              /* record an error */ 
 250 #define NOERR() {if (ISERR()) return;}  /* if error seen, return */ 
 251 #define NOERRN()        {if (ISERR()) return NULL;} /* NOERR with retval */ 
 252 #define NOERRZ()        {if (ISERR()) return 0;}        /* NOERR with retval */ 
 253 #define INSIST(c, e)    ((c) ? 0 : ERR(e))              /* if condition false, 
 255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ 
 256 #define EMPTYARC(x, y)  newarc(v->nfa, EMPTY, 0, x, y) 
 258 /* token type codes, some also used as NFA arc types */ 
 259 #define EMPTY   'n'                             /* no token present */ 
 260 #define EOS 'e'                                 /* end of string */ 
 261 #define PLAIN   'p'                             /* ordinary character */ 
 262 #define DIGIT   'd'                             /* digit (in bound) */ 
 263 #define BACKREF 'b'                             /* back reference */ 
 264 #define COLLEL  'I'                             /* start of [. */ 
 265 #define ECLASS  'E'                             /* start of [= */ 
 266 #define CCLASS  'C'                             /* start of [: */ 
 267 #define END 'X'                                 /* end of [. [= [: */ 
 268 #define RANGE   'R'                             /* - within [] which might be range delim. */ 
 269 #define LACON   'L'                             /* lookahead constraint subRE */ 
 270 #define AHEAD   'a'                             /* color-lookahead arc */ 
 271 #define BEHIND  'r'                             /* color-lookbehind arc */ 
 272 #define WBDRY   'w'                             /* word boundary constraint */ 
 273 #define NWBDRY  'W'                             /* non-word-boundary constraint */ 
 274 #define SBEGIN  'A'                             /* beginning of string (even if not BOL) */ 
 275 #define SEND    'Z'                             /* end of string (even if not EOL) */ 
 276 #define PREFER  'P'                             /* length preference */ 
 278 /* is an arc colored, and hence on a color chain? */ 
 279 #define COLORED(a)      ((a)->type == PLAIN || (a)->type == AHEAD || \ 
 284 /* static function list */ 
 285 static struct fns functions 
= { 
 286         rfree
,                                          /* regfree insides */ 
 292  * regcomp - compile regular expression 
 299         return wx_regcomp(re
, string
, wx_strlen(string
), flags
); 
 302 wx_regcomp(regex_t 
*re
, 
 308         struct vars 
*v 
= &var
; 
 314         FILE       *debug 
= (flags 
& REG_PROGRESS
) ? stdout 
: (FILE *) NULL
; 
 317         FILE       *debug 
= (FILE *) NULL
; 
 320 #define  CNOERR()        { if (ISERR()) return freev(v, v->err); } 
 324         if (re 
== NULL 
|| string 
== NULL
) 
 326         if ((flags 
& REG_QUOTE
) && 
 327                 (flags 
& (REG_ADVANCED 
| REG_EXPANDED 
| REG_NEWLINE
))) 
 329         if (!(flags 
& REG_EXTENDED
) && (flags 
& REG_ADVF
)) 
 332         /* initial setup (after which freev() is callable) */ 
 334         v
->now 
= (chr 
*) string
; 
 335         v
->stop 
= v
->now 
+ len
; 
 336         v
->savenow 
= v
->savestop 
= NULL
; 
 342         for (j 
= 0; j 
< v
->nsubs
; j
++) 
 346         v
->nlcolor 
= COLORLESS
; 
 356         re
->re_magic 
= REMAGIC
; 
 357         re
->re_info 
= 0;                        /* bits get set during parse */ 
 358         re
->re_csize 
= sizeof(chr
); 
 360         re
->re_fns 
= VS(&functions
); 
 362         /* more complex setup, malloced things */ 
 363         re
->re_guts 
= VS(MALLOC(sizeof(struct guts
))); 
 364         if (re
->re_guts 
== NULL
) 
 365                 return freev(v
, REG_ESPACE
); 
 366         g 
= (struct guts 
*) re
->re_guts
; 
 373         v
->nfa 
= newnfa(v
, v
->cm
, (struct nfa 
*) NULL
); 
 375         v
->cv 
= newcvec(100, 20, 10); 
 377                 return freev(v
, REG_ESPACE
); 
 381                 v
->mcces 
= newcvec(nleaders(v
), 0, i
); 
 383                 v
->mcces 
= allmcces(v
, v
->mcces
); 
 384                 leaders(v
, v
->mcces
); 
 385                 addmcce(v
->mcces
, (chr 
*) NULL
, (chr 
*) NULL
);  /* dummy */ 
 390         lexstart(v
);                            /* also handles prefixes */ 
 391         if ((v
->cflags 
& REG_NLSTOP
) || (v
->cflags 
& REG_NLANCH
)) 
 393                 /* assign newline a unique color */ 
 394                 v
->nlcolor 
= subcolor(v
->cm
, newline()); 
 395                 okcolors(v
->nfa
, v
->cm
); 
 398         v
->tree 
= parse(v
, EOS
, PLAIN
, v
->nfa
->init
, v
->nfa
->final
); 
 399         assert(SEE(EOS
));                       /* even if error; ISERR() => SEE(EOS) */ 
 401         assert(v
->tree 
!= NULL
); 
 403         /* finish setup of nfa and its subre tree */ 
 404         specialcolors(v
->nfa
); 
 409                 fprintf(debug
, "\n\n\n========= RAW ==========\n"); 
 410                 dumpnfa(v
->nfa
, debug
); 
 411                 dumpst(v
->tree
, debug
, 1); 
 415         v
->ntree 
= numst(v
->tree
, 1); 
 421                 fprintf(debug
, "\n\n\n========= TREE FIXED ==========\n"); 
 422                 dumpst(v
->tree
, debug
, 1); 
 426         /* build compacted NFAs for tree and lacons */ 
 427         re
->re_info 
|= nfatree(v
, v
->tree
, debug
); 
 429         assert(v
->nlacons 
== 0 || v
->lacons 
!= NULL
); 
 430         for (i 
= 1; i 
< v
->nlacons
; i
++) 
 434                         fprintf(debug
, "\n\n\n========= LA%d ==========\n", i
); 
 436                 nfanode(v
, &v
->lacons
[i
], debug
); 
 439         if (v
->tree
->flags 
& SHORTER
) 
 442         /* build compacted NFAs for tree, lacons, fast search */ 
 445                 fprintf(debug
, "\n\n\n========= SEARCH ==========\n"); 
 447         /* can sacrifice main NFA now, so use it as work area */ 
 448         (DISCARD
) optimize(v
->nfa
, debug
); 
 450         makesearch(v
, v
->nfa
); 
 452         compact(v
->nfa
, &g
->search
); 
 455         /* looks okay, package it up */ 
 456         re
->re_nsub 
= v
->nsubexp
; 
 457         v
->re 
= NULL
;                           /* freev no longer frees re */ 
 458         g
->magic 
= GUTSMAGIC
; 
 459         g
->cflags 
= v
->cflags
; 
 460         g
->info 
= re
->re_info
; 
 461         g
->nsub 
= re
->re_nsub
; 
 465         g
->compare 
= (v
->cflags 
& REG_ICASE
) ? casecmp 
: cmp
; 
 466         g
->lacons 
= v
->lacons
; 
 468         g
->nlacons 
= v
->nlacons
; 
 471         if (flags 
& REG_DUMP
) 
 480  * moresubs - enlarge subRE vector 
 483 moresubs(struct vars 
* v
, 
 484                  int wanted
)                    /* want enough room for this one */ 
 489         assert(wanted 
> 0 && (size_t) wanted 
>= v
->nsubs
); 
 490         n 
= (size_t) wanted 
*3 / 2 + 1; 
 492         if (v
->subs 
== v
->sub10
) 
 494                 p 
= (struct subre 
**) MALLOC(n 
* sizeof(struct subre 
*)); 
 496                         memcpy(VS(p
), VS(v
->subs
), 
 497                                    v
->nsubs 
* sizeof(struct subre 
*)); 
 500                 p 
= (struct subre 
**) REALLOC(v
->subs
, n 
* sizeof(struct subre 
*)); 
 507         for (p 
= &v
->subs
[v
->nsubs
]; v
->nsubs 
< n
; p
++, v
->nsubs
++) 
 509         assert(v
->nsubs 
== n
); 
 510         assert((size_t) wanted 
< v
->nsubs
); 
 514  * freev - free vars struct's substructures where necessary 
 516  * Optionally does error-number setting, and always returns error code 
 517  * (if any), to make error-handling code terser. 
 520 freev(struct vars 
* v
, 
 525         if (v
->subs 
!= v
->sub10
) 
 530                 freesubre(v
, v
->tree
); 
 531         if (v
->treechain 
!= NULL
) 
 537         if (v
->mcces 
!= NULL
) 
 539         if (v
->lacons 
!= NULL
) 
 540                 freelacons(v
->lacons
, v
->nlacons
); 
 541         ERR(err
);                                       /* nop if err==0 */ 
 547  * makesearch - turn an NFA into a search NFA (implicit prepend of .*?) 
 548  * NFA must have been optimize()d already. 
 551 makesearch(struct vars 
* v
, 
 556         struct state 
*pre 
= nfa
->pre
; 
 561         /* no loops are needed if it's anchored */ 
 562         for (a 
= pre
->outs
; a 
!= NULL
; a 
= a
->outchain
) 
 564                 assert(a
->type 
== PLAIN
); 
 565                 if (a
->co 
!= nfa
->bos
[0] && a
->co 
!= nfa
->bos
[1]) 
 570                 /* add implicit .* in front */ 
 571                 rainbow(nfa
, v
->cm
, PLAIN
, COLORLESS
, pre
, pre
); 
 573                 /* and ^* and \A* too -- not always necessary, but harmless */ 
 574                 newarc(nfa
, PLAIN
, nfa
->bos
[0], pre
, pre
); 
 575                 newarc(nfa
, PLAIN
, nfa
->bos
[1], pre
, pre
); 
 579          * Now here's the subtle part.  Because many REs have no lookback 
 580          * constraints, often knowing when you were in the pre state tells you 
 581          * little; it's the next state(s) that are informative.  But some of 
 582          * them may have other inarcs, i.e. it may be possible to make actual 
 583          * progress and then return to one of them.  We must de-optimize such 
 584          * cases, splitting each such state into progress and no-progress 
 588         /* first, make a list of the states */ 
 590         for (a 
= pre
->outs
; a 
!= NULL
; a 
= a
->outchain
) 
 593                 for (b 
= s
->ins
; b 
!= NULL
; b 
= b
->inchain
) 
 597                 {                                               /* must be split */ 
 604         for (s 
= slist
; s 
!= NULL
; s 
= s2
) 
 607                 copyouts(nfa
, s
, s2
); 
 608                 for (a 
= s
->ins
; a 
!= NULL
; a 
= b
) 
 613                                 cparc(nfa
, a
, a
->from
, s2
); 
 618                 s
->tmp 
= NULL
;                  /* clean up while we're at it */ 
 623  * parse - parse an RE 
 625  * This is actually just the top level, which parses a bunch of branches 
 626  * tied together with '|'.      They appear in the tree as the left children 
 627  * of a chain of '|' subres. 
 629 static struct subre 
* 
 630 parse(struct vars 
* v
, 
 631           int stopper
,                          /* EOS or ')' */ 
 632           int type
,                                     /* LACON (lookahead subRE) or PLAIN */ 
 633           struct state 
* init
,          /* initial state */ 
 634           struct state 
* final
)         /* final state */ 
 636         struct state 
*left
;                     /* scaffolding for branch */ 
 638         struct subre 
*branches
;         /* top level */ 
 639         struct subre 
*branch
;           /* current branch */ 
 640         struct subre 
*t
;                        /* temporary */ 
 641         int                     firstbranch
;    /* is this the first branch? */ 
 643         assert(stopper 
== ')' || stopper 
== EOS
); 
 645         branches 
= subre(v
, '|', LONGER
, init
, final
); 
 653                         /* need a place to hang it */ 
 654                         branch
->right 
= subre(v
, '|', LONGER
, init
, final
); 
 656                         branch 
= branch
->right
; 
 659                 left 
= newstate(v
->nfa
); 
 660                 right 
= newstate(v
->nfa
); 
 662                 EMPTYARC(init
, left
); 
 663                 EMPTYARC(right
, final
); 
 665                 branch
->left 
= parsebranch(v
, stopper
, type
, left
, right
, 0); 
 667                 branch
->flags 
|= UP(branch
->flags 
| branch
->left
->flags
); 
 668                 if ((branch
->flags 
& ~branches
->flags
) != 0)    /* new flags */ 
 669                         for (t 
= branches
; t 
!= branch
; t 
= t
->right
) 
 670                                 t
->flags 
|= branch
->flags
; 
 672         assert(SEE(stopper
) || SEE(EOS
)); 
 676                 assert(stopper 
== ')' && SEE(EOS
)); 
 680         /* optimize out simple cases */ 
 681         if (branch 
== branches
) 
 682         {                                                       /* only one branch */ 
 683                 assert(branch
->right 
== NULL
); 
 686                 freesubre(v
, branches
); 
 689         else if (!MESSY(branches
->flags
)) 
 690         {                                                       /* no interesting innards */ 
 691                 freesubre(v
, branches
->left
); 
 692                 branches
->left 
= NULL
; 
 693                 freesubre(v
, branches
->right
); 
 694                 branches
->right 
= NULL
; 
 702  * parsebranch - parse one branch of an RE 
 704  * This mostly manages concatenation, working closely with parseqatom(). 
 705  * Concatenated things are bundled up as much as possible, with separate 
 706  * ',' nodes introduced only when necessary due to substructure. 
 708 static struct subre 
* 
 709 parsebranch(struct vars 
* v
, 
 710                         int stopper
,            /* EOS or ')' */ 
 711                         int type
,                       /* LACON (lookahead subRE) or PLAIN */ 
 712                         struct state 
* left
,    /* leftmost state */ 
 713                         struct state 
* right
,           /* rightmost state */ 
 714                         int partial
)            /* is this only part of a branch? */ 
 716         struct state 
*lp
;                       /* left end of current construct */ 
 717         int                     seencontent
;    /* is there anything in this branch yet? */ 
 722         t 
= subre(v
, '=', 0, left
, right
);      /* op '=' is tentative */ 
 724         while (!SEE('|') && !SEE(stopper
) && !SEE(EOS
)) 
 727                 {                                               /* implicit concat operator */ 
 728                         lp 
= newstate(v
->nfa
); 
 730                         moveins(v
->nfa
, right
, lp
); 
 734                 /* NB, recursion in parseqatom() may swallow rest of branch */ 
 735                 parseqatom(v
, stopper
, type
, lp
, right
, t
); 
 743                 EMPTYARC(left
, right
); 
 750  * parseqatom - parse one quantified atom or constraint of an RE 
 752  * The bookkeeping near the end cooperates very closely with parsebranch(); 
 753  * in particular, it contains a recursion that can involve parsing the rest 
 754  * of the branch, making this function's name somewhat inaccurate. 
 757 parseqatom(struct vars 
* v
, 
 758                    int stopper
,                 /* EOS or ')' */ 
 759                    int type
,                    /* LACON (lookahead subRE) or PLAIN */ 
 760                    struct state 
* lp
,   /* left state to hang it on */ 
 761                    struct state 
* rp
,   /* right state to hang it on */ 
 762                    struct subre 
* top
)  /* subtree top */ 
 764         struct state 
*s
;                        /* temporaries for new states */ 
 767 #define  ARCV(t, val)    newarc(v->nfa, t, val, lp, rp) 
 770         struct subre 
*atom
;                     /* atom's subtree */ 
 772         int                     cap
;                    /* capturing parens? */ 
 773         int                     pos
;                    /* positive lookahead? */ 
 774         int                     subno
;                  /* capturing-parens or backref number */ 
 776         int                     qprefer
;                /* quantifier short/long preference */ 
 778         struct subre 
**atomp
;           /* where the pointer to atom is */ 
 780         /* initial bookkeeping */ 
 782         assert(lp
->nouts 
== 0);         /* must string new code */ 
 783         assert(rp
->nins 
== 0);          /* between lp and rp */ 
 784         subno 
= 0;                                      /* just to shut lint up */ 
 786         /* an atom or constraint... */ 
 787         atomtype 
= v
->nexttype
; 
 790                         /* first, constraints, which end by returning */ 
 793                         if (v
->cflags 
& REG_NLANCH
) 
 794                                 ARCV(BEHIND
, v
->nlcolor
); 
 800                         if (v
->cflags 
& REG_NLANCH
) 
 801                                 ARCV(AHEAD
, v
->nlcolor
); 
 806                         ARCV('^', 1);           /* BOL */ 
 807                         ARCV('^', 0);           /* or BOS */ 
 812                         ARCV('$', 1);           /* EOL */ 
 813                         ARCV('$', 0);           /* or EOS */ 
 818                         wordchrs(v
);            /* does NEXT() */ 
 819                         s 
= newstate(v
->nfa
); 
 821                         nonword(v
, BEHIND
, lp
, s
); 
 822                         word(v
, AHEAD
, s
, rp
); 
 826                         wordchrs(v
);            /* does NEXT() */ 
 827                         s 
= newstate(v
->nfa
); 
 829                         word(v
, BEHIND
, lp
, s
); 
 830                         nonword(v
, AHEAD
, s
, rp
); 
 834                         wordchrs(v
);            /* does NEXT() */ 
 835                         s 
= newstate(v
->nfa
); 
 837                         nonword(v
, BEHIND
, lp
, s
); 
 838                         word(v
, AHEAD
, s
, rp
); 
 839                         s 
= newstate(v
->nfa
); 
 841                         word(v
, BEHIND
, lp
, s
); 
 842                         nonword(v
, AHEAD
, s
, rp
); 
 846                         wordchrs(v
);            /* does NEXT() */ 
 847                         s 
= newstate(v
->nfa
); 
 849                         word(v
, BEHIND
, lp
, s
); 
 850                         word(v
, AHEAD
, s
, rp
); 
 851                         s 
= newstate(v
->nfa
); 
 853                         nonword(v
, BEHIND
, lp
, s
); 
 854                         nonword(v
, AHEAD
, s
, rp
); 
 857                 case LACON
:                             /* lookahead constraint */ 
 860                         s 
= newstate(v
->nfa
); 
 861                         s2 
= newstate(v
->nfa
); 
 863                         t 
= parse(v
, ')', LACON
, s
, s2
); 
 864                         freesubre(v
, t
);        /* internal structure irrelevant */ 
 865                         assert(SEE(')') || ISERR()); 
 867                         n 
= newlacon(v
, s
, s2
, pos
); 
 872                         /* then errors, to get them out of the way */ 
 884                         /* then plain characters, and minor variants on that theme */ 
 885                 case ')':                               /* unbalanced paren */ 
 886                         if ((v
->cflags 
& REG_ADVANCED
) != REG_EXTENDED
) 
 891                         /* legal in EREs due to specification botch */ 
 893                         /* fallthrough into case PLAIN */ 
 895                         onechr(v
, v
->nextvalue
, lp
, rp
); 
 896                         okcolors(v
->nfa
, v
->cm
); 
 901                         if (v
->nextvalue 
== 1) 
 905                         assert(SEE(']') || ISERR()); 
 909                         rainbow(v
->nfa
, v
->cm
, PLAIN
, 
 910                                         (v
->cflags 
& REG_NLSTOP
) ? v
->nlcolor 
: COLORLESS
, 
 914                         /* and finally the ugly stuff */ 
 915                 case '(':                               /* value flags as capturing or non */ 
 916                         cap 
= (type 
== LACON
) ? 0 : v
->nextvalue
; 
 921                                 if ((size_t) subno 
>= v
->nsubs
) 
 923                                 assert((size_t) subno 
< v
->nsubs
); 
 926                                 atomtype 
= PLAIN
;               /* something that's not '(' */ 
 928                         /* need new endpoints because tree will contain pointers */ 
 929                         s 
= newstate(v
->nfa
); 
 930                         s2 
= newstate(v
->nfa
); 
 935                         atom 
= parse(v
, ')', PLAIN
, s
, s2
); 
 936                         assert(SEE(')') || ISERR()); 
 941                                 v
->subs
[subno
] = atom
; 
 942                                 t 
= subre(v
, '(', atom
->flags 
| CAP
, lp
, rp
); 
 948                         /* postpone everything else pending possible {0} */ 
 950                 case BACKREF
:                   /* the Feature From The Black Lagoon */ 
 951                         INSIST(type 
!= LACON
, REG_ESUBREG
); 
 952                         INSIST(v
->nextvalue 
< v
->nsubs
, REG_ESUBREG
); 
 953                         INSIST(v
->subs
[v
->nextvalue
] != NULL
, REG_ESUBREG
); 
 955                         assert(v
->nextvalue 
> 0); 
 956                         atom 
= subre(v
, 'b', BACKR
, lp
, rp
); 
 957                         subno 
= v
->nextvalue
; 
 959                         EMPTYARC(lp
, rp
);       /* temporarily, so there's something */ 
 964         /* ...and an atom may be followed by a quantifier */ 
 970                         qprefer 
= (v
->nextvalue
) ? LONGER 
: SHORTER
; 
 976                         qprefer 
= (v
->nextvalue
) ? LONGER 
: SHORTER
; 
 982                         qprefer 
= (v
->nextvalue
) ? LONGER 
: SHORTER
; 
 999                                 /* {m,n} exercises preference, even if it's {m,m} */ 
1000                                 qprefer 
= (v
->nextvalue
) ? LONGER 
: SHORTER
; 
1005                                 /* {m} passes operand's preference through */ 
1009                         {                                       /* catches errors too */ 
1015                 default:                                /* no quantifier */ 
1021         /* annoying special case:  {0} or {0,0} cancels everything */ 
1022         if (m 
== 0 && n 
== 0) 
1026                 if (atomtype 
== '(') 
1027                         v
->subs
[subno
] = NULL
; 
1028                 delsub(v
->nfa
, lp
, rp
); 
1033         /* if not a messy case, avoid hard part */ 
1034         assert(!MESSY(top
->flags
)); 
1035         f 
= top
->flags 
| qprefer 
| ((atom 
!= NULL
) ? atom
->flags 
: 0); 
1036         if (atomtype 
!= '(' && atomtype 
!= BACKREF 
&& !MESSY(UP(f
))) 
1038                 if (!(m 
== 1 && n 
== 1)) 
1039                         repeat(v
, lp
, rp
, m
, n
); 
1047          * hard part:  something messy That is, capturing parens, back 
1048          * reference, short/long clash, or an atom with substructure 
1049          * containing one of those. 
1052         /* now we'll need a subre for the contents even if they're boring */ 
1055                 atom 
= subre(v
, '=', 0, lp
, rp
); 
1060          * prepare a general-purpose state skeleton 
1062          * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / / 
1063          * [lp] ----> [s2] ----bypass--------------------- 
1065          * where bypass is an empty, and prefix is some repetitions of atom 
1067         s 
= newstate(v
->nfa
);           /* first, new endpoints for the atom */ 
1068         s2 
= newstate(v
->nfa
); 
1070         moveouts(v
->nfa
, lp
, s
); 
1071         moveins(v
->nfa
, rp
, s2
); 
1075         s 
= newstate(v
->nfa
);           /* and spots for prefix and bypass */ 
1076         s2 
= newstate(v
->nfa
); 
1082         /* break remaining subRE into x{...} and what follows */ 
1083         t 
= subre(v
, '.', COMBINE(qprefer
, atom
->flags
), lp
, rp
); 
1086         /* here we should recurse... but we must postpone that to the end */ 
1088         /* split top into prefix and remaining */ 
1089         assert(top
->op 
== '=' && top
->left 
== NULL 
&& top
->right 
== NULL
); 
1090         top
->left 
= subre(v
, '=', top
->flags
, top
->begin
, lp
); 
1094         /* if it's a backref, now is the time to replicate the subNFA */ 
1095         if (atomtype 
== BACKREF
) 
1097                 assert(atom
->begin
->nouts 
== 1);                /* just the EMPTY */ 
1098                 delsub(v
->nfa
, atom
->begin
, atom
->end
); 
1099                 assert(v
->subs
[subno
] != NULL
); 
1100                 /* and here's why the recursion got postponed:  it must */ 
1101                 /* wait until the skeleton is filled in, because it may */ 
1102                 /* hit a backref that wants to copy the filled-in skeleton */ 
1103                 dupnfa(v
->nfa
, v
->subs
[subno
]->begin
, v
->subs
[subno
]->end
, 
1104                            atom
->begin
, atom
->end
); 
1108         /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */ 
1111                 EMPTYARC(s2
, atom
->end
);        /* the bypass */ 
1112                 assert(PREF(qprefer
) != 0); 
1113                 f 
= COMBINE(qprefer
, atom
->flags
); 
1114                 t 
= subre(v
, '|', f
, lp
, atom
->end
); 
1117                 t
->right 
= subre(v
, '|', PREF(f
), s2
, atom
->end
); 
1119                 t
->right
->left 
= subre(v
, '=', 0, s2
, atom
->end
); 
1126         /* deal with the rest of the quantifier */ 
1127         if (atomtype 
== BACKREF
) 
1129                 /* special case:  backrefs have internal quantifiers */ 
1130                 EMPTYARC(s
, atom
->begin
);               /* empty prefix */ 
1131                 /* just stuff everything into atom */ 
1132                 repeat(v
, atom
->begin
, atom
->end
, m
, n
); 
1133                 atom
->min 
= (short) m
; 
1134                 atom
->max 
= (short) n
; 
1135                 atom
->flags 
|= COMBINE(qprefer
, atom
->flags
); 
1137         else if (m 
== 1 && n 
== 1) 
1139                 /* no/vacuous quantifier:  done */ 
1140                 EMPTYARC(s
, atom
->begin
);               /* empty prefix */ 
1144                 /* turn x{m,n} into x{m-1,n-1}x, with capturing */ 
1145                 /* parens in only second x */ 
1146                 dupnfa(v
->nfa
, atom
->begin
, atom
->end
, s
, atom
->begin
); 
1147                 assert(m 
>= 1 && m 
!= INFINITY 
&& n 
>= 1); 
1148                 repeat(v
, s
, atom
->begin
, m 
- 1, (n 
== INFINITY
) ? n 
: n 
- 1); 
1149                 f 
= COMBINE(qprefer
, atom
->flags
); 
1150                 t 
= subre(v
, '.', f
, s
, atom
->end
);             /* prefix and atom */ 
1152                 t
->left 
= subre(v
, '=', PREF(f
), s
, atom
->begin
); 
1158         /* and finally, look after that postponed recursion */ 
1160         if (!(SEE('|') || SEE(stopper
) || SEE(EOS
))) 
1161                 t
->right 
= parsebranch(v
, stopper
, type
, atom
->end
, rp
, 1); 
1164                 EMPTYARC(atom
->end
, rp
); 
1165                 t
->right 
= subre(v
, '=', 0, atom
->end
, rp
); 
1167         assert(SEE('|') || SEE(stopper
) || SEE(EOS
)); 
1168         t
->flags 
|= COMBINE(t
->flags
, t
->right
->flags
); 
1169         top
->flags 
|= COMBINE(top
->flags
, t
->flags
); 
1173  * nonword - generate arcs for non-word-character ahead or behind 
1176 nonword(struct vars 
* v
, 
1177                 int dir
,                                /* AHEAD or BEHIND */ 
1181         int                     anchor 
= (dir 
== AHEAD
) ? '$' : '^'; 
1183         assert(dir 
== AHEAD 
|| dir 
== BEHIND
); 
1184         newarc(v
->nfa
, anchor
, 1, lp
, rp
); 
1185         newarc(v
->nfa
, anchor
, 0, lp
, rp
); 
1186         colorcomplement(v
->nfa
, v
->cm
, dir
, v
->wordchrs
, lp
, rp
); 
1187         /* (no need for special attention to \n) */ 
1191  * word - generate arcs for word character ahead or behind 
1194 word(struct vars 
* v
, 
1195          int dir
,                                       /* AHEAD or BEHIND */ 
1199         assert(dir 
== AHEAD 
|| dir 
== BEHIND
); 
1200         cloneouts(v
->nfa
, v
->wordchrs
, lp
, rp
, dir
); 
1201         /* (no need for special attention to \n) */ 
1205  * scannum - scan a number 
1207 static int                                              /* value, <= DUPMAX */ 
1208 scannum(struct vars 
* v
) 
1212         while (SEE(DIGIT
) && n 
< DUPMAX
) 
1214                 n 
= n 
* 10 + v
->nextvalue
; 
1217         if (SEE(DIGIT
) || n 
> DUPMAX
) 
1226  * repeat - replicate subNFA for quantifiers 
1228  * The duplication sequences used here are chosen carefully so that any 
1229  * pointers starting out pointing into the subexpression end up pointing into 
1230  * the last occurrence.  (Note that it may not be strung between the same 
1231  * left and right end states, however!)  This used to be important for the 
1232  * subRE tree, although the important bits are now handled by the in-line 
1233  * code in parse(), and when this is called, it doesn't matter any more. 
1236 repeat(struct vars 
* v
, 
1244 #define  PAIR(x, y)  ((x)*4 + (y)) 
1245 #define  REDUCE(x)       ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) 
1246         const int       rm 
= REDUCE(m
); 
1247         const int       rn 
= REDUCE(n
); 
1251         switch (PAIR(rm
, rn
)) 
1253                 case PAIR(0, 0):                /* empty string */ 
1254                         delsub(v
->nfa
, lp
, rp
); 
1257                 case PAIR(0, 1):                /* do as x| */ 
1260                 case PAIR(0, SOME
):             /* do as x{1,n}| */ 
1261                         repeat(v
, lp
, rp
, 1, n
); 
1265                 case PAIR(0, INF
):              /* loop x around */ 
1266                         s 
= newstate(v
->nfa
); 
1268                         moveouts(v
->nfa
, lp
, s
); 
1269                         moveins(v
->nfa
, rp
, s
); 
1273                 case PAIR(1, 1):                /* no action required */ 
1275                 case PAIR(1, SOME
):             /* do as x{0,n-1}x = (x{1,n-1}|)x */ 
1276                         s 
= newstate(v
->nfa
); 
1278                         moveouts(v
->nfa
, lp
, s
); 
1279                         dupnfa(v
->nfa
, s
, rp
, lp
, s
); 
1281                         repeat(v
, lp
, s
, 1, n 
- 1); 
1285                 case PAIR(1, INF
):              /* add loopback arc */ 
1286                         s 
= newstate(v
->nfa
); 
1287                         s2 
= newstate(v
->nfa
); 
1289                         moveouts(v
->nfa
, lp
, s
); 
1290                         moveins(v
->nfa
, rp
, s2
); 
1295                 case PAIR(SOME
, SOME
):  /* do as x{m-1,n-1}x */ 
1296                         s 
= newstate(v
->nfa
); 
1298                         moveouts(v
->nfa
, lp
, s
); 
1299                         dupnfa(v
->nfa
, s
, rp
, lp
, s
); 
1301                         repeat(v
, lp
, s
, m 
- 1, n 
- 1); 
1303                 case PAIR(SOME
, INF
):   /* do as x{m-1,}x */ 
1304                         s 
= newstate(v
->nfa
); 
1306                         moveouts(v
->nfa
, lp
, s
); 
1307                         dupnfa(v
->nfa
, s
, rp
, lp
, s
); 
1309                         repeat(v
, lp
, s
, m 
- 1, n
); 
1318  * bracket - handle non-complemented bracket expression 
1319  * Also called from cbracket for complemented bracket expressions. 
1322 bracket(struct vars 
* v
, 
1328         while (!SEE(']') && !SEE(EOS
)) 
1329                 brackpart(v
, lp
, rp
); 
1330         assert(SEE(']') || ISERR()); 
1331         okcolors(v
->nfa
, v
->cm
); 
1335  * cbracket - handle complemented bracket expression 
1336  * We do it by calling bracket() with dummy endpoints, and then complementing 
1337  * the result.  The alternative would be to invoke rainbow(), and then delete 
1338  * arcs as the b.e. is seen... but that gets messy. 
1341 cbracket(struct vars 
* v
, 
1345         struct state 
*left 
= newstate(v
->nfa
); 
1346         struct state 
*right 
= newstate(v
->nfa
); 
1348         struct arc 
*a
;                          /* arc from lp */ 
1349         struct arc 
*ba
;                         /* arc from left, from bracket() */ 
1350         struct arc 
*pa
;                         /* MCCE-prototype arc */ 
1356         bracket(v
, left
, right
); 
1357         if (v
->cflags 
& REG_NLSTOP
) 
1358                 newarc(v
->nfa
, PLAIN
, v
->nlcolor
, left
, right
); 
1361         assert(lp
->nouts 
== 0);         /* all outarcs will be ours */ 
1363         /* easy part of complementing */ 
1364         colorcomplement(v
->nfa
, v
->cm
, PLAIN
, left
, lp
, rp
); 
1366         if (v
->mcces 
== NULL
) 
1367         {                                                       /* no MCCEs -- we're done */ 
1368                 dropstate(v
->nfa
, left
); 
1369                 assert(right
->nins 
== 0); 
1370                 freestate(v
->nfa
, right
); 
1374         /* but complementing gets messy in the presence of MCCEs... */ 
1376         for (p 
= v
->mcces
->chrs
, i 
= v
->mcces
->nchrs
; i 
> 0; p
++, i
--) 
1378                 co 
= GETCOLOR(v
->cm
, *p
); 
1379                 a 
= findarc(lp
, PLAIN
, co
); 
1380                 ba 
= findarc(left
, PLAIN
, co
); 
1388                 s 
= newstate(v
->nfa
); 
1390                 newarc(v
->nfa
, PLAIN
, co
, lp
, s
); 
1392                 pa 
= findarc(v
->mccepbegin
, PLAIN
, co
); 
1395                 {                                               /* easy case, need all of them */ 
1396                         cloneouts(v
->nfa
, pa
->to
, s
, rp
, PLAIN
); 
1397                         newarc(v
->nfa
, '$', 1, s
, rp
); 
1398                         newarc(v
->nfa
, '$', 0, s
, rp
); 
1399                         colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, s
, rp
); 
1402                 {                                               /* must be selective */ 
1403                         if (findarc(ba
->to
, '$', 1) == NULL
) 
1405                                 newarc(v
->nfa
, '$', 1, s
, rp
); 
1406                                 newarc(v
->nfa
, '$', 0, s
, rp
); 
1407                                 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, 
1410                         for (pa 
= pa
->to
->outs
; pa 
!= NULL
; pa 
= pa
->outchain
) 
1411                                 if (findarc(ba
->to
, PLAIN
, pa
->co
) == NULL
) 
1412                                         newarc(v
->nfa
, PLAIN
, pa
->co
, s
, rp
); 
1413                         if (s
->nouts 
== 0)      /* limit of selectivity: none */ 
1414                                 dropstate(v
->nfa
, s
);   /* frees arc too */ 
1419         delsub(v
->nfa
, left
, right
); 
1420         assert(left
->nouts 
== 0); 
1421         freestate(v
->nfa
, left
); 
1422         assert(right
->nins 
== 0); 
1423         freestate(v
->nfa
, right
); 
1427  * brackpart - handle one item (or range) within a bracket expression 
1430 brackpart(struct vars 
* v
, 
1441         /* parse something, get rid of special cases, take shortcuts */ 
1442         switch (v
->nexttype
) 
1444                 case RANGE
:                             /* a-b-c or other botch */ 
1449                         c
[0] = v
->nextvalue
; 
1451                         /* shortcut for ordinary chr (not range, not MCCE leader) */ 
1452                         if (!SEE(RANGE
) && !ISCELEADER(v
, c
[0])) 
1454                                 onechr(v
, c
[0], lp
, rp
); 
1457                         startc 
= element(v
, c
, c 
+ 1); 
1462                         endp 
= scanplain(v
); 
1463                         INSIST(startp 
< endp
, REG_ECOLLATE
); 
1465                         startc 
= element(v
, startp
, endp
); 
1470                         endp 
= scanplain(v
); 
1471                         INSIST(startp 
< endp
, REG_ECOLLATE
); 
1473                         startc 
= element(v
, startp
, endp
); 
1475                         cv 
= eclass(v
, startc
, (v
->cflags 
& REG_ICASE
)); 
1477                         dovec(v
, cv
, lp
, rp
); 
1482                         endp 
= scanplain(v
); 
1483                         INSIST(startp 
< endp
, REG_ECTYPE
); 
1485                         cv 
= cclass(v
, startp
, endp
, (v
->cflags 
& REG_ICASE
)); 
1487                         dovec(v
, cv
, lp
, rp
); 
1499                 switch (v
->nexttype
) 
1503                                 c
[0] = v
->nextvalue
; 
1505                                 endc 
= element(v
, c
, c 
+ 1); 
1510                                 endp 
= scanplain(v
); 
1511                                 INSIST(startp 
< endp
, REG_ECOLLATE
); 
1513                                 endc 
= element(v
, startp
, endp
); 
1526          * Ranges are unportable.  Actually, standard C does guarantee that 
1527          * digits are contiguous, but making that an exception is just too 
1532         cv 
= range(v
, startc
, endc
, (v
->cflags 
& REG_ICASE
)); 
1534         dovec(v
, cv
, lp
, rp
); 
1538  * scanplain - scan PLAIN contents of [. etc. 
1540  * Certain bits of trickery in lex.c know that this code does not try 
1541  * to look past the final bracket of the [. etc. 
1543 static chr 
*                                    /* just after end of sequence */ 
1544 scanplain(struct vars 
* v
) 
1548         assert(SEE(COLLEL
) || SEE(ECLASS
) || SEE(CCLASS
)); 
1558         assert(SEE(END
) || ISERR()); 
1565  * leaders - process a cvec of collating elements to also include leaders 
1566  * Also gives all characters involved their own colors, which is almost 
1567  * certainly necessary, and sets up little disconnected subNFA. 
1570 leaders(struct vars 
* v
, 
1579         v
->mccepbegin 
= newstate(v
->nfa
); 
1580         v
->mccepend 
= newstate(v
->nfa
); 
1583         for (mcce 
= 0; mcce 
< cv
->nmcces
; mcce
++) 
1585                 p 
= cv
->mcces
[mcce
]; 
1587                 if (!haschr(cv
, leader
)) 
1590                         s 
= newstate(v
->nfa
); 
1591                         newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, leader
), 
1593                         okcolors(v
->nfa
, v
->cm
); 
1597                         a 
= findarc(v
->mccepbegin
, PLAIN
, 
1598                                                 GETCOLOR(v
->cm
, leader
)); 
1601                         assert(s 
!= v
->mccepend
); 
1604                 assert(*p 
!= 0 && *(p 
+ 1) == 0);               /* only 2-char MCCEs for 
1606                 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, *p
), s
, v
->mccepend
); 
1607                 okcolors(v
->nfa
, v
->cm
); 
1612  * onechr - fill in arcs for a plain character, and possible case complements 
1613  * This is mostly a shortcut for efficient handling of the common case. 
1616 onechr(struct vars 
* v
, 
1621         if (!(v
->cflags 
& REG_ICASE
)) 
1623                 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, c
), lp
, rp
); 
1627         /* rats, need general case anyway... */ 
1628         dovec(v
, allcases(v
, c
), lp
, rp
); 
1632  * dovec - fill in arcs for each element of a cvec 
1633  * This one has to handle the messy cases, like MCCEs and MCCE leaders. 
1636 dovec(struct vars 
* v
, 
1650         struct arc 
*pa
;                         /* arc in prototype */ 
1652         struct state 
*ps
;                       /* state in prototype */ 
1654         /* need a place to store leaders, if any */ 
1657                 assert(v
->mcces 
!= NULL
); 
1658                 if (v
->cv2 
== NULL 
|| v
->cv2
->nchrs 
< v
->mcces
->nchrs
) 
1662                         v
->cv2 
= newcvec(v
->mcces
->nchrs
, 0, v
->mcces
->nmcces
); 
1667                         leads 
= clearcvec(v
->cv2
); 
1672         /* first, get the ordinary characters out of the way */ 
1673         for (p 
= cv
->chrs
, i 
= cv
->nchrs
; i 
> 0; p
++, i
--) 
1676                 if (!ISCELEADER(v
, ch
)) 
1677                         newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, ch
), lp
, rp
); 
1680                         assert(singleton(v
->cm
, ch
)); 
1681                         assert(leads 
!= NULL
); 
1682                         if (!haschr(leads
, ch
)) 
1687         /* and the ranges */ 
1688         for (p 
= cv
->ranges
, i 
= cv
->nranges
; i 
> 0; p 
+= 2, i
--) 
1692                 while (from 
<= to 
&& (ce 
= nextleader(v
, from
, to
)) != NOCELT
) 
1695                                 subrange(v
, from
, ce 
- 1, lp
, rp
); 
1696                         assert(singleton(v
->cm
, ce
)); 
1697                         assert(leads 
!= NULL
); 
1698                         if (!haschr(leads
, ce
)) 
1703                         subrange(v
, from
, to
, lp
, rp
); 
1706         if ((leads 
== NULL 
|| leads
->nchrs 
== 0) && cv
->nmcces 
== 0) 
1709         /* deal with the MCCE leaders */ 
1711         for (p 
= leads
->chrs
, i 
= leads
->nchrs
; i 
> 0; p
++, i
--) 
1713                 co 
= GETCOLOR(v
->cm
, *p
); 
1714                 a 
= findarc(lp
, PLAIN
, co
); 
1719                         s 
= newstate(v
->nfa
); 
1721                         newarc(v
->nfa
, PLAIN
, co
, lp
, s
); 
1724                 pa 
= findarc(v
->mccepbegin
, PLAIN
, co
); 
1727                 newarc(v
->nfa
, '$', 1, s
, rp
); 
1728                 newarc(v
->nfa
, '$', 0, s
, rp
); 
1729                 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, ps
, s
, rp
); 
1734         for (i 
= 0; i 
< cv
->nmcces
; i
++) 
1737                 assert(singleton(v
->cm
, *p
)); 
1738                 if (!singleton(v
->cm
, *p
)) 
1744                 co 
= GETCOLOR(v
->cm
, ch
); 
1745                 a 
= findarc(lp
, PLAIN
, co
); 
1750                         s 
= newstate(v
->nfa
); 
1752                         newarc(v
->nfa
, PLAIN
, co
, lp
, s
); 
1755                 assert(*p 
!= 0);                /* at least two chars */ 
1756                 assert(singleton(v
->cm
, *p
)); 
1758                 co 
= GETCOLOR(v
->cm
, ch
); 
1759                 assert(*p 
== 0);                /* and only two, for now */ 
1760                 newarc(v
->nfa
, PLAIN
, co
, s
, rp
); 
1766  * nextleader - find next MCCE leader within range 
1768 static celt                                             
/* NOCELT means none */ 
1769 nextleader(struct vars 
* v
, 
1778         if (v
->mcces 
== NULL
) 
1781         for (i 
= v
->mcces
->nchrs
, p 
= v
->mcces
->chrs
; i 
> 0; i
--, p
++) 
1784                 if (from 
<= ch 
&& ch 
<= to
) 
1785                         if (it 
== NOCELT 
|| ch 
< it
) 
1792  * wordchrs - set up word-chr list for word-boundary stuff, if needed 
1794  * The list is kept as a bunch of arcs between two dummy states; it's 
1795  * disposed of by the unreachable-states sweep in NFA optimization. 
1796  * Does NEXT().  Must not be called from any unusual lexical context. 
1797  * This should be reconciled with the \w etc. handling in lex.c, and 
1798  * should be cleaned up to reduce dependencies on input scanning. 
1801 wordchrs(struct vars 
* v
) 
1804         struct state 
*right
; 
1806         if (v
->wordchrs 
!= NULL
) 
1808                 NEXT();                                 /* for consistency */ 
1812         left 
= newstate(v
->nfa
); 
1813         right 
= newstate(v
->nfa
); 
1815         /* fine point:  implemented with [::], and lexer will set REG_ULOCALE */ 
1818         assert(v
->savenow 
!= NULL 
&& SEE('[')); 
1819         bracket(v
, left
, right
); 
1820         assert((v
->savenow 
!= NULL 
&& SEE(']')) || ISERR()); 
1827  * subre - allocate a subre 
1829 static struct subre 
* 
1830 subre(struct vars 
* v
, 
1833           struct state 
* begin
, 
1840                 v
->treefree 
= ret
->left
; 
1843                 ret 
= (struct subre 
*) MALLOC(sizeof(struct subre
)); 
1849                 ret
->chain 
= v
->treechain
; 
1853         assert(strchr("|.b(=", op
) != NULL
); 
1859         ret
->min 
= ret
->max 
= 1; 
1870  * freesubre - free a subRE subtree 
1873 freesubre(struct vars 
* v
,              /* might be NULL */ 
1879         if (sr
->left 
!= NULL
) 
1880                 freesubre(v
, sr
->left
); 
1881         if (sr
->right 
!= NULL
) 
1882                 freesubre(v
, sr
->right
); 
1888  * freesrnode - free one node in a subRE subtree 
1891 freesrnode(struct vars 
* v
,             /* might be NULL */ 
1897         if (!NULLCNFA(sr
->cnfa
)) 
1898                 freecnfa(&sr
->cnfa
); 
1903                 sr
->left 
= v
->treefree
; 
1911  * optst - optimize a subRE subtree 
1914 optst(struct vars 
* v
, 
1920         /* recurse through children */ 
1921         if (t
->left 
!= NULL
) 
1923         if (t
->right 
!= NULL
) 
1928  * numst - number tree nodes (assigning retry indexes) 
1930 static int                                              /* next number */ 
1931 numst(struct subre 
* t
, 
1932           int start
)                            /* starting point for subtree numbers */ 
1939         t
->retry 
= (short) i
++; 
1940         if (t
->left 
!= NULL
) 
1941                 i 
= numst(t
->left
, i
); 
1942         if (t
->right 
!= NULL
) 
1943                 i 
= numst(t
->right
, i
); 
1948  * markst - mark tree nodes as INUSE 
1951 markst(struct subre 
* t
) 
1956         if (t
->left 
!= NULL
) 
1958         if (t
->right 
!= NULL
) 
1963  * cleanst - free any tree nodes not marked INUSE 
1966 cleanst(struct vars 
* v
) 
1971         for (t 
= v
->treechain
; t 
!= NULL
; t 
= next
) 
1974                 if (!(t
->flags 
& INUSE
)) 
1977         v
->treechain 
= NULL
; 
1978         v
->treefree 
= NULL
;                     /* just on general principles */ 
1982  * nfatree - turn a subRE subtree into a tree of compacted NFAs 
1984 static long                                             /* optimize results from top node */ 
1985 nfatree(struct vars 
* v
, 
1987                 FILE *f
)                                /* for debug output */ 
1989         assert(t 
!= NULL 
&& t
->begin 
!= NULL
); 
1991         if (t
->left 
!= NULL
) 
1992                 (DISCARD
) nfatree(v
, t
->left
, f
); 
1993         if (t
->right 
!= NULL
) 
1994                 (DISCARD
) nfatree(v
, t
->right
, f
); 
1996         return nfanode(v
, t
, f
); 
2000  * nfanode - do one NFA for nfatree 
2002 static long                                             /* optimize results */ 
2003 nfanode(struct vars 
* v
, 
2005                 FILE *f
)                                /* for debug output */ 
2010         assert(t
->begin 
!= NULL
); 
2017                 fprintf(f
, "\n\n\n========= TREE NODE %s ==========\n", 
2018                                 stid(t
, idbuf
, sizeof(idbuf
))); 
2021         nfa 
= newnfa(v
, v
->cm
, v
->nfa
); 
2023         dupnfa(nfa
, t
->begin
, t
->end
, nfa
->init
, nfa
->final
); 
2027                 ret 
= optimize(nfa
, f
); 
2030                 compact(nfa
, &t
->cnfa
); 
2037  * newlacon - allocate a lookahead-constraint subRE 
2039 static int                                              /* lacon number */ 
2040 newlacon(struct vars 
* v
, 
2041                  struct state 
* begin
, 
2048         if (v
->nlacons 
== 0) 
2050                 v
->lacons 
= (struct subre 
*) MALLOC(2 * sizeof(struct subre
)); 
2051                 n 
= 1;                                  /* skip 0th */ 
2056                 v
->lacons 
= (struct subre 
*) REALLOC(v
->lacons
, 
2057                                                                 (v
->nlacons 
+ 1) * sizeof(struct subre
)); 
2060         if (v
->lacons 
== NULL
) 
2065         sub 
= &v
->lacons
[n
]; 
2074  * freelacons - free lookahead-constraint subRE vector 
2077 freelacons(struct subre 
* subs
, 
2084         for (sub 
= subs 
+ 1, i 
= n 
- 1; i 
> 0; sub
++, i
--)      /* no 0th */ 
2085                 if (!NULLCNFA(sub
->cnfa
)) 
2086                         freecnfa(&sub
->cnfa
); 
2091  * rfree - free a whole RE (insides of regfree) 
2098         if (re 
== NULL 
|| re
->re_magic 
!= REMAGIC
) 
2101         re
->re_magic 
= 0;                       /* invalidate RE */ 
2102         g 
= (struct guts 
*) re
->re_guts
; 
2107         if (g
->tree 
!= NULL
) 
2108                 freesubre((struct vars 
*) NULL
, g
->tree
); 
2109         if (g
->lacons 
!= NULL
) 
2110                 freelacons(g
->lacons
, g
->nlacons
); 
2111         if (!NULLCNFA(g
->search
)) 
2112                 freecnfa(&g
->search
); 
2119  * dump - dump an RE in human-readable form 
2128         if (re
->re_magic 
!= REMAGIC
) 
2129                 fprintf(f
, "bad magic number (0x%x not 0x%x)\n", re
->re_magic
, 
2131         if (re
->re_guts 
== NULL
) 
2133                 fprintf(f
, "NULL guts!!!\n"); 
2136         g 
= (struct guts 
*) re
->re_guts
; 
2137         if (g
->magic 
!= GUTSMAGIC
) 
2138                 fprintf(f
, "bad guts magic number (0x%x not 0x%x)\n", g
->magic
, 
2141         fprintf(f
, "\n\n\n========= DUMP ==========\n"); 
2142         fprintf(f
, "nsub %d, info 0%lo, csize %d, ntree %d\n", 
2143                         re
->re_nsub
, re
->re_info
, re
->re_csize
, g
->ntree
); 
2145         dumpcolors(&g
->cmap
, f
); 
2146         if (!NULLCNFA(g
->search
)) 
2148                 printf("\nsearch:\n"); 
2149                 dumpcnfa(&g
->search
, f
); 
2151         for (i 
= 1; i 
< g
->nlacons
; i
++) 
2153                 fprintf(f
, "\nla%d (%s):\n", i
, 
2154                                 (g
->lacons
[i
].subno
) ? "positive" : "negative"); 
2155                 dumpcnfa(&g
->lacons
[i
].cnfa
, f
); 
2158         dumpst(g
->tree
, f
, 0); 
2162  * dumpst - dump a subRE tree 
2165 dumpst(struct subre 
* t
, 
2167            int nfapresent
)                      /* is the original NFA still around? */ 
2170                 fprintf(f
, "null tree\n"); 
2172                 stdump(t
, f
, nfapresent
); 
2177  * stdump - recursive guts of dumpst 
2180 stdump(struct subre 
* t
, 
2182            int nfapresent
)                      /* is the original NFA still around? */ 
2186         fprintf(f
, "%s. `%c'", stid(t
, idbuf
, sizeof(idbuf
)), t
->op
); 
2187         if (t
->flags 
& LONGER
) 
2188                 fprintf(f
, " longest"); 
2189         if (t
->flags 
& SHORTER
) 
2190                 fprintf(f
, " shortest"); 
2191         if (t
->flags 
& MIXED
) 
2192                 fprintf(f
, " hasmixed"); 
2194                 fprintf(f
, " hascapture"); 
2195         if (t
->flags 
& BACKR
) 
2196                 fprintf(f
, " hasbackref"); 
2197         if (!(t
->flags 
& INUSE
)) 
2198                 fprintf(f
, " UNUSED"); 
2200                 fprintf(f
, " (#%d)", t
->subno
); 
2201         if (t
->min 
!= 1 || t
->max 
!= 1) 
2203                 fprintf(f
, " {%d,", t
->min
); 
2204                 if (t
->max 
!= INFINITY
) 
2205                         fprintf(f
, "%d", t
->max
); 
2209                 fprintf(f
, " %ld-%ld", (long) t
->begin
->no
, (long) t
->end
->no
); 
2210         if (t
->left 
!= NULL
) 
2211                 fprintf(f
, " L:%s", stid(t
->left
, idbuf
, sizeof(idbuf
))); 
2212         if (t
->right 
!= NULL
) 
2213                 fprintf(f
, " R:%s", stid(t
->right
, idbuf
, sizeof(idbuf
))); 
2214         if (!NULLCNFA(t
->cnfa
)) 
2217                 dumpcnfa(&t
->cnfa
, f
); 
2220         if (t
->left 
!= NULL
) 
2221                 stdump(t
->left
, f
, nfapresent
); 
2222         if (t
->right 
!= NULL
) 
2223                 stdump(t
->right
, f
, nfapresent
); 
2227  * stid - identify a subtree node for dumping 
2229 static char *                                   /* points to buf or constant string */ 
2230 stid(struct subre 
* t
, 
2234         /* big enough for hex int or decimal t->retry? */ 
2235         if (bufsize 
< sizeof(int) * 2 + 3 || bufsize 
< sizeof(t
->retry
) * 3 + 1) 
2238                 sprintf(buf
, "%d", t
->retry
); 
2240                 sprintf(buf
, "0x%x", (int) t
);  /* may lose bits, that's okay */ 
2243 #endif   /* REG_DEBUG */ 
2246 #include "regc_lex.c" 
2247 #include "regc_color.c" 
2248 #include "regc_nfa.c" 
2249 #include "regc_cvec.c" 
2250 #include "regc_locale.c"