2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 * forward declarations, up here so forward datatypes etc. are defined early
38 /* =====^!^===== begin forwards =====^!^===== */
39 /* automatically gathered by fwd; do not hand-edit */
40 /* === regcomp.c === */
41 int compile
_ANSI_ARGS_((regex_t
*, CONST chr
*, size_t, int));
42 static VOID moresubs
_ANSI_ARGS_((struct vars
*, int));
43 static int freev
_ANSI_ARGS_((struct vars
*, int));
44 static VOID makesearch
_ANSI_ARGS_((struct vars
*, struct nfa
*));
45 static struct subre
*parse
_ANSI_ARGS_((struct vars
*, int, int, struct state
*, struct state
*));
46 static struct subre
*parsebranch
_ANSI_ARGS_((struct vars
*, int, int, struct state
*, struct state
*, int));
47 static VOID parseqatom
_ANSI_ARGS_((struct vars
*, int, int, struct state
*, struct state
*, struct subre
*));
48 static VOID nonword
_ANSI_ARGS_((struct vars
*, int, struct state
*, struct state
*));
49 static VOID word
_ANSI_ARGS_((struct vars
*, int, struct state
*, struct state
*));
50 static int scannum
_ANSI_ARGS_((struct vars
*));
51 static VOID repeat
_ANSI_ARGS_((struct vars
*, struct state
*, struct state
*, int, int));
52 static VOID bracket
_ANSI_ARGS_((struct vars
*, struct state
*, struct state
*));
53 static VOID cbracket
_ANSI_ARGS_((struct vars
*, struct state
*, struct state
*));
54 static VOID brackpart
_ANSI_ARGS_((struct vars
*, struct state
*, struct state
*));
55 static chr
*scanplain
_ANSI_ARGS_((struct vars
*));
56 static VOID leaders
_ANSI_ARGS_((struct vars
*, struct cvec
*));
57 static VOID onechr
_ANSI_ARGS_((struct vars
*, pchr
, struct state
*, struct state
*));
58 static VOID dovec
_ANSI_ARGS_((struct vars
*, struct cvec
*, struct state
*, struct state
*));
59 static celt nextleader
_ANSI_ARGS_((struct vars
*, pchr
, pchr
));
60 static VOID wordchrs
_ANSI_ARGS_((struct vars
*));
61 static struct subre
*subre
_ANSI_ARGS_((struct vars
*, int, int, struct state
*, struct state
*));
62 static VOID freesubre
_ANSI_ARGS_((struct vars
*, struct subre
*));
63 static VOID freesrnode
_ANSI_ARGS_((struct vars
*, struct subre
*));
64 static VOID optst
_ANSI_ARGS_((struct vars
*, struct subre
*));
65 static int numst
_ANSI_ARGS_((struct subre
*, int));
66 static VOID markst
_ANSI_ARGS_((struct subre
*));
67 static VOID cleanst
_ANSI_ARGS_((struct vars
*));
68 static long nfatree
_ANSI_ARGS_((struct vars
*, struct subre
*, FILE *));
69 static long nfanode
_ANSI_ARGS_((struct vars
*, struct subre
*, FILE *));
70 static int newlacon
_ANSI_ARGS_((struct vars
*, struct state
*, struct state
*, int));
71 static VOID freelacons
_ANSI_ARGS_((struct subre
*, int));
72 static VOID rfree
_ANSI_ARGS_((regex_t
*));
73 static VOID dump
_ANSI_ARGS_((regex_t
*, FILE *));
74 static VOID dumpst
_ANSI_ARGS_((struct subre
*, FILE *, int));
75 static VOID stdump
_ANSI_ARGS_((struct subre
*, FILE *, int));
76 static char *stid
_ANSI_ARGS_((struct subre
*, char *, size_t));
77 /* === regc_lex.c === */
78 static VOID lexstart
_ANSI_ARGS_((struct vars
*));
79 static VOID prefixes
_ANSI_ARGS_((struct vars
*));
80 static VOID lexnest
_ANSI_ARGS_((struct vars
*, chr
*, chr
*));
81 static VOID lexword
_ANSI_ARGS_((struct vars
*));
82 static int next
_ANSI_ARGS_((struct vars
*));
83 static int lexescape
_ANSI_ARGS_((struct vars
*));
84 static chr lexdigits
_ANSI_ARGS_((struct vars
*, int, int, int));
85 static int brenext
_ANSI_ARGS_((struct vars
*, pchr
));
86 static VOID skip
_ANSI_ARGS_((struct vars
*));
87 static chr newline
_ANSI_ARGS_((NOPARMS
));
89 static chr
*ch
_ANSI_ARGS_((NOPARMS
));
91 static chr chrnamed
_ANSI_ARGS_((struct vars
*, chr
*, chr
*, pchr
));
92 /* === regc_color.c === */
93 static VOID initcm
_ANSI_ARGS_((struct vars
*, struct colormap
*));
94 static VOID freecm
_ANSI_ARGS_((struct colormap
*));
95 static VOID cmtreefree
_ANSI_ARGS_((struct colormap
*, union tree
*, int));
96 static color setcolor
_ANSI_ARGS_((struct colormap
*, pchr
, pcolor
));
97 static color maxcolor
_ANSI_ARGS_((struct colormap
*));
98 static color newcolor
_ANSI_ARGS_((struct colormap
*));
99 static VOID freecolor
_ANSI_ARGS_((struct colormap
*, pcolor
));
100 static color pseudocolor
_ANSI_ARGS_((struct colormap
*));
101 static color subcolor
_ANSI_ARGS_((struct colormap
*, pchr c
));
102 static color newsub
_ANSI_ARGS_((struct colormap
*, pcolor
));
103 static VOID subrange
_ANSI_ARGS_((struct vars
*, pchr
, pchr
, struct state
*, struct state
*));
104 static VOID subblock
_ANSI_ARGS_((struct vars
*, pchr
, struct state
*, struct state
*));
105 static VOID okcolors
_ANSI_ARGS_((struct nfa
*, struct colormap
*));
106 static VOID colorchain
_ANSI_ARGS_((struct colormap
*, struct arc
*));
107 static VOID uncolorchain
_ANSI_ARGS_((struct colormap
*, struct arc
*));
108 static int singleton
_ANSI_ARGS_((struct colormap
*, pchr c
));
109 static VOID rainbow
_ANSI_ARGS_((struct nfa
*, struct colormap
*, int, pcolor
, struct state
*, struct state
*));
110 static VOID colorcomplement
_ANSI_ARGS_((struct nfa
*, struct colormap
*, int, struct state
*, struct state
*, struct state
*));
112 static VOID dumpcolors
_ANSI_ARGS_((struct colormap
*, FILE *));
113 static VOID fillcheck
_ANSI_ARGS_((struct colormap
*, union tree
*, int, FILE *));
114 static VOID dumpchr
_ANSI_ARGS_((pchr
, FILE *));
116 /* === regc_nfa.c === */
117 static struct nfa
*newnfa
_ANSI_ARGS_((struct vars
*, struct colormap
*, struct nfa
*));
118 static VOID freenfa
_ANSI_ARGS_((struct nfa
*));
119 static struct state
*newstate
_ANSI_ARGS_((struct nfa
*));
120 static struct state
*newfstate
_ANSI_ARGS_((struct nfa
*, int flag
));
121 static VOID dropstate
_ANSI_ARGS_((struct nfa
*, struct state
*));
122 static VOID freestate
_ANSI_ARGS_((struct nfa
*, struct state
*));
123 static VOID destroystate
_ANSI_ARGS_((struct nfa
*, struct state
*));
124 static VOID newarc
_ANSI_ARGS_((struct nfa
*, int, pcolor
, struct state
*, struct state
*));
125 static struct arc
*allocarc
_ANSI_ARGS_((struct nfa
*, struct state
*));
126 static VOID freearc
_ANSI_ARGS_((struct nfa
*, struct arc
*));
127 static struct arc
*findarc
_ANSI_ARGS_((struct state
*, int, pcolor
));
128 static VOID cparc
_ANSI_ARGS_((struct nfa
*, struct arc
*, struct state
*, struct state
*));
129 static VOID moveins
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
130 static VOID copyins
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
131 static VOID moveouts
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
132 static VOID copyouts
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
133 static VOID cloneouts
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*, struct state
*, int));
134 static VOID delsub
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
135 static VOID deltraverse
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
136 static VOID dupnfa
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*, struct state
*, struct state
*));
137 static VOID duptraverse
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*));
138 static VOID cleartraverse
_ANSI_ARGS_((struct nfa
*, struct state
*));
139 static VOID specialcolors
_ANSI_ARGS_((struct nfa
*));
140 static long optimize
_ANSI_ARGS_((struct nfa
*, FILE *));
141 static VOID pullback
_ANSI_ARGS_((struct nfa
*, FILE *));
142 static int pull
_ANSI_ARGS_((struct nfa
*, struct arc
*));
143 static VOID pushfwd
_ANSI_ARGS_((struct nfa
*, FILE *));
144 static int push
_ANSI_ARGS_((struct nfa
*, struct arc
*));
145 #define INCOMPATIBLE 1 /* destroys arc */
146 #define SATISFIED 2 /* constraint satisfied */
147 #define COMPATIBLE 3 /* compatible but not satisfied yet */
148 static int combine
_ANSI_ARGS_((struct arc
*, struct arc
*));
149 static VOID fixempties
_ANSI_ARGS_((struct nfa
*, FILE *));
150 static int unempty
_ANSI_ARGS_((struct nfa
*, struct arc
*));
151 static VOID cleanup
_ANSI_ARGS_((struct nfa
*));
152 static VOID markreachable
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*, struct state
*));
153 static VOID markcanreach
_ANSI_ARGS_((struct nfa
*, struct state
*, struct state
*, struct state
*));
154 static long analyze
_ANSI_ARGS_((struct nfa
*));
155 static VOID compact
_ANSI_ARGS_((struct nfa
*, struct cnfa
*));
156 static VOID carcsort
_ANSI_ARGS_((struct carc
*, struct carc
*));
157 static VOID freecnfa
_ANSI_ARGS_((struct cnfa
*));
158 static VOID dumpnfa
_ANSI_ARGS_((struct nfa
*, FILE *));
160 static VOID dumpstate
_ANSI_ARGS_((struct state
*, FILE *));
161 static VOID dumparcs
_ANSI_ARGS_((struct state
*, FILE *));
162 static int dumprarcs
_ANSI_ARGS_((struct arc
*, struct state
*, FILE *, int));
163 static VOID dumparc
_ANSI_ARGS_((struct arc
*, struct state
*, FILE *));
165 static VOID dumpcnfa
_ANSI_ARGS_((struct cnfa
*, FILE *));
167 static VOID dumpcstate
_ANSI_ARGS_((int, struct carc
*, struct cnfa
*, FILE *));
169 /* === regc_cvec.c === */
170 static struct cvec
*newcvec
_ANSI_ARGS_((int, int, int));
171 static struct cvec
*clearcvec
_ANSI_ARGS_((struct cvec
*));
172 static VOID addchr
_ANSI_ARGS_((struct cvec
*, pchr
));
173 static VOID addrange
_ANSI_ARGS_((struct cvec
*, pchr
, pchr
));
174 static VOID addmcce
_ANSI_ARGS_((struct cvec
*, chr
*, chr
*));
175 static int haschr
_ANSI_ARGS_((struct cvec
*, pchr
));
176 static struct cvec
*getcvec
_ANSI_ARGS_((struct vars
*, int, int, int));
177 static VOID freecvec
_ANSI_ARGS_((struct cvec
*));
178 /* === regc_locale.c === */
179 static int nmcces
_ANSI_ARGS_((struct vars
*));
180 static int nleaders
_ANSI_ARGS_((struct vars
*));
181 static struct cvec
*allmcces
_ANSI_ARGS_((struct vars
*, struct cvec
*));
182 static celt element
_ANSI_ARGS_((struct vars
*, chr
*, chr
*));
183 static struct cvec
*range
_ANSI_ARGS_((struct vars
*, celt
, celt
, int));
184 static int before
_ANSI_ARGS_((celt
, celt
));
185 static struct cvec
*eclass
_ANSI_ARGS_((struct vars
*, celt
, int));
186 static struct cvec
*cclass
_ANSI_ARGS_((struct vars
*, chr
*, chr
*, int));
187 static struct cvec
*allcases
_ANSI_ARGS_((struct vars
*, pchr
));
188 static int cmp
_ANSI_ARGS_((CONST chr
*, CONST chr
*, size_t));
189 static int casecmp
_ANSI_ARGS_((CONST chr
*, CONST chr
*, size_t));
190 /* automatically gathered by fwd; do not hand-edit */
191 /* =====^!^===== end forwards =====^!^===== */
195 /* internal variables, bundled for easy passing around */
198 chr
*now
; /* scan pointer into string */
199 chr
*stop
; /* end of string */
200 chr
*savenow
; /* saved now and stop for "subroutine call" */
202 int err
; /* error code (0 if none) */
203 int cflags
; /* copy of compile flags */
204 int lasttype
; /* type of previous token */
205 int nexttype
; /* type of next token */
206 chr nextvalue
; /* value (if any) of next token */
207 int lexcon
; /* lexical context type (see lex.c) */
208 int nsubexp
; /* subexpression count */
209 struct subre
**subs
; /* subRE pointer vector */
210 size_t nsubs
; /* length of vector */
211 struct subre
*sub10
[10]; /* initial vector, enough for most */
212 struct nfa
*nfa
; /* the NFA */
213 struct colormap
*cm
; /* character color map */
214 color nlcolor
; /* color of newline */
215 struct state
*wordchrs
; /* state in nfa holding word-char outarcs */
216 struct subre
*tree
; /* subexpression tree */
217 struct subre
*treechain
; /* all tree nodes allocated */
218 struct subre
*treefree
; /* any free tree nodes */
219 int ntree
; /* number of tree nodes */
220 struct cvec
*cv
; /* interface cvec */
221 struct cvec
*cv2
; /* utility cvec */
222 struct cvec
*mcces
; /* collating-element information */
223 # define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
224 struct state
*mccepbegin
; /* in nfa, start of MCCE prototypes */
225 struct state
*mccepend
; /* in nfa, end of MCCE prototypes */
226 struct subre
*lacons
; /* lookahead-constraint vector */
227 int nlacons
; /* size of lacons */
230 /* parsing macros; most know that `v' is the struct vars pointer */
231 #define NEXT() (next(v)) /* advance by one token */
232 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
233 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
234 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
235 #define ISERR() VISERR(v)
236 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
238 #define ERR(e) VERR(v, e) /* record an error */
239 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
240 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
241 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
242 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
243 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
244 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
246 /* token type codes, some also used as NFA arc types */
247 #define EMPTY 'n' /* no token present */
248 #define EOS 'e' /* end of string */
249 #define PLAIN 'p' /* ordinary character */
250 #define DIGIT 'd' /* digit (in bound) */
251 #define BACKREF 'b' /* back reference */
252 #define COLLEL 'I' /* start of [. */
253 #define ECLASS 'E' /* start of [= */
254 #define CCLASS 'C' /* start of [: */
255 #define END 'X' /* end of [. [= [: */
256 #define RANGE 'R' /* - within [] which might be range delim. */
257 #define LACON 'L' /* lookahead constraint subRE */
258 #define AHEAD 'a' /* color-lookahead arc */
259 #define BEHIND 'r' /* color-lookbehind arc */
260 #define WBDRY 'w' /* word boundary constraint */
261 #define NWBDRY 'W' /* non-word-boundary constraint */
262 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
263 #define SEND 'Z' /* end of string (even if not EOL) */
264 #define PREFER 'P' /* length preference */
266 /* is an arc colored, and hence on a color chain? */
267 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
272 /* static function list */
273 static struct fns functions
= {
274 rfree
, /* regfree insides */
280 - compile - compile regular expression
281 ^ int compile(regex_t *, CONST chr *, size_t, int);
284 compile(re
, string
, len
, flags
)
291 struct vars
*v
= &var
;
295 FILE *debug
= (flags
®_PROGRESS
) ? stdout
: (FILE *)NULL
;
296 # define CNOERR() { if (ISERR()) return freev(v, v->err); }
300 if (re
== NULL
|| string
== NULL
)
302 if ((flags
®_QUOTE
) &&
303 (flags
&(REG_ADVANCED
|REG_EXPANDED
|REG_NEWLINE
)))
305 if (!(flags
®_EXTENDED
) && (flags
®_ADVF
))
308 /* initial setup (after which freev() is callable) */
310 v
->now
= (chr
*)string
;
311 v
->stop
= v
->now
+ len
;
312 v
->savenow
= v
->savestop
= NULL
;
318 for (j
= 0; j
< v
->nsubs
; j
++)
322 v
->nlcolor
= COLORLESS
;
332 re
->re_magic
= REMAGIC
;
333 re
->re_info
= 0; /* bits get set during parse */
334 re
->re_csize
= sizeof(chr
);
336 re
->re_fns
= VS(&functions
);
338 /* more complex setup, malloced things */
339 re
->re_guts
= VS(MALLOC(sizeof(struct guts
)));
340 if (re
->re_guts
== NULL
)
341 return freev(v
, REG_ESPACE
);
342 g
= (struct guts
*)re
->re_guts
;
349 v
->nfa
= newnfa(v
, v
->cm
, (struct nfa
*)NULL
);
351 v
->cv
= newcvec(100, 20, 10);
353 return freev(v
, REG_ESPACE
);
356 v
->mcces
= newcvec(nleaders(v
), 0, i
);
358 v
->mcces
= allmcces(v
, v
->mcces
);
359 leaders(v
, v
->mcces
);
360 addmcce(v
->mcces
, (chr
*)NULL
, (chr
*)NULL
); /* dummy */
365 lexstart(v
); /* also handles prefixes */
366 if ((v
->cflags
®_NLSTOP
) || (v
->cflags
®_NLANCH
)) {
367 /* assign newline a unique color */
368 v
->nlcolor
= subcolor(v
->cm
, newline());
369 okcolors(v
->nfa
, v
->cm
);
372 v
->tree
= parse(v
, EOS
, PLAIN
, v
->nfa
->init
, v
->nfa
->final
);
373 assert(SEE(EOS
)); /* even if error; ISERR() => SEE(EOS) */
375 assert(v
->tree
!= NULL
);
377 /* finish setup of nfa and its subre tree */
378 specialcolors(v
->nfa
);
381 fprintf(debug
, "\n\n\n========= RAW ==========\n");
382 dumpnfa(v
->nfa
, debug
);
383 dumpst(v
->tree
, debug
, 1);
386 v
->ntree
= numst(v
->tree
, 1);
390 fprintf(debug
, "\n\n\n========= TREE FIXED ==========\n");
391 dumpst(v
->tree
, debug
, 1);
394 /* build compacted NFAs for tree and lacons */
395 re
->re_info
|= nfatree(v
, v
->tree
, debug
);
397 assert(v
->nlacons
== 0 || v
->lacons
!= NULL
);
398 for (i
= 1; i
< v
->nlacons
; i
++) {
400 fprintf(debug
, "\n\n\n========= LA%d ==========\n", i
);
401 nfanode(v
, &v
->lacons
[i
], debug
);
404 if (v
->tree
->flags
&SHORTER
)
407 /* build compacted NFAs for tree, lacons, fast search */
409 fprintf(debug
, "\n\n\n========= SEARCH ==========\n");
410 /* can sacrifice main NFA now, so use it as work area */
411 (DISCARD
)optimize(v
->nfa
, debug
);
413 makesearch(v
, v
->nfa
);
415 compact(v
->nfa
, &g
->search
);
418 /* looks okay, package it up */
419 re
->re_nsub
= v
->nsubexp
;
420 v
->re
= NULL
; /* freev no longer frees re */
421 g
->magic
= GUTSMAGIC
;
422 g
->cflags
= v
->cflags
;
423 g
->info
= re
->re_info
;
424 g
->nsub
= re
->re_nsub
;
428 g
->compare
= (v
->cflags
®_ICASE
) ? casecmp
: cmp
;
429 g
->lacons
= v
->lacons
;
431 g
->nlacons
= v
->nlacons
;
441 - moresubs - enlarge subRE vector
442 ^ static VOID moresubs(struct vars *, int);
447 int wanted
; /* want enough room for this one */
452 assert(wanted
> 0 && (size_t)wanted
>= v
->nsubs
);
453 n
= (size_t)wanted
* 3 / 2 + 1;
454 if (v
->subs
== v
->sub10
) {
455 p
= (struct subre
**)MALLOC(n
* sizeof(struct subre
*));
457 memcpy(VS(p
), VS(v
->subs
),
458 v
->nsubs
* sizeof(struct subre
*));
460 p
= (struct subre
**)REALLOC(v
->subs
, n
*sizeof(struct subre
*));
466 for (p
= &v
->subs
[v
->nsubs
]; v
->nsubs
< n
; p
++, v
->nsubs
++)
468 assert(v
->nsubs
== n
);
469 assert((size_t)wanted
< v
->nsubs
);
473 - freev - free vars struct's substructures where necessary
474 * Optionally does error-number setting, and always returns error code
475 * (if any), to make error-handling code terser.
476 ^ static int freev(struct vars *, int);
485 if (v
->subs
!= v
->sub10
)
490 freesubre(v
, v
->tree
);
491 if (v
->treechain
!= NULL
)
497 if (v
->mcces
!= NULL
)
499 if (v
->lacons
!= NULL
)
500 freelacons(v
->lacons
, v
->nlacons
);
501 ERR(err
); /* nop if err==0 */
507 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
508 * NFA must have been optimize()d already.
509 ^ static VOID makesearch(struct vars *, struct nfa *);
518 struct state
*pre
= nfa
->pre
;
523 /* no loops are needed if it's anchored */
524 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
) {
525 assert(a
->type
== PLAIN
);
526 if (a
->co
!= nfa
->bos
[0] && a
->co
!= nfa
->bos
[1])
530 /* add implicit .* in front */
531 rainbow(nfa
, v
->cm
, PLAIN
, COLORLESS
, pre
, pre
);
533 /* and ^* and \A* too -- not always necessary, but harmless */
534 newarc(nfa
, PLAIN
, nfa
->bos
[0], pre
, pre
);
535 newarc(nfa
, PLAIN
, nfa
->bos
[1], pre
, pre
);
539 * Now here's the subtle part. Because many REs have no lookback
540 * constraints, often knowing when you were in the pre state tells
541 * you little; it's the next state(s) that are informative. But
542 * some of them may have other inarcs, i.e. it may be possible to
543 * make actual progress and then return to one of them. We must
544 * de-optimize such cases, splitting each such state into progress
545 * and no-progress states.
548 /* first, make a list of the states */
550 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
) {
552 for (b
= s
->ins
; b
!= NULL
; b
= b
->inchain
)
555 if (b
!= NULL
) { /* must be split */
556 if (s
->tmp
== NULL
) { /* if not already in the list */
557 /* (fixes bugs 505048, 230589, */
558 /* 840258, 504785) */
566 for (s
= slist
; s
!= NULL
; s
= s2
) {
568 copyouts(nfa
, s
, s2
);
569 for (a
= s
->ins
; a
!= NULL
; a
= b
) {
571 if (a
->from
!= pre
) {
572 cparc(nfa
, a
, a
->from
, s2
);
577 s
->tmp
= NULL
; /* clean up while we're at it */
582 - parse - parse an RE
583 * This is actually just the top level, which parses a bunch of branches
584 * tied together with '|'. They appear in the tree as the left children
585 * of a chain of '|' subres.
586 ^ static struct subre *parse(struct vars *, int, int, struct state *,
589 static struct subre
*
590 parse(v
, stopper
, type
, init
, final
)
592 int stopper
; /* EOS or ')' */
593 int type
; /* LACON (lookahead subRE) or PLAIN */
594 struct state
*init
; /* initial state */
595 struct state
*final
; /* final state */
597 struct state
*left
; /* scaffolding for branch */
599 struct subre
*branches
; /* top level */
600 struct subre
*branch
; /* current branch */
601 struct subre
*t
; /* temporary */
602 int firstbranch
; /* is this the first branch? */
604 assert(stopper
== ')' || stopper
== EOS
);
606 branches
= subre(v
, '|', LONGER
, init
, final
);
612 /* need a place to hang it */
613 branch
->right
= subre(v
, '|', LONGER
, init
, final
);
615 branch
= branch
->right
;
618 left
= newstate(v
->nfa
);
619 right
= newstate(v
->nfa
);
621 EMPTYARC(init
, left
);
622 EMPTYARC(right
, final
);
624 branch
->left
= parsebranch(v
, stopper
, type
, left
, right
, 0);
626 branch
->flags
|= UP(branch
->flags
| branch
->left
->flags
);
627 if ((branch
->flags
&~ branches
->flags
) != 0) /* new flags */
628 for (t
= branches
; t
!= branch
; t
= t
->right
)
629 t
->flags
|= branch
->flags
;
631 assert(SEE(stopper
) || SEE(EOS
));
634 assert(stopper
== ')' && SEE(EOS
));
638 /* optimize out simple cases */
639 if (branch
== branches
) { /* only one branch */
640 assert(branch
->right
== NULL
);
643 freesubre(v
, branches
);
645 } else if (!MESSY(branches
->flags
)) { /* no interesting innards */
646 freesubre(v
, branches
->left
);
647 branches
->left
= NULL
;
648 freesubre(v
, branches
->right
);
649 branches
->right
= NULL
;
657 - parsebranch - parse one branch of an RE
658 * This mostly manages concatenation, working closely with parseqatom().
659 * Concatenated things are bundled up as much as possible, with separate
660 * ',' nodes introduced only when necessary due to substructure.
661 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
662 ^ struct state *, int);
664 static struct subre
*
665 parsebranch(v
, stopper
, type
, left
, right
, partial
)
667 int stopper
; /* EOS or ')' */
668 int type
; /* LACON (lookahead subRE) or PLAIN */
669 struct state
*left
; /* leftmost state */
670 struct state
*right
; /* rightmost state */
671 int partial
; /* is this only part of a branch? */
673 struct state
*lp
; /* left end of current construct */
674 int seencontent
; /* is there anything in this branch yet? */
679 t
= subre(v
, '=', 0, left
, right
); /* op '=' is tentative */
681 while (!SEE('|') && !SEE(stopper
) && !SEE(EOS
)) {
682 if (seencontent
) { /* implicit concat operator */
683 lp
= newstate(v
->nfa
);
685 moveins(v
->nfa
, right
, lp
);
689 /* NB, recursion in parseqatom() may swallow rest of branch */
690 parseqatom(v
, stopper
, type
, lp
, right
, t
);
693 if (!seencontent
) { /* empty branch */
697 EMPTYARC(left
, right
);
704 - parseqatom - parse one quantified atom or constraint of an RE
705 * The bookkeeping near the end cooperates very closely with parsebranch();
706 * in particular, it contains a recursion that can involve parsing the rest
707 * of the branch, making this function's name somewhat inaccurate.
708 ^ static VOID parseqatom(struct vars *, int, int, struct state *,
709 ^ struct state *, struct subre *);
712 parseqatom(v
, stopper
, type
, lp
, rp
, top
)
714 int stopper
; /* EOS or ')' */
715 int type
; /* LACON (lookahead subRE) or PLAIN */
716 struct state
*lp
; /* left state to hang it on */
717 struct state
*rp
; /* right state to hang it on */
718 struct subre
*top
; /* subtree top */
720 struct state
*s
; /* temporaries for new states */
722 # define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
724 struct subre
*atom
; /* atom's subtree */
726 int cap
; /* capturing parens? */
727 int pos
; /* positive lookahead? */
728 int subno
; /* capturing-parens or backref number */
730 int qprefer
; /* quantifier short/long preference */
732 struct subre
**atomp
; /* where the pointer to atom is */
734 /* initial bookkeeping */
736 assert(lp
->nouts
== 0); /* must string new code */
737 assert(rp
->nins
== 0); /* between lp and rp */
738 subno
= 0; /* just to shut lint up */
740 /* an atom or constraint... */
741 atomtype
= v
->nexttype
;
743 /* first, constraints, which end by returning */
746 if (v
->cflags
®_NLANCH
)
747 ARCV(BEHIND
, v
->nlcolor
);
753 if (v
->cflags
®_NLANCH
)
754 ARCV(AHEAD
, v
->nlcolor
);
759 ARCV('^', 1); /* BOL */
760 ARCV('^', 0); /* or BOS */
765 ARCV('$', 1); /* EOL */
766 ARCV('$', 0); /* or EOS */
771 wordchrs(v
); /* does NEXT() */
772 s
= newstate(v
->nfa
);
774 nonword(v
, BEHIND
, lp
, s
);
775 word(v
, AHEAD
, s
, rp
);
779 wordchrs(v
); /* does NEXT() */
780 s
= newstate(v
->nfa
);
782 word(v
, BEHIND
, lp
, s
);
783 nonword(v
, AHEAD
, s
, rp
);
787 wordchrs(v
); /* does NEXT() */
788 s
= newstate(v
->nfa
);
790 nonword(v
, BEHIND
, lp
, s
);
791 word(v
, AHEAD
, s
, rp
);
792 s
= newstate(v
->nfa
);
794 word(v
, BEHIND
, lp
, s
);
795 nonword(v
, AHEAD
, s
, rp
);
799 wordchrs(v
); /* does NEXT() */
800 s
= newstate(v
->nfa
);
802 word(v
, BEHIND
, lp
, s
);
803 word(v
, AHEAD
, s
, rp
);
804 s
= newstate(v
->nfa
);
806 nonword(v
, BEHIND
, lp
, s
);
807 nonword(v
, AHEAD
, s
, rp
);
810 case LACON
: /* lookahead constraint */
813 s
= newstate(v
->nfa
);
814 s2
= newstate(v
->nfa
);
816 t
= parse(v
, ')', LACON
, s
, s2
);
817 freesubre(v
, t
); /* internal structure irrelevant */
818 assert(SEE(')') || ISERR());
820 n
= newlacon(v
, s
, s2
, pos
);
825 /* then errors, to get them out of the way */
837 /* then plain characters, and minor variants on that theme */
838 case ')': /* unbalanced paren */
839 if ((v
->cflags
®_ADVANCED
) != REG_EXTENDED
) {
843 /* legal in EREs due to specification botch */
845 /* fallthrough into case PLAIN */
847 onechr(v
, v
->nextvalue
, lp
, rp
);
848 okcolors(v
->nfa
, v
->cm
);
853 if (v
->nextvalue
== 1)
857 assert(SEE(']') || ISERR());
861 rainbow(v
->nfa
, v
->cm
, PLAIN
,
862 (v
->cflags
®_NLSTOP
) ? v
->nlcolor
: COLORLESS
,
866 /* and finally the ugly stuff */
867 case '(': /* value flags as capturing or non */
868 cap
= (type
== LACON
) ? 0 : v
->nextvalue
;
872 if ((size_t)subno
>= v
->nsubs
)
874 assert((size_t)subno
< v
->nsubs
);
876 atomtype
= PLAIN
; /* something that's not '(' */
878 /* need new endpoints because tree will contain pointers */
879 s
= newstate(v
->nfa
);
880 s2
= newstate(v
->nfa
);
885 atom
= parse(v
, ')', PLAIN
, s
, s2
);
886 assert(SEE(')') || ISERR());
890 v
->subs
[subno
] = atom
;
891 t
= subre(v
, '(', atom
->flags
|CAP
, lp
, rp
);
897 /* postpone everything else pending possible {0} */
899 case BACKREF
: /* the Feature From The Black Lagoon */
900 INSIST(type
!= LACON
, REG_ESUBREG
);
901 INSIST(v
->nextvalue
< v
->nsubs
, REG_ESUBREG
);
902 INSIST(v
->subs
[(int)v
->nextvalue
] != NULL
, REG_ESUBREG
);
904 assert(v
->nextvalue
> 0);
905 atom
= subre(v
, 'b', BACKR
, lp
, rp
);
906 subno
= v
->nextvalue
;
908 EMPTYARC(lp
, rp
); /* temporarily, so there's something */
913 /* ...and an atom may be followed by a quantifier */
914 switch (v
->nexttype
) {
918 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
924 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
930 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
945 /* {m,n} exercises preference, even if it's {m,m} */
946 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
949 /* {m} passes operand's preference through */
952 if (!SEE('}')) { /* catches errors too */
958 default: /* no quantifier */
964 /* annoying special case: {0} or {0,0} cancels everything */
965 if (m
== 0 && n
== 0) {
969 v
->subs
[subno
] = NULL
;
970 delsub(v
->nfa
, lp
, rp
);
975 /* if not a messy case, avoid hard part */
976 assert(!MESSY(top
->flags
));
977 f
= top
->flags
| qprefer
| ((atom
!= NULL
) ? atom
->flags
: 0);
978 if (atomtype
!= '(' && atomtype
!= BACKREF
&& !MESSY(UP(f
))) {
979 if (!(m
== 1 && n
== 1))
980 repeat(v
, lp
, rp
, m
, n
);
988 * hard part: something messy
989 * That is, capturing parens, back reference, short/long clash, or
990 * an atom with substructure containing one of those.
993 /* now we'll need a subre for the contents even if they're boring */
995 atom
= subre(v
, '=', 0, lp
, rp
);
1000 * prepare a general-purpose state skeleton
1002 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1004 * [lp] ----> [s2] ----bypass---------------------
1006 * where bypass is an empty, and prefix is some repetitions of atom
1008 s
= newstate(v
->nfa
); /* first, new endpoints for the atom */
1009 s2
= newstate(v
->nfa
);
1011 moveouts(v
->nfa
, lp
, s
);
1012 moveins(v
->nfa
, rp
, s2
);
1016 s
= newstate(v
->nfa
); /* and spots for prefix and bypass */
1017 s2
= newstate(v
->nfa
);
1023 /* break remaining subRE into x{...} and what follows */
1024 t
= subre(v
, '.', COMBINE(qprefer
, atom
->flags
), lp
, rp
);
1027 /* here we should recurse... but we must postpone that to the end */
1029 /* split top into prefix and remaining */
1030 assert(top
->op
== '=' && top
->left
== NULL
&& top
->right
== NULL
);
1031 top
->left
= subre(v
, '=', top
->flags
, top
->begin
, lp
);
1035 /* if it's a backref, now is the time to replicate the subNFA */
1036 if (atomtype
== BACKREF
) {
1037 assert(atom
->begin
->nouts
== 1); /* just the EMPTY */
1038 delsub(v
->nfa
, atom
->begin
, atom
->end
);
1039 assert(v
->subs
[subno
] != NULL
);
1040 /* and here's why the recursion got postponed: it must */
1041 /* wait until the skeleton is filled in, because it may */
1042 /* hit a backref that wants to copy the filled-in skeleton */
1043 dupnfa(v
->nfa
, v
->subs
[subno
]->begin
, v
->subs
[subno
]->end
,
1044 atom
->begin
, atom
->end
);
1048 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1050 EMPTYARC(s2
, atom
->end
); /* the bypass */
1051 assert(PREF(qprefer
) != 0);
1052 f
= COMBINE(qprefer
, atom
->flags
);
1053 t
= subre(v
, '|', f
, lp
, atom
->end
);
1056 t
->right
= subre(v
, '|', PREF(f
), s2
, atom
->end
);
1058 t
->right
->left
= subre(v
, '=', 0, s2
, atom
->end
);
1065 /* deal with the rest of the quantifier */
1066 if (atomtype
== BACKREF
) {
1067 /* special case: backrefs have internal quantifiers */
1068 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1069 /* just stuff everything into atom */
1070 repeat(v
, atom
->begin
, atom
->end
, m
, n
);
1071 atom
->min
= (short)m
;
1072 atom
->max
= (short)n
;
1073 atom
->flags
|= COMBINE(qprefer
, atom
->flags
);
1074 } else if (m
== 1 && n
== 1) {
1075 /* no/vacuous quantifier: done */
1076 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1078 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1079 /* parens in only second x */
1080 dupnfa(v
->nfa
, atom
->begin
, atom
->end
, s
, atom
->begin
);
1081 assert(m
>= 1 && m
!= INFINITY
&& n
>= 1);
1082 repeat(v
, s
, atom
->begin
, m
-1, (n
== INFINITY
) ? n
: n
-1);
1083 f
= COMBINE(qprefer
, atom
->flags
);
1084 t
= subre(v
, '.', f
, s
, atom
->end
); /* prefix and atom */
1086 t
->left
= subre(v
, '=', PREF(f
), s
, atom
->begin
);
1092 /* and finally, look after that postponed recursion */
1094 if (!(SEE('|') || SEE(stopper
) || SEE(EOS
)))
1095 t
->right
= parsebranch(v
, stopper
, type
, atom
->end
, rp
, 1);
1097 EMPTYARC(atom
->end
, rp
);
1098 t
->right
= subre(v
, '=', 0, atom
->end
, rp
);
1100 assert(SEE('|') || SEE(stopper
) || SEE(EOS
));
1101 t
->flags
|= COMBINE(t
->flags
, t
->right
->flags
);
1102 top
->flags
|= COMBINE(top
->flags
, t
->flags
);
1106 - nonword - generate arcs for non-word-character ahead or behind
1107 ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
1110 nonword(v
, dir
, lp
, rp
)
1112 int dir
; /* AHEAD or BEHIND */
1116 int anchor
= (dir
== AHEAD
) ? '$' : '^';
1118 assert(dir
== AHEAD
|| dir
== BEHIND
);
1119 newarc(v
->nfa
, anchor
, 1, lp
, rp
);
1120 newarc(v
->nfa
, anchor
, 0, lp
, rp
);
1121 colorcomplement(v
->nfa
, v
->cm
, dir
, v
->wordchrs
, lp
, rp
);
1122 /* (no need for special attention to \n) */
1126 - word - generate arcs for word character ahead or behind
1127 ^ static VOID word(struct vars *, int, struct state *, struct state *);
1130 word(v
, dir
, lp
, rp
)
1132 int dir
; /* AHEAD or BEHIND */
1136 assert(dir
== AHEAD
|| dir
== BEHIND
);
1137 cloneouts(v
->nfa
, v
->wordchrs
, lp
, rp
, dir
);
1138 /* (no need for special attention to \n) */
1142 - scannum - scan a number
1143 ^ static int scannum(struct vars *);
1145 static int /* value, <= DUPMAX */
1151 while (SEE(DIGIT
) && n
< DUPMAX
) {
1152 n
= n
*10 + v
->nextvalue
;
1155 if (SEE(DIGIT
) || n
> DUPMAX
) {
1163 - repeat - replicate subNFA for quantifiers
1164 * The duplication sequences used here are chosen carefully so that any
1165 * pointers starting out pointing into the subexpression end up pointing into
1166 * the last occurrence. (Note that it may not be strung between the same
1167 * left and right end states, however!) This used to be important for the
1168 * subRE tree, although the important bits are now handled by the in-line
1169 * code in parse(), and when this is called, it doesn't matter any more.
1170 ^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
1173 repeat(v
, lp
, rp
, m
, n
)
1182 # define PAIR(x, y) ((x)*4 + (y))
1183 # define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1184 CONST
int rm
= REDUCE(m
);
1185 CONST
int rn
= REDUCE(n
);
1189 switch (PAIR(rm
, rn
)) {
1190 case PAIR(0, 0): /* empty string */
1191 delsub(v
->nfa
, lp
, rp
);
1194 case PAIR(0, 1): /* do as x| */
1197 case PAIR(0, SOME
): /* do as x{1,n}| */
1198 repeat(v
, lp
, rp
, 1, n
);
1202 case PAIR(0, INF
): /* loop x around */
1203 s
= newstate(v
->nfa
);
1205 moveouts(v
->nfa
, lp
, s
);
1206 moveins(v
->nfa
, rp
, s
);
1210 case PAIR(1, 1): /* no action required */
1212 case PAIR(1, SOME
): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1213 s
= newstate(v
->nfa
);
1215 moveouts(v
->nfa
, lp
, s
);
1216 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1218 repeat(v
, lp
, s
, 1, n
-1);
1222 case PAIR(1, INF
): /* add loopback arc */
1223 s
= newstate(v
->nfa
);
1224 s2
= newstate(v
->nfa
);
1226 moveouts(v
->nfa
, lp
, s
);
1227 moveins(v
->nfa
, rp
, s2
);
1232 case PAIR(SOME
, SOME
): /* do as x{m-1,n-1}x */
1233 s
= newstate(v
->nfa
);
1235 moveouts(v
->nfa
, lp
, s
);
1236 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1238 repeat(v
, lp
, s
, m
-1, n
-1);
1240 case PAIR(SOME
, INF
): /* do as x{m-1,}x */
1241 s
= newstate(v
->nfa
);
1243 moveouts(v
->nfa
, lp
, s
);
1244 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1246 repeat(v
, lp
, s
, m
-1, n
);
1255 - bracket - handle non-complemented bracket expression
1256 * Also called from cbracket for complemented bracket expressions.
1257 ^ static VOID bracket(struct vars *, struct state *, struct state *);
1267 while (!SEE(']') && !SEE(EOS
))
1268 brackpart(v
, lp
, rp
);
1269 assert(SEE(']') || ISERR());
1270 okcolors(v
->nfa
, v
->cm
);
1274 - cbracket - handle complemented bracket expression
1275 * We do it by calling bracket() with dummy endpoints, and then complementing
1276 * the result. The alternative would be to invoke rainbow(), and then delete
1277 * arcs as the b.e. is seen... but that gets messy.
1278 ^ static VOID cbracket(struct vars *, struct state *, struct state *);
1286 struct state
*left
= newstate(v
->nfa
);
1287 struct state
*right
= newstate(v
->nfa
);
1289 struct arc
*a
; /* arc from lp */
1290 struct arc
*ba
; /* arc from left, from bracket() */
1291 struct arc
*pa
; /* MCCE-prototype arc */
1297 bracket(v
, left
, right
);
1298 if (v
->cflags
®_NLSTOP
)
1299 newarc(v
->nfa
, PLAIN
, v
->nlcolor
, left
, right
);
1302 assert(lp
->nouts
== 0); /* all outarcs will be ours */
1304 /* easy part of complementing */
1305 colorcomplement(v
->nfa
, v
->cm
, PLAIN
, left
, lp
, rp
);
1307 if (v
->mcces
== NULL
) { /* no MCCEs -- we're done */
1308 dropstate(v
->nfa
, left
);
1309 assert(right
->nins
== 0);
1310 freestate(v
->nfa
, right
);
1314 /* but complementing gets messy in the presence of MCCEs... */
1316 for (p
= v
->mcces
->chrs
, i
= v
->mcces
->nchrs
; i
> 0; p
++, i
--) {
1317 co
= GETCOLOR(v
->cm
, *p
);
1318 a
= findarc(lp
, PLAIN
, co
);
1319 ba
= findarc(left
, PLAIN
, co
);
1326 s
= newstate(v
->nfa
);
1328 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1330 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1332 if (ba
== NULL
) { /* easy case, need all of them */
1333 cloneouts(v
->nfa
, pa
->to
, s
, rp
, PLAIN
);
1334 newarc(v
->nfa
, '$', 1, s
, rp
);
1335 newarc(v
->nfa
, '$', 0, s
, rp
);
1336 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, s
, rp
);
1337 } else { /* must be selective */
1338 if (findarc(ba
->to
, '$', 1) == NULL
) {
1339 newarc(v
->nfa
, '$', 1, s
, rp
);
1340 newarc(v
->nfa
, '$', 0, s
, rp
);
1341 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
,
1344 for (pa
= pa
->to
->outs
; pa
!= NULL
; pa
= pa
->outchain
)
1345 if (findarc(ba
->to
, PLAIN
, pa
->co
) == NULL
)
1346 newarc(v
->nfa
, PLAIN
, pa
->co
, s
, rp
);
1347 if (s
->nouts
== 0) /* limit of selectivity: none */
1348 dropstate(v
->nfa
, s
); /* frees arc too */
1353 delsub(v
->nfa
, left
, right
);
1354 assert(left
->nouts
== 0);
1355 freestate(v
->nfa
, left
);
1356 assert(right
->nins
== 0);
1357 freestate(v
->nfa
, right
);
1361 - brackpart - handle one item (or range) within a bracket expression
1362 ^ static VOID brackpart(struct vars *, struct state *, struct state *);
1365 brackpart(v
, lp
, rp
)
1377 /* parse something, get rid of special cases, take shortcuts */
1378 switch (v
->nexttype
) {
1379 case RANGE
: /* a-b-c or other botch */
1384 c
[0] = v
->nextvalue
;
1386 /* shortcut for ordinary chr (not range, not MCCE leader) */
1387 if (!SEE(RANGE
) && !ISCELEADER(v
, c
[0])) {
1388 onechr(v
, c
[0], lp
, rp
);
1391 startc
= element(v
, c
, c
+1);
1396 endp
= scanplain(v
);
1397 INSIST(startp
< endp
, REG_ECOLLATE
);
1399 startc
= element(v
, startp
, endp
);
1404 endp
= scanplain(v
);
1405 INSIST(startp
< endp
, REG_ECOLLATE
);
1407 startc
= element(v
, startp
, endp
);
1409 cv
= eclass(v
, startc
, (v
->cflags
®_ICASE
));
1411 dovec(v
, cv
, lp
, rp
);
1416 endp
= scanplain(v
);
1417 INSIST(startp
< endp
, REG_ECTYPE
);
1419 cv
= cclass(v
, startp
, endp
, (v
->cflags
®_ICASE
));
1421 dovec(v
, cv
, lp
, rp
);
1432 switch (v
->nexttype
) {
1435 c
[0] = v
->nextvalue
;
1437 endc
= element(v
, c
, c
+1);
1442 endp
= scanplain(v
);
1443 INSIST(startp
< endp
, REG_ECOLLATE
);
1445 endc
= element(v
, startp
, endp
);
1457 * Ranges are unportable. Actually, standard C does
1458 * guarantee that digits are contiguous, but making
1459 * that an exception is just too complicated.
1463 cv
= range(v
, startc
, endc
, (v
->cflags
®_ICASE
));
1465 dovec(v
, cv
, lp
, rp
);
1469 - scanplain - scan PLAIN contents of [. etc.
1470 * Certain bits of trickery in lex.c know that this code does not try
1471 * to look past the final bracket of the [. etc.
1472 ^ static chr *scanplain(struct vars *);
1474 static chr
* /* just after end of sequence */
1480 assert(SEE(COLLEL
) || SEE(ECLASS
) || SEE(CCLASS
));
1484 while (SEE(PLAIN
)) {
1489 assert(SEE(END
) || ISERR());
1496 - leaders - process a cvec of collating elements to also include leaders
1497 * Also gives all characters involved their own colors, which is almost
1498 * certainly necessary, and sets up little disconnected subNFA.
1499 ^ static VOID leaders(struct vars *, struct cvec *);
1512 v
->mccepbegin
= newstate(v
->nfa
);
1513 v
->mccepend
= newstate(v
->nfa
);
1516 for (mcce
= 0; mcce
< cv
->nmcces
; mcce
++) {
1517 p
= cv
->mcces
[mcce
];
1519 if (!haschr(cv
, leader
)) {
1521 s
= newstate(v
->nfa
);
1522 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, leader
),
1524 okcolors(v
->nfa
, v
->cm
);
1526 a
= findarc(v
->mccepbegin
, PLAIN
,
1527 GETCOLOR(v
->cm
, leader
));
1530 assert(s
!= v
->mccepend
);
1533 assert(*p
!= 0 && *(p
+1) == 0); /* only 2-char MCCEs for now */
1534 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, *p
), s
, v
->mccepend
);
1535 okcolors(v
->nfa
, v
->cm
);
1540 - onechr - fill in arcs for a plain character, and possible case complements
1541 * This is mostly a shortcut for efficient handling of the common case.
1542 ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
1545 onechr(v
, c
, lp
, rp
)
1551 if (!(v
->cflags
®_ICASE
)) {
1552 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, c
), lp
, rp
);
1556 /* rats, need general case anyway... */
1557 dovec(v
, allcases(v
, c
), lp
, rp
);
1561 - dovec - fill in arcs for each element of a cvec
1562 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1563 ^ static VOID dovec(struct vars *, struct cvec *, struct state *,
1567 dovec(v
, cv
, lp
, rp
)
1580 struct arc
*pa
; /* arc in prototype */
1582 struct state
*ps
; /* state in prototype */
1584 /* need a place to store leaders, if any */
1585 if (nmcces(v
) > 0) {
1586 assert(v
->mcces
!= NULL
);
1587 if (v
->cv2
== NULL
|| v
->cv2
->nchrs
< v
->mcces
->nchrs
) {
1590 v
->cv2
= newcvec(v
->mcces
->nchrs
, 0, v
->mcces
->nmcces
);
1594 leads
= clearcvec(v
->cv2
);
1598 /* first, get the ordinary characters out of the way */
1599 for (p
= cv
->chrs
, i
= cv
->nchrs
; i
> 0; p
++, i
--) {
1601 if (!ISCELEADER(v
, ch
))
1602 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, ch
), lp
, rp
);
1604 assert(singleton(v
->cm
, ch
));
1605 assert(leads
!= NULL
);
1606 if (!haschr(leads
, ch
))
1611 /* and the ranges */
1612 for (p
= cv
->ranges
, i
= cv
->nranges
; i
> 0; p
+= 2, i
--) {
1615 while (from
<= to
&& (ce
= nextleader(v
, from
, to
)) != NOCELT
) {
1617 subrange(v
, from
, ce
- 1, lp
, rp
);
1618 assert(singleton(v
->cm
, ce
));
1619 assert(leads
!= NULL
);
1620 if (!haschr(leads
, ce
))
1625 subrange(v
, from
, to
, lp
, rp
);
1628 if ((leads
== NULL
|| leads
->nchrs
== 0) && cv
->nmcces
== 0)
1631 /* deal with the MCCE leaders */
1633 for (p
= leads
->chrs
, i
= leads
->nchrs
; i
> 0; p
++, i
--) {
1634 co
= GETCOLOR(v
->cm
, *p
);
1635 a
= findarc(lp
, PLAIN
, co
);
1639 s
= newstate(v
->nfa
);
1641 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1644 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1647 newarc(v
->nfa
, '$', 1, s
, rp
);
1648 newarc(v
->nfa
, '$', 0, s
, rp
);
1649 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, ps
, s
, rp
);
1654 for (i
= 0; i
< cv
->nmcces
; i
++) {
1656 assert(singleton(v
->cm
, *p
));
1657 if (!singleton(v
->cm
, *p
)) {
1662 co
= GETCOLOR(v
->cm
, ch
);
1663 a
= findarc(lp
, PLAIN
, co
);
1667 s
= newstate(v
->nfa
);
1669 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1672 assert(*p
!= 0); /* at least two chars */
1673 assert(singleton(v
->cm
, *p
));
1675 co
= GETCOLOR(v
->cm
, ch
);
1676 assert(*p
== 0); /* and only two, for now */
1677 newarc(v
->nfa
, PLAIN
, co
, s
, rp
);
1683 - nextleader - find next MCCE leader within range
1684 ^ static celt nextleader(struct vars *, pchr, pchr);
1686 static celt
/* NOCELT means none */
1687 nextleader(v
, from
, to
)
1697 if (v
->mcces
== NULL
)
1700 for (i
= v
->mcces
->nchrs
, p
= v
->mcces
->chrs
; i
> 0; i
--, p
++) {
1702 if (from
<= ch
&& ch
<= to
)
1703 if (it
== NOCELT
|| ch
< it
)
1710 - wordchrs - set up word-chr list for word-boundary stuff, if needed
1711 * The list is kept as a bunch of arcs between two dummy states; it's
1712 * disposed of by the unreachable-states sweep in NFA optimization.
1713 * Does NEXT(). Must not be called from any unusual lexical context.
1714 * This should be reconciled with the \w etc. handling in lex.c, and
1715 * should be cleaned up to reduce dependencies on input scanning.
1716 ^ static VOID wordchrs(struct vars *);
1723 struct state
*right
;
1725 if (v
->wordchrs
!= NULL
) {
1726 NEXT(); /* for consistency */
1730 left
= newstate(v
->nfa
);
1731 right
= newstate(v
->nfa
);
1733 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1736 assert(v
->savenow
!= NULL
&& SEE('['));
1737 bracket(v
, left
, right
);
1738 assert((v
->savenow
!= NULL
&& SEE(']')) || ISERR());
1745 - subre - allocate a subre
1746 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1749 static struct subre
*
1750 subre(v
, op
, flags
, begin
, end
)
1754 struct state
*begin
;
1761 v
->treefree
= ret
->left
;
1763 ret
= (struct subre
*)MALLOC(sizeof(struct subre
));
1768 ret
->chain
= v
->treechain
;
1772 assert(strchr("|.b(=", op
) != NULL
);
1778 ret
->min
= ret
->max
= 1;
1789 - freesubre - free a subRE subtree
1790 ^ static VOID freesubre(struct vars *, struct subre *);
1794 struct vars
*v
; /* might be NULL */
1800 if (sr
->left
!= NULL
)
1801 freesubre(v
, sr
->left
);
1802 if (sr
->right
!= NULL
)
1803 freesubre(v
, sr
->right
);
1809 - freesrnode - free one node in a subRE subtree
1810 ^ static VOID freesrnode(struct vars *, struct subre *);
1814 struct vars
*v
; /* might be NULL */
1820 if (!NULLCNFA(sr
->cnfa
))
1821 freecnfa(&sr
->cnfa
);
1825 sr
->left
= v
->treefree
;
1832 - optst - optimize a subRE subtree
1833 ^ static VOID optst(struct vars *, struct subre *);
1843 /* recurse through children */
1844 if (t
->left
!= NULL
)
1846 if (t
->right
!= NULL
)
1851 - numst - number tree nodes (assigning retry indexes)
1852 ^ static int numst(struct subre *, int);
1854 static int /* next number */
1857 int start
; /* starting point for subtree numbers */
1864 t
->retry
= (short)i
++;
1865 if (t
->left
!= NULL
)
1866 i
= numst(t
->left
, i
);
1867 if (t
->right
!= NULL
)
1868 i
= numst(t
->right
, i
);
1873 - markst - mark tree nodes as INUSE
1874 ^ static VOID markst(struct subre *);
1883 if (t
->left
!= NULL
)
1885 if (t
->right
!= NULL
)
1890 - cleanst - free any tree nodes not marked INUSE
1891 ^ static VOID cleanst(struct vars *);
1900 for (t
= v
->treechain
; t
!= NULL
; t
= next
) {
1902 if (!(t
->flags
&INUSE
))
1905 v
->treechain
= NULL
;
1906 v
->treefree
= NULL
; /* just on general principles */
1910 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1911 ^ static long nfatree(struct vars *, struct subre *, FILE *);
1913 static long /* optimize results from top node */
1917 FILE *f
; /* for debug output */
1919 assert(t
!= NULL
&& t
->begin
!= NULL
);
1921 if (t
->left
!= NULL
)
1922 (DISCARD
)nfatree(v
, t
->left
, f
);
1923 if (t
->right
!= NULL
)
1924 (DISCARD
)nfatree(v
, t
->right
, f
);
1926 return nfanode(v
, t
, f
);
1930 - nfanode - do one NFA for nfatree
1931 ^ static long nfanode(struct vars *, struct subre *, FILE *);
1933 static long /* optimize results */
1937 FILE *f
; /* for debug output */
1943 assert(t
->begin
!= NULL
);
1946 fprintf(f
, "\n\n\n========= TREE NODE %s ==========\n",
1947 stid(t
, idbuf
, sizeof(idbuf
)));
1948 nfa
= newnfa(v
, v
->cm
, v
->nfa
);
1950 dupnfa(nfa
, t
->begin
, t
->end
, nfa
->init
, nfa
->final
);
1953 ret
= optimize(nfa
, f
);
1956 compact(nfa
, &t
->cnfa
);
1963 - newlacon - allocate a lookahead-constraint subRE
1964 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
1966 static int /* lacon number */
1967 newlacon(v
, begin
, end
, pos
)
1969 struct state
*begin
;
1976 if (v
->nlacons
== 0) {
1977 v
->lacons
= (struct subre
*)MALLOC(2 * sizeof(struct subre
));
1978 n
= 1; /* skip 0th */
1981 v
->lacons
= (struct subre
*)REALLOC(v
->lacons
,
1982 (v
->nlacons
+1)*sizeof(struct subre
));
1985 if (v
->lacons
== NULL
) {
1989 sub
= &v
->lacons
[n
];
1998 - freelacons - free lookahead-constraint subRE vector
1999 ^ static VOID freelacons(struct subre *, int);
2010 for (sub
= subs
+ 1, i
= n
- 1; i
> 0; sub
++, i
--) /* no 0th */
2011 if (!NULLCNFA(sub
->cnfa
))
2012 freecnfa(&sub
->cnfa
);
2017 - rfree - free a whole RE (insides of regfree)
2018 ^ static VOID rfree(regex_t *);
2026 if (re
== NULL
|| re
->re_magic
!= REMAGIC
)
2029 re
->re_magic
= 0; /* invalidate RE */
2030 g
= (struct guts
*)re
->re_guts
;
2035 if (g
->tree
!= NULL
)
2036 freesubre((struct vars
*)NULL
, g
->tree
);
2037 if (g
->lacons
!= NULL
)
2038 freelacons(g
->lacons
, g
->nlacons
);
2039 if (!NULLCNFA(g
->search
))
2040 freecnfa(&g
->search
);
2045 - dump - dump an RE in human-readable form
2046 ^ static VOID dump(regex_t *, FILE *);
2057 if (re
->re_magic
!= REMAGIC
)
2058 fprintf(f
, "bad magic number (0x%x not 0x%x)\n", re
->re_magic
,
2060 if (re
->re_guts
== NULL
) {
2061 fprintf(f
, "NULL guts!!!\n");
2064 g
= (struct guts
*)re
->re_guts
;
2065 if (g
->magic
!= GUTSMAGIC
)
2066 fprintf(f
, "bad guts magic number (0x%x not 0x%x)\n", g
->magic
,
2069 fprintf(f
, "\n\n\n========= DUMP ==========\n");
2070 fprintf(f
, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2071 re
->re_nsub
, re
->re_info
, re
->re_csize
, g
->ntree
);
2073 dumpcolors(&g
->cmap
, f
);
2074 if (!NULLCNFA(g
->search
)) {
2075 printf("\nsearch:\n");
2076 dumpcnfa(&g
->search
, f
);
2078 for (i
= 1; i
< g
->nlacons
; i
++) {
2079 fprintf(f
, "\nla%d (%s):\n", i
,
2080 (g
->lacons
[i
].subno
) ? "positive" : "negative");
2081 dumpcnfa(&g
->lacons
[i
].cnfa
, f
);
2084 dumpst(g
->tree
, f
, 0);
2089 - dumpst - dump a subRE tree
2090 ^ static VOID dumpst(struct subre *, FILE *, int);
2093 dumpst(t
, f
, nfapresent
)
2096 int nfapresent
; /* is the original NFA still around? */
2099 fprintf(f
, "null tree\n");
2101 stdump(t
, f
, nfapresent
);
2106 - stdump - recursive guts of dumpst
2107 ^ static VOID stdump(struct subre *, FILE *, int);
2110 stdump(t
, f
, nfapresent
)
2113 int nfapresent
; /* is the original NFA still around? */
2117 fprintf(f
, "%s. `%c'", stid(t
, idbuf
, sizeof(idbuf
)), t
->op
);
2118 if (t
->flags
&LONGER
)
2119 fprintf(f
, " longest");
2120 if (t
->flags
&SHORTER
)
2121 fprintf(f
, " shortest");
2123 fprintf(f
, " hasmixed");
2125 fprintf(f
, " hascapture");
2127 fprintf(f
, " hasbackref");
2128 if (!(t
->flags
&INUSE
))
2129 fprintf(f
, " UNUSED");
2131 fprintf(f
, " (#%d)", t
->subno
);
2132 if (t
->min
!= 1 || t
->max
!= 1) {
2133 fprintf(f
, " {%d,", t
->min
);
2134 if (t
->max
!= INFINITY
)
2135 fprintf(f
, "%d", t
->max
);
2139 fprintf(f
, " %ld-%ld", (long)t
->begin
->no
, (long)t
->end
->no
);
2140 if (t
->left
!= NULL
)
2141 fprintf(f
, " L:%s", stid(t
->left
, idbuf
, sizeof(idbuf
)));
2142 if (t
->right
!= NULL
)
2143 fprintf(f
, " R:%s", stid(t
->right
, idbuf
, sizeof(idbuf
)));
2144 if (!NULLCNFA(t
->cnfa
)) {
2146 dumpcnfa(&t
->cnfa
, f
);
2149 if (t
->left
!= NULL
)
2150 stdump(t
->left
, f
, nfapresent
);
2151 if (t
->right
!= NULL
)
2152 stdump(t
->right
, f
, nfapresent
);
2156 - stid - identify a subtree node for dumping
2157 ^ static char *stid(struct subre *, char *, size_t);
2159 static char * /* points to buf or constant string */
2160 stid(t
, buf
, bufsize
)
2165 /* big enough for hex int or decimal t->retry? */
2166 if (bufsize
< sizeof(int)*2 + 3 || bufsize
< sizeof(t
->retry
)*3 + 1)
2169 sprintf(buf
, "%d", t
->retry
);
2171 sprintf(buf
, "0x%x", (int)(wxUIntPtr
)(t
)); /* may lose bits, that's okay */
2175 #include "regc_lex.c"
2176 #include "regc_color.c"
2177 #include "regc_nfa.c"
2178 #include "regc_cvec.c"
2179 #include "regc_locale.c"