]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
38 * forward declarations, up here so forward datatypes etc. are defined early
40 /* === regcomp.c === */
41 static void moresubs(struct vars
*, int);
42 static int freev(struct vars
*, int);
43 static void makesearch(struct vars
*, struct nfa
*);
44 static struct subre
*parse(struct vars
*, int, int, struct state
*, struct state
*);
45 static struct subre
*parsebranch(struct vars
*, int, int, struct state
*, struct state
*, int);
46 static void parseqatom(struct vars
*, int, int, struct state
*, struct state
*, struct subre
*);
47 static void nonword(struct vars
*, int, struct state
*, struct state
*);
48 static void word(struct vars
*, int, struct state
*, struct state
*);
49 static int scannum(struct vars
*);
50 static void repeat(struct vars
*, struct state
*, struct state
*, int, int);
51 static void bracket(struct vars
*, struct state
*, struct state
*);
52 static void cbracket(struct vars
*, struct state
*, struct state
*);
53 static void brackpart(struct vars
*, struct state
*, struct state
*);
54 static chr
*scanplain(struct vars
*);
55 static void leaders(struct vars
*, struct cvec
*);
56 static void onechr(struct vars
*, chr
, struct state
*, struct state
*);
57 static void dovec(struct vars
*, struct cvec
*, struct state
*, struct state
*);
58 static celt
nextleader(struct vars
*, chr
, chr
);
59 static void wordchrs(struct vars
*);
60 static struct subre
*subre(struct vars
*, int, int, struct state
*, struct state
*);
61 static void freesubre(struct vars
*, struct subre
*);
62 static void freesrnode(struct vars
*, struct subre
*);
63 static void optst(struct vars
*, struct subre
*);
64 static int numst(struct subre
*, int);
65 static void markst(struct subre
*);
66 static void cleanst(struct vars
*);
67 static long nfatree(struct vars
*, struct subre
*, FILE *);
68 static long nfanode(struct vars
*, struct subre
*, FILE *);
69 static int newlacon(struct vars
*, struct state
*, struct state
*, int);
70 static void freelacons(struct subre
*, int);
71 static void rfree(regex_t
*);
74 static void dump(regex_t
*, FILE *);
75 static void dumpst(struct subre
*, FILE *, int);
76 static void stdump(struct subre
*, FILE *, int);
77 static char *stid(struct subre
*, char *, size_t);
79 /* === regc_lex.c === */
80 static void lexstart(struct vars
*);
81 static void prefixes(struct vars
*);
82 static void lexnest(struct vars
*, chr
*, chr
*);
83 static void lexword(struct vars
*);
84 static int next(struct vars
*);
85 static int lexescape(struct vars
*);
86 static chr
lexdigits(struct vars
*, int, int, int);
87 static int brenext(struct vars
*, chr
);
88 static void skip(struct vars
*);
89 static chr
newline(void);
90 static chr
chrnamed(struct vars
*, chr
*, chr
*, chr
);
92 /* === regc_color.c === */
93 static void initcm(struct vars
*, struct colormap
*);
94 static void freecm(struct colormap
*);
95 static void cmtreefree(struct colormap
*, union tree
*, int);
96 static color
setcolor(struct colormap
*, chr
, pcolor
);
97 static color
maxcolor(struct colormap
*);
98 static color
newcolor(struct colormap
*);
99 static void freecolor(struct colormap
*, pcolor
);
100 static color
pseudocolor(struct colormap
*);
101 static color
subcolor(struct colormap
*, chr c
);
102 static color
newsub(struct colormap
*, pcolor
);
103 static void subrange(struct vars
*, chr
, chr
, struct state
*, struct state
*);
104 static void subblock(struct vars
*, chr
, struct state
*, struct state
*);
105 static void okcolors(struct nfa
*, struct colormap
*);
106 static void colorchain(struct colormap
*, struct arc
*);
107 static void uncolorchain(struct colormap
*, struct arc
*);
108 static int singleton(struct colormap
*, chr c
);
109 static void rainbow(struct nfa
*, struct colormap
*, int, pcolor
, struct state
*, struct state
*);
110 static void colorcomplement(struct nfa
*, struct colormap
*, int, struct state
*, struct state
*, struct state
*);
113 static void dumpcolors(struct colormap
*, FILE *);
114 static void fillcheck(struct colormap
*, union tree
*, int, FILE *);
115 static void dumpchr(chr
, FILE *);
117 /* === regc_nfa.c === */
118 static struct nfa
*newnfa(struct vars
*, struct colormap
*, struct nfa
*);
119 static void freenfa(struct nfa
*);
120 static struct state
*newstate(struct nfa
*);
121 static struct state
*newfstate(struct nfa
*, int flag
);
122 static void dropstate(struct nfa
*, struct state
*);
123 static void freestate(struct nfa
*, struct state
*);
124 static void destroystate(struct nfa
*, struct state
*);
125 static void newarc(struct nfa
*, int, pcolor
, struct state
*, struct state
*);
126 static struct arc
*allocarc(struct nfa
*, struct state
*);
127 static void freearc(struct nfa
*, struct arc
*);
128 static struct arc
*findarc(struct state
*, int, pcolor
);
129 static void cparc(struct nfa
*, struct arc
*, struct state
*, struct state
*);
130 static void moveins(struct nfa
*, struct state
*, struct state
*);
131 static void copyins(struct nfa
*, struct state
*, struct state
*);
132 static void moveouts(struct nfa
*, struct state
*, struct state
*);
133 static void copyouts(struct nfa
*, struct state
*, struct state
*);
134 static void cloneouts(struct nfa
*, struct state
*, struct state
*, struct state
*, int);
135 static void delsub(struct nfa
*, struct state
*, struct state
*);
136 static void deltraverse(struct nfa
*, struct state
*, struct state
*);
137 static void dupnfa(struct nfa
*, struct state
*, struct state
*, struct state
*, struct state
*);
138 static void duptraverse(struct nfa
*, struct state
*, struct state
*);
139 static void cleartraverse(struct nfa
*, struct state
*);
140 static void specialcolors(struct nfa
*);
141 static long optimize(struct nfa
*, FILE *);
142 static void pullback(struct nfa
*, FILE *);
143 static int pull(struct nfa
*, struct arc
*);
144 static void pushfwd(struct nfa
*, FILE *);
145 static int push(struct nfa
*, struct arc
*);
147 #define INCOMPATIBLE 1 /* destroys arc */
148 #define SATISFIED 2 /* constraint satisfied */
149 #define COMPATIBLE 3 /* compatible but not satisfied yet */
150 static int combine(struct arc
*, struct arc
*);
151 static void fixempties(struct nfa
*, FILE *);
152 static int unempty(struct nfa
*, struct arc
*);
153 static void cleanup(struct nfa
*);
154 static void markreachable(struct nfa
*, struct state
*, struct state
*, struct state
*);
155 static void markcanreach(struct nfa
*, struct state
*, struct state
*, struct state
*);
156 static long analyze(struct nfa
*);
157 static void compact(struct nfa
*, struct cnfa
*);
158 static void carcsort(struct carc
*, struct carc
*);
159 static void freecnfa(struct cnfa
*);
160 static void dumpnfa(struct nfa
*, FILE *);
163 static void dumpstate(struct state
*, FILE *);
164 static void dumparcs(struct state
*, FILE *);
165 static int dumprarcs(struct arc
*, struct state
*, FILE *, int);
166 static void dumparc(struct arc
*, struct state
*, FILE *);
167 static void dumpcnfa(struct cnfa
*, FILE *);
168 static void dumpcstate(int, struct carc
*, struct cnfa
*, FILE *);
170 /* === regc_cvec.c === */
171 static struct cvec
*newcvec(int, int, int);
172 static struct cvec
*clearcvec(struct cvec
*);
173 static void addchr(struct cvec
*, chr
);
174 static void addrange(struct cvec
*, chr
, chr
);
175 static void addmcce(struct cvec
*, chr
*, chr
*);
176 static int haschr(struct cvec
*, chr
);
177 static struct cvec
*getcvec(struct vars
*, int, int, int);
178 static void freecvec(struct cvec
*);
180 /* === regc_locale.c === */
181 extern int wx_isdigit(wx_wchar c
);
182 extern int wx_isalpha(wx_wchar c
);
183 extern int wx_isalnum(wx_wchar c
);
184 extern int wx_isupper(wx_wchar c
);
185 extern int wx_islower(wx_wchar c
);
186 extern int wx_isgraph(wx_wchar c
);
187 extern int wx_ispunct(wx_wchar c
);
188 extern int wx_isspace(wx_wchar c
);
189 extern wx_wchar
wx_toupper(wx_wchar c
);
190 extern wx_wchar
wx_tolower(wx_wchar c
);
191 extern int wx_strlen(const wx_wchar
* szString
);
192 static int nmcces(struct vars
*);
193 static int nleaders(struct vars
*);
194 static struct cvec
*allmcces(struct vars
*, struct cvec
*);
195 static celt
element(struct vars
*, chr
*, chr
*);
196 static struct cvec
*range(struct vars
*, celt
, celt
, int);
197 static int before(celt
, celt
);
198 static struct cvec
*eclass(struct vars
*, celt
, int);
199 static struct cvec
*cclass(struct vars
*, chr
*, chr
*, int);
200 static struct cvec
*allcases(struct vars
*, chr
);
201 static int cmp(const chr
*, const chr
*, size_t);
202 static int casecmp(const chr
*, const chr
*, size_t);
205 /* internal variables, bundled for easy passing around */
209 chr
*now
; /* scan pointer into string */
210 chr
*stop
; /* end of string */
211 chr
*savenow
; /* saved now and stop for "subroutine
214 int err
; /* error code (0 if none) */
215 int cflags
; /* copy of compile flags */
216 int lasttype
; /* type of previous token */
217 int nexttype
; /* type of next token */
218 chr nextvalue
; /* value (if any) of next token */
219 int lexcon
; /* lexical context type (see lex.c) */
220 int nsubexp
; /* subexpression count */
221 struct subre
**subs
; /* subRE pointer vector */
222 size_t nsubs
; /* length of vector */
223 struct subre
*sub10
[10]; /* initial vector, enough for most */
224 struct nfa
*nfa
; /* the NFA */
225 struct colormap
*cm
; /* character color map */
226 color nlcolor
; /* color of newline */
227 struct state
*wordchrs
; /* state in nfa holding word-char outarcs */
228 struct subre
*tree
; /* subexpression tree */
229 struct subre
*treechain
; /* all tree nodes allocated */
230 struct subre
*treefree
; /* any free tree nodes */
231 int ntree
; /* number of tree nodes */
232 struct cvec
*cv
; /* interface cvec */
233 struct cvec
*cv2
; /* utility cvec */
234 struct cvec
*mcces
; /* collating-element information */
235 #define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
236 struct state
*mccepbegin
; /* in nfa, start of MCCE prototypes */
237 struct state
*mccepend
; /* in nfa, end of MCCE prototypes */
238 struct subre
*lacons
; /* lookahead-constraint vector */
239 int nlacons
; /* size of lacons */
242 /* parsing macros; most know that `v' is the struct vars pointer */
243 #define NEXT() (next(v)) /* advance by one token */
244 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
245 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
246 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
247 #define ISERR() VISERR(v)
248 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
250 #define ERR(e) VERR(v, e) /* record an error */
251 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
252 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
253 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
254 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false,
256 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
257 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
259 /* token type codes, some also used as NFA arc types */
260 #define EMPTY 'n' /* no token present */
261 #define EOS 'e' /* end of string */
262 #define PLAIN 'p' /* ordinary character */
263 #define DIGIT 'd' /* digit (in bound) */
264 #define BACKREF 'b' /* back reference */
265 #define COLLEL 'I' /* start of [. */
266 #define ECLASS 'E' /* start of [= */
267 #define CCLASS 'C' /* start of [: */
268 #define END 'X' /* end of [. [= [: */
269 #define RANGE 'R' /* - within [] which might be range delim. */
270 #define LACON 'L' /* lookahead constraint subRE */
271 #define AHEAD 'a' /* color-lookahead arc */
272 #define BEHIND 'r' /* color-lookbehind arc */
273 #define WBDRY 'w' /* word boundary constraint */
274 #define NWBDRY 'W' /* non-word-boundary constraint */
275 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
276 #define SEND 'Z' /* end of string (even if not EOL) */
277 #define PREFER 'P' /* length preference */
279 /* is an arc colored, and hence on a color chain? */
280 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
285 /* static function list */
286 static struct fns functions
= {
287 rfree
, /* regfree insides */
293 * regcomp - compile regular expression
300 return wx_regcomp(re
, string
, wx_strlen(string
), flags
);
303 wx_regcomp(regex_t
*re
,
309 struct vars
*v
= &var
;
315 FILE *debug
= (flags
& REG_PROGRESS
) ? stdout
: (FILE *) NULL
;
318 FILE *debug
= (FILE *) NULL
;
321 #define CNOERR() { if (ISERR()) return freev(v, v->err); }
325 if (re
== NULL
|| string
== NULL
)
327 if ((flags
& REG_QUOTE
) &&
328 (flags
& (REG_ADVANCED
| REG_EXPANDED
| REG_NEWLINE
)))
330 if (!(flags
& REG_EXTENDED
) && (flags
& REG_ADVF
))
333 /* initial setup (after which freev() is callable) */
335 v
->now
= (chr
*) string
;
336 v
->stop
= v
->now
+ len
;
337 v
->savenow
= v
->savestop
= NULL
;
343 for (j
= 0; j
< v
->nsubs
; j
++)
347 v
->nlcolor
= COLORLESS
;
357 re
->re_magic
= REMAGIC
;
358 re
->re_info
= 0; /* bits get set during parse */
359 re
->re_csize
= sizeof(chr
);
361 re
->re_fns
= VS(&functions
);
363 /* more complex setup, malloced things */
364 re
->re_guts
= VS(MALLOC(sizeof(struct guts
)));
365 if (re
->re_guts
== NULL
)
366 return freev(v
, REG_ESPACE
);
367 g
= (struct guts
*) re
->re_guts
;
374 v
->nfa
= newnfa(v
, v
->cm
, (struct nfa
*) NULL
);
376 v
->cv
= newcvec(100, 20, 10);
378 return freev(v
, REG_ESPACE
);
382 v
->mcces
= newcvec(nleaders(v
), 0, i
);
384 v
->mcces
= allmcces(v
, v
->mcces
);
385 leaders(v
, v
->mcces
);
386 addmcce(v
->mcces
, (chr
*) NULL
, (chr
*) NULL
); /* dummy */
391 lexstart(v
); /* also handles prefixes */
392 if ((v
->cflags
& REG_NLSTOP
) || (v
->cflags
& REG_NLANCH
))
394 /* assign newline a unique color */
395 v
->nlcolor
= subcolor(v
->cm
, newline());
396 okcolors(v
->nfa
, v
->cm
);
399 v
->tree
= parse(v
, EOS
, PLAIN
, v
->nfa
->init
, v
->nfa
->final
);
400 assert(SEE(EOS
)); /* even if error; ISERR() => SEE(EOS) */
402 assert(v
->tree
!= NULL
);
404 /* finish setup of nfa and its subre tree */
405 specialcolors(v
->nfa
);
410 fprintf(debug
, "\n\n\n========= RAW ==========\n");
411 dumpnfa(v
->nfa
, debug
);
412 dumpst(v
->tree
, debug
, 1);
416 v
->ntree
= numst(v
->tree
, 1);
422 fprintf(debug
, "\n\n\n========= TREE FIXED ==========\n");
423 dumpst(v
->tree
, debug
, 1);
427 /* build compacted NFAs for tree and lacons */
428 re
->re_info
|= nfatree(v
, v
->tree
, debug
);
430 assert(v
->nlacons
== 0 || v
->lacons
!= NULL
);
431 for (i
= 1; i
< v
->nlacons
; i
++)
435 fprintf(debug
, "\n\n\n========= LA%d ==========\n", i
);
437 nfanode(v
, &v
->lacons
[i
], debug
);
440 if (v
->tree
->flags
& SHORTER
)
443 /* build compacted NFAs for tree, lacons, fast search */
446 fprintf(debug
, "\n\n\n========= SEARCH ==========\n");
448 /* can sacrifice main NFA now, so use it as work area */
449 (DISCARD
) optimize(v
->nfa
, debug
);
451 makesearch(v
, v
->nfa
);
453 compact(v
->nfa
, &g
->search
);
456 /* looks okay, package it up */
457 re
->re_nsub
= v
->nsubexp
;
458 v
->re
= NULL
; /* freev no longer frees re */
459 g
->magic
= GUTSMAGIC
;
460 g
->cflags
= v
->cflags
;
461 g
->info
= re
->re_info
;
462 g
->nsub
= re
->re_nsub
;
466 g
->compare
= (v
->cflags
& REG_ICASE
) ? casecmp
: cmp
;
467 g
->lacons
= v
->lacons
;
469 g
->nlacons
= v
->nlacons
;
472 if (flags
& REG_DUMP
)
481 * moresubs - enlarge subRE vector
484 moresubs(struct vars
* v
,
485 int wanted
) /* want enough room for this one */
490 assert(wanted
> 0 && (size_t) wanted
>= v
->nsubs
);
491 n
= (size_t) wanted
*3 / 2 + 1;
493 if (v
->subs
== v
->sub10
)
495 p
= (struct subre
**) MALLOC(n
* sizeof(struct subre
*));
497 memcpy(VS(p
), VS(v
->subs
),
498 v
->nsubs
* sizeof(struct subre
*));
501 p
= (struct subre
**) REALLOC(v
->subs
, n
* sizeof(struct subre
*));
508 for (p
= &v
->subs
[v
->nsubs
]; v
->nsubs
< n
; p
++, v
->nsubs
++)
510 assert(v
->nsubs
== n
);
511 assert((size_t) wanted
< v
->nsubs
);
515 * freev - free vars struct's substructures where necessary
517 * Optionally does error-number setting, and always returns error code
518 * (if any), to make error-handling code terser.
521 freev(struct vars
* v
,
526 if (v
->subs
!= v
->sub10
)
531 freesubre(v
, v
->tree
);
532 if (v
->treechain
!= NULL
)
538 if (v
->mcces
!= NULL
)
540 if (v
->lacons
!= NULL
)
541 freelacons(v
->lacons
, v
->nlacons
);
542 ERR(err
); /* nop if err==0 */
548 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
549 * NFA must have been optimize()d already.
552 makesearch(struct vars
* v
,
557 struct state
*pre
= nfa
->pre
;
562 /* no loops are needed if it's anchored */
563 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
565 assert(a
->type
== PLAIN
);
566 if (a
->co
!= nfa
->bos
[0] && a
->co
!= nfa
->bos
[1])
571 /* add implicit .* in front */
572 rainbow(nfa
, v
->cm
, PLAIN
, COLORLESS
, pre
, pre
);
574 /* and ^* and \A* too -- not always necessary, but harmless */
575 newarc(nfa
, PLAIN
, nfa
->bos
[0], pre
, pre
);
576 newarc(nfa
, PLAIN
, nfa
->bos
[1], pre
, pre
);
580 * Now here's the subtle part. Because many REs have no lookback
581 * constraints, often knowing when you were in the pre state tells you
582 * little; it's the next state(s) that are informative. But some of
583 * them may have other inarcs, i.e. it may be possible to make actual
584 * progress and then return to one of them. We must de-optimize such
585 * cases, splitting each such state into progress and no-progress
589 /* first, make a list of the states */
591 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
594 for (b
= s
->ins
; b
!= NULL
; b
= b
->inchain
)
598 { /* must be split */
605 for (s
= slist
; s
!= NULL
; s
= s2
)
608 copyouts(nfa
, s
, s2
);
609 for (a
= s
->ins
; a
!= NULL
; a
= b
)
614 cparc(nfa
, a
, a
->from
, s2
);
619 s
->tmp
= NULL
; /* clean up while we're at it */
624 * parse - parse an RE
626 * This is actually just the top level, which parses a bunch of branches
627 * tied together with '|'. They appear in the tree as the left children
628 * of a chain of '|' subres.
630 static struct subre
*
631 parse(struct vars
* v
,
632 int stopper
, /* EOS or ')' */
633 int type
, /* LACON (lookahead subRE) or PLAIN */
634 struct state
* init
, /* initial state */
635 struct state
* final
) /* final state */
637 struct state
*left
; /* scaffolding for branch */
639 struct subre
*branches
; /* top level */
640 struct subre
*branch
; /* current branch */
641 struct subre
*t
; /* temporary */
642 int firstbranch
; /* is this the first branch? */
644 assert(stopper
== ')' || stopper
== EOS
);
646 branches
= subre(v
, '|', LONGER
, init
, final
);
654 /* need a place to hang it */
655 branch
->right
= subre(v
, '|', LONGER
, init
, final
);
657 branch
= branch
->right
;
660 left
= newstate(v
->nfa
);
661 right
= newstate(v
->nfa
);
663 EMPTYARC(init
, left
);
664 EMPTYARC(right
, final
);
666 branch
->left
= parsebranch(v
, stopper
, type
, left
, right
, 0);
668 branch
->flags
|= UP(branch
->flags
| branch
->left
->flags
);
669 if ((branch
->flags
& ~branches
->flags
) != 0) /* new flags */
670 for (t
= branches
; t
!= branch
; t
= t
->right
)
671 t
->flags
|= branch
->flags
;
673 assert(SEE(stopper
) || SEE(EOS
));
677 assert(stopper
== ')' && SEE(EOS
));
681 /* optimize out simple cases */
682 if (branch
== branches
)
683 { /* only one branch */
684 assert(branch
->right
== NULL
);
687 freesubre(v
, branches
);
690 else if (!MESSY(branches
->flags
))
691 { /* no interesting innards */
692 freesubre(v
, branches
->left
);
693 branches
->left
= NULL
;
694 freesubre(v
, branches
->right
);
695 branches
->right
= NULL
;
703 * parsebranch - parse one branch of an RE
705 * This mostly manages concatenation, working closely with parseqatom().
706 * Concatenated things are bundled up as much as possible, with separate
707 * ',' nodes introduced only when necessary due to substructure.
709 static struct subre
*
710 parsebranch(struct vars
* v
,
711 int stopper
, /* EOS or ')' */
712 int type
, /* LACON (lookahead subRE) or PLAIN */
713 struct state
* left
, /* leftmost state */
714 struct state
* right
, /* rightmost state */
715 int partial
) /* is this only part of a branch? */
717 struct state
*lp
; /* left end of current construct */
718 int seencontent
; /* is there anything in this branch yet? */
723 t
= subre(v
, '=', 0, left
, right
); /* op '=' is tentative */
725 while (!SEE('|') && !SEE(stopper
) && !SEE(EOS
))
728 { /* implicit concat operator */
729 lp
= newstate(v
->nfa
);
731 moveins(v
->nfa
, right
, lp
);
735 /* NB, recursion in parseqatom() may swallow rest of branch */
736 parseqatom(v
, stopper
, type
, lp
, right
, t
);
744 EMPTYARC(left
, right
);
751 * parseqatom - parse one quantified atom or constraint of an RE
753 * The bookkeeping near the end cooperates very closely with parsebranch();
754 * in particular, it contains a recursion that can involve parsing the rest
755 * of the branch, making this function's name somewhat inaccurate.
758 parseqatom(struct vars
* v
,
759 int stopper
, /* EOS or ')' */
760 int type
, /* LACON (lookahead subRE) or PLAIN */
761 struct state
* lp
, /* left state to hang it on */
762 struct state
* rp
, /* right state to hang it on */
763 struct subre
* top
) /* subtree top */
765 struct state
*s
; /* temporaries for new states */
768 #define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
771 struct subre
*atom
; /* atom's subtree */
773 int cap
; /* capturing parens? */
774 int pos
; /* positive lookahead? */
775 int subno
; /* capturing-parens or backref number */
777 int qprefer
; /* quantifier short/long preference */
779 struct subre
**atomp
; /* where the pointer to atom is */
781 /* initial bookkeeping */
783 assert(lp
->nouts
== 0); /* must string new code */
784 assert(rp
->nins
== 0); /* between lp and rp */
785 subno
= 0; /* just to shut lint up */
787 /* an atom or constraint... */
788 atomtype
= v
->nexttype
;
791 /* first, constraints, which end by returning */
794 if (v
->cflags
& REG_NLANCH
)
795 ARCV(BEHIND
, v
->nlcolor
);
801 if (v
->cflags
& REG_NLANCH
)
802 ARCV(AHEAD
, v
->nlcolor
);
807 ARCV('^', 1); /* BOL */
808 ARCV('^', 0); /* or BOS */
813 ARCV('$', 1); /* EOL */
814 ARCV('$', 0); /* or EOS */
819 wordchrs(v
); /* does NEXT() */
820 s
= newstate(v
->nfa
);
822 nonword(v
, BEHIND
, lp
, s
);
823 word(v
, AHEAD
, s
, rp
);
827 wordchrs(v
); /* does NEXT() */
828 s
= newstate(v
->nfa
);
830 word(v
, BEHIND
, lp
, s
);
831 nonword(v
, AHEAD
, s
, rp
);
835 wordchrs(v
); /* does NEXT() */
836 s
= newstate(v
->nfa
);
838 nonword(v
, BEHIND
, lp
, s
);
839 word(v
, AHEAD
, s
, rp
);
840 s
= newstate(v
->nfa
);
842 word(v
, BEHIND
, lp
, s
);
843 nonword(v
, AHEAD
, s
, rp
);
847 wordchrs(v
); /* does NEXT() */
848 s
= newstate(v
->nfa
);
850 word(v
, BEHIND
, lp
, s
);
851 word(v
, AHEAD
, s
, rp
);
852 s
= newstate(v
->nfa
);
854 nonword(v
, BEHIND
, lp
, s
);
855 nonword(v
, AHEAD
, s
, rp
);
858 case LACON
: /* lookahead constraint */
861 s
= newstate(v
->nfa
);
862 s2
= newstate(v
->nfa
);
864 t
= parse(v
, ')', LACON
, s
, s2
);
865 freesubre(v
, t
); /* internal structure irrelevant */
866 assert(SEE(')') || ISERR());
868 n
= newlacon(v
, s
, s2
, pos
);
873 /* then errors, to get them out of the way */
885 /* then plain characters, and minor variants on that theme */
886 case ')': /* unbalanced paren */
887 if ((v
->cflags
& REG_ADVANCED
) != REG_EXTENDED
)
892 /* legal in EREs due to specification botch */
894 /* fallthrough into case PLAIN */
896 onechr(v
, v
->nextvalue
, lp
, rp
);
897 okcolors(v
->nfa
, v
->cm
);
902 if (v
->nextvalue
== 1)
906 assert(SEE(']') || ISERR());
910 rainbow(v
->nfa
, v
->cm
, PLAIN
,
911 (v
->cflags
& REG_NLSTOP
) ? v
->nlcolor
: COLORLESS
,
915 /* and finally the ugly stuff */
916 case '(': /* value flags as capturing or non */
917 cap
= (type
== LACON
) ? 0 : v
->nextvalue
;
922 if ((size_t) subno
>= v
->nsubs
)
924 assert((size_t) subno
< v
->nsubs
);
927 atomtype
= PLAIN
; /* something that's not '(' */
929 /* need new endpoints because tree will contain pointers */
930 s
= newstate(v
->nfa
);
931 s2
= newstate(v
->nfa
);
936 atom
= parse(v
, ')', PLAIN
, s
, s2
);
937 assert(SEE(')') || ISERR());
942 v
->subs
[subno
] = atom
;
943 t
= subre(v
, '(', atom
->flags
| CAP
, lp
, rp
);
949 /* postpone everything else pending possible {0} */
951 case BACKREF
: /* the Feature From The Black Lagoon */
952 INSIST(type
!= LACON
, REG_ESUBREG
);
953 INSIST(v
->nextvalue
< v
->nsubs
, REG_ESUBREG
);
954 INSIST(v
->subs
[v
->nextvalue
] != NULL
, REG_ESUBREG
);
956 assert(v
->nextvalue
> 0);
957 atom
= subre(v
, 'b', BACKR
, lp
, rp
);
958 subno
= v
->nextvalue
;
960 EMPTYARC(lp
, rp
); /* temporarily, so there's something */
965 /* ...and an atom may be followed by a quantifier */
971 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
977 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
983 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
1000 /* {m,n} exercises preference, even if it's {m,m} */
1001 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
1006 /* {m} passes operand's preference through */
1010 { /* catches errors too */
1016 default: /* no quantifier */
1022 /* annoying special case: {0} or {0,0} cancels everything */
1023 if (m
== 0 && n
== 0)
1027 if (atomtype
== '(')
1028 v
->subs
[subno
] = NULL
;
1029 delsub(v
->nfa
, lp
, rp
);
1034 /* if not a messy case, avoid hard part */
1035 assert(!MESSY(top
->flags
));
1036 f
= top
->flags
| qprefer
| ((atom
!= NULL
) ? atom
->flags
: 0);
1037 if (atomtype
!= '(' && atomtype
!= BACKREF
&& !MESSY(UP(f
)))
1039 if (!(m
== 1 && n
== 1))
1040 repeat(v
, lp
, rp
, m
, n
);
1048 * hard part: something messy That is, capturing parens, back
1049 * reference, short/long clash, or an atom with substructure
1050 * containing one of those.
1053 /* now we'll need a subre for the contents even if they're boring */
1056 atom
= subre(v
, '=', 0, lp
, rp
);
1061 * prepare a general-purpose state skeleton
1063 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1064 * [lp] ----> [s2] ----bypass---------------------
1066 * where bypass is an empty, and prefix is some repetitions of atom
1068 s
= newstate(v
->nfa
); /* first, new endpoints for the atom */
1069 s2
= newstate(v
->nfa
);
1071 moveouts(v
->nfa
, lp
, s
);
1072 moveins(v
->nfa
, rp
, s2
);
1076 s
= newstate(v
->nfa
); /* and spots for prefix and bypass */
1077 s2
= newstate(v
->nfa
);
1083 /* break remaining subRE into x{...} and what follows */
1084 t
= subre(v
, '.', COMBINE(qprefer
, atom
->flags
), lp
, rp
);
1087 /* here we should recurse... but we must postpone that to the end */
1089 /* split top into prefix and remaining */
1090 assert(top
->op
== '=' && top
->left
== NULL
&& top
->right
== NULL
);
1091 top
->left
= subre(v
, '=', top
->flags
, top
->begin
, lp
);
1095 /* if it's a backref, now is the time to replicate the subNFA */
1096 if (atomtype
== BACKREF
)
1098 assert(atom
->begin
->nouts
== 1); /* just the EMPTY */
1099 delsub(v
->nfa
, atom
->begin
, atom
->end
);
1100 assert(v
->subs
[subno
] != NULL
);
1101 /* and here's why the recursion got postponed: it must */
1102 /* wait until the skeleton is filled in, because it may */
1103 /* hit a backref that wants to copy the filled-in skeleton */
1104 dupnfa(v
->nfa
, v
->subs
[subno
]->begin
, v
->subs
[subno
]->end
,
1105 atom
->begin
, atom
->end
);
1109 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1112 EMPTYARC(s2
, atom
->end
); /* the bypass */
1113 assert(PREF(qprefer
) != 0);
1114 f
= COMBINE(qprefer
, atom
->flags
);
1115 t
= subre(v
, '|', f
, lp
, atom
->end
);
1118 t
->right
= subre(v
, '|', PREF(f
), s2
, atom
->end
);
1120 t
->right
->left
= subre(v
, '=', 0, s2
, atom
->end
);
1127 /* deal with the rest of the quantifier */
1128 if (atomtype
== BACKREF
)
1130 /* special case: backrefs have internal quantifiers */
1131 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1132 /* just stuff everything into atom */
1133 repeat(v
, atom
->begin
, atom
->end
, m
, n
);
1134 atom
->min
= (short) m
;
1135 atom
->max
= (short) n
;
1136 atom
->flags
|= COMBINE(qprefer
, atom
->flags
);
1138 else if (m
== 1 && n
== 1)
1140 /* no/vacuous quantifier: done */
1141 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1145 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1146 /* parens in only second x */
1147 dupnfa(v
->nfa
, atom
->begin
, atom
->end
, s
, atom
->begin
);
1148 assert(m
>= 1 && m
!= INFINITY
&& n
>= 1);
1149 repeat(v
, s
, atom
->begin
, m
- 1, (n
== INFINITY
) ? n
: n
- 1);
1150 f
= COMBINE(qprefer
, atom
->flags
);
1151 t
= subre(v
, '.', f
, s
, atom
->end
); /* prefix and atom */
1153 t
->left
= subre(v
, '=', PREF(f
), s
, atom
->begin
);
1159 /* and finally, look after that postponed recursion */
1161 if (!(SEE('|') || SEE(stopper
) || SEE(EOS
)))
1162 t
->right
= parsebranch(v
, stopper
, type
, atom
->end
, rp
, 1);
1165 EMPTYARC(atom
->end
, rp
);
1166 t
->right
= subre(v
, '=', 0, atom
->end
, rp
);
1168 assert(SEE('|') || SEE(stopper
) || SEE(EOS
));
1169 t
->flags
|= COMBINE(t
->flags
, t
->right
->flags
);
1170 top
->flags
|= COMBINE(top
->flags
, t
->flags
);
1174 * nonword - generate arcs for non-word-character ahead or behind
1177 nonword(struct vars
* v
,
1178 int dir
, /* AHEAD or BEHIND */
1182 int anchor
= (dir
== AHEAD
) ? '$' : '^';
1184 assert(dir
== AHEAD
|| dir
== BEHIND
);
1185 newarc(v
->nfa
, anchor
, 1, lp
, rp
);
1186 newarc(v
->nfa
, anchor
, 0, lp
, rp
);
1187 colorcomplement(v
->nfa
, v
->cm
, dir
, v
->wordchrs
, lp
, rp
);
1188 /* (no need for special attention to \n) */
1192 * word - generate arcs for word character ahead or behind
1195 word(struct vars
* v
,
1196 int dir
, /* AHEAD or BEHIND */
1200 assert(dir
== AHEAD
|| dir
== BEHIND
);
1201 cloneouts(v
->nfa
, v
->wordchrs
, lp
, rp
, dir
);
1202 /* (no need for special attention to \n) */
1206 * scannum - scan a number
1208 static int /* value, <= DUPMAX */
1209 scannum(struct vars
* v
)
1213 while (SEE(DIGIT
) && n
< DUPMAX
)
1215 n
= n
* 10 + v
->nextvalue
;
1218 if (SEE(DIGIT
) || n
> DUPMAX
)
1227 * repeat - replicate subNFA for quantifiers
1229 * The duplication sequences used here are chosen carefully so that any
1230 * pointers starting out pointing into the subexpression end up pointing into
1231 * the last occurrence. (Note that it may not be strung between the same
1232 * left and right end states, however!) This used to be important for the
1233 * subRE tree, although the important bits are now handled by the in-line
1234 * code in parse(), and when this is called, it doesn't matter any more.
1237 repeat(struct vars
* v
,
1245 #define PAIR(x, y) ((x)*4 + (y))
1246 #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1247 const int rm
= REDUCE(m
);
1248 const int rn
= REDUCE(n
);
1252 switch (PAIR(rm
, rn
))
1254 case PAIR(0, 0): /* empty string */
1255 delsub(v
->nfa
, lp
, rp
);
1258 case PAIR(0, 1): /* do as x| */
1261 case PAIR(0, SOME
): /* do as x{1,n}| */
1262 repeat(v
, lp
, rp
, 1, n
);
1266 case PAIR(0, INF
): /* loop x around */
1267 s
= newstate(v
->nfa
);
1269 moveouts(v
->nfa
, lp
, s
);
1270 moveins(v
->nfa
, rp
, s
);
1274 case PAIR(1, 1): /* no action required */
1276 case PAIR(1, SOME
): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1277 s
= newstate(v
->nfa
);
1279 moveouts(v
->nfa
, lp
, s
);
1280 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1282 repeat(v
, lp
, s
, 1, n
- 1);
1286 case PAIR(1, INF
): /* add loopback arc */
1287 s
= newstate(v
->nfa
);
1288 s2
= newstate(v
->nfa
);
1290 moveouts(v
->nfa
, lp
, s
);
1291 moveins(v
->nfa
, rp
, s2
);
1296 case PAIR(SOME
, SOME
): /* do as x{m-1,n-1}x */
1297 s
= newstate(v
->nfa
);
1299 moveouts(v
->nfa
, lp
, s
);
1300 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1302 repeat(v
, lp
, s
, m
- 1, n
- 1);
1304 case PAIR(SOME
, INF
): /* do as x{m-1,}x */
1305 s
= newstate(v
->nfa
);
1307 moveouts(v
->nfa
, lp
, s
);
1308 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1310 repeat(v
, lp
, s
, m
- 1, n
);
1319 * bracket - handle non-complemented bracket expression
1320 * Also called from cbracket for complemented bracket expressions.
1323 bracket(struct vars
* v
,
1329 while (!SEE(']') && !SEE(EOS
))
1330 brackpart(v
, lp
, rp
);
1331 assert(SEE(']') || ISERR());
1332 okcolors(v
->nfa
, v
->cm
);
1336 * cbracket - handle complemented bracket expression
1337 * We do it by calling bracket() with dummy endpoints, and then complementing
1338 * the result. The alternative would be to invoke rainbow(), and then delete
1339 * arcs as the b.e. is seen... but that gets messy.
1342 cbracket(struct vars
* v
,
1346 struct state
*left
= newstate(v
->nfa
);
1347 struct state
*right
= newstate(v
->nfa
);
1349 struct arc
*a
; /* arc from lp */
1350 struct arc
*ba
; /* arc from left, from bracket() */
1351 struct arc
*pa
; /* MCCE-prototype arc */
1357 bracket(v
, left
, right
);
1358 if (v
->cflags
& REG_NLSTOP
)
1359 newarc(v
->nfa
, PLAIN
, v
->nlcolor
, left
, right
);
1362 assert(lp
->nouts
== 0); /* all outarcs will be ours */
1364 /* easy part of complementing */
1365 colorcomplement(v
->nfa
, v
->cm
, PLAIN
, left
, lp
, rp
);
1367 if (v
->mcces
== NULL
)
1368 { /* no MCCEs -- we're done */
1369 dropstate(v
->nfa
, left
);
1370 assert(right
->nins
== 0);
1371 freestate(v
->nfa
, right
);
1375 /* but complementing gets messy in the presence of MCCEs... */
1377 for (p
= v
->mcces
->chrs
, i
= v
->mcces
->nchrs
; i
> 0; p
++, i
--)
1379 co
= GETCOLOR(v
->cm
, *p
);
1380 a
= findarc(lp
, PLAIN
, co
);
1381 ba
= findarc(left
, PLAIN
, co
);
1389 s
= newstate(v
->nfa
);
1391 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1393 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1396 { /* easy case, need all of them */
1397 cloneouts(v
->nfa
, pa
->to
, s
, rp
, PLAIN
);
1398 newarc(v
->nfa
, '$', 1, s
, rp
);
1399 newarc(v
->nfa
, '$', 0, s
, rp
);
1400 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, s
, rp
);
1403 { /* must be selective */
1404 if (findarc(ba
->to
, '$', 1) == NULL
)
1406 newarc(v
->nfa
, '$', 1, s
, rp
);
1407 newarc(v
->nfa
, '$', 0, s
, rp
);
1408 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
,
1411 for (pa
= pa
->to
->outs
; pa
!= NULL
; pa
= pa
->outchain
)
1412 if (findarc(ba
->to
, PLAIN
, pa
->co
) == NULL
)
1413 newarc(v
->nfa
, PLAIN
, pa
->co
, s
, rp
);
1414 if (s
->nouts
== 0) /* limit of selectivity: none */
1415 dropstate(v
->nfa
, s
); /* frees arc too */
1420 delsub(v
->nfa
, left
, right
);
1421 assert(left
->nouts
== 0);
1422 freestate(v
->nfa
, left
);
1423 assert(right
->nins
== 0);
1424 freestate(v
->nfa
, right
);
1428 * brackpart - handle one item (or range) within a bracket expression
1431 brackpart(struct vars
* v
,
1442 /* parse something, get rid of special cases, take shortcuts */
1443 switch (v
->nexttype
)
1445 case RANGE
: /* a-b-c or other botch */
1450 c
[0] = v
->nextvalue
;
1452 /* shortcut for ordinary chr (not range, not MCCE leader) */
1453 if (!SEE(RANGE
) && !ISCELEADER(v
, c
[0]))
1455 onechr(v
, c
[0], lp
, rp
);
1458 startc
= element(v
, c
, c
+ 1);
1463 endp
= scanplain(v
);
1464 INSIST(startp
< endp
, REG_ECOLLATE
);
1466 startc
= element(v
, startp
, endp
);
1471 endp
= scanplain(v
);
1472 INSIST(startp
< endp
, REG_ECOLLATE
);
1474 startc
= element(v
, startp
, endp
);
1476 cv
= eclass(v
, startc
, (v
->cflags
& REG_ICASE
));
1478 dovec(v
, cv
, lp
, rp
);
1483 endp
= scanplain(v
);
1484 INSIST(startp
< endp
, REG_ECTYPE
);
1486 cv
= cclass(v
, startp
, endp
, (v
->cflags
& REG_ICASE
));
1488 dovec(v
, cv
, lp
, rp
);
1500 switch (v
->nexttype
)
1504 c
[0] = v
->nextvalue
;
1506 endc
= element(v
, c
, c
+ 1);
1511 endp
= scanplain(v
);
1512 INSIST(startp
< endp
, REG_ECOLLATE
);
1514 endc
= element(v
, startp
, endp
);
1527 * Ranges are unportable. Actually, standard C does guarantee that
1528 * digits are contiguous, but making that an exception is just too
1533 cv
= range(v
, startc
, endc
, (v
->cflags
& REG_ICASE
));
1535 dovec(v
, cv
, lp
, rp
);
1539 * scanplain - scan PLAIN contents of [. etc.
1541 * Certain bits of trickery in lex.c know that this code does not try
1542 * to look past the final bracket of the [. etc.
1544 static chr
* /* just after end of sequence */
1545 scanplain(struct vars
* v
)
1549 assert(SEE(COLLEL
) || SEE(ECLASS
) || SEE(CCLASS
));
1559 assert(SEE(END
) || ISERR());
1566 * leaders - process a cvec of collating elements to also include leaders
1567 * Also gives all characters involved their own colors, which is almost
1568 * certainly necessary, and sets up little disconnected subNFA.
1571 leaders(struct vars
* v
,
1580 v
->mccepbegin
= newstate(v
->nfa
);
1581 v
->mccepend
= newstate(v
->nfa
);
1584 for (mcce
= 0; mcce
< cv
->nmcces
; mcce
++)
1586 p
= cv
->mcces
[mcce
];
1588 if (!haschr(cv
, leader
))
1591 s
= newstate(v
->nfa
);
1592 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, leader
),
1594 okcolors(v
->nfa
, v
->cm
);
1598 a
= findarc(v
->mccepbegin
, PLAIN
,
1599 GETCOLOR(v
->cm
, leader
));
1602 assert(s
!= v
->mccepend
);
1605 assert(*p
!= 0 && *(p
+ 1) == 0); /* only 2-char MCCEs for
1607 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, *p
), s
, v
->mccepend
);
1608 okcolors(v
->nfa
, v
->cm
);
1613 * onechr - fill in arcs for a plain character, and possible case complements
1614 * This is mostly a shortcut for efficient handling of the common case.
1617 onechr(struct vars
* v
,
1622 if (!(v
->cflags
& REG_ICASE
))
1624 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, c
), lp
, rp
);
1628 /* rats, need general case anyway... */
1629 dovec(v
, allcases(v
, c
), lp
, rp
);
1633 * dovec - fill in arcs for each element of a cvec
1634 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1637 dovec(struct vars
* v
,
1651 struct arc
*pa
; /* arc in prototype */
1653 struct state
*ps
; /* state in prototype */
1655 /* need a place to store leaders, if any */
1658 assert(v
->mcces
!= NULL
);
1659 if (v
->cv2
== NULL
|| v
->cv2
->nchrs
< v
->mcces
->nchrs
)
1663 v
->cv2
= newcvec(v
->mcces
->nchrs
, 0, v
->mcces
->nmcces
);
1668 leads
= clearcvec(v
->cv2
);
1673 /* first, get the ordinary characters out of the way */
1674 for (p
= cv
->chrs
, i
= cv
->nchrs
; i
> 0; p
++, i
--)
1677 if (!ISCELEADER(v
, ch
))
1678 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, ch
), lp
, rp
);
1681 assert(singleton(v
->cm
, ch
));
1682 assert(leads
!= NULL
);
1683 if (!haschr(leads
, ch
))
1688 /* and the ranges */
1689 for (p
= cv
->ranges
, i
= cv
->nranges
; i
> 0; p
+= 2, i
--)
1693 while (from
<= to
&& (ce
= nextleader(v
, from
, to
)) != NOCELT
)
1696 subrange(v
, from
, ce
- 1, lp
, rp
);
1697 assert(singleton(v
->cm
, ce
));
1698 assert(leads
!= NULL
);
1699 if (!haschr(leads
, ce
))
1704 subrange(v
, from
, to
, lp
, rp
);
1707 if ((leads
== NULL
|| leads
->nchrs
== 0) && cv
->nmcces
== 0)
1710 /* deal with the MCCE leaders */
1712 for (p
= leads
->chrs
, i
= leads
->nchrs
; i
> 0; p
++, i
--)
1714 co
= GETCOLOR(v
->cm
, *p
);
1715 a
= findarc(lp
, PLAIN
, co
);
1720 s
= newstate(v
->nfa
);
1722 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1725 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1728 newarc(v
->nfa
, '$', 1, s
, rp
);
1729 newarc(v
->nfa
, '$', 0, s
, rp
);
1730 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, ps
, s
, rp
);
1735 for (i
= 0; i
< cv
->nmcces
; i
++)
1738 assert(singleton(v
->cm
, *p
));
1739 if (!singleton(v
->cm
, *p
))
1745 co
= GETCOLOR(v
->cm
, ch
);
1746 a
= findarc(lp
, PLAIN
, co
);
1751 s
= newstate(v
->nfa
);
1753 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1756 assert(*p
!= 0); /* at least two chars */
1757 assert(singleton(v
->cm
, *p
));
1759 co
= GETCOLOR(v
->cm
, ch
);
1760 assert(*p
== 0); /* and only two, for now */
1761 newarc(v
->nfa
, PLAIN
, co
, s
, rp
);
1767 * nextleader - find next MCCE leader within range
1769 static celt
/* NOCELT means none */
1770 nextleader(struct vars
* v
,
1779 if (v
->mcces
== NULL
)
1782 for (i
= v
->mcces
->nchrs
, p
= v
->mcces
->chrs
; i
> 0; i
--, p
++)
1785 if (from
<= ch
&& ch
<= to
)
1786 if (it
== NOCELT
|| ch
< it
)
1793 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1795 * The list is kept as a bunch of arcs between two dummy states; it's
1796 * disposed of by the unreachable-states sweep in NFA optimization.
1797 * Does NEXT(). Must not be called from any unusual lexical context.
1798 * This should be reconciled with the \w etc. handling in lex.c, and
1799 * should be cleaned up to reduce dependencies on input scanning.
1802 wordchrs(struct vars
* v
)
1805 struct state
*right
;
1807 if (v
->wordchrs
!= NULL
)
1809 NEXT(); /* for consistency */
1813 left
= newstate(v
->nfa
);
1814 right
= newstate(v
->nfa
);
1816 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1819 assert(v
->savenow
!= NULL
&& SEE('['));
1820 bracket(v
, left
, right
);
1821 assert((v
->savenow
!= NULL
&& SEE(']')) || ISERR());
1828 * subre - allocate a subre
1830 static struct subre
*
1831 subre(struct vars
* v
,
1834 struct state
* begin
,
1841 v
->treefree
= ret
->left
;
1844 ret
= (struct subre
*) MALLOC(sizeof(struct subre
));
1850 ret
->chain
= v
->treechain
;
1854 assert(strchr("|.b(=", op
) != NULL
);
1860 ret
->min
= ret
->max
= 1;
1871 * freesubre - free a subRE subtree
1874 freesubre(struct vars
* v
, /* might be NULL */
1880 if (sr
->left
!= NULL
)
1881 freesubre(v
, sr
->left
);
1882 if (sr
->right
!= NULL
)
1883 freesubre(v
, sr
->right
);
1889 * freesrnode - free one node in a subRE subtree
1892 freesrnode(struct vars
* v
, /* might be NULL */
1898 if (!NULLCNFA(sr
->cnfa
))
1899 freecnfa(&sr
->cnfa
);
1904 sr
->left
= v
->treefree
;
1912 * optst - optimize a subRE subtree
1915 optst(struct vars
* v
,
1921 /* recurse through children */
1922 if (t
->left
!= NULL
)
1924 if (t
->right
!= NULL
)
1929 * numst - number tree nodes (assigning retry indexes)
1931 static int /* next number */
1932 numst(struct subre
* t
,
1933 int start
) /* starting point for subtree numbers */
1940 t
->retry
= (short) i
++;
1941 if (t
->left
!= NULL
)
1942 i
= numst(t
->left
, i
);
1943 if (t
->right
!= NULL
)
1944 i
= numst(t
->right
, i
);
1949 * markst - mark tree nodes as INUSE
1952 markst(struct subre
* t
)
1957 if (t
->left
!= NULL
)
1959 if (t
->right
!= NULL
)
1964 * cleanst - free any tree nodes not marked INUSE
1967 cleanst(struct vars
* v
)
1972 for (t
= v
->treechain
; t
!= NULL
; t
= next
)
1975 if (!(t
->flags
& INUSE
))
1978 v
->treechain
= NULL
;
1979 v
->treefree
= NULL
; /* just on general principles */
1983 * nfatree - turn a subRE subtree into a tree of compacted NFAs
1985 static long /* optimize results from top node */
1986 nfatree(struct vars
* v
,
1988 FILE *f
) /* for debug output */
1990 assert(t
!= NULL
&& t
->begin
!= NULL
);
1992 if (t
->left
!= NULL
)
1993 (DISCARD
) nfatree(v
, t
->left
, f
);
1994 if (t
->right
!= NULL
)
1995 (DISCARD
) nfatree(v
, t
->right
, f
);
1997 return nfanode(v
, t
, f
);
2001 * nfanode - do one NFA for nfatree
2003 static long /* optimize results */
2004 nfanode(struct vars
* v
,
2006 FILE *f
) /* for debug output */
2011 assert(t
->begin
!= NULL
);
2018 fprintf(f
, "\n\n\n========= TREE NODE %s ==========\n",
2019 stid(t
, idbuf
, sizeof(idbuf
)));
2022 nfa
= newnfa(v
, v
->cm
, v
->nfa
);
2024 dupnfa(nfa
, t
->begin
, t
->end
, nfa
->init
, nfa
->final
);
2028 ret
= optimize(nfa
, f
);
2031 compact(nfa
, &t
->cnfa
);
2038 * newlacon - allocate a lookahead-constraint subRE
2040 static int /* lacon number */
2041 newlacon(struct vars
* v
,
2042 struct state
* begin
,
2049 if (v
->nlacons
== 0)
2051 v
->lacons
= (struct subre
*) MALLOC(2 * sizeof(struct subre
));
2052 n
= 1; /* skip 0th */
2057 v
->lacons
= (struct subre
*) REALLOC(v
->lacons
,
2058 (v
->nlacons
+ 1) * sizeof(struct subre
));
2061 if (v
->lacons
== NULL
)
2066 sub
= &v
->lacons
[n
];
2075 * freelacons - free lookahead-constraint subRE vector
2078 freelacons(struct subre
* subs
,
2085 for (sub
= subs
+ 1, i
= n
- 1; i
> 0; sub
++, i
--) /* no 0th */
2086 if (!NULLCNFA(sub
->cnfa
))
2087 freecnfa(&sub
->cnfa
);
2092 * rfree - free a whole RE (insides of regfree)
2099 if (re
== NULL
|| re
->re_magic
!= REMAGIC
)
2102 re
->re_magic
= 0; /* invalidate RE */
2103 g
= (struct guts
*) re
->re_guts
;
2108 if (g
->tree
!= NULL
)
2109 freesubre((struct vars
*) NULL
, g
->tree
);
2110 if (g
->lacons
!= NULL
)
2111 freelacons(g
->lacons
, g
->nlacons
);
2112 if (!NULLCNFA(g
->search
))
2113 freecnfa(&g
->search
);
2120 * dump - dump an RE in human-readable form
2129 if (re
->re_magic
!= REMAGIC
)
2130 fprintf(f
, "bad magic number (0x%x not 0x%x)\n", re
->re_magic
,
2132 if (re
->re_guts
== NULL
)
2134 fprintf(f
, "NULL guts!!!\n");
2137 g
= (struct guts
*) re
->re_guts
;
2138 if (g
->magic
!= GUTSMAGIC
)
2139 fprintf(f
, "bad guts magic number (0x%x not 0x%x)\n", g
->magic
,
2142 fprintf(f
, "\n\n\n========= DUMP ==========\n");
2143 fprintf(f
, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2144 re
->re_nsub
, re
->re_info
, re
->re_csize
, g
->ntree
);
2146 dumpcolors(&g
->cmap
, f
);
2147 if (!NULLCNFA(g
->search
))
2149 printf("\nsearch:\n");
2150 dumpcnfa(&g
->search
, f
);
2152 for (i
= 1; i
< g
->nlacons
; i
++)
2154 fprintf(f
, "\nla%d (%s):\n", i
,
2155 (g
->lacons
[i
].subno
) ? "positive" : "negative");
2156 dumpcnfa(&g
->lacons
[i
].cnfa
, f
);
2159 dumpst(g
->tree
, f
, 0);
2163 * dumpst - dump a subRE tree
2166 dumpst(struct subre
* t
,
2168 int nfapresent
) /* is the original NFA still around? */
2171 fprintf(f
, "null tree\n");
2173 stdump(t
, f
, nfapresent
);
2178 * stdump - recursive guts of dumpst
2181 stdump(struct subre
* t
,
2183 int nfapresent
) /* is the original NFA still around? */
2187 fprintf(f
, "%s. `%c'", stid(t
, idbuf
, sizeof(idbuf
)), t
->op
);
2188 if (t
->flags
& LONGER
)
2189 fprintf(f
, " longest");
2190 if (t
->flags
& SHORTER
)
2191 fprintf(f
, " shortest");
2192 if (t
->flags
& MIXED
)
2193 fprintf(f
, " hasmixed");
2195 fprintf(f
, " hascapture");
2196 if (t
->flags
& BACKR
)
2197 fprintf(f
, " hasbackref");
2198 if (!(t
->flags
& INUSE
))
2199 fprintf(f
, " UNUSED");
2201 fprintf(f
, " (#%d)", t
->subno
);
2202 if (t
->min
!= 1 || t
->max
!= 1)
2204 fprintf(f
, " {%d,", t
->min
);
2205 if (t
->max
!= INFINITY
)
2206 fprintf(f
, "%d", t
->max
);
2210 fprintf(f
, " %ld-%ld", (long) t
->begin
->no
, (long) t
->end
->no
);
2211 if (t
->left
!= NULL
)
2212 fprintf(f
, " L:%s", stid(t
->left
, idbuf
, sizeof(idbuf
)));
2213 if (t
->right
!= NULL
)
2214 fprintf(f
, " R:%s", stid(t
->right
, idbuf
, sizeof(idbuf
)));
2215 if (!NULLCNFA(t
->cnfa
))
2218 dumpcnfa(&t
->cnfa
, f
);
2221 if (t
->left
!= NULL
)
2222 stdump(t
->left
, f
, nfapresent
);
2223 if (t
->right
!= NULL
)
2224 stdump(t
->right
, f
, nfapresent
);
2228 * stid - identify a subtree node for dumping
2230 static char * /* points to buf or constant string */
2231 stid(struct subre
* t
,
2235 /* big enough for hex int or decimal t->retry? */
2236 if (bufsize
< sizeof(int) * 2 + 3 || bufsize
< sizeof(t
->retry
) * 3 + 1)
2239 sprintf(buf
, "%d", t
->retry
);
2241 sprintf(buf
, "0x%x", (int) t
); /* may lose bits, that's okay */
2244 #endif /* REG_DEBUG */
2247 #include "regc_lex.c"
2248 #include "regc_color.c"
2249 #include "regc_nfa.c"
2250 #include "regc_cvec.c"
2251 #include "regc_locale.c"