]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
38 * forward declarations, up here so forward datatypes etc. are defined early
40 /* === regcomp.c === */
41 static void moresubs(struct vars
*, int);
42 static int freev(struct vars
*, int);
43 static void makesearch(struct vars
*, struct nfa
*);
44 static struct subre
*parse(struct vars
*, int, int, struct state
*, struct state
*);
45 static struct subre
*parsebranch(struct vars
*, int, int, struct state
*, struct state
*, int);
46 static void parseqatom(struct vars
*, int, int, struct state
*, struct state
*, struct subre
*);
47 static void nonword(struct vars
*, int, struct state
*, struct state
*);
48 static void word(struct vars
*, int, struct state
*, struct state
*);
49 static int scannum(struct vars
*);
50 static void repeat(struct vars
*, struct state
*, struct state
*, int, int);
51 static void bracket(struct vars
*, struct state
*, struct state
*);
52 static void cbracket(struct vars
*, struct state
*, struct state
*);
53 static void brackpart(struct vars
*, struct state
*, struct state
*);
54 static chr
*scanplain(struct vars
*);
55 static void leaders(struct vars
*, struct cvec
*);
56 static void onechr(struct vars
*, chr
, struct state
*, struct state
*);
57 static void dovec(struct vars
*, struct cvec
*, struct state
*, struct state
*);
58 static celt
nextleader(struct vars
*, chr
, chr
);
59 static void wordchrs(struct vars
*);
60 static struct subre
*subre(struct vars
*, int, int, struct state
*, struct state
*);
61 static void freesubre(struct vars
*, struct subre
*);
62 static void freesrnode(struct vars
*, struct subre
*);
63 static void optst(struct vars
*, struct subre
*);
64 static int numst(struct subre
*, int);
65 static void markst(struct subre
*);
66 static void cleanst(struct vars
*);
67 static long nfatree(struct vars
*, struct subre
*, FILE *);
68 static long nfanode(struct vars
*, struct subre
*, FILE *);
69 static int newlacon(struct vars
*, struct state
*, struct state
*, int);
70 static void freelacons(struct subre
*, int);
71 static void rfree(regex_t
*);
74 static void dump(regex_t
*, FILE *);
75 static void dumpst(struct subre
*, FILE *, int);
76 static void stdump(struct subre
*, FILE *, int);
77 static char *stid(struct subre
*, char *, size_t);
79 /* === regc_lex.c === */
80 static void lexstart(struct vars
*);
81 static void prefixes(struct vars
*);
82 static void lexnest(struct vars
*, chr
*, chr
*);
83 static void lexword(struct vars
*);
84 static int next(struct vars
*);
85 static int lexescape(struct vars
*);
86 static chr
lexdigits(struct vars
*, int, int, int);
87 static int brenext(struct vars
*, chr
);
88 static void skip(struct vars
*);
89 static chr
newline(void);
90 static chr
chrnamed(struct vars
*, chr
*, chr
*, chr
);
92 /* === regc_color.c === */
93 static void initcm(struct vars
*, struct colormap
*);
94 static void freecm(struct colormap
*);
95 static void cmtreefree(struct colormap
*, union tree
*, int);
96 static color
setcolor(struct colormap
*, chr
, pcolor
);
97 static color
maxcolor(struct colormap
*);
98 static color
newcolor(struct colormap
*);
99 static void freecolor(struct colormap
*, pcolor
);
100 static color
pseudocolor(struct colormap
*);
101 static color
subcolor(struct colormap
*, chr c
);
102 static color
newsub(struct colormap
*, pcolor
);
103 static void subrange(struct vars
*, chr
, chr
, struct state
*, struct state
*);
104 static void subblock(struct vars
*, chr
, struct state
*, struct state
*);
105 static void okcolors(struct nfa
*, struct colormap
*);
106 static void colorchain(struct colormap
*, struct arc
*);
107 static void uncolorchain(struct colormap
*, struct arc
*);
108 static int singleton(struct colormap
*, chr c
);
109 static void rainbow(struct nfa
*, struct colormap
*, int, pcolor
, struct state
*, struct state
*);
110 static void colorcomplement(struct nfa
*, struct colormap
*, int, struct state
*, struct state
*, struct state
*);
113 static void dumpcolors(struct colormap
*, FILE *);
114 static void fillcheck(struct colormap
*, union tree
*, int, FILE *);
115 static void dumpchr(chr
, FILE *);
117 /* === regc_nfa.c === */
118 static struct nfa
*newnfa(struct vars
*, struct colormap
*, struct nfa
*);
119 static void freenfa(struct nfa
*);
120 static struct state
*newstate(struct nfa
*);
121 static struct state
*newfstate(struct nfa
*, int flag
);
122 static void dropstate(struct nfa
*, struct state
*);
123 static void freestate(struct nfa
*, struct state
*);
124 static void destroystate(struct nfa
*, struct state
*);
125 static void newarc(struct nfa
*, int, pcolor
, struct state
*, struct state
*);
126 static struct arc
*allocarc(struct nfa
*, struct state
*);
127 static void freearc(struct nfa
*, struct arc
*);
128 static struct arc
*findarc(struct state
*, int, pcolor
);
129 static void cparc(struct nfa
*, struct arc
*, struct state
*, struct state
*);
130 static void moveins(struct nfa
*, struct state
*, struct state
*);
131 static void copyins(struct nfa
*, struct state
*, struct state
*);
132 static void moveouts(struct nfa
*, struct state
*, struct state
*);
133 static void copyouts(struct nfa
*, struct state
*, struct state
*);
134 static void cloneouts(struct nfa
*, struct state
*, struct state
*, struct state
*, int);
135 static void delsub(struct nfa
*, struct state
*, struct state
*);
136 static void deltraverse(struct nfa
*, struct state
*, struct state
*);
137 static void dupnfa(struct nfa
*, struct state
*, struct state
*, struct state
*, struct state
*);
138 static void duptraverse(struct nfa
*, struct state
*, struct state
*);
139 static void cleartraverse(struct nfa
*, struct state
*);
140 static void specialcolors(struct nfa
*);
141 static long optimize(struct nfa
*, FILE *);
142 static void pullback(struct nfa
*, FILE *);
143 static int pull(struct nfa
*, struct arc
*);
144 static void pushfwd(struct nfa
*, FILE *);
145 static int push(struct nfa
*, struct arc
*);
147 #define INCOMPATIBLE 1 /* destroys arc */
148 #define SATISFIED 2 /* constraint satisfied */
149 #define COMPATIBLE 3 /* compatible but not satisfied yet */
150 static int combine(struct arc
*, struct arc
*);
151 static void fixempties(struct nfa
*, FILE *);
152 static int unempty(struct nfa
*, struct arc
*);
153 static void cleanup(struct nfa
*);
154 static void markreachable(struct nfa
*, struct state
*, struct state
*, struct state
*);
155 static void markcanreach(struct nfa
*, struct state
*, struct state
*, struct state
*);
156 static long analyze(struct nfa
*);
157 static void compact(struct nfa
*, struct cnfa
*);
158 static void carcsort(struct carc
*, struct carc
*);
159 static void freecnfa(struct cnfa
*);
160 static void dumpnfa(struct nfa
*, FILE *);
163 static void dumpstate(struct state
*, FILE *);
164 static void dumparcs(struct state
*, FILE *);
165 static int dumprarcs(struct arc
*, struct state
*, FILE *, int);
166 static void dumparc(struct arc
*, struct state
*, FILE *);
167 static void dumpcnfa(struct cnfa
*, FILE *);
168 static void dumpcstate(int, struct carc
*, struct cnfa
*, FILE *);
170 /* === regc_cvec.c === */
171 static struct cvec
*newcvec(int, int, int);
172 static struct cvec
*clearcvec(struct cvec
*);
173 static void addchr(struct cvec
*, chr
);
174 static void addrange(struct cvec
*, chr
, chr
);
175 static void addmcce(struct cvec
*, chr
*, chr
*);
176 static int haschr(struct cvec
*, chr
);
177 static struct cvec
*getcvec(struct vars
*, int, int, int);
178 static void freecvec(struct cvec
*);
180 /* === regc_locale.c === */
181 static int wx_isdigit(wx_wchar c
);
182 static int wx_isalpha(wx_wchar c
);
183 static int wx_isalnum(wx_wchar c
);
184 static int wx_isupper(wx_wchar c
);
185 static int wx_islower(wx_wchar c
);
186 static int wx_isgraph(wx_wchar c
);
187 static int wx_ispunct(wx_wchar c
);
188 static int wx_isspace(wx_wchar c
);
189 static wx_wchar
wx_toupper(wx_wchar c
);
190 static wx_wchar
wx_tolower(wx_wchar c
);
191 static int nmcces(struct vars
*);
192 static int nleaders(struct vars
*);
193 static struct cvec
*allmcces(struct vars
*, struct cvec
*);
194 static celt
element(struct vars
*, chr
*, chr
*);
195 static struct cvec
*range(struct vars
*, celt
, celt
, int);
196 static int before(celt
, celt
);
197 static struct cvec
*eclass(struct vars
*, celt
, int);
198 static struct cvec
*cclass(struct vars
*, chr
*, chr
*, int);
199 static struct cvec
*allcases(struct vars
*, chr
);
200 static int cmp(const chr
*, const chr
*, size_t);
201 static int casecmp(const chr
*, const chr
*, size_t);
204 /* internal variables, bundled for easy passing around */
208 chr
*now
; /* scan pointer into string */
209 chr
*stop
; /* end of string */
210 chr
*savenow
; /* saved now and stop for "subroutine
213 int err
; /* error code (0 if none) */
214 int cflags
; /* copy of compile flags */
215 int lasttype
; /* type of previous token */
216 int nexttype
; /* type of next token */
217 chr nextvalue
; /* value (if any) of next token */
218 int lexcon
; /* lexical context type (see lex.c) */
219 int nsubexp
; /* subexpression count */
220 struct subre
**subs
; /* subRE pointer vector */
221 size_t nsubs
; /* length of vector */
222 struct subre
*sub10
[10]; /* initial vector, enough for most */
223 struct nfa
*nfa
; /* the NFA */
224 struct colormap
*cm
; /* character color map */
225 color nlcolor
; /* color of newline */
226 struct state
*wordchrs
; /* state in nfa holding word-char outarcs */
227 struct subre
*tree
; /* subexpression tree */
228 struct subre
*treechain
; /* all tree nodes allocated */
229 struct subre
*treefree
; /* any free tree nodes */
230 int ntree
; /* number of tree nodes */
231 struct cvec
*cv
; /* interface cvec */
232 struct cvec
*cv2
; /* utility cvec */
233 struct cvec
*mcces
; /* collating-element information */
234 #define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
235 struct state
*mccepbegin
; /* in nfa, start of MCCE prototypes */
236 struct state
*mccepend
; /* in nfa, end of MCCE prototypes */
237 struct subre
*lacons
; /* lookahead-constraint vector */
238 int nlacons
; /* size of lacons */
241 /* parsing macros; most know that `v' is the struct vars pointer */
242 #define NEXT() (next(v)) /* advance by one token */
243 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
244 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
245 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
246 #define ISERR() VISERR(v)
247 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
249 #define ERR(e) VERR(v, e) /* record an error */
250 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
251 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
252 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
253 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false,
255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
256 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
258 /* token type codes, some also used as NFA arc types */
259 #define EMPTY 'n' /* no token present */
260 #define EOS 'e' /* end of string */
261 #define PLAIN 'p' /* ordinary character */
262 #define DIGIT 'd' /* digit (in bound) */
263 #define BACKREF 'b' /* back reference */
264 #define COLLEL 'I' /* start of [. */
265 #define ECLASS 'E' /* start of [= */
266 #define CCLASS 'C' /* start of [: */
267 #define END 'X' /* end of [. [= [: */
268 #define RANGE 'R' /* - within [] which might be range delim. */
269 #define LACON 'L' /* lookahead constraint subRE */
270 #define AHEAD 'a' /* color-lookahead arc */
271 #define BEHIND 'r' /* color-lookbehind arc */
272 #define WBDRY 'w' /* word boundary constraint */
273 #define NWBDRY 'W' /* non-word-boundary constraint */
274 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
275 #define SEND 'Z' /* end of string (even if not EOL) */
276 #define PREFER 'P' /* length preference */
278 /* is an arc colored, and hence on a color chain? */
279 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
284 /* static function list */
285 static struct fns functions
= {
286 rfree
, /* regfree insides */
292 * regcomp - compile regular expression
301 chr
* s2
= (chr
*) string
;
303 if (string
&& *string
)
308 nLen
= ((s2
- string
) / sizeof(chr
));
310 return wx_regcomp(re
, string
, nLen
, flags
);
313 wx_regcomp(regex_t
*re
,
319 struct vars
*v
= &var
;
325 FILE *debug
= (flags
& REG_PROGRESS
) ? stdout
: (FILE *) NULL
;
328 FILE *debug
= (FILE *) NULL
;
331 #define CNOERR() { if (ISERR()) return freev(v, v->err); }
335 if (re
== NULL
|| string
== NULL
)
337 if ((flags
& REG_QUOTE
) &&
338 (flags
& (REG_ADVANCED
| REG_EXPANDED
| REG_NEWLINE
)))
340 if (!(flags
& REG_EXTENDED
) && (flags
& REG_ADVF
))
343 /* initial setup (after which freev() is callable) */
345 v
->now
= (chr
*) string
;
346 v
->stop
= v
->now
+ len
;
347 v
->savenow
= v
->savestop
= NULL
;
353 for (j
= 0; j
< v
->nsubs
; j
++)
357 v
->nlcolor
= COLORLESS
;
367 re
->re_magic
= REMAGIC
;
368 re
->re_info
= 0; /* bits get set during parse */
369 re
->re_csize
= sizeof(chr
);
371 re
->re_fns
= VS(&functions
);
373 /* more complex setup, malloced things */
374 re
->re_guts
= VS(MALLOC(sizeof(struct guts
)));
375 if (re
->re_guts
== NULL
)
376 return freev(v
, REG_ESPACE
);
377 g
= (struct guts
*) re
->re_guts
;
384 v
->nfa
= newnfa(v
, v
->cm
, (struct nfa
*) NULL
);
386 v
->cv
= newcvec(100, 20, 10);
388 return freev(v
, REG_ESPACE
);
392 v
->mcces
= newcvec(nleaders(v
), 0, i
);
394 v
->mcces
= allmcces(v
, v
->mcces
);
395 leaders(v
, v
->mcces
);
396 addmcce(v
->mcces
, (chr
*) NULL
, (chr
*) NULL
); /* dummy */
401 lexstart(v
); /* also handles prefixes */
402 if ((v
->cflags
& REG_NLSTOP
) || (v
->cflags
& REG_NLANCH
))
404 /* assign newline a unique color */
405 v
->nlcolor
= subcolor(v
->cm
, newline());
406 okcolors(v
->nfa
, v
->cm
);
409 v
->tree
= parse(v
, EOS
, PLAIN
, v
->nfa
->init
, v
->nfa
->final
);
410 assert(SEE(EOS
)); /* even if error; ISERR() => SEE(EOS) */
412 assert(v
->tree
!= NULL
);
414 /* finish setup of nfa and its subre tree */
415 specialcolors(v
->nfa
);
420 fprintf(debug
, "\n\n\n========= RAW ==========\n");
421 dumpnfa(v
->nfa
, debug
);
422 dumpst(v
->tree
, debug
, 1);
426 v
->ntree
= numst(v
->tree
, 1);
432 fprintf(debug
, "\n\n\n========= TREE FIXED ==========\n");
433 dumpst(v
->tree
, debug
, 1);
437 /* build compacted NFAs for tree and lacons */
438 re
->re_info
|= nfatree(v
, v
->tree
, debug
);
440 assert(v
->nlacons
== 0 || v
->lacons
!= NULL
);
441 for (i
= 1; i
< v
->nlacons
; i
++)
445 fprintf(debug
, "\n\n\n========= LA%d ==========\n", i
);
447 nfanode(v
, &v
->lacons
[i
], debug
);
450 if (v
->tree
->flags
& SHORTER
)
453 /* build compacted NFAs for tree, lacons, fast search */
456 fprintf(debug
, "\n\n\n========= SEARCH ==========\n");
458 /* can sacrifice main NFA now, so use it as work area */
459 (DISCARD
) optimize(v
->nfa
, debug
);
461 makesearch(v
, v
->nfa
);
463 compact(v
->nfa
, &g
->search
);
466 /* looks okay, package it up */
467 re
->re_nsub
= v
->nsubexp
;
468 v
->re
= NULL
; /* freev no longer frees re */
469 g
->magic
= GUTSMAGIC
;
470 g
->cflags
= v
->cflags
;
471 g
->info
= re
->re_info
;
472 g
->nsub
= re
->re_nsub
;
476 g
->compare
= (v
->cflags
& REG_ICASE
) ? casecmp
: cmp
;
477 g
->lacons
= v
->lacons
;
479 g
->nlacons
= v
->nlacons
;
482 if (flags
& REG_DUMP
)
491 * moresubs - enlarge subRE vector
494 moresubs(struct vars
* v
,
495 int wanted
) /* want enough room for this one */
500 assert(wanted
> 0 && (size_t) wanted
>= v
->nsubs
);
501 n
= (size_t) wanted
*3 / 2 + 1;
503 if (v
->subs
== v
->sub10
)
505 p
= (struct subre
**) MALLOC(n
* sizeof(struct subre
*));
507 memcpy(VS(p
), VS(v
->subs
),
508 v
->nsubs
* sizeof(struct subre
*));
511 p
= (struct subre
**) REALLOC(v
->subs
, n
* sizeof(struct subre
*));
518 for (p
= &v
->subs
[v
->nsubs
]; v
->nsubs
< n
; p
++, v
->nsubs
++)
520 assert(v
->nsubs
== n
);
521 assert((size_t) wanted
< v
->nsubs
);
525 * freev - free vars struct's substructures where necessary
527 * Optionally does error-number setting, and always returns error code
528 * (if any), to make error-handling code terser.
531 freev(struct vars
* v
,
536 if (v
->subs
!= v
->sub10
)
541 freesubre(v
, v
->tree
);
542 if (v
->treechain
!= NULL
)
548 if (v
->mcces
!= NULL
)
550 if (v
->lacons
!= NULL
)
551 freelacons(v
->lacons
, v
->nlacons
);
552 ERR(err
); /* nop if err==0 */
558 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
559 * NFA must have been optimize()d already.
562 makesearch(struct vars
* v
,
567 struct state
*pre
= nfa
->pre
;
572 /* no loops are needed if it's anchored */
573 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
575 assert(a
->type
== PLAIN
);
576 if (a
->co
!= nfa
->bos
[0] && a
->co
!= nfa
->bos
[1])
581 /* add implicit .* in front */
582 rainbow(nfa
, v
->cm
, PLAIN
, COLORLESS
, pre
, pre
);
584 /* and ^* and \A* too -- not always necessary, but harmless */
585 newarc(nfa
, PLAIN
, nfa
->bos
[0], pre
, pre
);
586 newarc(nfa
, PLAIN
, nfa
->bos
[1], pre
, pre
);
590 * Now here's the subtle part. Because many REs have no lookback
591 * constraints, often knowing when you were in the pre state tells you
592 * little; it's the next state(s) that are informative. But some of
593 * them may have other inarcs, i.e. it may be possible to make actual
594 * progress and then return to one of them. We must de-optimize such
595 * cases, splitting each such state into progress and no-progress
599 /* first, make a list of the states */
601 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
604 for (b
= s
->ins
; b
!= NULL
; b
= b
->inchain
)
608 { /* must be split */
615 for (s
= slist
; s
!= NULL
; s
= s2
)
618 copyouts(nfa
, s
, s2
);
619 for (a
= s
->ins
; a
!= NULL
; a
= b
)
624 cparc(nfa
, a
, a
->from
, s2
);
629 s
->tmp
= NULL
; /* clean up while we're at it */
634 * parse - parse an RE
636 * This is actually just the top level, which parses a bunch of branches
637 * tied together with '|'. They appear in the tree as the left children
638 * of a chain of '|' subres.
640 static struct subre
*
641 parse(struct vars
* v
,
642 int stopper
, /* EOS or ')' */
643 int type
, /* LACON (lookahead subRE) or PLAIN */
644 struct state
* init
, /* initial state */
645 struct state
* final
) /* final state */
647 struct state
*left
; /* scaffolding for branch */
649 struct subre
*branches
; /* top level */
650 struct subre
*branch
; /* current branch */
651 struct subre
*t
; /* temporary */
652 int firstbranch
; /* is this the first branch? */
654 assert(stopper
== ')' || stopper
== EOS
);
656 branches
= subre(v
, '|', LONGER
, init
, final
);
664 /* need a place to hang it */
665 branch
->right
= subre(v
, '|', LONGER
, init
, final
);
667 branch
= branch
->right
;
670 left
= newstate(v
->nfa
);
671 right
= newstate(v
->nfa
);
673 EMPTYARC(init
, left
);
674 EMPTYARC(right
, final
);
676 branch
->left
= parsebranch(v
, stopper
, type
, left
, right
, 0);
678 branch
->flags
|= UP(branch
->flags
| branch
->left
->flags
);
679 if ((branch
->flags
& ~branches
->flags
) != 0) /* new flags */
680 for (t
= branches
; t
!= branch
; t
= t
->right
)
681 t
->flags
|= branch
->flags
;
683 assert(SEE(stopper
) || SEE(EOS
));
687 assert(stopper
== ')' && SEE(EOS
));
691 /* optimize out simple cases */
692 if (branch
== branches
)
693 { /* only one branch */
694 assert(branch
->right
== NULL
);
697 freesubre(v
, branches
);
700 else if (!MESSY(branches
->flags
))
701 { /* no interesting innards */
702 freesubre(v
, branches
->left
);
703 branches
->left
= NULL
;
704 freesubre(v
, branches
->right
);
705 branches
->right
= NULL
;
713 * parsebranch - parse one branch of an RE
715 * This mostly manages concatenation, working closely with parseqatom().
716 * Concatenated things are bundled up as much as possible, with separate
717 * ',' nodes introduced only when necessary due to substructure.
719 static struct subre
*
720 parsebranch(struct vars
* v
,
721 int stopper
, /* EOS or ')' */
722 int type
, /* LACON (lookahead subRE) or PLAIN */
723 struct state
* left
, /* leftmost state */
724 struct state
* right
, /* rightmost state */
725 int partial
) /* is this only part of a branch? */
727 struct state
*lp
; /* left end of current construct */
728 int seencontent
; /* is there anything in this branch yet? */
733 t
= subre(v
, '=', 0, left
, right
); /* op '=' is tentative */
735 while (!SEE('|') && !SEE(stopper
) && !SEE(EOS
))
738 { /* implicit concat operator */
739 lp
= newstate(v
->nfa
);
741 moveins(v
->nfa
, right
, lp
);
745 /* NB, recursion in parseqatom() may swallow rest of branch */
746 parseqatom(v
, stopper
, type
, lp
, right
, t
);
754 EMPTYARC(left
, right
);
761 * parseqatom - parse one quantified atom or constraint of an RE
763 * The bookkeeping near the end cooperates very closely with parsebranch();
764 * in particular, it contains a recursion that can involve parsing the rest
765 * of the branch, making this function's name somewhat inaccurate.
768 parseqatom(struct vars
* v
,
769 int stopper
, /* EOS or ')' */
770 int type
, /* LACON (lookahead subRE) or PLAIN */
771 struct state
* lp
, /* left state to hang it on */
772 struct state
* rp
, /* right state to hang it on */
773 struct subre
* top
) /* subtree top */
775 struct state
*s
; /* temporaries for new states */
778 #define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
781 struct subre
*atom
; /* atom's subtree */
783 int cap
; /* capturing parens? */
784 int pos
; /* positive lookahead? */
785 int subno
; /* capturing-parens or backref number */
787 int qprefer
; /* quantifier short/long preference */
789 struct subre
**atomp
; /* where the pointer to atom is */
791 /* initial bookkeeping */
793 assert(lp
->nouts
== 0); /* must string new code */
794 assert(rp
->nins
== 0); /* between lp and rp */
795 subno
= 0; /* just to shut lint up */
797 /* an atom or constraint... */
798 atomtype
= v
->nexttype
;
801 /* first, constraints, which end by returning */
804 if (v
->cflags
& REG_NLANCH
)
805 ARCV(BEHIND
, v
->nlcolor
);
811 if (v
->cflags
& REG_NLANCH
)
812 ARCV(AHEAD
, v
->nlcolor
);
817 ARCV('^', 1); /* BOL */
818 ARCV('^', 0); /* or BOS */
823 ARCV('$', 1); /* EOL */
824 ARCV('$', 0); /* or EOS */
829 wordchrs(v
); /* does NEXT() */
830 s
= newstate(v
->nfa
);
832 nonword(v
, BEHIND
, lp
, s
);
833 word(v
, AHEAD
, s
, rp
);
837 wordchrs(v
); /* does NEXT() */
838 s
= newstate(v
->nfa
);
840 word(v
, BEHIND
, lp
, s
);
841 nonword(v
, AHEAD
, s
, rp
);
845 wordchrs(v
); /* does NEXT() */
846 s
= newstate(v
->nfa
);
848 nonword(v
, BEHIND
, lp
, s
);
849 word(v
, AHEAD
, s
, rp
);
850 s
= newstate(v
->nfa
);
852 word(v
, BEHIND
, lp
, s
);
853 nonword(v
, AHEAD
, s
, rp
);
857 wordchrs(v
); /* does NEXT() */
858 s
= newstate(v
->nfa
);
860 word(v
, BEHIND
, lp
, s
);
861 word(v
, AHEAD
, s
, rp
);
862 s
= newstate(v
->nfa
);
864 nonword(v
, BEHIND
, lp
, s
);
865 nonword(v
, AHEAD
, s
, rp
);
868 case LACON
: /* lookahead constraint */
871 s
= newstate(v
->nfa
);
872 s2
= newstate(v
->nfa
);
874 t
= parse(v
, ')', LACON
, s
, s2
);
875 freesubre(v
, t
); /* internal structure irrelevant */
876 assert(SEE(')') || ISERR());
878 n
= newlacon(v
, s
, s2
, pos
);
883 /* then errors, to get them out of the way */
895 /* then plain characters, and minor variants on that theme */
896 case ')': /* unbalanced paren */
897 if ((v
->cflags
& REG_ADVANCED
) != REG_EXTENDED
)
902 /* legal in EREs due to specification botch */
904 /* fallthrough into case PLAIN */
906 onechr(v
, v
->nextvalue
, lp
, rp
);
907 okcolors(v
->nfa
, v
->cm
);
912 if (v
->nextvalue
== 1)
916 assert(SEE(']') || ISERR());
920 rainbow(v
->nfa
, v
->cm
, PLAIN
,
921 (v
->cflags
& REG_NLSTOP
) ? v
->nlcolor
: COLORLESS
,
925 /* and finally the ugly stuff */
926 case '(': /* value flags as capturing or non */
927 cap
= (type
== LACON
) ? 0 : v
->nextvalue
;
932 if ((size_t) subno
>= v
->nsubs
)
934 assert((size_t) subno
< v
->nsubs
);
937 atomtype
= PLAIN
; /* something that's not '(' */
939 /* need new endpoints because tree will contain pointers */
940 s
= newstate(v
->nfa
);
941 s2
= newstate(v
->nfa
);
946 atom
= parse(v
, ')', PLAIN
, s
, s2
);
947 assert(SEE(')') || ISERR());
952 v
->subs
[subno
] = atom
;
953 t
= subre(v
, '(', atom
->flags
| CAP
, lp
, rp
);
959 /* postpone everything else pending possible {0} */
961 case BACKREF
: /* the Feature From The Black Lagoon */
962 INSIST(type
!= LACON
, REG_ESUBREG
);
963 INSIST(v
->nextvalue
< v
->nsubs
, REG_ESUBREG
);
964 INSIST(v
->subs
[v
->nextvalue
] != NULL
, REG_ESUBREG
);
966 assert(v
->nextvalue
> 0);
967 atom
= subre(v
, 'b', BACKR
, lp
, rp
);
968 subno
= v
->nextvalue
;
970 EMPTYARC(lp
, rp
); /* temporarily, so there's something */
975 /* ...and an atom may be followed by a quantifier */
981 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
987 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
993 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
1010 /* {m,n} exercises preference, even if it's {m,m} */
1011 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
1016 /* {m} passes operand's preference through */
1020 { /* catches errors too */
1026 default: /* no quantifier */
1032 /* annoying special case: {0} or {0,0} cancels everything */
1033 if (m
== 0 && n
== 0)
1037 if (atomtype
== '(')
1038 v
->subs
[subno
] = NULL
;
1039 delsub(v
->nfa
, lp
, rp
);
1044 /* if not a messy case, avoid hard part */
1045 assert(!MESSY(top
->flags
));
1046 f
= top
->flags
| qprefer
| ((atom
!= NULL
) ? atom
->flags
: 0);
1047 if (atomtype
!= '(' && atomtype
!= BACKREF
&& !MESSY(UP(f
)))
1049 if (!(m
== 1 && n
== 1))
1050 repeat(v
, lp
, rp
, m
, n
);
1058 * hard part: something messy That is, capturing parens, back
1059 * reference, short/long clash, or an atom with substructure
1060 * containing one of those.
1063 /* now we'll need a subre for the contents even if they're boring */
1066 atom
= subre(v
, '=', 0, lp
, rp
);
1071 * prepare a general-purpose state skeleton
1073 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1074 * [lp] ----> [s2] ----bypass---------------------
1076 * where bypass is an empty, and prefix is some repetitions of atom
1078 s
= newstate(v
->nfa
); /* first, new endpoints for the atom */
1079 s2
= newstate(v
->nfa
);
1081 moveouts(v
->nfa
, lp
, s
);
1082 moveins(v
->nfa
, rp
, s2
);
1086 s
= newstate(v
->nfa
); /* and spots for prefix and bypass */
1087 s2
= newstate(v
->nfa
);
1093 /* break remaining subRE into x{...} and what follows */
1094 t
= subre(v
, '.', COMBINE(qprefer
, atom
->flags
), lp
, rp
);
1097 /* here we should recurse... but we must postpone that to the end */
1099 /* split top into prefix and remaining */
1100 assert(top
->op
== '=' && top
->left
== NULL
&& top
->right
== NULL
);
1101 top
->left
= subre(v
, '=', top
->flags
, top
->begin
, lp
);
1105 /* if it's a backref, now is the time to replicate the subNFA */
1106 if (atomtype
== BACKREF
)
1108 assert(atom
->begin
->nouts
== 1); /* just the EMPTY */
1109 delsub(v
->nfa
, atom
->begin
, atom
->end
);
1110 assert(v
->subs
[subno
] != NULL
);
1111 /* and here's why the recursion got postponed: it must */
1112 /* wait until the skeleton is filled in, because it may */
1113 /* hit a backref that wants to copy the filled-in skeleton */
1114 dupnfa(v
->nfa
, v
->subs
[subno
]->begin
, v
->subs
[subno
]->end
,
1115 atom
->begin
, atom
->end
);
1119 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1122 EMPTYARC(s2
, atom
->end
); /* the bypass */
1123 assert(PREF(qprefer
) != 0);
1124 f
= COMBINE(qprefer
, atom
->flags
);
1125 t
= subre(v
, '|', f
, lp
, atom
->end
);
1128 t
->right
= subre(v
, '|', PREF(f
), s2
, atom
->end
);
1130 t
->right
->left
= subre(v
, '=', 0, s2
, atom
->end
);
1137 /* deal with the rest of the quantifier */
1138 if (atomtype
== BACKREF
)
1140 /* special case: backrefs have internal quantifiers */
1141 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1142 /* just stuff everything into atom */
1143 repeat(v
, atom
->begin
, atom
->end
, m
, n
);
1144 atom
->min
= (short) m
;
1145 atom
->max
= (short) n
;
1146 atom
->flags
|= COMBINE(qprefer
, atom
->flags
);
1148 else if (m
== 1 && n
== 1)
1150 /* no/vacuous quantifier: done */
1151 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1155 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1156 /* parens in only second x */
1157 dupnfa(v
->nfa
, atom
->begin
, atom
->end
, s
, atom
->begin
);
1158 assert(m
>= 1 && m
!= INFINITY
&& n
>= 1);
1159 repeat(v
, s
, atom
->begin
, m
- 1, (n
== INFINITY
) ? n
: n
- 1);
1160 f
= COMBINE(qprefer
, atom
->flags
);
1161 t
= subre(v
, '.', f
, s
, atom
->end
); /* prefix and atom */
1163 t
->left
= subre(v
, '=', PREF(f
), s
, atom
->begin
);
1169 /* and finally, look after that postponed recursion */
1171 if (!(SEE('|') || SEE(stopper
) || SEE(EOS
)))
1172 t
->right
= parsebranch(v
, stopper
, type
, atom
->end
, rp
, 1);
1175 EMPTYARC(atom
->end
, rp
);
1176 t
->right
= subre(v
, '=', 0, atom
->end
, rp
);
1178 assert(SEE('|') || SEE(stopper
) || SEE(EOS
));
1179 t
->flags
|= COMBINE(t
->flags
, t
->right
->flags
);
1180 top
->flags
|= COMBINE(top
->flags
, t
->flags
);
1184 * nonword - generate arcs for non-word-character ahead or behind
1187 nonword(struct vars
* v
,
1188 int dir
, /* AHEAD or BEHIND */
1192 int anchor
= (dir
== AHEAD
) ? '$' : '^';
1194 assert(dir
== AHEAD
|| dir
== BEHIND
);
1195 newarc(v
->nfa
, anchor
, 1, lp
, rp
);
1196 newarc(v
->nfa
, anchor
, 0, lp
, rp
);
1197 colorcomplement(v
->nfa
, v
->cm
, dir
, v
->wordchrs
, lp
, rp
);
1198 /* (no need for special attention to \n) */
1202 * word - generate arcs for word character ahead or behind
1205 word(struct vars
* v
,
1206 int dir
, /* AHEAD or BEHIND */
1210 assert(dir
== AHEAD
|| dir
== BEHIND
);
1211 cloneouts(v
->nfa
, v
->wordchrs
, lp
, rp
, dir
);
1212 /* (no need for special attention to \n) */
1216 * scannum - scan a number
1218 static int /* value, <= DUPMAX */
1219 scannum(struct vars
* v
)
1223 while (SEE(DIGIT
) && n
< DUPMAX
)
1225 n
= n
* 10 + v
->nextvalue
;
1228 if (SEE(DIGIT
) || n
> DUPMAX
)
1237 * repeat - replicate subNFA for quantifiers
1239 * The duplication sequences used here are chosen carefully so that any
1240 * pointers starting out pointing into the subexpression end up pointing into
1241 * the last occurrence. (Note that it may not be strung between the same
1242 * left and right end states, however!) This used to be important for the
1243 * subRE tree, although the important bits are now handled by the in-line
1244 * code in parse(), and when this is called, it doesn't matter any more.
1247 repeat(struct vars
* v
,
1255 #define PAIR(x, y) ((x)*4 + (y))
1256 #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1257 const int rm
= REDUCE(m
);
1258 const int rn
= REDUCE(n
);
1262 switch (PAIR(rm
, rn
))
1264 case PAIR(0, 0): /* empty string */
1265 delsub(v
->nfa
, lp
, rp
);
1268 case PAIR(0, 1): /* do as x| */
1271 case PAIR(0, SOME
): /* do as x{1,n}| */
1272 repeat(v
, lp
, rp
, 1, n
);
1276 case PAIR(0, INF
): /* loop x around */
1277 s
= newstate(v
->nfa
);
1279 moveouts(v
->nfa
, lp
, s
);
1280 moveins(v
->nfa
, rp
, s
);
1284 case PAIR(1, 1): /* no action required */
1286 case PAIR(1, SOME
): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1287 s
= newstate(v
->nfa
);
1289 moveouts(v
->nfa
, lp
, s
);
1290 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1292 repeat(v
, lp
, s
, 1, n
- 1);
1296 case PAIR(1, INF
): /* add loopback arc */
1297 s
= newstate(v
->nfa
);
1298 s2
= newstate(v
->nfa
);
1300 moveouts(v
->nfa
, lp
, s
);
1301 moveins(v
->nfa
, rp
, s2
);
1306 case PAIR(SOME
, SOME
): /* do as x{m-1,n-1}x */
1307 s
= newstate(v
->nfa
);
1309 moveouts(v
->nfa
, lp
, s
);
1310 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1312 repeat(v
, lp
, s
, m
- 1, n
- 1);
1314 case PAIR(SOME
, INF
): /* do as x{m-1,}x */
1315 s
= newstate(v
->nfa
);
1317 moveouts(v
->nfa
, lp
, s
);
1318 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1320 repeat(v
, lp
, s
, m
- 1, n
);
1329 * bracket - handle non-complemented bracket expression
1330 * Also called from cbracket for complemented bracket expressions.
1333 bracket(struct vars
* v
,
1339 while (!SEE(']') && !SEE(EOS
))
1340 brackpart(v
, lp
, rp
);
1341 assert(SEE(']') || ISERR());
1342 okcolors(v
->nfa
, v
->cm
);
1346 * cbracket - handle complemented bracket expression
1347 * We do it by calling bracket() with dummy endpoints, and then complementing
1348 * the result. The alternative would be to invoke rainbow(), and then delete
1349 * arcs as the b.e. is seen... but that gets messy.
1352 cbracket(struct vars
* v
,
1356 struct state
*left
= newstate(v
->nfa
);
1357 struct state
*right
= newstate(v
->nfa
);
1359 struct arc
*a
; /* arc from lp */
1360 struct arc
*ba
; /* arc from left, from bracket() */
1361 struct arc
*pa
; /* MCCE-prototype arc */
1367 bracket(v
, left
, right
);
1368 if (v
->cflags
& REG_NLSTOP
)
1369 newarc(v
->nfa
, PLAIN
, v
->nlcolor
, left
, right
);
1372 assert(lp
->nouts
== 0); /* all outarcs will be ours */
1374 /* easy part of complementing */
1375 colorcomplement(v
->nfa
, v
->cm
, PLAIN
, left
, lp
, rp
);
1377 if (v
->mcces
== NULL
)
1378 { /* no MCCEs -- we're done */
1379 dropstate(v
->nfa
, left
);
1380 assert(right
->nins
== 0);
1381 freestate(v
->nfa
, right
);
1385 /* but complementing gets messy in the presence of MCCEs... */
1387 for (p
= v
->mcces
->chrs
, i
= v
->mcces
->nchrs
; i
> 0; p
++, i
--)
1389 co
= GETCOLOR(v
->cm
, *p
);
1390 a
= findarc(lp
, PLAIN
, co
);
1391 ba
= findarc(left
, PLAIN
, co
);
1399 s
= newstate(v
->nfa
);
1401 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1403 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1406 { /* easy case, need all of them */
1407 cloneouts(v
->nfa
, pa
->to
, s
, rp
, PLAIN
);
1408 newarc(v
->nfa
, '$', 1, s
, rp
);
1409 newarc(v
->nfa
, '$', 0, s
, rp
);
1410 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, s
, rp
);
1413 { /* must be selective */
1414 if (findarc(ba
->to
, '$', 1) == NULL
)
1416 newarc(v
->nfa
, '$', 1, s
, rp
);
1417 newarc(v
->nfa
, '$', 0, s
, rp
);
1418 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
,
1421 for (pa
= pa
->to
->outs
; pa
!= NULL
; pa
= pa
->outchain
)
1422 if (findarc(ba
->to
, PLAIN
, pa
->co
) == NULL
)
1423 newarc(v
->nfa
, PLAIN
, pa
->co
, s
, rp
);
1424 if (s
->nouts
== 0) /* limit of selectivity: none */
1425 dropstate(v
->nfa
, s
); /* frees arc too */
1430 delsub(v
->nfa
, left
, right
);
1431 assert(left
->nouts
== 0);
1432 freestate(v
->nfa
, left
);
1433 assert(right
->nins
== 0);
1434 freestate(v
->nfa
, right
);
1438 * brackpart - handle one item (or range) within a bracket expression
1441 brackpart(struct vars
* v
,
1452 /* parse something, get rid of special cases, take shortcuts */
1453 switch (v
->nexttype
)
1455 case RANGE
: /* a-b-c or other botch */
1460 c
[0] = v
->nextvalue
;
1462 /* shortcut for ordinary chr (not range, not MCCE leader) */
1463 if (!SEE(RANGE
) && !ISCELEADER(v
, c
[0]))
1465 onechr(v
, c
[0], lp
, rp
);
1468 startc
= element(v
, c
, c
+ 1);
1473 endp
= scanplain(v
);
1474 INSIST(startp
< endp
, REG_ECOLLATE
);
1476 startc
= element(v
, startp
, endp
);
1481 endp
= scanplain(v
);
1482 INSIST(startp
< endp
, REG_ECOLLATE
);
1484 startc
= element(v
, startp
, endp
);
1486 cv
= eclass(v
, startc
, (v
->cflags
& REG_ICASE
));
1488 dovec(v
, cv
, lp
, rp
);
1493 endp
= scanplain(v
);
1494 INSIST(startp
< endp
, REG_ECTYPE
);
1496 cv
= cclass(v
, startp
, endp
, (v
->cflags
& REG_ICASE
));
1498 dovec(v
, cv
, lp
, rp
);
1510 switch (v
->nexttype
)
1514 c
[0] = v
->nextvalue
;
1516 endc
= element(v
, c
, c
+ 1);
1521 endp
= scanplain(v
);
1522 INSIST(startp
< endp
, REG_ECOLLATE
);
1524 endc
= element(v
, startp
, endp
);
1537 * Ranges are unportable. Actually, standard C does guarantee that
1538 * digits are contiguous, but making that an exception is just too
1543 cv
= range(v
, startc
, endc
, (v
->cflags
& REG_ICASE
));
1545 dovec(v
, cv
, lp
, rp
);
1549 * scanplain - scan PLAIN contents of [. etc.
1551 * Certain bits of trickery in lex.c know that this code does not try
1552 * to look past the final bracket of the [. etc.
1554 static chr
* /* just after end of sequence */
1555 scanplain(struct vars
* v
)
1559 assert(SEE(COLLEL
) || SEE(ECLASS
) || SEE(CCLASS
));
1569 assert(SEE(END
) || ISERR());
1576 * leaders - process a cvec of collating elements to also include leaders
1577 * Also gives all characters involved their own colors, which is almost
1578 * certainly necessary, and sets up little disconnected subNFA.
1581 leaders(struct vars
* v
,
1590 v
->mccepbegin
= newstate(v
->nfa
);
1591 v
->mccepend
= newstate(v
->nfa
);
1594 for (mcce
= 0; mcce
< cv
->nmcces
; mcce
++)
1596 p
= cv
->mcces
[mcce
];
1598 if (!haschr(cv
, leader
))
1601 s
= newstate(v
->nfa
);
1602 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, leader
),
1604 okcolors(v
->nfa
, v
->cm
);
1608 a
= findarc(v
->mccepbegin
, PLAIN
,
1609 GETCOLOR(v
->cm
, leader
));
1612 assert(s
!= v
->mccepend
);
1615 assert(*p
!= 0 && *(p
+ 1) == 0); /* only 2-char MCCEs for
1617 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, *p
), s
, v
->mccepend
);
1618 okcolors(v
->nfa
, v
->cm
);
1623 * onechr - fill in arcs for a plain character, and possible case complements
1624 * This is mostly a shortcut for efficient handling of the common case.
1627 onechr(struct vars
* v
,
1632 if (!(v
->cflags
& REG_ICASE
))
1634 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, c
), lp
, rp
);
1638 /* rats, need general case anyway... */
1639 dovec(v
, allcases(v
, c
), lp
, rp
);
1643 * dovec - fill in arcs for each element of a cvec
1644 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1647 dovec(struct vars
* v
,
1661 struct arc
*pa
; /* arc in prototype */
1663 struct state
*ps
; /* state in prototype */
1665 /* need a place to store leaders, if any */
1668 assert(v
->mcces
!= NULL
);
1669 if (v
->cv2
== NULL
|| v
->cv2
->nchrs
< v
->mcces
->nchrs
)
1673 v
->cv2
= newcvec(v
->mcces
->nchrs
, 0, v
->mcces
->nmcces
);
1678 leads
= clearcvec(v
->cv2
);
1683 /* first, get the ordinary characters out of the way */
1684 for (p
= cv
->chrs
, i
= cv
->nchrs
; i
> 0; p
++, i
--)
1687 if (!ISCELEADER(v
, ch
))
1688 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, ch
), lp
, rp
);
1691 assert(singleton(v
->cm
, ch
));
1692 assert(leads
!= NULL
);
1693 if (!haschr(leads
, ch
))
1698 /* and the ranges */
1699 for (p
= cv
->ranges
, i
= cv
->nranges
; i
> 0; p
+= 2, i
--)
1703 while (from
<= to
&& (ce
= nextleader(v
, from
, to
)) != NOCELT
)
1706 subrange(v
, from
, ce
- 1, lp
, rp
);
1707 assert(singleton(v
->cm
, ce
));
1708 assert(leads
!= NULL
);
1709 if (!haschr(leads
, ce
))
1714 subrange(v
, from
, to
, lp
, rp
);
1717 if ((leads
== NULL
|| leads
->nchrs
== 0) && cv
->nmcces
== 0)
1720 /* deal with the MCCE leaders */
1722 for (p
= leads
->chrs
, i
= leads
->nchrs
; i
> 0; p
++, i
--)
1724 co
= GETCOLOR(v
->cm
, *p
);
1725 a
= findarc(lp
, PLAIN
, co
);
1730 s
= newstate(v
->nfa
);
1732 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1735 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1738 newarc(v
->nfa
, '$', 1, s
, rp
);
1739 newarc(v
->nfa
, '$', 0, s
, rp
);
1740 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, ps
, s
, rp
);
1745 for (i
= 0; i
< cv
->nmcces
; i
++)
1748 assert(singleton(v
->cm
, *p
));
1749 if (!singleton(v
->cm
, *p
))
1755 co
= GETCOLOR(v
->cm
, ch
);
1756 a
= findarc(lp
, PLAIN
, co
);
1761 s
= newstate(v
->nfa
);
1763 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1766 assert(*p
!= 0); /* at least two chars */
1767 assert(singleton(v
->cm
, *p
));
1769 co
= GETCOLOR(v
->cm
, ch
);
1770 assert(*p
== 0); /* and only two, for now */
1771 newarc(v
->nfa
, PLAIN
, co
, s
, rp
);
1777 * nextleader - find next MCCE leader within range
1779 static celt
/* NOCELT means none */
1780 nextleader(struct vars
* v
,
1789 if (v
->mcces
== NULL
)
1792 for (i
= v
->mcces
->nchrs
, p
= v
->mcces
->chrs
; i
> 0; i
--, p
++)
1795 if (from
<= ch
&& ch
<= to
)
1796 if (it
== NOCELT
|| ch
< it
)
1803 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1805 * The list is kept as a bunch of arcs between two dummy states; it's
1806 * disposed of by the unreachable-states sweep in NFA optimization.
1807 * Does NEXT(). Must not be called from any unusual lexical context.
1808 * This should be reconciled with the \w etc. handling in lex.c, and
1809 * should be cleaned up to reduce dependencies on input scanning.
1812 wordchrs(struct vars
* v
)
1815 struct state
*right
;
1817 if (v
->wordchrs
!= NULL
)
1819 NEXT(); /* for consistency */
1823 left
= newstate(v
->nfa
);
1824 right
= newstate(v
->nfa
);
1826 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1829 assert(v
->savenow
!= NULL
&& SEE('['));
1830 bracket(v
, left
, right
);
1831 assert((v
->savenow
!= NULL
&& SEE(']')) || ISERR());
1838 * subre - allocate a subre
1840 static struct subre
*
1841 subre(struct vars
* v
,
1844 struct state
* begin
,
1851 v
->treefree
= ret
->left
;
1854 ret
= (struct subre
*) MALLOC(sizeof(struct subre
));
1860 ret
->chain
= v
->treechain
;
1864 assert(strchr("|.b(=", op
) != NULL
);
1870 ret
->min
= ret
->max
= 1;
1881 * freesubre - free a subRE subtree
1884 freesubre(struct vars
* v
, /* might be NULL */
1890 if (sr
->left
!= NULL
)
1891 freesubre(v
, sr
->left
);
1892 if (sr
->right
!= NULL
)
1893 freesubre(v
, sr
->right
);
1899 * freesrnode - free one node in a subRE subtree
1902 freesrnode(struct vars
* v
, /* might be NULL */
1908 if (!NULLCNFA(sr
->cnfa
))
1909 freecnfa(&sr
->cnfa
);
1914 sr
->left
= v
->treefree
;
1922 * optst - optimize a subRE subtree
1925 optst(struct vars
* v
,
1931 /* recurse through children */
1932 if (t
->left
!= NULL
)
1934 if (t
->right
!= NULL
)
1939 * numst - number tree nodes (assigning retry indexes)
1941 static int /* next number */
1942 numst(struct subre
* t
,
1943 int start
) /* starting point for subtree numbers */
1950 t
->retry
= (short) i
++;
1951 if (t
->left
!= NULL
)
1952 i
= numst(t
->left
, i
);
1953 if (t
->right
!= NULL
)
1954 i
= numst(t
->right
, i
);
1959 * markst - mark tree nodes as INUSE
1962 markst(struct subre
* t
)
1967 if (t
->left
!= NULL
)
1969 if (t
->right
!= NULL
)
1974 * cleanst - free any tree nodes not marked INUSE
1977 cleanst(struct vars
* v
)
1982 for (t
= v
->treechain
; t
!= NULL
; t
= next
)
1985 if (!(t
->flags
& INUSE
))
1988 v
->treechain
= NULL
;
1989 v
->treefree
= NULL
; /* just on general principles */
1993 * nfatree - turn a subRE subtree into a tree of compacted NFAs
1995 static long /* optimize results from top node */
1996 nfatree(struct vars
* v
,
1998 FILE *f
) /* for debug output */
2000 assert(t
!= NULL
&& t
->begin
!= NULL
);
2002 if (t
->left
!= NULL
)
2003 (DISCARD
) nfatree(v
, t
->left
, f
);
2004 if (t
->right
!= NULL
)
2005 (DISCARD
) nfatree(v
, t
->right
, f
);
2007 return nfanode(v
, t
, f
);
2011 * nfanode - do one NFA for nfatree
2013 static long /* optimize results */
2014 nfanode(struct vars
* v
,
2016 FILE *f
) /* for debug output */
2021 assert(t
->begin
!= NULL
);
2028 fprintf(f
, "\n\n\n========= TREE NODE %s ==========\n",
2029 stid(t
, idbuf
, sizeof(idbuf
)));
2032 nfa
= newnfa(v
, v
->cm
, v
->nfa
);
2034 dupnfa(nfa
, t
->begin
, t
->end
, nfa
->init
, nfa
->final
);
2038 ret
= optimize(nfa
, f
);
2041 compact(nfa
, &t
->cnfa
);
2048 * newlacon - allocate a lookahead-constraint subRE
2050 static int /* lacon number */
2051 newlacon(struct vars
* v
,
2052 struct state
* begin
,
2059 if (v
->nlacons
== 0)
2061 v
->lacons
= (struct subre
*) MALLOC(2 * sizeof(struct subre
));
2062 n
= 1; /* skip 0th */
2067 v
->lacons
= (struct subre
*) REALLOC(v
->lacons
,
2068 (v
->nlacons
+ 1) * sizeof(struct subre
));
2071 if (v
->lacons
== NULL
)
2076 sub
= &v
->lacons
[n
];
2085 * freelacons - free lookahead-constraint subRE vector
2088 freelacons(struct subre
* subs
,
2095 for (sub
= subs
+ 1, i
= n
- 1; i
> 0; sub
++, i
--) /* no 0th */
2096 if (!NULLCNFA(sub
->cnfa
))
2097 freecnfa(&sub
->cnfa
);
2102 * rfree - free a whole RE (insides of regfree)
2109 if (re
== NULL
|| re
->re_magic
!= REMAGIC
)
2112 re
->re_magic
= 0; /* invalidate RE */
2113 g
= (struct guts
*) re
->re_guts
;
2118 if (g
->tree
!= NULL
)
2119 freesubre((struct vars
*) NULL
, g
->tree
);
2120 if (g
->lacons
!= NULL
)
2121 freelacons(g
->lacons
, g
->nlacons
);
2122 if (!NULLCNFA(g
->search
))
2123 freecnfa(&g
->search
);
2130 * dump - dump an RE in human-readable form
2139 if (re
->re_magic
!= REMAGIC
)
2140 fprintf(f
, "bad magic number (0x%x not 0x%x)\n", re
->re_magic
,
2142 if (re
->re_guts
== NULL
)
2144 fprintf(f
, "NULL guts!!!\n");
2147 g
= (struct guts
*) re
->re_guts
;
2148 if (g
->magic
!= GUTSMAGIC
)
2149 fprintf(f
, "bad guts magic number (0x%x not 0x%x)\n", g
->magic
,
2152 fprintf(f
, "\n\n\n========= DUMP ==========\n");
2153 fprintf(f
, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2154 re
->re_nsub
, re
->re_info
, re
->re_csize
, g
->ntree
);
2156 dumpcolors(&g
->cmap
, f
);
2157 if (!NULLCNFA(g
->search
))
2159 printf("\nsearch:\n");
2160 dumpcnfa(&g
->search
, f
);
2162 for (i
= 1; i
< g
->nlacons
; i
++)
2164 fprintf(f
, "\nla%d (%s):\n", i
,
2165 (g
->lacons
[i
].subno
) ? "positive" : "negative");
2166 dumpcnfa(&g
->lacons
[i
].cnfa
, f
);
2169 dumpst(g
->tree
, f
, 0);
2173 * dumpst - dump a subRE tree
2176 dumpst(struct subre
* t
,
2178 int nfapresent
) /* is the original NFA still around? */
2181 fprintf(f
, "null tree\n");
2183 stdump(t
, f
, nfapresent
);
2188 * stdump - recursive guts of dumpst
2191 stdump(struct subre
* t
,
2193 int nfapresent
) /* is the original NFA still around? */
2197 fprintf(f
, "%s. `%c'", stid(t
, idbuf
, sizeof(idbuf
)), t
->op
);
2198 if (t
->flags
& LONGER
)
2199 fprintf(f
, " longest");
2200 if (t
->flags
& SHORTER
)
2201 fprintf(f
, " shortest");
2202 if (t
->flags
& MIXED
)
2203 fprintf(f
, " hasmixed");
2205 fprintf(f
, " hascapture");
2206 if (t
->flags
& BACKR
)
2207 fprintf(f
, " hasbackref");
2208 if (!(t
->flags
& INUSE
))
2209 fprintf(f
, " UNUSED");
2211 fprintf(f
, " (#%d)", t
->subno
);
2212 if (t
->min
!= 1 || t
->max
!= 1)
2214 fprintf(f
, " {%d,", t
->min
);
2215 if (t
->max
!= INFINITY
)
2216 fprintf(f
, "%d", t
->max
);
2220 fprintf(f
, " %ld-%ld", (long) t
->begin
->no
, (long) t
->end
->no
);
2221 if (t
->left
!= NULL
)
2222 fprintf(f
, " L:%s", stid(t
->left
, idbuf
, sizeof(idbuf
)));
2223 if (t
->right
!= NULL
)
2224 fprintf(f
, " R:%s", stid(t
->right
, idbuf
, sizeof(idbuf
)));
2225 if (!NULLCNFA(t
->cnfa
))
2228 dumpcnfa(&t
->cnfa
, f
);
2231 if (t
->left
!= NULL
)
2232 stdump(t
->left
, f
, nfapresent
);
2233 if (t
->right
!= NULL
)
2234 stdump(t
->right
, f
, nfapresent
);
2238 * stid - identify a subtree node for dumping
2240 static char * /* points to buf or constant string */
2241 stid(struct subre
* t
,
2245 /* big enough for hex int or decimal t->retry? */
2246 if (bufsize
< sizeof(int) * 2 + 3 || bufsize
< sizeof(t
->retry
) * 3 + 1)
2249 sprintf(buf
, "%d", t
->retry
);
2251 sprintf(buf
, "0x%x", (int) t
); /* may lose bits, that's okay */
2254 #endif /* REG_DEBUG */
2257 #include "regc_lex.c"
2258 #include "regc_color.c"
2259 #include "regc_nfa.c"
2260 #include "regc_cvec.c"
2261 #include "regc_locale.c"