]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
38 * forward declarations, up here so forward datatypes etc. are defined early
40 /* === regcomp.c === */
41 static void moresubs(struct vars
*, int);
42 static int freev(struct vars
*, int);
43 static void makesearch(struct vars
*, struct nfa
*);
44 static struct subre
*parse(struct vars
*, int, int, struct state
*, struct state
*);
45 static struct subre
*parsebranch(struct vars
*, int, int, struct state
*, struct state
*, int);
46 static void parseqatom(struct vars
*, int, int, struct state
*, struct state
*, struct subre
*);
47 static void nonword(struct vars
*, int, struct state
*, struct state
*);
48 static void word(struct vars
*, int, struct state
*, struct state
*);
49 static int scannum(struct vars
*);
50 static void repeat(struct vars
*, struct state
*, struct state
*, int, int);
51 static void bracket(struct vars
*, struct state
*, struct state
*);
52 static void cbracket(struct vars
*, struct state
*, struct state
*);
53 static void brackpart(struct vars
*, struct state
*, struct state
*);
54 static chr
*scanplain(struct vars
*);
55 static void leaders(struct vars
*, struct cvec
*);
56 static void onechr(struct vars
*, chr
, struct state
*, struct state
*);
57 static void dovec(struct vars
*, struct cvec
*, struct state
*, struct state
*);
58 static celt
nextleader(struct vars
*, chr
, chr
);
59 static void wordchrs(struct vars
*);
60 static struct subre
*subre(struct vars
*, int, int, struct state
*, struct state
*);
61 static void freesubre(struct vars
*, struct subre
*);
62 static void freesrnode(struct vars
*, struct subre
*);
63 static void optst(struct vars
*, struct subre
*);
64 static int numst(struct subre
*, int);
65 static void markst(struct subre
*);
66 static void cleanst(struct vars
*);
67 static long nfatree(struct vars
*, struct subre
*, FILE *);
68 static long nfanode(struct vars
*, struct subre
*, FILE *);
69 static int newlacon(struct vars
*, struct state
*, struct state
*, int);
70 static void freelacons(struct subre
*, int);
71 static void rfree(regex_t
*);
74 static void dump(regex_t
*, FILE *);
75 static void dumpst(struct subre
*, FILE *, int);
76 static void stdump(struct subre
*, FILE *, int);
77 static char *stid(struct subre
*, char *, size_t);
79 /* === regc_lex.c === */
80 static void lexstart(struct vars
*);
81 static void prefixes(struct vars
*);
82 static void lexnest(struct vars
*, chr
*, chr
*);
83 static void lexword(struct vars
*);
84 static int next(struct vars
*);
85 static int lexescape(struct vars
*);
86 static chr
lexdigits(struct vars
*, int, int, int);
87 static int brenext(struct vars
*, chr
);
88 static void skip(struct vars
*);
89 static chr
newline(void);
90 static chr
chrnamed(struct vars
*, chr
*, chr
*, chr
);
92 /* === regc_color.c === */
93 static void initcm(struct vars
*, struct colormap
*);
94 static void freecm(struct colormap
*);
95 static void cmtreefree(struct colormap
*, union tree
*, int);
96 static color
setcolor(struct colormap
*, chr
, pcolor
);
97 static color
maxcolor(struct colormap
*);
98 static color
newcolor(struct colormap
*);
99 static void freecolor(struct colormap
*, pcolor
);
100 static color
pseudocolor(struct colormap
*);
101 static color
subcolor(struct colormap
*, chr c
);
102 static color
newsub(struct colormap
*, pcolor
);
103 static void subrange(struct vars
*, chr
, chr
, struct state
*, struct state
*);
104 static void subblock(struct vars
*, chr
, struct state
*, struct state
*);
105 static void okcolors(struct nfa
*, struct colormap
*);
106 static void colorchain(struct colormap
*, struct arc
*);
107 static void uncolorchain(struct colormap
*, struct arc
*);
108 static int singleton(struct colormap
*, chr c
);
109 static void rainbow(struct nfa
*, struct colormap
*, int, pcolor
, struct state
*, struct state
*);
110 static void colorcomplement(struct nfa
*, struct colormap
*, int, struct state
*, struct state
*, struct state
*);
113 static void dumpcolors(struct colormap
*, FILE *);
114 static void fillcheck(struct colormap
*, union tree
*, int, FILE *);
115 static void dumpchr(chr
, FILE *);
117 /* === regc_nfa.c === */
118 static struct nfa
*newnfa(struct vars
*, struct colormap
*, struct nfa
*);
119 static void freenfa(struct nfa
*);
120 static struct state
*newstate(struct nfa
*);
121 static struct state
*newfstate(struct nfa
*, int flag
);
122 static void dropstate(struct nfa
*, struct state
*);
123 static void freestate(struct nfa
*, struct state
*);
124 static void destroystate(struct nfa
*, struct state
*);
125 static void newarc(struct nfa
*, int, pcolor
, struct state
*, struct state
*);
126 static struct arc
*allocarc(struct nfa
*, struct state
*);
127 static void freearc(struct nfa
*, struct arc
*);
128 static struct arc
*findarc(struct state
*, int, pcolor
);
129 static void cparc(struct nfa
*, struct arc
*, struct state
*, struct state
*);
130 static void moveins(struct nfa
*, struct state
*, struct state
*);
131 static void copyins(struct nfa
*, struct state
*, struct state
*);
132 static void moveouts(struct nfa
*, struct state
*, struct state
*);
133 static void copyouts(struct nfa
*, struct state
*, struct state
*);
134 static void cloneouts(struct nfa
*, struct state
*, struct state
*, struct state
*, int);
135 static void delsub(struct nfa
*, struct state
*, struct state
*);
136 static void deltraverse(struct nfa
*, struct state
*, struct state
*);
137 static void dupnfa(struct nfa
*, struct state
*, struct state
*, struct state
*, struct state
*);
138 static void duptraverse(struct nfa
*, struct state
*, struct state
*);
139 static void cleartraverse(struct nfa
*, struct state
*);
140 static void specialcolors(struct nfa
*);
141 static long optimize(struct nfa
*, FILE *);
142 static void pullback(struct nfa
*, FILE *);
143 static int pull(struct nfa
*, struct arc
*);
144 static void pushfwd(struct nfa
*, FILE *);
145 static int push(struct nfa
*, struct arc
*);
147 #define INCOMPATIBLE 1 /* destroys arc */
148 #define SATISFIED 2 /* constraint satisfied */
149 #define COMPATIBLE 3 /* compatible but not satisfied yet */
150 static int combine(struct arc
*, struct arc
*);
151 static void fixempties(struct nfa
*, FILE *);
152 static int unempty(struct nfa
*, struct arc
*);
153 static void cleanup(struct nfa
*);
154 static void markreachable(struct nfa
*, struct state
*, struct state
*, struct state
*);
155 static void markcanreach(struct nfa
*, struct state
*, struct state
*, struct state
*);
156 static long analyze(struct nfa
*);
157 static void compact(struct nfa
*, struct cnfa
*);
158 static void carcsort(struct carc
*, struct carc
*);
159 static void freecnfa(struct cnfa
*);
160 static void dumpnfa(struct nfa
*, FILE *);
163 static void dumpstate(struct state
*, FILE *);
164 static void dumparcs(struct state
*, FILE *);
165 static int dumprarcs(struct arc
*, struct state
*, FILE *, int);
166 static void dumparc(struct arc
*, struct state
*, FILE *);
167 static void dumpcnfa(struct cnfa
*, FILE *);
168 static void dumpcstate(int, struct carc
*, struct cnfa
*, FILE *);
170 /* === regc_cvec.c === */
171 static struct cvec
*newcvec(int, int, int);
172 static struct cvec
*clearcvec(struct cvec
*);
173 static void addchr(struct cvec
*, chr
);
174 static void addrange(struct cvec
*, chr
, chr
);
175 static void addmcce(struct cvec
*, chr
*, chr
*);
176 static int haschr(struct cvec
*, chr
);
177 static struct cvec
*getcvec(struct vars
*, int, int, int);
178 static void freecvec(struct cvec
*);
180 /* === regc_locale.c === */
181 extern int wx_isdigit(wx_wchar c
);
182 extern int wx_isalpha(wx_wchar c
);
183 extern int wx_isalnum(wx_wchar c
);
184 extern int wx_isupper(wx_wchar c
);
185 extern int wx_islower(wx_wchar c
);
186 extern int wx_isgraph(wx_wchar c
);
187 extern int wx_ispunct(wx_wchar c
);
188 extern int wx_isspace(wx_wchar c
);
189 extern wx_wchar
wx_toupper(wx_wchar c
);
190 extern wx_wchar
wx_tolower(wx_wchar c
);
191 static int nmcces(struct vars
*);
192 static int nleaders(struct vars
*);
193 static struct cvec
*allmcces(struct vars
*, struct cvec
*);
194 static celt
element(struct vars
*, chr
*, chr
*);
195 static struct cvec
*range(struct vars
*, celt
, celt
, int);
196 static int before(celt
, celt
);
197 static struct cvec
*eclass(struct vars
*, celt
, int);
198 static struct cvec
*cclass(struct vars
*, chr
*, chr
*, int);
199 static struct cvec
*allcases(struct vars
*, chr
);
200 static int cmp(const chr
*, const chr
*, size_t);
201 static int casecmp(const chr
*, const chr
*, size_t);
204 /* internal variables, bundled for easy passing around */
208 chr
*now
; /* scan pointer into string */
209 chr
*stop
; /* end of string */
210 chr
*savenow
; /* saved now and stop for "subroutine
213 int err
; /* error code (0 if none) */
214 int cflags
; /* copy of compile flags */
215 int lasttype
; /* type of previous token */
216 int nexttype
; /* type of next token */
217 chr nextvalue
; /* value (if any) of next token */
218 int lexcon
; /* lexical context type (see lex.c) */
219 int nsubexp
; /* subexpression count */
220 struct subre
**subs
; /* subRE pointer vector */
221 size_t nsubs
; /* length of vector */
222 struct subre
*sub10
[10]; /* initial vector, enough for most */
223 struct nfa
*nfa
; /* the NFA */
224 struct colormap
*cm
; /* character color map */
225 color nlcolor
; /* color of newline */
226 struct state
*wordchrs
; /* state in nfa holding word-char outarcs */
227 struct subre
*tree
; /* subexpression tree */
228 struct subre
*treechain
; /* all tree nodes allocated */
229 struct subre
*treefree
; /* any free tree nodes */
230 int ntree
; /* number of tree nodes */
231 struct cvec
*cv
; /* interface cvec */
232 struct cvec
*cv2
; /* utility cvec */
233 struct cvec
*mcces
; /* collating-element information */
234 #define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
235 struct state
*mccepbegin
; /* in nfa, start of MCCE prototypes */
236 struct state
*mccepend
; /* in nfa, end of MCCE prototypes */
237 struct subre
*lacons
; /* lookahead-constraint vector */
238 int nlacons
; /* size of lacons */
241 /* parsing macros; most know that `v' is the struct vars pointer */
242 #define NEXT() (next(v)) /* advance by one token */
243 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
244 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
245 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
246 #define ISERR() VISERR(v)
247 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
249 #define ERR(e) VERR(v, e) /* record an error */
250 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
251 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
252 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
253 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false,
255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
256 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
258 /* token type codes, some also used as NFA arc types */
259 #define EMPTY 'n' /* no token present */
260 #define EOS 'e' /* end of string */
261 #define PLAIN 'p' /* ordinary character */
262 #define DIGIT 'd' /* digit (in bound) */
263 #define BACKREF 'b' /* back reference */
264 #define COLLEL 'I' /* start of [. */
265 #define ECLASS 'E' /* start of [= */
266 #define CCLASS 'C' /* start of [: */
267 #define END 'X' /* end of [. [= [: */
268 #define RANGE 'R' /* - within [] which might be range delim. */
269 #define LACON 'L' /* lookahead constraint subRE */
270 #define AHEAD 'a' /* color-lookahead arc */
271 #define BEHIND 'r' /* color-lookbehind arc */
272 #define WBDRY 'w' /* word boundary constraint */
273 #define NWBDRY 'W' /* non-word-boundary constraint */
274 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
275 #define SEND 'Z' /* end of string (even if not EOL) */
276 #define PREFER 'P' /* length preference */
278 /* is an arc colored, and hence on a color chain? */
279 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
284 /* static function list */
285 static struct fns functions
= {
286 rfree
, /* regfree insides */
292 * regcomp - compile regular expression
299 return wx_regcomp(re
, string
, wx_strlen(string
), flags
);
302 wx_regcomp(regex_t
*re
,
308 struct vars
*v
= &var
;
314 FILE *debug
= (flags
& REG_PROGRESS
) ? stdout
: (FILE *) NULL
;
317 FILE *debug
= (FILE *) NULL
;
320 #define CNOERR() { if (ISERR()) return freev(v, v->err); }
324 if (re
== NULL
|| string
== NULL
)
326 if ((flags
& REG_QUOTE
) &&
327 (flags
& (REG_ADVANCED
| REG_EXPANDED
| REG_NEWLINE
)))
329 if (!(flags
& REG_EXTENDED
) && (flags
& REG_ADVF
))
332 /* initial setup (after which freev() is callable) */
334 v
->now
= (chr
*) string
;
335 v
->stop
= v
->now
+ len
;
336 v
->savenow
= v
->savestop
= NULL
;
342 for (j
= 0; j
< v
->nsubs
; j
++)
346 v
->nlcolor
= COLORLESS
;
356 re
->re_magic
= REMAGIC
;
357 re
->re_info
= 0; /* bits get set during parse */
358 re
->re_csize
= sizeof(chr
);
360 re
->re_fns
= VS(&functions
);
362 /* more complex setup, malloced things */
363 re
->re_guts
= VS(MALLOC(sizeof(struct guts
)));
364 if (re
->re_guts
== NULL
)
365 return freev(v
, REG_ESPACE
);
366 g
= (struct guts
*) re
->re_guts
;
373 v
->nfa
= newnfa(v
, v
->cm
, (struct nfa
*) NULL
);
375 v
->cv
= newcvec(100, 20, 10);
377 return freev(v
, REG_ESPACE
);
381 v
->mcces
= newcvec(nleaders(v
), 0, i
);
383 v
->mcces
= allmcces(v
, v
->mcces
);
384 leaders(v
, v
->mcces
);
385 addmcce(v
->mcces
, (chr
*) NULL
, (chr
*) NULL
); /* dummy */
390 lexstart(v
); /* also handles prefixes */
391 if ((v
->cflags
& REG_NLSTOP
) || (v
->cflags
& REG_NLANCH
))
393 /* assign newline a unique color */
394 v
->nlcolor
= subcolor(v
->cm
, newline());
395 okcolors(v
->nfa
, v
->cm
);
398 v
->tree
= parse(v
, EOS
, PLAIN
, v
->nfa
->init
, v
->nfa
->final
);
399 assert(SEE(EOS
)); /* even if error; ISERR() => SEE(EOS) */
401 assert(v
->tree
!= NULL
);
403 /* finish setup of nfa and its subre tree */
404 specialcolors(v
->nfa
);
409 fprintf(debug
, "\n\n\n========= RAW ==========\n");
410 dumpnfa(v
->nfa
, debug
);
411 dumpst(v
->tree
, debug
, 1);
415 v
->ntree
= numst(v
->tree
, 1);
421 fprintf(debug
, "\n\n\n========= TREE FIXED ==========\n");
422 dumpst(v
->tree
, debug
, 1);
426 /* build compacted NFAs for tree and lacons */
427 re
->re_info
|= nfatree(v
, v
->tree
, debug
);
429 assert(v
->nlacons
== 0 || v
->lacons
!= NULL
);
430 for (i
= 1; i
< v
->nlacons
; i
++)
434 fprintf(debug
, "\n\n\n========= LA%d ==========\n", i
);
436 nfanode(v
, &v
->lacons
[i
], debug
);
439 if (v
->tree
->flags
& SHORTER
)
442 /* build compacted NFAs for tree, lacons, fast search */
445 fprintf(debug
, "\n\n\n========= SEARCH ==========\n");
447 /* can sacrifice main NFA now, so use it as work area */
448 (DISCARD
) optimize(v
->nfa
, debug
);
450 makesearch(v
, v
->nfa
);
452 compact(v
->nfa
, &g
->search
);
455 /* looks okay, package it up */
456 re
->re_nsub
= v
->nsubexp
;
457 v
->re
= NULL
; /* freev no longer frees re */
458 g
->magic
= GUTSMAGIC
;
459 g
->cflags
= v
->cflags
;
460 g
->info
= re
->re_info
;
461 g
->nsub
= re
->re_nsub
;
465 g
->compare
= (v
->cflags
& REG_ICASE
) ? casecmp
: cmp
;
466 g
->lacons
= v
->lacons
;
468 g
->nlacons
= v
->nlacons
;
471 if (flags
& REG_DUMP
)
480 * moresubs - enlarge subRE vector
483 moresubs(struct vars
* v
,
484 int wanted
) /* want enough room for this one */
489 assert(wanted
> 0 && (size_t) wanted
>= v
->nsubs
);
490 n
= (size_t) wanted
*3 / 2 + 1;
492 if (v
->subs
== v
->sub10
)
494 p
= (struct subre
**) MALLOC(n
* sizeof(struct subre
*));
496 memcpy(VS(p
), VS(v
->subs
),
497 v
->nsubs
* sizeof(struct subre
*));
500 p
= (struct subre
**) REALLOC(v
->subs
, n
* sizeof(struct subre
*));
507 for (p
= &v
->subs
[v
->nsubs
]; v
->nsubs
< n
; p
++, v
->nsubs
++)
509 assert(v
->nsubs
== n
);
510 assert((size_t) wanted
< v
->nsubs
);
514 * freev - free vars struct's substructures where necessary
516 * Optionally does error-number setting, and always returns error code
517 * (if any), to make error-handling code terser.
520 freev(struct vars
* v
,
525 if (v
->subs
!= v
->sub10
)
530 freesubre(v
, v
->tree
);
531 if (v
->treechain
!= NULL
)
537 if (v
->mcces
!= NULL
)
539 if (v
->lacons
!= NULL
)
540 freelacons(v
->lacons
, v
->nlacons
);
541 ERR(err
); /* nop if err==0 */
547 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
548 * NFA must have been optimize()d already.
551 makesearch(struct vars
* v
,
556 struct state
*pre
= nfa
->pre
;
561 /* no loops are needed if it's anchored */
562 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
564 assert(a
->type
== PLAIN
);
565 if (a
->co
!= nfa
->bos
[0] && a
->co
!= nfa
->bos
[1])
570 /* add implicit .* in front */
571 rainbow(nfa
, v
->cm
, PLAIN
, COLORLESS
, pre
, pre
);
573 /* and ^* and \A* too -- not always necessary, but harmless */
574 newarc(nfa
, PLAIN
, nfa
->bos
[0], pre
, pre
);
575 newarc(nfa
, PLAIN
, nfa
->bos
[1], pre
, pre
);
579 * Now here's the subtle part. Because many REs have no lookback
580 * constraints, often knowing when you were in the pre state tells you
581 * little; it's the next state(s) that are informative. But some of
582 * them may have other inarcs, i.e. it may be possible to make actual
583 * progress and then return to one of them. We must de-optimize such
584 * cases, splitting each such state into progress and no-progress
588 /* first, make a list of the states */
590 for (a
= pre
->outs
; a
!= NULL
; a
= a
->outchain
)
593 for (b
= s
->ins
; b
!= NULL
; b
= b
->inchain
)
597 { /* must be split */
604 for (s
= slist
; s
!= NULL
; s
= s2
)
607 copyouts(nfa
, s
, s2
);
608 for (a
= s
->ins
; a
!= NULL
; a
= b
)
613 cparc(nfa
, a
, a
->from
, s2
);
618 s
->tmp
= NULL
; /* clean up while we're at it */
623 * parse - parse an RE
625 * This is actually just the top level, which parses a bunch of branches
626 * tied together with '|'. They appear in the tree as the left children
627 * of a chain of '|' subres.
629 static struct subre
*
630 parse(struct vars
* v
,
631 int stopper
, /* EOS or ')' */
632 int type
, /* LACON (lookahead subRE) or PLAIN */
633 struct state
* init
, /* initial state */
634 struct state
* final
) /* final state */
636 struct state
*left
; /* scaffolding for branch */
638 struct subre
*branches
; /* top level */
639 struct subre
*branch
; /* current branch */
640 struct subre
*t
; /* temporary */
641 int firstbranch
; /* is this the first branch? */
643 assert(stopper
== ')' || stopper
== EOS
);
645 branches
= subre(v
, '|', LONGER
, init
, final
);
653 /* need a place to hang it */
654 branch
->right
= subre(v
, '|', LONGER
, init
, final
);
656 branch
= branch
->right
;
659 left
= newstate(v
->nfa
);
660 right
= newstate(v
->nfa
);
662 EMPTYARC(init
, left
);
663 EMPTYARC(right
, final
);
665 branch
->left
= parsebranch(v
, stopper
, type
, left
, right
, 0);
667 branch
->flags
|= UP(branch
->flags
| branch
->left
->flags
);
668 if ((branch
->flags
& ~branches
->flags
) != 0) /* new flags */
669 for (t
= branches
; t
!= branch
; t
= t
->right
)
670 t
->flags
|= branch
->flags
;
672 assert(SEE(stopper
) || SEE(EOS
));
676 assert(stopper
== ')' && SEE(EOS
));
680 /* optimize out simple cases */
681 if (branch
== branches
)
682 { /* only one branch */
683 assert(branch
->right
== NULL
);
686 freesubre(v
, branches
);
689 else if (!MESSY(branches
->flags
))
690 { /* no interesting innards */
691 freesubre(v
, branches
->left
);
692 branches
->left
= NULL
;
693 freesubre(v
, branches
->right
);
694 branches
->right
= NULL
;
702 * parsebranch - parse one branch of an RE
704 * This mostly manages concatenation, working closely with parseqatom().
705 * Concatenated things are bundled up as much as possible, with separate
706 * ',' nodes introduced only when necessary due to substructure.
708 static struct subre
*
709 parsebranch(struct vars
* v
,
710 int stopper
, /* EOS or ')' */
711 int type
, /* LACON (lookahead subRE) or PLAIN */
712 struct state
* left
, /* leftmost state */
713 struct state
* right
, /* rightmost state */
714 int partial
) /* is this only part of a branch? */
716 struct state
*lp
; /* left end of current construct */
717 int seencontent
; /* is there anything in this branch yet? */
722 t
= subre(v
, '=', 0, left
, right
); /* op '=' is tentative */
724 while (!SEE('|') && !SEE(stopper
) && !SEE(EOS
))
727 { /* implicit concat operator */
728 lp
= newstate(v
->nfa
);
730 moveins(v
->nfa
, right
, lp
);
734 /* NB, recursion in parseqatom() may swallow rest of branch */
735 parseqatom(v
, stopper
, type
, lp
, right
, t
);
743 EMPTYARC(left
, right
);
750 * parseqatom - parse one quantified atom or constraint of an RE
752 * The bookkeeping near the end cooperates very closely with parsebranch();
753 * in particular, it contains a recursion that can involve parsing the rest
754 * of the branch, making this function's name somewhat inaccurate.
757 parseqatom(struct vars
* v
,
758 int stopper
, /* EOS or ')' */
759 int type
, /* LACON (lookahead subRE) or PLAIN */
760 struct state
* lp
, /* left state to hang it on */
761 struct state
* rp
, /* right state to hang it on */
762 struct subre
* top
) /* subtree top */
764 struct state
*s
; /* temporaries for new states */
767 #define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
770 struct subre
*atom
; /* atom's subtree */
772 int cap
; /* capturing parens? */
773 int pos
; /* positive lookahead? */
774 int subno
; /* capturing-parens or backref number */
776 int qprefer
; /* quantifier short/long preference */
778 struct subre
**atomp
; /* where the pointer to atom is */
780 /* initial bookkeeping */
782 assert(lp
->nouts
== 0); /* must string new code */
783 assert(rp
->nins
== 0); /* between lp and rp */
784 subno
= 0; /* just to shut lint up */
786 /* an atom or constraint... */
787 atomtype
= v
->nexttype
;
790 /* first, constraints, which end by returning */
793 if (v
->cflags
& REG_NLANCH
)
794 ARCV(BEHIND
, v
->nlcolor
);
800 if (v
->cflags
& REG_NLANCH
)
801 ARCV(AHEAD
, v
->nlcolor
);
806 ARCV('^', 1); /* BOL */
807 ARCV('^', 0); /* or BOS */
812 ARCV('$', 1); /* EOL */
813 ARCV('$', 0); /* or EOS */
818 wordchrs(v
); /* does NEXT() */
819 s
= newstate(v
->nfa
);
821 nonword(v
, BEHIND
, lp
, s
);
822 word(v
, AHEAD
, s
, rp
);
826 wordchrs(v
); /* does NEXT() */
827 s
= newstate(v
->nfa
);
829 word(v
, BEHIND
, lp
, s
);
830 nonword(v
, AHEAD
, s
, rp
);
834 wordchrs(v
); /* does NEXT() */
835 s
= newstate(v
->nfa
);
837 nonword(v
, BEHIND
, lp
, s
);
838 word(v
, AHEAD
, s
, rp
);
839 s
= newstate(v
->nfa
);
841 word(v
, BEHIND
, lp
, s
);
842 nonword(v
, AHEAD
, s
, rp
);
846 wordchrs(v
); /* does NEXT() */
847 s
= newstate(v
->nfa
);
849 word(v
, BEHIND
, lp
, s
);
850 word(v
, AHEAD
, s
, rp
);
851 s
= newstate(v
->nfa
);
853 nonword(v
, BEHIND
, lp
, s
);
854 nonword(v
, AHEAD
, s
, rp
);
857 case LACON
: /* lookahead constraint */
860 s
= newstate(v
->nfa
);
861 s2
= newstate(v
->nfa
);
863 t
= parse(v
, ')', LACON
, s
, s2
);
864 freesubre(v
, t
); /* internal structure irrelevant */
865 assert(SEE(')') || ISERR());
867 n
= newlacon(v
, s
, s2
, pos
);
872 /* then errors, to get them out of the way */
884 /* then plain characters, and minor variants on that theme */
885 case ')': /* unbalanced paren */
886 if ((v
->cflags
& REG_ADVANCED
) != REG_EXTENDED
)
891 /* legal in EREs due to specification botch */
893 /* fallthrough into case PLAIN */
895 onechr(v
, v
->nextvalue
, lp
, rp
);
896 okcolors(v
->nfa
, v
->cm
);
901 if (v
->nextvalue
== 1)
905 assert(SEE(']') || ISERR());
909 rainbow(v
->nfa
, v
->cm
, PLAIN
,
910 (v
->cflags
& REG_NLSTOP
) ? v
->nlcolor
: COLORLESS
,
914 /* and finally the ugly stuff */
915 case '(': /* value flags as capturing or non */
916 cap
= (type
== LACON
) ? 0 : v
->nextvalue
;
921 if ((size_t) subno
>= v
->nsubs
)
923 assert((size_t) subno
< v
->nsubs
);
926 atomtype
= PLAIN
; /* something that's not '(' */
928 /* need new endpoints because tree will contain pointers */
929 s
= newstate(v
->nfa
);
930 s2
= newstate(v
->nfa
);
935 atom
= parse(v
, ')', PLAIN
, s
, s2
);
936 assert(SEE(')') || ISERR());
941 v
->subs
[subno
] = atom
;
942 t
= subre(v
, '(', atom
->flags
| CAP
, lp
, rp
);
948 /* postpone everything else pending possible {0} */
950 case BACKREF
: /* the Feature From The Black Lagoon */
951 INSIST(type
!= LACON
, REG_ESUBREG
);
952 INSIST(v
->nextvalue
< v
->nsubs
, REG_ESUBREG
);
953 INSIST(v
->subs
[v
->nextvalue
] != NULL
, REG_ESUBREG
);
955 assert(v
->nextvalue
> 0);
956 atom
= subre(v
, 'b', BACKR
, lp
, rp
);
957 subno
= v
->nextvalue
;
959 EMPTYARC(lp
, rp
); /* temporarily, so there's something */
964 /* ...and an atom may be followed by a quantifier */
970 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
976 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
982 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
999 /* {m,n} exercises preference, even if it's {m,m} */
1000 qprefer
= (v
->nextvalue
) ? LONGER
: SHORTER
;
1005 /* {m} passes operand's preference through */
1009 { /* catches errors too */
1015 default: /* no quantifier */
1021 /* annoying special case: {0} or {0,0} cancels everything */
1022 if (m
== 0 && n
== 0)
1026 if (atomtype
== '(')
1027 v
->subs
[subno
] = NULL
;
1028 delsub(v
->nfa
, lp
, rp
);
1033 /* if not a messy case, avoid hard part */
1034 assert(!MESSY(top
->flags
));
1035 f
= top
->flags
| qprefer
| ((atom
!= NULL
) ? atom
->flags
: 0);
1036 if (atomtype
!= '(' && atomtype
!= BACKREF
&& !MESSY(UP(f
)))
1038 if (!(m
== 1 && n
== 1))
1039 repeat(v
, lp
, rp
, m
, n
);
1047 * hard part: something messy That is, capturing parens, back
1048 * reference, short/long clash, or an atom with substructure
1049 * containing one of those.
1052 /* now we'll need a subre for the contents even if they're boring */
1055 atom
= subre(v
, '=', 0, lp
, rp
);
1060 * prepare a general-purpose state skeleton
1062 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1063 * [lp] ----> [s2] ----bypass---------------------
1065 * where bypass is an empty, and prefix is some repetitions of atom
1067 s
= newstate(v
->nfa
); /* first, new endpoints for the atom */
1068 s2
= newstate(v
->nfa
);
1070 moveouts(v
->nfa
, lp
, s
);
1071 moveins(v
->nfa
, rp
, s2
);
1075 s
= newstate(v
->nfa
); /* and spots for prefix and bypass */
1076 s2
= newstate(v
->nfa
);
1082 /* break remaining subRE into x{...} and what follows */
1083 t
= subre(v
, '.', COMBINE(qprefer
, atom
->flags
), lp
, rp
);
1086 /* here we should recurse... but we must postpone that to the end */
1088 /* split top into prefix and remaining */
1089 assert(top
->op
== '=' && top
->left
== NULL
&& top
->right
== NULL
);
1090 top
->left
= subre(v
, '=', top
->flags
, top
->begin
, lp
);
1094 /* if it's a backref, now is the time to replicate the subNFA */
1095 if (atomtype
== BACKREF
)
1097 assert(atom
->begin
->nouts
== 1); /* just the EMPTY */
1098 delsub(v
->nfa
, atom
->begin
, atom
->end
);
1099 assert(v
->subs
[subno
] != NULL
);
1100 /* and here's why the recursion got postponed: it must */
1101 /* wait until the skeleton is filled in, because it may */
1102 /* hit a backref that wants to copy the filled-in skeleton */
1103 dupnfa(v
->nfa
, v
->subs
[subno
]->begin
, v
->subs
[subno
]->end
,
1104 atom
->begin
, atom
->end
);
1108 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1111 EMPTYARC(s2
, atom
->end
); /* the bypass */
1112 assert(PREF(qprefer
) != 0);
1113 f
= COMBINE(qprefer
, atom
->flags
);
1114 t
= subre(v
, '|', f
, lp
, atom
->end
);
1117 t
->right
= subre(v
, '|', PREF(f
), s2
, atom
->end
);
1119 t
->right
->left
= subre(v
, '=', 0, s2
, atom
->end
);
1126 /* deal with the rest of the quantifier */
1127 if (atomtype
== BACKREF
)
1129 /* special case: backrefs have internal quantifiers */
1130 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1131 /* just stuff everything into atom */
1132 repeat(v
, atom
->begin
, atom
->end
, m
, n
);
1133 atom
->min
= (short) m
;
1134 atom
->max
= (short) n
;
1135 atom
->flags
|= COMBINE(qprefer
, atom
->flags
);
1137 else if (m
== 1 && n
== 1)
1139 /* no/vacuous quantifier: done */
1140 EMPTYARC(s
, atom
->begin
); /* empty prefix */
1144 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1145 /* parens in only second x */
1146 dupnfa(v
->nfa
, atom
->begin
, atom
->end
, s
, atom
->begin
);
1147 assert(m
>= 1 && m
!= INFINITY
&& n
>= 1);
1148 repeat(v
, s
, atom
->begin
, m
- 1, (n
== INFINITY
) ? n
: n
- 1);
1149 f
= COMBINE(qprefer
, atom
->flags
);
1150 t
= subre(v
, '.', f
, s
, atom
->end
); /* prefix and atom */
1152 t
->left
= subre(v
, '=', PREF(f
), s
, atom
->begin
);
1158 /* and finally, look after that postponed recursion */
1160 if (!(SEE('|') || SEE(stopper
) || SEE(EOS
)))
1161 t
->right
= parsebranch(v
, stopper
, type
, atom
->end
, rp
, 1);
1164 EMPTYARC(atom
->end
, rp
);
1165 t
->right
= subre(v
, '=', 0, atom
->end
, rp
);
1167 assert(SEE('|') || SEE(stopper
) || SEE(EOS
));
1168 t
->flags
|= COMBINE(t
->flags
, t
->right
->flags
);
1169 top
->flags
|= COMBINE(top
->flags
, t
->flags
);
1173 * nonword - generate arcs for non-word-character ahead or behind
1176 nonword(struct vars
* v
,
1177 int dir
, /* AHEAD or BEHIND */
1181 int anchor
= (dir
== AHEAD
) ? '$' : '^';
1183 assert(dir
== AHEAD
|| dir
== BEHIND
);
1184 newarc(v
->nfa
, anchor
, 1, lp
, rp
);
1185 newarc(v
->nfa
, anchor
, 0, lp
, rp
);
1186 colorcomplement(v
->nfa
, v
->cm
, dir
, v
->wordchrs
, lp
, rp
);
1187 /* (no need for special attention to \n) */
1191 * word - generate arcs for word character ahead or behind
1194 word(struct vars
* v
,
1195 int dir
, /* AHEAD or BEHIND */
1199 assert(dir
== AHEAD
|| dir
== BEHIND
);
1200 cloneouts(v
->nfa
, v
->wordchrs
, lp
, rp
, dir
);
1201 /* (no need for special attention to \n) */
1205 * scannum - scan a number
1207 static int /* value, <= DUPMAX */
1208 scannum(struct vars
* v
)
1212 while (SEE(DIGIT
) && n
< DUPMAX
)
1214 n
= n
* 10 + v
->nextvalue
;
1217 if (SEE(DIGIT
) || n
> DUPMAX
)
1226 * repeat - replicate subNFA for quantifiers
1228 * The duplication sequences used here are chosen carefully so that any
1229 * pointers starting out pointing into the subexpression end up pointing into
1230 * the last occurrence. (Note that it may not be strung between the same
1231 * left and right end states, however!) This used to be important for the
1232 * subRE tree, although the important bits are now handled by the in-line
1233 * code in parse(), and when this is called, it doesn't matter any more.
1236 repeat(struct vars
* v
,
1244 #define PAIR(x, y) ((x)*4 + (y))
1245 #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1246 const int rm
= REDUCE(m
);
1247 const int rn
= REDUCE(n
);
1251 switch (PAIR(rm
, rn
))
1253 case PAIR(0, 0): /* empty string */
1254 delsub(v
->nfa
, lp
, rp
);
1257 case PAIR(0, 1): /* do as x| */
1260 case PAIR(0, SOME
): /* do as x{1,n}| */
1261 repeat(v
, lp
, rp
, 1, n
);
1265 case PAIR(0, INF
): /* loop x around */
1266 s
= newstate(v
->nfa
);
1268 moveouts(v
->nfa
, lp
, s
);
1269 moveins(v
->nfa
, rp
, s
);
1273 case PAIR(1, 1): /* no action required */
1275 case PAIR(1, SOME
): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1276 s
= newstate(v
->nfa
);
1278 moveouts(v
->nfa
, lp
, s
);
1279 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1281 repeat(v
, lp
, s
, 1, n
- 1);
1285 case PAIR(1, INF
): /* add loopback arc */
1286 s
= newstate(v
->nfa
);
1287 s2
= newstate(v
->nfa
);
1289 moveouts(v
->nfa
, lp
, s
);
1290 moveins(v
->nfa
, rp
, s2
);
1295 case PAIR(SOME
, SOME
): /* do as x{m-1,n-1}x */
1296 s
= newstate(v
->nfa
);
1298 moveouts(v
->nfa
, lp
, s
);
1299 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1301 repeat(v
, lp
, s
, m
- 1, n
- 1);
1303 case PAIR(SOME
, INF
): /* do as x{m-1,}x */
1304 s
= newstate(v
->nfa
);
1306 moveouts(v
->nfa
, lp
, s
);
1307 dupnfa(v
->nfa
, s
, rp
, lp
, s
);
1309 repeat(v
, lp
, s
, m
- 1, n
);
1318 * bracket - handle non-complemented bracket expression
1319 * Also called from cbracket for complemented bracket expressions.
1322 bracket(struct vars
* v
,
1328 while (!SEE(']') && !SEE(EOS
))
1329 brackpart(v
, lp
, rp
);
1330 assert(SEE(']') || ISERR());
1331 okcolors(v
->nfa
, v
->cm
);
1335 * cbracket - handle complemented bracket expression
1336 * We do it by calling bracket() with dummy endpoints, and then complementing
1337 * the result. The alternative would be to invoke rainbow(), and then delete
1338 * arcs as the b.e. is seen... but that gets messy.
1341 cbracket(struct vars
* v
,
1345 struct state
*left
= newstate(v
->nfa
);
1346 struct state
*right
= newstate(v
->nfa
);
1348 struct arc
*a
; /* arc from lp */
1349 struct arc
*ba
; /* arc from left, from bracket() */
1350 struct arc
*pa
; /* MCCE-prototype arc */
1356 bracket(v
, left
, right
);
1357 if (v
->cflags
& REG_NLSTOP
)
1358 newarc(v
->nfa
, PLAIN
, v
->nlcolor
, left
, right
);
1361 assert(lp
->nouts
== 0); /* all outarcs will be ours */
1363 /* easy part of complementing */
1364 colorcomplement(v
->nfa
, v
->cm
, PLAIN
, left
, lp
, rp
);
1366 if (v
->mcces
== NULL
)
1367 { /* no MCCEs -- we're done */
1368 dropstate(v
->nfa
, left
);
1369 assert(right
->nins
== 0);
1370 freestate(v
->nfa
, right
);
1374 /* but complementing gets messy in the presence of MCCEs... */
1376 for (p
= v
->mcces
->chrs
, i
= v
->mcces
->nchrs
; i
> 0; p
++, i
--)
1378 co
= GETCOLOR(v
->cm
, *p
);
1379 a
= findarc(lp
, PLAIN
, co
);
1380 ba
= findarc(left
, PLAIN
, co
);
1388 s
= newstate(v
->nfa
);
1390 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1392 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1395 { /* easy case, need all of them */
1396 cloneouts(v
->nfa
, pa
->to
, s
, rp
, PLAIN
);
1397 newarc(v
->nfa
, '$', 1, s
, rp
);
1398 newarc(v
->nfa
, '$', 0, s
, rp
);
1399 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
, s
, rp
);
1402 { /* must be selective */
1403 if (findarc(ba
->to
, '$', 1) == NULL
)
1405 newarc(v
->nfa
, '$', 1, s
, rp
);
1406 newarc(v
->nfa
, '$', 0, s
, rp
);
1407 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, pa
->to
,
1410 for (pa
= pa
->to
->outs
; pa
!= NULL
; pa
= pa
->outchain
)
1411 if (findarc(ba
->to
, PLAIN
, pa
->co
) == NULL
)
1412 newarc(v
->nfa
, PLAIN
, pa
->co
, s
, rp
);
1413 if (s
->nouts
== 0) /* limit of selectivity: none */
1414 dropstate(v
->nfa
, s
); /* frees arc too */
1419 delsub(v
->nfa
, left
, right
);
1420 assert(left
->nouts
== 0);
1421 freestate(v
->nfa
, left
);
1422 assert(right
->nins
== 0);
1423 freestate(v
->nfa
, right
);
1427 * brackpart - handle one item (or range) within a bracket expression
1430 brackpart(struct vars
* v
,
1441 /* parse something, get rid of special cases, take shortcuts */
1442 switch (v
->nexttype
)
1444 case RANGE
: /* a-b-c or other botch */
1449 c
[0] = v
->nextvalue
;
1451 /* shortcut for ordinary chr (not range, not MCCE leader) */
1452 if (!SEE(RANGE
) && !ISCELEADER(v
, c
[0]))
1454 onechr(v
, c
[0], lp
, rp
);
1457 startc
= element(v
, c
, c
+ 1);
1462 endp
= scanplain(v
);
1463 INSIST(startp
< endp
, REG_ECOLLATE
);
1465 startc
= element(v
, startp
, endp
);
1470 endp
= scanplain(v
);
1471 INSIST(startp
< endp
, REG_ECOLLATE
);
1473 startc
= element(v
, startp
, endp
);
1475 cv
= eclass(v
, startc
, (v
->cflags
& REG_ICASE
));
1477 dovec(v
, cv
, lp
, rp
);
1482 endp
= scanplain(v
);
1483 INSIST(startp
< endp
, REG_ECTYPE
);
1485 cv
= cclass(v
, startp
, endp
, (v
->cflags
& REG_ICASE
));
1487 dovec(v
, cv
, lp
, rp
);
1499 switch (v
->nexttype
)
1503 c
[0] = v
->nextvalue
;
1505 endc
= element(v
, c
, c
+ 1);
1510 endp
= scanplain(v
);
1511 INSIST(startp
< endp
, REG_ECOLLATE
);
1513 endc
= element(v
, startp
, endp
);
1526 * Ranges are unportable. Actually, standard C does guarantee that
1527 * digits are contiguous, but making that an exception is just too
1532 cv
= range(v
, startc
, endc
, (v
->cflags
& REG_ICASE
));
1534 dovec(v
, cv
, lp
, rp
);
1538 * scanplain - scan PLAIN contents of [. etc.
1540 * Certain bits of trickery in lex.c know that this code does not try
1541 * to look past the final bracket of the [. etc.
1543 static chr
* /* just after end of sequence */
1544 scanplain(struct vars
* v
)
1548 assert(SEE(COLLEL
) || SEE(ECLASS
) || SEE(CCLASS
));
1558 assert(SEE(END
) || ISERR());
1565 * leaders - process a cvec of collating elements to also include leaders
1566 * Also gives all characters involved their own colors, which is almost
1567 * certainly necessary, and sets up little disconnected subNFA.
1570 leaders(struct vars
* v
,
1579 v
->mccepbegin
= newstate(v
->nfa
);
1580 v
->mccepend
= newstate(v
->nfa
);
1583 for (mcce
= 0; mcce
< cv
->nmcces
; mcce
++)
1585 p
= cv
->mcces
[mcce
];
1587 if (!haschr(cv
, leader
))
1590 s
= newstate(v
->nfa
);
1591 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, leader
),
1593 okcolors(v
->nfa
, v
->cm
);
1597 a
= findarc(v
->mccepbegin
, PLAIN
,
1598 GETCOLOR(v
->cm
, leader
));
1601 assert(s
!= v
->mccepend
);
1604 assert(*p
!= 0 && *(p
+ 1) == 0); /* only 2-char MCCEs for
1606 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, *p
), s
, v
->mccepend
);
1607 okcolors(v
->nfa
, v
->cm
);
1612 * onechr - fill in arcs for a plain character, and possible case complements
1613 * This is mostly a shortcut for efficient handling of the common case.
1616 onechr(struct vars
* v
,
1621 if (!(v
->cflags
& REG_ICASE
))
1623 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, c
), lp
, rp
);
1627 /* rats, need general case anyway... */
1628 dovec(v
, allcases(v
, c
), lp
, rp
);
1632 * dovec - fill in arcs for each element of a cvec
1633 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1636 dovec(struct vars
* v
,
1650 struct arc
*pa
; /* arc in prototype */
1652 struct state
*ps
; /* state in prototype */
1654 /* need a place to store leaders, if any */
1657 assert(v
->mcces
!= NULL
);
1658 if (v
->cv2
== NULL
|| v
->cv2
->nchrs
< v
->mcces
->nchrs
)
1662 v
->cv2
= newcvec(v
->mcces
->nchrs
, 0, v
->mcces
->nmcces
);
1667 leads
= clearcvec(v
->cv2
);
1672 /* first, get the ordinary characters out of the way */
1673 for (p
= cv
->chrs
, i
= cv
->nchrs
; i
> 0; p
++, i
--)
1676 if (!ISCELEADER(v
, ch
))
1677 newarc(v
->nfa
, PLAIN
, subcolor(v
->cm
, ch
), lp
, rp
);
1680 assert(singleton(v
->cm
, ch
));
1681 assert(leads
!= NULL
);
1682 if (!haschr(leads
, ch
))
1687 /* and the ranges */
1688 for (p
= cv
->ranges
, i
= cv
->nranges
; i
> 0; p
+= 2, i
--)
1692 while (from
<= to
&& (ce
= nextleader(v
, from
, to
)) != NOCELT
)
1695 subrange(v
, from
, ce
- 1, lp
, rp
);
1696 assert(singleton(v
->cm
, ce
));
1697 assert(leads
!= NULL
);
1698 if (!haschr(leads
, ce
))
1703 subrange(v
, from
, to
, lp
, rp
);
1706 if ((leads
== NULL
|| leads
->nchrs
== 0) && cv
->nmcces
== 0)
1709 /* deal with the MCCE leaders */
1711 for (p
= leads
->chrs
, i
= leads
->nchrs
; i
> 0; p
++, i
--)
1713 co
= GETCOLOR(v
->cm
, *p
);
1714 a
= findarc(lp
, PLAIN
, co
);
1719 s
= newstate(v
->nfa
);
1721 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1724 pa
= findarc(v
->mccepbegin
, PLAIN
, co
);
1727 newarc(v
->nfa
, '$', 1, s
, rp
);
1728 newarc(v
->nfa
, '$', 0, s
, rp
);
1729 colorcomplement(v
->nfa
, v
->cm
, AHEAD
, ps
, s
, rp
);
1734 for (i
= 0; i
< cv
->nmcces
; i
++)
1737 assert(singleton(v
->cm
, *p
));
1738 if (!singleton(v
->cm
, *p
))
1744 co
= GETCOLOR(v
->cm
, ch
);
1745 a
= findarc(lp
, PLAIN
, co
);
1750 s
= newstate(v
->nfa
);
1752 newarc(v
->nfa
, PLAIN
, co
, lp
, s
);
1755 assert(*p
!= 0); /* at least two chars */
1756 assert(singleton(v
->cm
, *p
));
1758 co
= GETCOLOR(v
->cm
, ch
);
1759 assert(*p
== 0); /* and only two, for now */
1760 newarc(v
->nfa
, PLAIN
, co
, s
, rp
);
1766 * nextleader - find next MCCE leader within range
1768 static celt
/* NOCELT means none */
1769 nextleader(struct vars
* v
,
1778 if (v
->mcces
== NULL
)
1781 for (i
= v
->mcces
->nchrs
, p
= v
->mcces
->chrs
; i
> 0; i
--, p
++)
1784 if (from
<= ch
&& ch
<= to
)
1785 if (it
== NOCELT
|| ch
< it
)
1792 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1794 * The list is kept as a bunch of arcs between two dummy states; it's
1795 * disposed of by the unreachable-states sweep in NFA optimization.
1796 * Does NEXT(). Must not be called from any unusual lexical context.
1797 * This should be reconciled with the \w etc. handling in lex.c, and
1798 * should be cleaned up to reduce dependencies on input scanning.
1801 wordchrs(struct vars
* v
)
1804 struct state
*right
;
1806 if (v
->wordchrs
!= NULL
)
1808 NEXT(); /* for consistency */
1812 left
= newstate(v
->nfa
);
1813 right
= newstate(v
->nfa
);
1815 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1818 assert(v
->savenow
!= NULL
&& SEE('['));
1819 bracket(v
, left
, right
);
1820 assert((v
->savenow
!= NULL
&& SEE(']')) || ISERR());
1827 * subre - allocate a subre
1829 static struct subre
*
1830 subre(struct vars
* v
,
1833 struct state
* begin
,
1840 v
->treefree
= ret
->left
;
1843 ret
= (struct subre
*) MALLOC(sizeof(struct subre
));
1849 ret
->chain
= v
->treechain
;
1853 assert(strchr("|.b(=", op
) != NULL
);
1859 ret
->min
= ret
->max
= 1;
1870 * freesubre - free a subRE subtree
1873 freesubre(struct vars
* v
, /* might be NULL */
1879 if (sr
->left
!= NULL
)
1880 freesubre(v
, sr
->left
);
1881 if (sr
->right
!= NULL
)
1882 freesubre(v
, sr
->right
);
1888 * freesrnode - free one node in a subRE subtree
1891 freesrnode(struct vars
* v
, /* might be NULL */
1897 if (!NULLCNFA(sr
->cnfa
))
1898 freecnfa(&sr
->cnfa
);
1903 sr
->left
= v
->treefree
;
1911 * optst - optimize a subRE subtree
1914 optst(struct vars
* v
,
1920 /* recurse through children */
1921 if (t
->left
!= NULL
)
1923 if (t
->right
!= NULL
)
1928 * numst - number tree nodes (assigning retry indexes)
1930 static int /* next number */
1931 numst(struct subre
* t
,
1932 int start
) /* starting point for subtree numbers */
1939 t
->retry
= (short) i
++;
1940 if (t
->left
!= NULL
)
1941 i
= numst(t
->left
, i
);
1942 if (t
->right
!= NULL
)
1943 i
= numst(t
->right
, i
);
1948 * markst - mark tree nodes as INUSE
1951 markst(struct subre
* t
)
1956 if (t
->left
!= NULL
)
1958 if (t
->right
!= NULL
)
1963 * cleanst - free any tree nodes not marked INUSE
1966 cleanst(struct vars
* v
)
1971 for (t
= v
->treechain
; t
!= NULL
; t
= next
)
1974 if (!(t
->flags
& INUSE
))
1977 v
->treechain
= NULL
;
1978 v
->treefree
= NULL
; /* just on general principles */
1982 * nfatree - turn a subRE subtree into a tree of compacted NFAs
1984 static long /* optimize results from top node */
1985 nfatree(struct vars
* v
,
1987 FILE *f
) /* for debug output */
1989 assert(t
!= NULL
&& t
->begin
!= NULL
);
1991 if (t
->left
!= NULL
)
1992 (DISCARD
) nfatree(v
, t
->left
, f
);
1993 if (t
->right
!= NULL
)
1994 (DISCARD
) nfatree(v
, t
->right
, f
);
1996 return nfanode(v
, t
, f
);
2000 * nfanode - do one NFA for nfatree
2002 static long /* optimize results */
2003 nfanode(struct vars
* v
,
2005 FILE *f
) /* for debug output */
2010 assert(t
->begin
!= NULL
);
2017 fprintf(f
, "\n\n\n========= TREE NODE %s ==========\n",
2018 stid(t
, idbuf
, sizeof(idbuf
)));
2021 nfa
= newnfa(v
, v
->cm
, v
->nfa
);
2023 dupnfa(nfa
, t
->begin
, t
->end
, nfa
->init
, nfa
->final
);
2027 ret
= optimize(nfa
, f
);
2030 compact(nfa
, &t
->cnfa
);
2037 * newlacon - allocate a lookahead-constraint subRE
2039 static int /* lacon number */
2040 newlacon(struct vars
* v
,
2041 struct state
* begin
,
2048 if (v
->nlacons
== 0)
2050 v
->lacons
= (struct subre
*) MALLOC(2 * sizeof(struct subre
));
2051 n
= 1; /* skip 0th */
2056 v
->lacons
= (struct subre
*) REALLOC(v
->lacons
,
2057 (v
->nlacons
+ 1) * sizeof(struct subre
));
2060 if (v
->lacons
== NULL
)
2065 sub
= &v
->lacons
[n
];
2074 * freelacons - free lookahead-constraint subRE vector
2077 freelacons(struct subre
* subs
,
2084 for (sub
= subs
+ 1, i
= n
- 1; i
> 0; sub
++, i
--) /* no 0th */
2085 if (!NULLCNFA(sub
->cnfa
))
2086 freecnfa(&sub
->cnfa
);
2091 * rfree - free a whole RE (insides of regfree)
2098 if (re
== NULL
|| re
->re_magic
!= REMAGIC
)
2101 re
->re_magic
= 0; /* invalidate RE */
2102 g
= (struct guts
*) re
->re_guts
;
2107 if (g
->tree
!= NULL
)
2108 freesubre((struct vars
*) NULL
, g
->tree
);
2109 if (g
->lacons
!= NULL
)
2110 freelacons(g
->lacons
, g
->nlacons
);
2111 if (!NULLCNFA(g
->search
))
2112 freecnfa(&g
->search
);
2119 * dump - dump an RE in human-readable form
2128 if (re
->re_magic
!= REMAGIC
)
2129 fprintf(f
, "bad magic number (0x%x not 0x%x)\n", re
->re_magic
,
2131 if (re
->re_guts
== NULL
)
2133 fprintf(f
, "NULL guts!!!\n");
2136 g
= (struct guts
*) re
->re_guts
;
2137 if (g
->magic
!= GUTSMAGIC
)
2138 fprintf(f
, "bad guts magic number (0x%x not 0x%x)\n", g
->magic
,
2141 fprintf(f
, "\n\n\n========= DUMP ==========\n");
2142 fprintf(f
, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2143 re
->re_nsub
, re
->re_info
, re
->re_csize
, g
->ntree
);
2145 dumpcolors(&g
->cmap
, f
);
2146 if (!NULLCNFA(g
->search
))
2148 printf("\nsearch:\n");
2149 dumpcnfa(&g
->search
, f
);
2151 for (i
= 1; i
< g
->nlacons
; i
++)
2153 fprintf(f
, "\nla%d (%s):\n", i
,
2154 (g
->lacons
[i
].subno
) ? "positive" : "negative");
2155 dumpcnfa(&g
->lacons
[i
].cnfa
, f
);
2158 dumpst(g
->tree
, f
, 0);
2162 * dumpst - dump a subRE tree
2165 dumpst(struct subre
* t
,
2167 int nfapresent
) /* is the original NFA still around? */
2170 fprintf(f
, "null tree\n");
2172 stdump(t
, f
, nfapresent
);
2177 * stdump - recursive guts of dumpst
2180 stdump(struct subre
* t
,
2182 int nfapresent
) /* is the original NFA still around? */
2186 fprintf(f
, "%s. `%c'", stid(t
, idbuf
, sizeof(idbuf
)), t
->op
);
2187 if (t
->flags
& LONGER
)
2188 fprintf(f
, " longest");
2189 if (t
->flags
& SHORTER
)
2190 fprintf(f
, " shortest");
2191 if (t
->flags
& MIXED
)
2192 fprintf(f
, " hasmixed");
2194 fprintf(f
, " hascapture");
2195 if (t
->flags
& BACKR
)
2196 fprintf(f
, " hasbackref");
2197 if (!(t
->flags
& INUSE
))
2198 fprintf(f
, " UNUSED");
2200 fprintf(f
, " (#%d)", t
->subno
);
2201 if (t
->min
!= 1 || t
->max
!= 1)
2203 fprintf(f
, " {%d,", t
->min
);
2204 if (t
->max
!= INFINITY
)
2205 fprintf(f
, "%d", t
->max
);
2209 fprintf(f
, " %ld-%ld", (long) t
->begin
->no
, (long) t
->end
->no
);
2210 if (t
->left
!= NULL
)
2211 fprintf(f
, " L:%s", stid(t
->left
, idbuf
, sizeof(idbuf
)));
2212 if (t
->right
!= NULL
)
2213 fprintf(f
, " R:%s", stid(t
->right
, idbuf
, sizeof(idbuf
)));
2214 if (!NULLCNFA(t
->cnfa
))
2217 dumpcnfa(&t
->cnfa
, f
);
2220 if (t
->left
!= NULL
)
2221 stdump(t
->left
, f
, nfapresent
);
2222 if (t
->right
!= NULL
)
2223 stdump(t
->right
, f
, nfapresent
);
2227 * stid - identify a subtree node for dumping
2229 static char * /* points to buf or constant string */
2230 stid(struct subre
* t
,
2234 /* big enough for hex int or decimal t->retry? */
2235 if (bufsize
< sizeof(int) * 2 + 3 || bufsize
< sizeof(t
->retry
) * 3 + 1)
2238 sprintf(buf
, "%d", t
->retry
);
2240 sprintf(buf
, "0x%x", (int) t
); /* may lose bits, that's okay */
2243 #endif /* REG_DEBUG */
2246 #include "regc_lex.c"
2247 #include "regc_color.c"
2248 #include "regc_nfa.c"
2249 #include "regc_cvec.c"
2250 #include "regc_locale.c"