]> git.saurik.com Git - wxWidgets.git/blame - src/regex/regcomp.c
byte length for interim UniChar String corrected
[wxWidgets.git] / src / regex / regcomp.c
CommitLineData
c0a7edde
RN
1/*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
3ca4086b
VS
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
c0a7edde
RN
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
3ca4086b
VS
10 * thanks all of them.
11 *
c0a7edde
RN
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
3ca4086b 16 *
c0a7edde
RN
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
3ca4086b 19 *
c0a7edde
RN
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
c0a7edde 31 */
5a419892 32
c0a7edde 33#include "regguts.h"
5a419892
VZ
34
35/*
c0a7edde 36 * forward declarations, up here so forward datatypes etc. are defined early
5a419892 37 */
3ca4086b
VS
38/* =====^!^===== begin forwards =====^!^===== */
39/* automatically gathered by fwd; do not hand-edit */
c0a7edde 40/* === regcomp.c === */
3ca4086b
VS
41int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
42static VOID moresubs _ANSI_ARGS_((struct vars *, int));
43static int freev _ANSI_ARGS_((struct vars *, int));
44static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
45static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
46static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
47static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
48static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
49static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
50static int scannum _ANSI_ARGS_((struct vars *));
51static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
52static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
53static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
54static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
55static chr *scanplain _ANSI_ARGS_((struct vars *));
56static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
57static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
58static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
59static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
60static VOID wordchrs _ANSI_ARGS_((struct vars *));
61static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
62static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
63static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
64static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
65static int numst _ANSI_ARGS_((struct subre *, int));
66static VOID markst _ANSI_ARGS_((struct subre *));
67static VOID cleanst _ANSI_ARGS_((struct vars *));
68static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
69static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
70static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
71static VOID freelacons _ANSI_ARGS_((struct subre *, int));
72static VOID rfree _ANSI_ARGS_((regex_t *));
73static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
74static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
75static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
76static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
77/* === regc_lex.c === */
78static VOID lexstart _ANSI_ARGS_((struct vars *));
79static VOID prefixes _ANSI_ARGS_((struct vars *));
80static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
81static VOID lexword _ANSI_ARGS_((struct vars *));
82static int next _ANSI_ARGS_((struct vars *));
83static int lexescape _ANSI_ARGS_((struct vars *));
84static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
85static int brenext _ANSI_ARGS_((struct vars *, pchr));
86static VOID skip _ANSI_ARGS_((struct vars *));
87static chr newline _ANSI_ARGS_((NOPARMS));
c0a7edde 88#ifdef REG_DEBUG
3ca4086b 89static chr *ch _ANSI_ARGS_((NOPARMS));
c0a7edde 90#endif
3ca4086b 91static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
c0a7edde 92/* === regc_color.c === */
3ca4086b
VS
93static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
94static VOID freecm _ANSI_ARGS_((struct colormap *));
95static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
96static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
97static color maxcolor _ANSI_ARGS_((struct colormap *));
98static color newcolor _ANSI_ARGS_((struct colormap *));
99static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
100static color pseudocolor _ANSI_ARGS_((struct colormap *));
101static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
102static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
103static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
104static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
105static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
106static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
107static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
108static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
109static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
110static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
c0a7edde 111#ifdef REG_DEBUG
3ca4086b
VS
112static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
113static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
114static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
c0a7edde
RN
115#endif
116/* === regc_nfa.c === */
3ca4086b
VS
117static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
118static VOID freenfa _ANSI_ARGS_((struct nfa *));
119static struct state *newstate _ANSI_ARGS_((struct nfa *));
120static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
121static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
122static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
123static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
124static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
125static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
126static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
127static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
128static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
129static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
130static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
131static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
132static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
133static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
134static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
135static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
136static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
137static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
138static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
139static VOID specialcolors _ANSI_ARGS_((struct nfa *));
140static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
141static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
142static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
143static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
144static int push _ANSI_ARGS_((struct nfa *, struct arc *));
145#define INCOMPATIBLE 1 /* destroys arc */
146#define SATISFIED 2 /* constraint satisfied */
147#define COMPATIBLE 3 /* compatible but not satisfied yet */
148static int combine _ANSI_ARGS_((struct arc *, struct arc *));
149static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
150static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
151static VOID cleanup _ANSI_ARGS_((struct nfa *));
152static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
153static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
154static long analyze _ANSI_ARGS_((struct nfa *));
155static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
156static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
157static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
158static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
c0a7edde 159#ifdef REG_DEBUG
3ca4086b
VS
160static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
161static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
162static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
163static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
164#endif
165static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
166#ifdef REG_DEBUG
167static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
c0a7edde
RN
168#endif
169/* === regc_cvec.c === */
3ca4086b
VS
170static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
171static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
172static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
173static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
174static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
175static int haschr _ANSI_ARGS_((struct cvec *, pchr));
176static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
177static VOID freecvec _ANSI_ARGS_((struct cvec *));
c0a7edde 178/* === regc_locale.c === */
3ca4086b
VS
179static int nmcces _ANSI_ARGS_((struct vars *));
180static int nleaders _ANSI_ARGS_((struct vars *));
181static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
182static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
183static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
184static int before _ANSI_ARGS_((celt, celt));
185static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
186static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
187static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
188static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
189static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
190/* automatically gathered by fwd; do not hand-edit */
191/* =====^!^===== end forwards =====^!^===== */
192
c0a7edde
RN
193
194
195/* internal variables, bundled for easy passing around */
3ca4086b
VS
196struct vars {
197 regex_t *re;
198 chr *now; /* scan pointer into string */
199 chr *stop; /* end of string */
200 chr *savenow; /* saved now and stop for "subroutine call" */
201 chr *savestop;
202 int err; /* error code (0 if none) */
203 int cflags; /* copy of compile flags */
204 int lasttype; /* type of previous token */
205 int nexttype; /* type of next token */
206 chr nextvalue; /* value (if any) of next token */
207 int lexcon; /* lexical context type (see lex.c) */
208 int nsubexp; /* subexpression count */
209 struct subre **subs; /* subRE pointer vector */
210 size_t nsubs; /* length of vector */
c0a7edde 211 struct subre *sub10[10]; /* initial vector, enough for most */
3ca4086b
VS
212 struct nfa *nfa; /* the NFA */
213 struct colormap *cm; /* character color map */
214 color nlcolor; /* color of newline */
215 struct state *wordchrs; /* state in nfa holding word-char outarcs */
216 struct subre *tree; /* subexpression tree */
c0a7edde
RN
217 struct subre *treechain; /* all tree nodes allocated */
218 struct subre *treefree; /* any free tree nodes */
3ca4086b
VS
219 int ntree; /* number of tree nodes */
220 struct cvec *cv; /* interface cvec */
221 struct cvec *cv2; /* utility cvec */
222 struct cvec *mcces; /* collating-element information */
223# define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
c0a7edde 224 struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
3ca4086b
VS
225 struct state *mccepend; /* in nfa, end of MCCE prototypes */
226 struct subre *lacons; /* lookahead-constraint vector */
227 int nlacons; /* size of lacons */
5a419892
VZ
228};
229
c0a7edde 230/* parsing macros; most know that `v' is the struct vars pointer */
3ca4086b
VS
231#define NEXT() (next(v)) /* advance by one token */
232#define SEE(t) (v->nexttype == (t)) /* is next token this? */
233#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
234#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
235#define ISERR() VISERR(v)
236#define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
c0a7edde 237 ((vv)->err = (e)))
3ca4086b
VS
238#define ERR(e) VERR(v, e) /* record an error */
239#define NOERR() {if (ISERR()) return;} /* if error seen, return */
240#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
241#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
242#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
243#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
244#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
c0a7edde
RN
245
246/* token type codes, some also used as NFA arc types */
3ca4086b
VS
247#define EMPTY 'n' /* no token present */
248#define EOS 'e' /* end of string */
249#define PLAIN 'p' /* ordinary character */
250#define DIGIT 'd' /* digit (in bound) */
251#define BACKREF 'b' /* back reference */
252#define COLLEL 'I' /* start of [. */
253#define ECLASS 'E' /* start of [= */
254#define CCLASS 'C' /* start of [: */
255#define END 'X' /* end of [. [= [: */
256#define RANGE 'R' /* - within [] which might be range delim. */
257#define LACON 'L' /* lookahead constraint subRE */
258#define AHEAD 'a' /* color-lookahead arc */
259#define BEHIND 'r' /* color-lookbehind arc */
260#define WBDRY 'w' /* word boundary constraint */
261#define NWBDRY 'W' /* non-word-boundary constraint */
262#define SBEGIN 'A' /* beginning of string (even if not BOL) */
263#define SEND 'Z' /* end of string (even if not EOL) */
264#define PREFER 'P' /* length preference */
c0a7edde
RN
265
266/* is an arc colored, and hence on a color chain? */
3ca4086b 267#define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
c0a7edde
RN
268 (a)->type == BEHIND)
269
270
271
272/* static function list */
273static struct fns functions = {
3ca4086b 274 rfree, /* regfree insides */
c0a7edde 275};
5a419892 276
5a419892 277
5a419892
VZ
278
279/*
3ca4086b
VS
280 - compile - compile regular expression
281 ^ int compile(regex_t *, CONST chr *, size_t, int);
5a419892 282 */
c0a7edde 283int
3ca4086b
VS
284compile(re, string, len, flags)
285regex_t *re;
286CONST chr *string;
287size_t len;
288int flags;
5a419892 289{
c0a7edde
RN
290 struct vars var;
291 struct vars *v = &var;
292 struct guts *g;
3ca4086b
VS
293 int i;
294 size_t j;
295 FILE *debug = (flags&REG_PROGRESS) ? stdout : (FILE *)NULL;
296# define CNOERR() { if (ISERR()) return freev(v, v->err); }
c0a7edde
RN
297
298 /* sanity checks */
299
300 if (re == NULL || string == NULL)
301 return REG_INVARG;
3ca4086b
VS
302 if ((flags&REG_QUOTE) &&
303 (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
c0a7edde 304 return REG_INVARG;
3ca4086b 305 if (!(flags&REG_EXTENDED) && (flags&REG_ADVF))
c0a7edde
RN
306 return REG_INVARG;
307
308 /* initial setup (after which freev() is callable) */
309 v->re = re;
3ca4086b 310 v->now = (chr *)string;
c0a7edde
RN
311 v->stop = v->now + len;
312 v->savenow = v->savestop = NULL;
313 v->err = 0;
314 v->cflags = flags;
315 v->nsubexp = 0;
316 v->subs = v->sub10;
317 v->nsubs = 10;
318 for (j = 0; j < v->nsubs; j++)
319 v->subs[j] = NULL;
320 v->nfa = NULL;
321 v->cm = NULL;
322 v->nlcolor = COLORLESS;
323 v->wordchrs = NULL;
324 v->tree = NULL;
325 v->treechain = NULL;
326 v->treefree = NULL;
327 v->cv = NULL;
328 v->cv2 = NULL;
329 v->mcces = NULL;
330 v->lacons = NULL;
331 v->nlacons = 0;
332 re->re_magic = REMAGIC;
3ca4086b 333 re->re_info = 0; /* bits get set during parse */
c0a7edde
RN
334 re->re_csize = sizeof(chr);
335 re->re_guts = NULL;
336 re->re_fns = VS(&functions);
337
338 /* more complex setup, malloced things */
339 re->re_guts = VS(MALLOC(sizeof(struct guts)));
340 if (re->re_guts == NULL)
341 return freev(v, REG_ESPACE);
3ca4086b 342 g = (struct guts *)re->re_guts;
c0a7edde
RN
343 g->tree = NULL;
344 initcm(v, &g->cmap);
345 v->cm = &g->cmap;
346 g->lacons = NULL;
347 g->nlacons = 0;
348 ZAPCNFA(g->search);
3ca4086b 349 v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
c0a7edde
RN
350 CNOERR();
351 v->cv = newcvec(100, 20, 10);
352 if (v->cv == NULL)
353 return freev(v, REG_ESPACE);
354 i = nmcces(v);
3ca4086b 355 if (i > 0) {
c0a7edde
RN
356 v->mcces = newcvec(nleaders(v), 0, i);
357 CNOERR();
358 v->mcces = allmcces(v, v->mcces);
359 leaders(v, v->mcces);
3ca4086b 360 addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */
c0a7edde
RN
361 }
362 CNOERR();
363
364 /* parsing */
3ca4086b
VS
365 lexstart(v); /* also handles prefixes */
366 if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
c0a7edde
RN
367 /* assign newline a unique color */
368 v->nlcolor = subcolor(v->cm, newline());
369 okcolors(v->nfa, v->cm);
370 }
371 CNOERR();
372 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
3ca4086b 373 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
c0a7edde
RN
374 CNOERR();
375 assert(v->tree != NULL);
376
377 /* finish setup of nfa and its subre tree */
378 specialcolors(v->nfa);
379 CNOERR();
3ca4086b 380 if (debug != NULL) {
c0a7edde
RN
381 fprintf(debug, "\n\n\n========= RAW ==========\n");
382 dumpnfa(v->nfa, debug);
383 dumpst(v->tree, debug, 1);
3ca4086b 384 }
c0a7edde
RN
385 optst(v, v->tree);
386 v->ntree = numst(v->tree, 1);
387 markst(v->tree);
388 cleanst(v);
3ca4086b 389 if (debug != NULL) {
c0a7edde
RN
390 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
391 dumpst(v->tree, debug, 1);
5a419892
VZ
392 }
393
c0a7edde
RN
394 /* build compacted NFAs for tree and lacons */
395 re->re_info |= nfatree(v, v->tree, debug);
396 CNOERR();
397 assert(v->nlacons == 0 || v->lacons != NULL);
3ca4086b 398 for (i = 1; i < v->nlacons; i++) {
c0a7edde
RN
399 if (debug != NULL)
400 fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
c0a7edde 401 nfanode(v, &v->lacons[i], debug);
5a419892 402 }
c0a7edde 403 CNOERR();
3ca4086b 404 if (v->tree->flags&SHORTER)
c0a7edde
RN
405 NOTE(REG_USHORTEST);
406
407 /* build compacted NFAs for tree, lacons, fast search */
c0a7edde
RN
408 if (debug != NULL)
409 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
c0a7edde 410 /* can sacrifice main NFA now, so use it as work area */
3ca4086b 411 (DISCARD)optimize(v->nfa, debug);
c0a7edde
RN
412 CNOERR();
413 makesearch(v, v->nfa);
414 CNOERR();
415 compact(v->nfa, &g->search);
416 CNOERR();
417
418 /* looks okay, package it up */
419 re->re_nsub = v->nsubexp;
3ca4086b 420 v->re = NULL; /* freev no longer frees re */
c0a7edde
RN
421 g->magic = GUTSMAGIC;
422 g->cflags = v->cflags;
423 g->info = re->re_info;
424 g->nsub = re->re_nsub;
425 g->tree = v->tree;
426 v->tree = NULL;
427 g->ntree = v->ntree;
3ca4086b 428 g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
c0a7edde
RN
429 g->lacons = v->lacons;
430 v->lacons = NULL;
431 g->nlacons = v->nlacons;
432
3ca4086b 433 if (flags&REG_DUMP)
c0a7edde 434 dump(re, stdout);
5a419892 435
c0a7edde
RN
436 assert(v->err == 0);
437 return freev(v, 0);
5a419892
VZ
438}
439
440/*
3ca4086b
VS
441 - moresubs - enlarge subRE vector
442 ^ static VOID moresubs(struct vars *, int);
5a419892 443 */
3ca4086b
VS
444static VOID
445moresubs(v, wanted)
446struct vars *v;
447int wanted; /* want enough room for this one */
5a419892 448{
c0a7edde 449 struct subre **p;
3ca4086b 450 size_t n;
c0a7edde 451
3ca4086b
VS
452 assert(wanted > 0 && (size_t)wanted >= v->nsubs);
453 n = (size_t)wanted * 3 / 2 + 1;
454 if (v->subs == v->sub10) {
455 p = (struct subre **)MALLOC(n * sizeof(struct subre *));
c0a7edde
RN
456 if (p != NULL)
457 memcpy(VS(p), VS(v->subs),
3ca4086b
VS
458 v->nsubs * sizeof(struct subre *));
459 } else
460 p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
461 if (p == NULL) {
c0a7edde 462 ERR(REG_ESPACE);
5a419892 463 return;
5a419892 464 }
c0a7edde
RN
465 v->subs = p;
466 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
467 *p = NULL;
468 assert(v->nsubs == n);
3ca4086b 469 assert((size_t)wanted < v->nsubs);
5a419892
VZ
470}
471
472/*
3ca4086b 473 - freev - free vars struct's substructures where necessary
c0a7edde
RN
474 * Optionally does error-number setting, and always returns error code
475 * (if any), to make error-handling code terser.
3ca4086b 476 ^ static int freev(struct vars *, int);
5a419892 477 */
c0a7edde 478static int
3ca4086b
VS
479freev(v, err)
480struct vars *v;
481int err;
5a419892 482{
c0a7edde
RN
483 if (v->re != NULL)
484 rfree(v->re);
485 if (v->subs != v->sub10)
486 FREE(v->subs);
487 if (v->nfa != NULL)
488 freenfa(v->nfa);
489 if (v->tree != NULL)
490 freesubre(v, v->tree);
491 if (v->treechain != NULL)
492 cleanst(v);
493 if (v->cv != NULL)
494 freecvec(v->cv);
495 if (v->cv2 != NULL)
496 freecvec(v->cv2);
497 if (v->mcces != NULL)
498 freecvec(v->mcces);
499 if (v->lacons != NULL)
500 freelacons(v->lacons, v->nlacons);
3ca4086b 501 ERR(err); /* nop if err==0 */
c0a7edde
RN
502
503 return v->err;
5a419892
VZ
504}
505
506/*
3ca4086b 507 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
c0a7edde 508 * NFA must have been optimize()d already.
3ca4086b 509 ^ static VOID makesearch(struct vars *, struct nfa *);
5a419892 510 */
3ca4086b
VS
511static VOID
512makesearch(v, nfa)
513struct vars *v;
514struct nfa *nfa;
5a419892 515{
c0a7edde
RN
516 struct arc *a;
517 struct arc *b;
518 struct state *pre = nfa->pre;
519 struct state *s;
520 struct state *s2;
521 struct state *slist;
522
523 /* no loops are needed if it's anchored */
3ca4086b 524 for (a = pre->outs; a != NULL; a = a->outchain) {
c0a7edde
RN
525 assert(a->type == PLAIN);
526 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
527 break;
5a419892 528 }
3ca4086b 529 if (a != NULL) {
c0a7edde
RN
530 /* add implicit .* in front */
531 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
532
533 /* and ^* and \A* too -- not always necessary, but harmless */
534 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
535 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
5a419892
VZ
536 }
537
c0a7edde
RN
538 /*
539 * Now here's the subtle part. Because many REs have no lookback
3ca4086b
VS
540 * constraints, often knowing when you were in the pre state tells
541 * you little; it's the next state(s) that are informative. But
542 * some of them may have other inarcs, i.e. it may be possible to
543 * make actual progress and then return to one of them. We must
544 * de-optimize such cases, splitting each such state into progress
545 * and no-progress states.
546 */
c0a7edde
RN
547
548 /* first, make a list of the states */
549 slist = NULL;
3ca4086b 550 for (a = pre->outs; a != NULL; a = a->outchain) {
c0a7edde
RN
551 s = a->to;
552 for (b = s->ins; b != NULL; b = b->inchain)
553 if (b->from != pre)
3ca4086b
VS
554 break;
555 if (b != NULL) { /* must be split */
556 if (s->tmp == NULL) { /* if not already in the list */
557 /* (fixes bugs 505048, 230589, */
558 /* 840258, 504785) */
559 s->tmp = slist;
560 slist = s;
561 }
5a419892 562 }
3ca4086b 563 }
5a419892 564
c0a7edde 565 /* do the splits */
3ca4086b 566 for (s = slist; s != NULL; s = s2) {
c0a7edde
RN
567 s2 = newstate(nfa);
568 copyouts(nfa, s, s2);
3ca4086b 569 for (a = s->ins; a != NULL; a = b) {
c0a7edde 570 b = a->inchain;
3ca4086b 571 if (a->from != pre) {
c0a7edde
RN
572 cparc(nfa, a, a->from, s2);
573 freearc(nfa, a);
574 }
575 }
576 s2 = s->tmp;
3ca4086b 577 s->tmp = NULL; /* clean up while we're at it */
c0a7edde 578 }
5a419892
VZ
579}
580
581/*
3ca4086b 582 - parse - parse an RE
c0a7edde 583 * This is actually just the top level, which parses a bunch of branches
3ca4086b 584 * tied together with '|'. They appear in the tree as the left children
c0a7edde 585 * of a chain of '|' subres.
3ca4086b
VS
586 ^ static struct subre *parse(struct vars *, int, int, struct state *,
587 ^ struct state *);
5a419892 588 */
c0a7edde 589static struct subre *
3ca4086b
VS
590parse(v, stopper, type, init, final)
591struct vars *v;
592int stopper; /* EOS or ')' */
593int type; /* LACON (lookahead subRE) or PLAIN */
594struct state *init; /* initial state */
595struct state *final; /* final state */
5a419892 596{
3ca4086b 597 struct state *left; /* scaffolding for branch */
c0a7edde 598 struct state *right;
3ca4086b
VS
599 struct subre *branches; /* top level */
600 struct subre *branch; /* current branch */
601 struct subre *t; /* temporary */
602 int firstbranch; /* is this the first branch? */
c0a7edde
RN
603
604 assert(stopper == ')' || stopper == EOS);
605
606 branches = subre(v, '|', LONGER, init, final);
607 NOERRN();
608 branch = branches;
609 firstbranch = 1;
3ca4086b
VS
610 do { /* a branch */
611 if (!firstbranch) {
c0a7edde
RN
612 /* need a place to hang it */
613 branch->right = subre(v, '|', LONGER, init, final);
614 NOERRN();
615 branch = branch->right;
616 }
617 firstbranch = 0;
618 left = newstate(v->nfa);
619 right = newstate(v->nfa);
620 NOERRN();
621 EMPTYARC(init, left);
622 EMPTYARC(right, final);
623 NOERRN();
624 branch->left = parsebranch(v, stopper, type, left, right, 0);
625 NOERRN();
626 branch->flags |= UP(branch->flags | branch->left->flags);
3ca4086b 627 if ((branch->flags &~ branches->flags) != 0) /* new flags */
c0a7edde
RN
628 for (t = branches; t != branch; t = t->right)
629 t->flags |= branch->flags;
630 } while (EAT('|'));
631 assert(SEE(stopper) || SEE(EOS));
632
3ca4086b 633 if (!SEE(stopper)) {
c0a7edde
RN
634 assert(stopper == ')' && SEE(EOS));
635 ERR(REG_EPAREN);
636 }
5a419892 637
c0a7edde 638 /* optimize out simple cases */
3ca4086b 639 if (branch == branches) { /* only one branch */
c0a7edde
RN
640 assert(branch->right == NULL);
641 t = branch->left;
642 branch->left = NULL;
643 freesubre(v, branches);
644 branches = t;
3ca4086b 645 } else if (!MESSY(branches->flags)) { /* no interesting innards */
c0a7edde
RN
646 freesubre(v, branches->left);
647 branches->left = NULL;
648 freesubre(v, branches->right);
649 branches->right = NULL;
650 branches->op = '=';
5a419892
VZ
651 }
652
c0a7edde 653 return branches;
5a419892
VZ
654}
655
656/*
3ca4086b 657 - parsebranch - parse one branch of an RE
c0a7edde
RN
658 * This mostly manages concatenation, working closely with parseqatom().
659 * Concatenated things are bundled up as much as possible, with separate
660 * ',' nodes introduced only when necessary due to substructure.
3ca4086b
VS
661 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
662 ^ struct state *, int);
5a419892 663 */
c0a7edde 664static struct subre *
3ca4086b
VS
665parsebranch(v, stopper, type, left, right, partial)
666struct vars *v;
667int stopper; /* EOS or ')' */
668int type; /* LACON (lookahead subRE) or PLAIN */
669struct state *left; /* leftmost state */
670struct state *right; /* rightmost state */
671int partial; /* is this only part of a branch? */
5a419892 672{
3ca4086b
VS
673 struct state *lp; /* left end of current construct */
674 int seencontent; /* is there anything in this branch yet? */
c0a7edde
RN
675 struct subre *t;
676
677 lp = left;
678 seencontent = 0;
679 t = subre(v, '=', 0, left, right); /* op '=' is tentative */
680 NOERRN();
3ca4086b
VS
681 while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
682 if (seencontent) { /* implicit concat operator */
c0a7edde
RN
683 lp = newstate(v->nfa);
684 NOERRN();
685 moveins(v->nfa, right, lp);
3ca4086b 686 }
c0a7edde 687 seencontent = 1;
5a419892 688
c0a7edde
RN
689 /* NB, recursion in parseqatom() may swallow rest of branch */
690 parseqatom(v, stopper, type, lp, right, t);
5a419892 691 }
5a419892 692
3ca4086b 693 if (!seencontent) { /* empty branch */
c0a7edde
RN
694 if (!partial)
695 NOTE(REG_UUNSPEC);
696 assert(lp == left);
697 EMPTYARC(left, right);
5a419892
VZ
698 }
699
c0a7edde 700 return t;
5a419892
VZ
701}
702
703/*
3ca4086b 704 - parseqatom - parse one quantified atom or constraint of an RE
c0a7edde
RN
705 * The bookkeeping near the end cooperates very closely with parsebranch();
706 * in particular, it contains a recursion that can involve parsing the rest
707 * of the branch, making this function's name somewhat inaccurate.
3ca4086b
VS
708 ^ static VOID parseqatom(struct vars *, int, int, struct state *,
709 ^ struct state *, struct subre *);
5a419892 710 */
3ca4086b
VS
711static VOID
712parseqatom(v, stopper, type, lp, rp, top)
713struct vars *v;
714int stopper; /* EOS or ')' */
715int type; /* LACON (lookahead subRE) or PLAIN */
716struct state *lp; /* left state to hang it on */
717struct state *rp; /* right state to hang it on */
718struct subre *top; /* subtree top */
5a419892 719{
3ca4086b 720 struct state *s; /* temporaries for new states */
c0a7edde 721 struct state *s2;
3ca4086b
VS
722# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
723 int m, n;
724 struct subre *atom; /* atom's subtree */
c0a7edde 725 struct subre *t;
3ca4086b
VS
726 int cap; /* capturing parens? */
727 int pos; /* positive lookahead? */
728 int subno; /* capturing-parens or backref number */
729 int atomtype;
730 int qprefer; /* quantifier short/long preference */
731 int f;
732 struct subre **atomp; /* where the pointer to atom is */
c0a7edde
RN
733
734 /* initial bookkeeping */
735 atom = NULL;
3ca4086b
VS
736 assert(lp->nouts == 0); /* must string new code */
737 assert(rp->nins == 0); /* between lp and rp */
738 subno = 0; /* just to shut lint up */
c0a7edde
RN
739
740 /* an atom or constraint... */
741 atomtype = v->nexttype;
3ca4086b
VS
742 switch (atomtype) {
743 /* first, constraints, which end by returning */
744 case '^':
745 ARCV('^', 1);
746 if (v->cflags&REG_NLANCH)
747 ARCV(BEHIND, v->nlcolor);
748 NEXT();
749 return;
5a419892 750 break;
3ca4086b
VS
751 case '$':
752 ARCV('$', 1);
753 if (v->cflags&REG_NLANCH)
754 ARCV(AHEAD, v->nlcolor);
755 NEXT();
756 return;
757 break;
758 case SBEGIN:
759 ARCV('^', 1); /* BOL */
760 ARCV('^', 0); /* or BOS */
761 NEXT();
762 return;
763 break;
764 case SEND:
765 ARCV('$', 1); /* EOL */
766 ARCV('$', 0); /* or EOS */
767 NEXT();
768 return;
769 break;
770 case '<':
771 wordchrs(v); /* does NEXT() */
772 s = newstate(v->nfa);
773 NOERR();
774 nonword(v, BEHIND, lp, s);
775 word(v, AHEAD, s, rp);
776 return;
777 break;
778 case '>':
779 wordchrs(v); /* does NEXT() */
780 s = newstate(v->nfa);
781 NOERR();
782 word(v, BEHIND, lp, s);
783 nonword(v, AHEAD, s, rp);
784 return;
785 break;
786 case WBDRY:
787 wordchrs(v); /* does NEXT() */
788 s = newstate(v->nfa);
789 NOERR();
790 nonword(v, BEHIND, lp, s);
791 word(v, AHEAD, s, rp);
792 s = newstate(v->nfa);
793 NOERR();
794 word(v, BEHIND, lp, s);
795 nonword(v, AHEAD, s, rp);
796 return;
797 break;
798 case NWBDRY:
799 wordchrs(v); /* does NEXT() */
800 s = newstate(v->nfa);
801 NOERR();
802 word(v, BEHIND, lp, s);
803 word(v, AHEAD, s, rp);
804 s = newstate(v->nfa);
805 NOERR();
806 nonword(v, BEHIND, lp, s);
807 nonword(v, AHEAD, s, rp);
808 return;
809 break;
810 case LACON: /* lookahead constraint */
811 pos = v->nextvalue;
812 NEXT();
813 s = newstate(v->nfa);
814 s2 = newstate(v->nfa);
815 NOERR();
816 t = parse(v, ')', LACON, s, s2);
817 freesubre(v, t); /* internal structure irrelevant */
818 assert(SEE(')') || ISERR());
819 NEXT();
820 n = newlacon(v, s, s2, pos);
821 NOERR();
822 ARCV(LACON, n);
823 return;
824 break;
825 /* then errors, to get them out of the way */
826 case '*':
827 case '+':
828 case '?':
829 case '{':
830 ERR(REG_BADRPT);
831 return;
5a419892
VZ
832 break;
833 default:
3ca4086b
VS
834 ERR(REG_ASSERT);
835 return;
836 break;
837 /* then plain characters, and minor variants on that theme */
838 case ')': /* unbalanced paren */
839 if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
840 ERR(REG_EPAREN);
c0a7edde 841 return;
3ca4086b
VS
842 }
843 /* legal in EREs due to specification botch */
844 NOTE(REG_UPBOTCH);
845 /* fallthrough into case PLAIN */
846 case PLAIN:
847 onechr(v, v->nextvalue, lp, rp);
848 okcolors(v->nfa, v->cm);
849 NOERR();
850 NEXT();
5a419892 851 break;
3ca4086b
VS
852 case '[':
853 if (v->nextvalue == 1)
854 bracket(v, lp, rp);
855 else
856 cbracket(v, lp, rp);
857 assert(SEE(']') || ISERR());
858 NEXT();
5a419892 859 break;
3ca4086b
VS
860 case '.':
861 rainbow(v->nfa, v->cm, PLAIN,
862 (v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,
863 lp, rp);
864 NEXT();
5a419892 865 break;
3ca4086b
VS
866 /* and finally the ugly stuff */
867 case '(': /* value flags as capturing or non */
868 cap = (type == LACON) ? 0 : v->nextvalue;
869 if (cap) {
870 v->nsubexp++;
871 subno = v->nsubexp;
872 if ((size_t)subno >= v->nsubs)
873 moresubs(v, subno);
874 assert((size_t)subno < v->nsubs);
875 } else
876 atomtype = PLAIN; /* something that's not '(' */
877 NEXT();
878 /* need new endpoints because tree will contain pointers */
879 s = newstate(v->nfa);
880 s2 = newstate(v->nfa);
881 NOERR();
882 EMPTYARC(lp, s);
883 EMPTYARC(s2, rp);
884 NOERR();
885 atom = parse(v, ')', PLAIN, s, s2);
886 assert(SEE(')') || ISERR());
887 NEXT();
888 NOERR();
889 if (cap) {
890 v->subs[subno] = atom;
891 t = subre(v, '(', atom->flags|CAP, lp, rp);
c0a7edde 892 NOERR();
3ca4086b
VS
893 t->subno = subno;
894 t->left = atom;
895 atom = t;
896 }
897 /* postpone everything else pending possible {0} */
898 break;
899 case BACKREF: /* the Feature From The Black Lagoon */
900 INSIST(type != LACON, REG_ESUBREG);
901 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
902 INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
903 NOERR();
904 assert(v->nextvalue > 0);
905 atom = subre(v, 'b', BACKR, lp, rp);
906 subno = v->nextvalue;
907 atom->subno = subno;
908 EMPTYARC(lp, rp); /* temporarily, so there's something */
909 NEXT();
5a419892
VZ
910 break;
911 }
5a419892 912
c0a7edde 913 /* ...and an atom may be followed by a quantifier */
3ca4086b
VS
914 switch (v->nexttype) {
915 case '*':
916 m = 0;
917 n = INFINITY;
918 qprefer = (v->nextvalue) ? LONGER : SHORTER;
5a419892 919 NEXT();
3ca4086b
VS
920 break;
921 case '+':
922 m = 1;
923 n = INFINITY;
924 qprefer = (v->nextvalue) ? LONGER : SHORTER;
925 NEXT();
926 break;
927 case '?':
928 m = 0;
929 n = 1;
930 qprefer = (v->nextvalue) ? LONGER : SHORTER;
931 NEXT();
932 break;
933 case '{':
934 NEXT();
935 m = scannum(v);
936 if (EAT(',')) {
937 if (SEE(DIGIT))
938 n = scannum(v);
c0a7edde 939 else
3ca4086b
VS
940 n = INFINITY;
941 if (m > n) {
c0a7edde
RN
942 ERR(REG_BADBR);
943 return;
944 }
3ca4086b
VS
945 /* {m,n} exercises preference, even if it's {m,m} */
946 qprefer = (v->nextvalue) ? LONGER : SHORTER;
947 } else {
948 n = m;
949 /* {m} passes operand's preference through */
c0a7edde 950 qprefer = 0;
3ca4086b
VS
951 }
952 if (!SEE('}')) { /* catches errors too */
953 ERR(REG_BADBR);
954 return;
955 }
956 NEXT();
957 break;
958 default: /* no quantifier */
959 m = n = 1;
960 qprefer = 0;
961 break;
c0a7edde 962 }
5a419892 963
c0a7edde 964 /* annoying special case: {0} or {0,0} cancels everything */
3ca4086b 965 if (m == 0 && n == 0) {
c0a7edde
RN
966 if (atom != NULL)
967 freesubre(v, atom);
968 if (atomtype == '(')
969 v->subs[subno] = NULL;
970 delsub(v->nfa, lp, rp);
971 EMPTYARC(lp, rp);
972 return;
973 }
5a419892 974
c0a7edde
RN
975 /* if not a messy case, avoid hard part */
976 assert(!MESSY(top->flags));
977 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
3ca4086b 978 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
c0a7edde
RN
979 if (!(m == 1 && n == 1))
980 repeat(v, lp, rp, m, n);
981 if (atom != NULL)
982 freesubre(v, atom);
983 top->flags = f;
984 return;
985 }
986
987 /*
3ca4086b
VS
988 * hard part: something messy
989 * That is, capturing parens, back reference, short/long clash, or
990 * an atom with substructure containing one of those.
991 */
5a419892 992
c0a7edde 993 /* now we'll need a subre for the contents even if they're boring */
3ca4086b 994 if (atom == NULL) {
c0a7edde
RN
995 atom = subre(v, '=', 0, lp, rp);
996 NOERR();
997 }
5a419892 998
c0a7edde
RN
999 /*
1000 * prepare a general-purpose state skeleton
1001 *
3ca4086b
VS
1002 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1003 * / /
c0a7edde
RN
1004 * [lp] ----> [s2] ----bypass---------------------
1005 *
1006 * where bypass is an empty, and prefix is some repetitions of atom
3ca4086b 1007 */
c0a7edde
RN
1008 s = newstate(v->nfa); /* first, new endpoints for the atom */
1009 s2 = newstate(v->nfa);
1010 NOERR();
1011 moveouts(v->nfa, lp, s);
1012 moveins(v->nfa, rp, s2);
1013 NOERR();
1014 atom->begin = s;
1015 atom->end = s2;
1016 s = newstate(v->nfa); /* and spots for prefix and bypass */
1017 s2 = newstate(v->nfa);
1018 NOERR();
1019 EMPTYARC(lp, s);
1020 EMPTYARC(lp, s2);
1021 NOERR();
1022
1023 /* break remaining subRE into x{...} and what follows */
1024 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1025 t->left = atom;
1026 atomp = &t->left;
1027 /* here we should recurse... but we must postpone that to the end */
1028
1029 /* split top into prefix and remaining */
1030 assert(top->op == '=' && top->left == NULL && top->right == NULL);
1031 top->left = subre(v, '=', top->flags, top->begin, lp);
1032 top->op = '.';
1033 top->right = t;
1034
1035 /* if it's a backref, now is the time to replicate the subNFA */
3ca4086b
VS
1036 if (atomtype == BACKREF) {
1037 assert(atom->begin->nouts == 1); /* just the EMPTY */
c0a7edde
RN
1038 delsub(v->nfa, atom->begin, atom->end);
1039 assert(v->subs[subno] != NULL);
1040 /* and here's why the recursion got postponed: it must */
1041 /* wait until the skeleton is filled in, because it may */
1042 /* hit a backref that wants to copy the filled-in skeleton */
1043 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
3ca4086b 1044 atom->begin, atom->end);
c0a7edde
RN
1045 NOERR();
1046 }
1047
1048 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
3ca4086b
VS
1049 if (m == 0) {
1050 EMPTYARC(s2, atom->end); /* the bypass */
c0a7edde
RN
1051 assert(PREF(qprefer) != 0);
1052 f = COMBINE(qprefer, atom->flags);
1053 t = subre(v, '|', f, lp, atom->end);
1054 NOERR();
1055 t->left = atom;
1056 t->right = subre(v, '|', PREF(f), s2, atom->end);
1057 NOERR();
1058 t->right->left = subre(v, '=', 0, s2, atom->end);
1059 NOERR();
1060 *atomp = t;
1061 atomp = &t->left;
1062 m = 1;
1063 }
5a419892 1064
c0a7edde 1065 /* deal with the rest of the quantifier */
3ca4086b 1066 if (atomtype == BACKREF) {
c0a7edde 1067 /* special case: backrefs have internal quantifiers */
3ca4086b 1068 EMPTYARC(s, atom->begin); /* empty prefix */
c0a7edde
RN
1069 /* just stuff everything into atom */
1070 repeat(v, atom->begin, atom->end, m, n);
3ca4086b
VS
1071 atom->min = (short)m;
1072 atom->max = (short)n;
c0a7edde 1073 atom->flags |= COMBINE(qprefer, atom->flags);
3ca4086b 1074 } else if (m == 1 && n == 1) {
c0a7edde 1075 /* no/vacuous quantifier: done */
3ca4086b
VS
1076 EMPTYARC(s, atom->begin); /* empty prefix */
1077 } else {
c0a7edde 1078 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
3ca4086b 1079 /* parens in only second x */
c0a7edde
RN
1080 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1081 assert(m >= 1 && m != INFINITY && n >= 1);
3ca4086b 1082 repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
c0a7edde 1083 f = COMBINE(qprefer, atom->flags);
3ca4086b 1084 t = subre(v, '.', f, s, atom->end); /* prefix and atom */
c0a7edde
RN
1085 NOERR();
1086 t->left = subre(v, '=', PREF(f), s, atom->begin);
1087 NOERR();
1088 t->right = atom;
1089 *atomp = t;
1090 }
5a419892 1091
c0a7edde
RN
1092 /* and finally, look after that postponed recursion */
1093 t = top->right;
1094 if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1095 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
3ca4086b 1096 else {
c0a7edde
RN
1097 EMPTYARC(atom->end, rp);
1098 t->right = subre(v, '=', 0, atom->end, rp);
1099 }
1100 assert(SEE('|') || SEE(stopper) || SEE(EOS));
1101 t->flags |= COMBINE(t->flags, t->right->flags);
1102 top->flags |= COMBINE(top->flags, t->flags);
5a419892
VZ
1103}
1104
1105/*
3ca4086b
VS
1106 - nonword - generate arcs for non-word-character ahead or behind
1107 ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
5a419892 1108 */
3ca4086b
VS
1109static VOID
1110nonword(v, dir, lp, rp)
1111struct vars *v;
1112int dir; /* AHEAD or BEHIND */
1113struct state *lp;
1114struct state *rp;
5a419892 1115{
3ca4086b 1116 int anchor = (dir == AHEAD) ? '$' : '^';
5a419892 1117
c0a7edde
RN
1118 assert(dir == AHEAD || dir == BEHIND);
1119 newarc(v->nfa, anchor, 1, lp, rp);
1120 newarc(v->nfa, anchor, 0, lp, rp);
1121 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1122 /* (no need for special attention to \n) */
5a419892
VZ
1123}
1124
1125/*
3ca4086b
VS
1126 - word - generate arcs for word character ahead or behind
1127 ^ static VOID word(struct vars *, int, struct state *, struct state *);
5a419892 1128 */
3ca4086b
VS
1129static VOID
1130word(v, dir, lp, rp)
1131struct vars *v;
1132int dir; /* AHEAD or BEHIND */
1133struct state *lp;
1134struct state *rp;
5a419892 1135{
c0a7edde
RN
1136 assert(dir == AHEAD || dir == BEHIND);
1137 cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1138 /* (no need for special attention to \n) */
5a419892
VZ
1139}
1140
1141/*
3ca4086b
VS
1142 - scannum - scan a number
1143 ^ static int scannum(struct vars *);
5a419892 1144 */
3ca4086b
VS
1145static int /* value, <= DUPMAX */
1146scannum(v)
1147struct vars *v;
5a419892 1148{
3ca4086b 1149 int n = 0;
c0a7edde 1150
3ca4086b
VS
1151 while (SEE(DIGIT) && n < DUPMAX) {
1152 n = n*10 + v->nextvalue;
c0a7edde
RN
1153 NEXT();
1154 }
3ca4086b 1155 if (SEE(DIGIT) || n > DUPMAX) {
c0a7edde
RN
1156 ERR(REG_BADBR);
1157 return 0;
1158 }
1159 return n;
5a419892
VZ
1160}
1161
1162/*
3ca4086b 1163 - repeat - replicate subNFA for quantifiers
c0a7edde
RN
1164 * The duplication sequences used here are chosen carefully so that any
1165 * pointers starting out pointing into the subexpression end up pointing into
1166 * the last occurrence. (Note that it may not be strung between the same
1167 * left and right end states, however!) This used to be important for the
1168 * subRE tree, although the important bits are now handled by the in-line
1169 * code in parse(), and when this is called, it doesn't matter any more.
3ca4086b 1170 ^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
5a419892 1171 */
3ca4086b
VS
1172static VOID
1173repeat(v, lp, rp, m, n)
1174struct vars *v;
1175struct state *lp;
1176struct state *rp;
1177int m;
1178int n;
5a419892 1179{
3ca4086b
VS
1180# define SOME 2
1181# define INF 3
1182# define PAIR(x, y) ((x)*4 + (y))
1183# define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1184 CONST int rm = REDUCE(m);
1185 CONST int rn = REDUCE(n);
c0a7edde
RN
1186 struct state *s;
1187 struct state *s2;
1188
3ca4086b
VS
1189 switch (PAIR(rm, rn)) {
1190 case PAIR(0, 0): /* empty string */
1191 delsub(v->nfa, lp, rp);
1192 EMPTYARC(lp, rp);
1193 break;
1194 case PAIR(0, 1): /* do as x| */
1195 EMPTYARC(lp, rp);
1196 break;
1197 case PAIR(0, SOME): /* do as x{1,n}| */
1198 repeat(v, lp, rp, 1, n);
1199 NOERR();
1200 EMPTYARC(lp, rp);
1201 break;
1202 case PAIR(0, INF): /* loop x around */
1203 s = newstate(v->nfa);
1204 NOERR();
1205 moveouts(v->nfa, lp, s);
1206 moveins(v->nfa, rp, s);
1207 EMPTYARC(lp, s);
1208 EMPTYARC(s, rp);
1209 break;
1210 case PAIR(1, 1): /* no action required */
1211 break;
1212 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1213 s = newstate(v->nfa);
1214 NOERR();
1215 moveouts(v->nfa, lp, s);
1216 dupnfa(v->nfa, s, rp, lp, s);
1217 NOERR();
1218 repeat(v, lp, s, 1, n-1);
1219 NOERR();
1220 EMPTYARC(lp, s);
1221 break;
1222 case PAIR(1, INF): /* add loopback arc */
1223 s = newstate(v->nfa);
1224 s2 = newstate(v->nfa);
1225 NOERR();
1226 moveouts(v->nfa, lp, s);
1227 moveins(v->nfa, rp, s2);
1228 EMPTYARC(lp, s);
1229 EMPTYARC(s2, rp);
1230 EMPTYARC(s2, s);
1231 break;
1232 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1233 s = newstate(v->nfa);
1234 NOERR();
1235 moveouts(v->nfa, lp, s);
1236 dupnfa(v->nfa, s, rp, lp, s);
1237 NOERR();
1238 repeat(v, lp, s, m-1, n-1);
1239 break;
1240 case PAIR(SOME, INF): /* do as x{m-1,}x */
1241 s = newstate(v->nfa);
1242 NOERR();
1243 moveouts(v->nfa, lp, s);
1244 dupnfa(v->nfa, s, rp, lp, s);
1245 NOERR();
1246 repeat(v, lp, s, m-1, n);
1247 break;
1248 default:
1249 ERR(REG_ASSERT);
1250 break;
5a419892
VZ
1251 }
1252}
1253
1254/*
3ca4086b 1255 - bracket - handle non-complemented bracket expression
c0a7edde 1256 * Also called from cbracket for complemented bracket expressions.
3ca4086b 1257 ^ static VOID bracket(struct vars *, struct state *, struct state *);
5a419892 1258 */
3ca4086b
VS
1259static VOID
1260bracket(v, lp, rp)
1261struct vars *v;
1262struct state *lp;
1263struct state *rp;
5a419892 1264{
c0a7edde
RN
1265 assert(SEE('['));
1266 NEXT();
1267 while (!SEE(']') && !SEE(EOS))
1268 brackpart(v, lp, rp);
1269 assert(SEE(']') || ISERR());
1270 okcolors(v->nfa, v->cm);
5a419892
VZ
1271}
1272
1273/*
3ca4086b 1274 - cbracket - handle complemented bracket expression
c0a7edde 1275 * We do it by calling bracket() with dummy endpoints, and then complementing
3ca4086b 1276 * the result. The alternative would be to invoke rainbow(), and then delete
c0a7edde 1277 * arcs as the b.e. is seen... but that gets messy.
3ca4086b 1278 ^ static VOID cbracket(struct vars *, struct state *, struct state *);
5a419892 1279 */
3ca4086b
VS
1280static VOID
1281cbracket(v, lp, rp)
1282struct vars *v;
1283struct state *lp;
1284struct state *rp;
5a419892 1285{
c0a7edde
RN
1286 struct state *left = newstate(v->nfa);
1287 struct state *right = newstate(v->nfa);
1288 struct state *s;
3ca4086b
VS
1289 struct arc *a; /* arc from lp */
1290 struct arc *ba; /* arc from left, from bracket() */
1291 struct arc *pa; /* MCCE-prototype arc */
1292 color co;
1293 chr *p;
1294 int i;
c0a7edde
RN
1295
1296 NOERR();
1297 bracket(v, left, right);
3ca4086b 1298 if (v->cflags&REG_NLSTOP)
c0a7edde
RN
1299 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1300 NOERR();
1301
1302 assert(lp->nouts == 0); /* all outarcs will be ours */
1303
1304 /* easy part of complementing */
1305 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1306 NOERR();
3ca4086b 1307 if (v->mcces == NULL) { /* no MCCEs -- we're done */
c0a7edde
RN
1308 dropstate(v->nfa, left);
1309 assert(right->nins == 0);
1310 freestate(v->nfa, right);
5a419892 1311 return;
c0a7edde
RN
1312 }
1313
1314 /* but complementing gets messy in the presence of MCCEs... */
1315 NOTE(REG_ULOCALE);
3ca4086b 1316 for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
c0a7edde
RN
1317 co = GETCOLOR(v->cm, *p);
1318 a = findarc(lp, PLAIN, co);
1319 ba = findarc(left, PLAIN, co);
3ca4086b 1320 if (ba == NULL) {
c0a7edde
RN
1321 assert(a != NULL);
1322 freearc(v->nfa, a);
3ca4086b 1323 } else {
c0a7edde 1324 assert(a == NULL);
3ca4086b 1325 }
c0a7edde
RN
1326 s = newstate(v->nfa);
1327 NOERR();
1328 newarc(v->nfa, PLAIN, co, lp, s);
1329 NOERR();
1330 pa = findarc(v->mccepbegin, PLAIN, co);
1331 assert(pa != NULL);
3ca4086b 1332 if (ba == NULL) { /* easy case, need all of them */
c0a7edde
RN
1333 cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1334 newarc(v->nfa, '$', 1, s, rp);
1335 newarc(v->nfa, '$', 0, s, rp);
1336 colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
3ca4086b
VS
1337 } else { /* must be selective */
1338 if (findarc(ba->to, '$', 1) == NULL) {
c0a7edde
RN
1339 newarc(v->nfa, '$', 1, s, rp);
1340 newarc(v->nfa, '$', 0, s, rp);
1341 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
3ca4086b 1342 s, rp);
c0a7edde
RN
1343 }
1344 for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1345 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1346 newarc(v->nfa, PLAIN, pa->co, s, rp);
1347 if (s->nouts == 0) /* limit of selectivity: none */
1348 dropstate(v->nfa, s); /* frees arc too */
1349 }
1350 NOERR();
1351 }
5a419892 1352
c0a7edde
RN
1353 delsub(v->nfa, left, right);
1354 assert(left->nouts == 0);
1355 freestate(v->nfa, left);
1356 assert(right->nins == 0);
1357 freestate(v->nfa, right);
1358}
3ca4086b 1359
c0a7edde 1360/*
3ca4086b
VS
1361 - brackpart - handle one item (or range) within a bracket expression
1362 ^ static VOID brackpart(struct vars *, struct state *, struct state *);
c0a7edde 1363 */
3ca4086b
VS
1364static VOID
1365brackpart(v, lp, rp)
1366struct vars *v;
1367struct state *lp;
1368struct state *rp;
c0a7edde 1369{
3ca4086b
VS
1370 celt startc;
1371 celt endc;
c0a7edde 1372 struct cvec *cv;
3ca4086b
VS
1373 chr *startp;
1374 chr *endp;
1375 chr c[1];
c0a7edde
RN
1376
1377 /* parse something, get rid of special cases, take shortcuts */
3ca4086b
VS
1378 switch (v->nexttype) {
1379 case RANGE: /* a-b-c or other botch */
1380 ERR(REG_ERANGE);
1381 return;
1382 break;
1383 case PLAIN:
1384 c[0] = v->nextvalue;
1385 NEXT();
1386 /* shortcut for ordinary chr (not range, not MCCE leader) */
1387 if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
1388 onechr(v, c[0], lp, rp);
c0a7edde 1389 return;
3ca4086b
VS
1390 }
1391 startc = element(v, c, c+1);
1392 NOERR();
1393 break;
1394 case COLLEL:
1395 startp = v->now;
1396 endp = scanplain(v);
1397 INSIST(startp < endp, REG_ECOLLATE);
1398 NOERR();
1399 startc = element(v, startp, endp);
1400 NOERR();
1401 break;
1402 case ECLASS:
1403 startp = v->now;
1404 endp = scanplain(v);
1405 INSIST(startp < endp, REG_ECOLLATE);
1406 NOERR();
1407 startc = element(v, startp, endp);
1408 NOERR();
1409 cv = eclass(v, startc, (v->cflags&REG_ICASE));
1410 NOERR();
1411 dovec(v, cv, lp, rp);
1412 return;
5a419892 1413 break;
3ca4086b
VS
1414 case CCLASS:
1415 startp = v->now;
1416 endp = scanplain(v);
1417 INSIST(startp < endp, REG_ECTYPE);
1418 NOERR();
1419 cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
1420 NOERR();
1421 dovec(v, cv, lp, rp);
1422 return;
1423 break;
1424 default:
1425 ERR(REG_ASSERT);
1426 return;
1427 break;
1428 }
1429
1430 if (SEE(RANGE)) {
1431 NEXT();
1432 switch (v->nexttype) {
c0a7edde 1433 case PLAIN:
3ca4086b 1434 case RANGE:
c0a7edde
RN
1435 c[0] = v->nextvalue;
1436 NEXT();
3ca4086b 1437 endc = element(v, c, c+1);
c0a7edde 1438 NOERR();
3ca4086b 1439 break;
c0a7edde
RN
1440 case COLLEL:
1441 startp = v->now;
1442 endp = scanplain(v);
1443 INSIST(startp < endp, REG_ECOLLATE);
1444 NOERR();
3ca4086b 1445 endc = element(v, startp, endp);
c0a7edde
RN
1446 NOERR();
1447 break;
c0a7edde 1448 default:
3ca4086b 1449 ERR(REG_ERANGE);
c0a7edde 1450 return;
3ca4086b
VS
1451 break;
1452 }
1453 } else
c0a7edde
RN
1454 endc = startc;
1455
1456 /*
3ca4086b
VS
1457 * Ranges are unportable. Actually, standard C does
1458 * guarantee that digits are contiguous, but making
1459 * that an exception is just too complicated.
1460 */
c0a7edde
RN
1461 if (startc != endc)
1462 NOTE(REG_UUNPORT);
3ca4086b 1463 cv = range(v, startc, endc, (v->cflags&REG_ICASE));
c0a7edde
RN
1464 NOERR();
1465 dovec(v, cv, lp, rp);
5a419892
VZ
1466}
1467
1468/*
3ca4086b 1469 - scanplain - scan PLAIN contents of [. etc.
c0a7edde
RN
1470 * Certain bits of trickery in lex.c know that this code does not try
1471 * to look past the final bracket of the [. etc.
3ca4086b 1472 ^ static chr *scanplain(struct vars *);
5a419892 1473 */
3ca4086b
VS
1474static chr * /* just after end of sequence */
1475scanplain(v)
1476struct vars *v;
5a419892 1477{
3ca4086b 1478 chr *endp;
c0a7edde
RN
1479
1480 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1481 NEXT();
1482
1483 endp = v->now;
3ca4086b 1484 while (SEE(PLAIN)) {
c0a7edde
RN
1485 endp = v->now;
1486 NEXT();
1487 }
1488
1489 assert(SEE(END) || ISERR());
1490 NEXT();
1491
1492 return endp;
5a419892
VZ
1493}
1494
1495/*
3ca4086b 1496 - leaders - process a cvec of collating elements to also include leaders
c0a7edde
RN
1497 * Also gives all characters involved their own colors, which is almost
1498 * certainly necessary, and sets up little disconnected subNFA.
3ca4086b 1499 ^ static VOID leaders(struct vars *, struct cvec *);
5a419892 1500 */
3ca4086b
VS
1501static VOID
1502leaders(v, cv)
1503struct vars *v;
1504struct cvec *cv;
5a419892 1505{
3ca4086b
VS
1506 int mcce;
1507 chr *p;
1508 chr leader;
c0a7edde
RN
1509 struct state *s;
1510 struct arc *a;
1511
1512 v->mccepbegin = newstate(v->nfa);
1513 v->mccepend = newstate(v->nfa);
1514 NOERR();
1515
3ca4086b 1516 for (mcce = 0; mcce < cv->nmcces; mcce++) {
c0a7edde
RN
1517 p = cv->mcces[mcce];
1518 leader = *p;
3ca4086b 1519 if (!haschr(cv, leader)) {
c0a7edde
RN
1520 addchr(cv, leader);
1521 s = newstate(v->nfa);
1522 newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
3ca4086b 1523 v->mccepbegin, s);
c0a7edde 1524 okcolors(v->nfa, v->cm);
3ca4086b 1525 } else {
c0a7edde
RN
1526 a = findarc(v->mccepbegin, PLAIN,
1527 GETCOLOR(v->cm, leader));
1528 assert(a != NULL);
1529 s = a->to;
1530 assert(s != v->mccepend);
5a419892 1531 }
c0a7edde 1532 p++;
3ca4086b 1533 assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */
c0a7edde
RN
1534 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1535 okcolors(v->nfa, v->cm);
5a419892 1536 }
5a419892
VZ
1537}
1538
1539/*
3ca4086b 1540 - onechr - fill in arcs for a plain character, and possible case complements
c0a7edde 1541 * This is mostly a shortcut for efficient handling of the common case.
3ca4086b 1542 ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
5a419892 1543 */
3ca4086b
VS
1544static VOID
1545onechr(v, c, lp, rp)
1546struct vars *v;
1547pchr c;
1548struct state *lp;
1549struct state *rp;
5a419892 1550{
3ca4086b 1551 if (!(v->cflags&REG_ICASE)) {
c0a7edde
RN
1552 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1553 return;
1554 }
1555
1556 /* rats, need general case anyway... */
1557 dovec(v, allcases(v, c), lp, rp);
5a419892
VZ
1558}
1559
1560/*
3ca4086b 1561 - dovec - fill in arcs for each element of a cvec
c0a7edde 1562 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
3ca4086b
VS
1563 ^ static VOID dovec(struct vars *, struct cvec *, struct state *,
1564 ^ struct state *);
5a419892 1565 */
3ca4086b
VS
1566static VOID
1567dovec(v, cv, lp, rp)
1568struct vars *v;
1569struct cvec *cv;
1570struct state *lp;
1571struct state *rp;
5a419892 1572{
3ca4086b
VS
1573 chr ch, from, to;
1574 celt ce;
1575 chr *p;
1576 int i;
1577 color co;
c0a7edde
RN
1578 struct cvec *leads;
1579 struct arc *a;
3ca4086b 1580 struct arc *pa; /* arc in prototype */
c0a7edde 1581 struct state *s;
3ca4086b 1582 struct state *ps; /* state in prototype */
c0a7edde
RN
1583
1584 /* need a place to store leaders, if any */
3ca4086b 1585 if (nmcces(v) > 0) {
c0a7edde 1586 assert(v->mcces != NULL);
3ca4086b 1587 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
c0a7edde
RN
1588 if (v->cv2 != NULL)
1589 free(v->cv2);
1590 v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1591 NOERR();
1592 leads = v->cv2;
3ca4086b 1593 } else
c0a7edde 1594 leads = clearcvec(v->cv2);
3ca4086b 1595 } else
c0a7edde
RN
1596 leads = NULL;
1597
1598 /* first, get the ordinary characters out of the way */
3ca4086b 1599 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
c0a7edde
RN
1600 ch = *p;
1601 if (!ISCELEADER(v, ch))
1602 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
3ca4086b 1603 else {
c0a7edde
RN
1604 assert(singleton(v->cm, ch));
1605 assert(leads != NULL);
1606 if (!haschr(leads, ch))
1607 addchr(leads, ch);
3ca4086b 1608 }
c0a7edde 1609 }
5a419892 1610
c0a7edde 1611 /* and the ranges */
3ca4086b 1612 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
c0a7edde 1613 from = *p;
3ca4086b
VS
1614 to = *(p+1);
1615 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
c0a7edde
RN
1616 if (from < ce)
1617 subrange(v, from, ce - 1, lp, rp);
1618 assert(singleton(v->cm, ce));
1619 assert(leads != NULL);
1620 if (!haschr(leads, ce))
1621 addchr(leads, ce);
1622 from = ce + 1;
1623 }
1624 if (from <= to)
1625 subrange(v, from, to, lp, rp);
5a419892
VZ
1626 }
1627
c0a7edde
RN
1628 if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1629 return;
5a419892 1630
c0a7edde
RN
1631 /* deal with the MCCE leaders */
1632 NOTE(REG_ULOCALE);
3ca4086b 1633 for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
c0a7edde
RN
1634 co = GETCOLOR(v->cm, *p);
1635 a = findarc(lp, PLAIN, co);
1636 if (a != NULL)
1637 s = a->to;
3ca4086b 1638 else {
c0a7edde
RN
1639 s = newstate(v->nfa);
1640 NOERR();
1641 newarc(v->nfa, PLAIN, co, lp, s);
1642 NOERR();
1643 }
1644 pa = findarc(v->mccepbegin, PLAIN, co);
1645 assert(pa != NULL);
1646 ps = pa->to;
1647 newarc(v->nfa, '$', 1, s, rp);
1648 newarc(v->nfa, '$', 0, s, rp);
1649 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1650 NOERR();
1651 }
1652
1653 /* and the MCCEs */
3ca4086b 1654 for (i = 0; i < cv->nmcces; i++) {
c0a7edde
RN
1655 p = cv->mcces[i];
1656 assert(singleton(v->cm, *p));
3ca4086b 1657 if (!singleton(v->cm, *p)) {
c0a7edde
RN
1658 ERR(REG_ASSERT);
1659 return;
1660 }
1661 ch = *p++;
1662 co = GETCOLOR(v->cm, ch);
1663 a = findarc(lp, PLAIN, co);
1664 if (a != NULL)
1665 s = a->to;
3ca4086b 1666 else {
c0a7edde
RN
1667 s = newstate(v->nfa);
1668 NOERR();
1669 newarc(v->nfa, PLAIN, co, lp, s);
1670 NOERR();
1671 }
3ca4086b 1672 assert(*p != 0); /* at least two chars */
c0a7edde
RN
1673 assert(singleton(v->cm, *p));
1674 ch = *p++;
1675 co = GETCOLOR(v->cm, ch);
3ca4086b 1676 assert(*p == 0); /* and only two, for now */
c0a7edde
RN
1677 newarc(v->nfa, PLAIN, co, s, rp);
1678 NOERR();
1679 }
5a419892
VZ
1680}
1681
1682/*
3ca4086b
VS
1683 - nextleader - find next MCCE leader within range
1684 ^ static celt nextleader(struct vars *, pchr, pchr);
5a419892 1685 */
3ca4086b
VS
1686static celt /* NOCELT means none */
1687nextleader(v, from, to)
1688struct vars *v;
1689pchr from;
1690pchr to;
5a419892 1691{
3ca4086b
VS
1692 int i;
1693 chr *p;
1694 chr ch;
1695 celt it = NOCELT;
c0a7edde
RN
1696
1697 if (v->mcces == NULL)
1698 return it;
1699
3ca4086b 1700 for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
c0a7edde
RN
1701 ch = *p;
1702 if (from <= ch && ch <= to)
1703 if (it == NOCELT || ch < it)
1704 it = ch;
1705 }
1706 return it;
5a419892
VZ
1707}
1708
1709/*
3ca4086b 1710 - wordchrs - set up word-chr list for word-boundary stuff, if needed
c0a7edde
RN
1711 * The list is kept as a bunch of arcs between two dummy states; it's
1712 * disposed of by the unreachable-states sweep in NFA optimization.
1713 * Does NEXT(). Must not be called from any unusual lexical context.
1714 * This should be reconciled with the \w etc. handling in lex.c, and
1715 * should be cleaned up to reduce dependencies on input scanning.
3ca4086b 1716 ^ static VOID wordchrs(struct vars *);
5a419892 1717 */
3ca4086b
VS
1718static VOID
1719wordchrs(v)
1720struct vars *v;
5a419892 1721{
c0a7edde
RN
1722 struct state *left;
1723 struct state *right;
5a419892 1724
3ca4086b
VS
1725 if (v->wordchrs != NULL) {
1726 NEXT(); /* for consistency */
5a419892
VZ
1727 return;
1728 }
1729
c0a7edde
RN
1730 left = newstate(v->nfa);
1731 right = newstate(v->nfa);
1732 NOERR();
3ca4086b 1733 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
c0a7edde
RN
1734 lexword(v);
1735 NEXT();
1736 assert(v->savenow != NULL && SEE('['));
1737 bracket(v, left, right);
1738 assert((v->savenow != NULL && SEE(']')) || ISERR());
1739 NEXT();
1740 NOERR();
1741 v->wordchrs = left;
5a419892
VZ
1742}
1743
1744/*
3ca4086b
VS
1745 - subre - allocate a subre
1746 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1747 ^ struct state *);
5a419892 1748 */
c0a7edde 1749static struct subre *
3ca4086b
VS
1750subre(v, op, flags, begin, end)
1751struct vars *v;
1752int op;
1753int flags;
1754struct state *begin;
1755struct state *end;
5a419892 1756{
c0a7edde 1757 struct subre *ret;
5a419892 1758
c0a7edde
RN
1759 ret = v->treefree;
1760 if (ret != NULL)
1761 v->treefree = ret->left;
3ca4086b
VS
1762 else {
1763 ret = (struct subre *)MALLOC(sizeof(struct subre));
1764 if (ret == NULL) {
c0a7edde
RN
1765 ERR(REG_ESPACE);
1766 return NULL;
3ca4086b 1767 }
c0a7edde
RN
1768 ret->chain = v->treechain;
1769 v->treechain = ret;
5a419892
VZ
1770 }
1771
c0a7edde 1772 assert(strchr("|.b(=", op) != NULL);
5a419892 1773
c0a7edde
RN
1774 ret->op = op;
1775 ret->flags = flags;
1776 ret->retry = 0;
1777 ret->subno = 0;
1778 ret->min = ret->max = 1;
1779 ret->left = NULL;
1780 ret->right = NULL;
1781 ret->begin = begin;
1782 ret->end = end;
1783 ZAPCNFA(ret->cnfa);
1784
1785 return ret;
5a419892
VZ
1786}
1787
1788/*
3ca4086b
VS
1789 - freesubre - free a subRE subtree
1790 ^ static VOID freesubre(struct vars *, struct subre *);
5a419892 1791 */
3ca4086b
VS
1792static VOID
1793freesubre(v, sr)
1794struct vars *v; /* might be NULL */
1795struct subre *sr;
5a419892 1796{
c0a7edde
RN
1797 if (sr == NULL)
1798 return;
1799
1800 if (sr->left != NULL)
1801 freesubre(v, sr->left);
1802 if (sr->right != NULL)
1803 freesubre(v, sr->right);
1804
1805 freesrnode(v, sr);
5a419892
VZ
1806}
1807
1808/*
3ca4086b
VS
1809 - freesrnode - free one node in a subRE subtree
1810 ^ static VOID freesrnode(struct vars *, struct subre *);
5a419892 1811 */
3ca4086b
VS
1812static VOID
1813freesrnode(v, sr)
1814struct vars *v; /* might be NULL */
1815struct subre *sr;
5a419892 1816{
c0a7edde
RN
1817 if (sr == NULL)
1818 return;
1819
1820 if (!NULLCNFA(sr->cnfa))
1821 freecnfa(&sr->cnfa);
1822 sr->flags = 0;
1823
3ca4086b 1824 if (v != NULL) {
c0a7edde
RN
1825 sr->left = v->treefree;
1826 v->treefree = sr;
3ca4086b 1827 } else
c0a7edde 1828 FREE(sr);
5a419892
VZ
1829}
1830
1831/*
3ca4086b
VS
1832 - optst - optimize a subRE subtree
1833 ^ static VOID optst(struct vars *, struct subre *);
5a419892 1834 */
3ca4086b
VS
1835static VOID
1836optst(v, t)
1837struct vars *v;
1838struct subre *t;
5a419892 1839{
c0a7edde
RN
1840 if (t == NULL)
1841 return;
1842
1843 /* recurse through children */
1844 if (t->left != NULL)
1845 optst(v, t->left);
1846 if (t->right != NULL)
1847 optst(v, t->right);
5a419892
VZ
1848}
1849
1850/*
3ca4086b
VS
1851 - numst - number tree nodes (assigning retry indexes)
1852 ^ static int numst(struct subre *, int);
5a419892 1853 */
3ca4086b
VS
1854static int /* next number */
1855numst(t, start)
1856struct subre *t;
1857int start; /* starting point for subtree numbers */
5a419892 1858{
3ca4086b 1859 int i;
c0a7edde
RN
1860
1861 assert(t != NULL);
1862
1863 i = start;
3ca4086b 1864 t->retry = (short)i++;
c0a7edde
RN
1865 if (t->left != NULL)
1866 i = numst(t->left, i);
1867 if (t->right != NULL)
1868 i = numst(t->right, i);
1869 return i;
5a419892
VZ
1870}
1871
1872/*
3ca4086b
VS
1873 - markst - mark tree nodes as INUSE
1874 ^ static VOID markst(struct subre *);
5a419892 1875 */
3ca4086b
VS
1876static VOID
1877markst(t)
1878struct subre *t;
5a419892 1879{
c0a7edde
RN
1880 assert(t != NULL);
1881
1882 t->flags |= INUSE;
1883 if (t->left != NULL)
1884 markst(t->left);
1885 if (t->right != NULL)
1886 markst(t->right);
5a419892
VZ
1887}
1888
1889/*
3ca4086b
VS
1890 - cleanst - free any tree nodes not marked INUSE
1891 ^ static VOID cleanst(struct vars *);
5a419892 1892 */
3ca4086b
VS
1893static VOID
1894cleanst(v)
1895struct vars *v;
5a419892 1896{
c0a7edde
RN
1897 struct subre *t;
1898 struct subre *next;
1899
3ca4086b 1900 for (t = v->treechain; t != NULL; t = next) {
c0a7edde 1901 next = t->chain;
3ca4086b 1902 if (!(t->flags&INUSE))
c0a7edde 1903 FREE(t);
3ca4086b 1904 }
c0a7edde 1905 v->treechain = NULL;
3ca4086b 1906 v->treefree = NULL; /* just on general principles */
5a419892
VZ
1907}
1908
1909/*
3ca4086b
VS
1910 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1911 ^ static long nfatree(struct vars *, struct subre *, FILE *);
5a419892 1912 */
3ca4086b
VS
1913static long /* optimize results from top node */
1914nfatree(v, t, f)
1915struct vars *v;
1916struct subre *t;
1917FILE *f; /* for debug output */
5a419892 1918{
c0a7edde
RN
1919 assert(t != NULL && t->begin != NULL);
1920
1921 if (t->left != NULL)
3ca4086b 1922 (DISCARD)nfatree(v, t->left, f);
c0a7edde 1923 if (t->right != NULL)
3ca4086b 1924 (DISCARD)nfatree(v, t->right, f);
c0a7edde
RN
1925
1926 return nfanode(v, t, f);
5a419892
VZ
1927}
1928
1929/*
3ca4086b
VS
1930 - nfanode - do one NFA for nfatree
1931 ^ static long nfanode(struct vars *, struct subre *, FILE *);
5a419892 1932 */
3ca4086b
VS
1933static long /* optimize results */
1934nfanode(v, t, f)
1935struct vars *v;
1936struct subre *t;
1937FILE *f; /* for debug output */
5a419892 1938{
c0a7edde 1939 struct nfa *nfa;
3ca4086b
VS
1940 long ret = 0;
1941 char idbuf[50];
5a419892 1942
c0a7edde 1943 assert(t->begin != NULL);
5a419892 1944
c0a7edde 1945 if (f != NULL)
c0a7edde 1946 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
3ca4086b 1947 stid(t, idbuf, sizeof(idbuf)));
c0a7edde
RN
1948 nfa = newnfa(v, v->cm, v->nfa);
1949 NOERRZ();
1950 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
3ca4086b 1951 if (!ISERR()) {
c0a7edde
RN
1952 specialcolors(nfa);
1953 ret = optimize(nfa, f);
1954 }
1955 if (!ISERR())
1956 compact(nfa, &t->cnfa);
1957
1958 freenfa(nfa);
1959 return ret;
5a419892
VZ
1960}
1961
1962/*
3ca4086b
VS
1963 - newlacon - allocate a lookahead-constraint subRE
1964 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
5a419892 1965 */
3ca4086b
VS
1966static int /* lacon number */
1967newlacon(v, begin, end, pos)
1968struct vars *v;
1969struct state *begin;
1970struct state *end;
1971int pos;
5a419892 1972{
3ca4086b 1973 int n;
c0a7edde
RN
1974 struct subre *sub;
1975
3ca4086b
VS
1976 if (v->nlacons == 0) {
1977 v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
1978 n = 1; /* skip 0th */
c0a7edde 1979 v->nlacons = 2;
3ca4086b
VS
1980 } else {
1981 v->lacons = (struct subre *)REALLOC(v->lacons,
1982 (v->nlacons+1)*sizeof(struct subre));
c0a7edde 1983 n = v->nlacons++;
3ca4086b
VS
1984 }
1985 if (v->lacons == NULL) {
c0a7edde
RN
1986 ERR(REG_ESPACE);
1987 return 0;
5a419892 1988 }
c0a7edde
RN
1989 sub = &v->lacons[n];
1990 sub->begin = begin;
1991 sub->end = end;
1992 sub->subno = pos;
1993 ZAPCNFA(sub->cnfa);
1994 return n;
5a419892
VZ
1995}
1996
1997/*
3ca4086b
VS
1998 - freelacons - free lookahead-constraint subRE vector
1999 ^ static VOID freelacons(struct subre *, int);
5a419892 2000 */
3ca4086b
VS
2001static VOID
2002freelacons(subs, n)
2003struct subre *subs;
2004int n;
5a419892 2005{
c0a7edde 2006 struct subre *sub;
3ca4086b 2007 int i;
c0a7edde
RN
2008
2009 assert(n > 0);
2010 for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
2011 if (!NULLCNFA(sub->cnfa))
2012 freecnfa(&sub->cnfa);
2013 FREE(subs);
5a419892
VZ
2014}
2015
2016/*
3ca4086b
VS
2017 - rfree - free a whole RE (insides of regfree)
2018 ^ static VOID rfree(regex_t *);
5a419892 2019 */
3ca4086b
VS
2020static VOID
2021rfree(re)
2022regex_t *re;
5a419892 2023{
c0a7edde 2024 struct guts *g;
5a419892 2025
c0a7edde 2026 if (re == NULL || re->re_magic != REMAGIC)
5a419892
VZ
2027 return;
2028
3ca4086b
VS
2029 re->re_magic = 0; /* invalidate RE */
2030 g = (struct guts *)re->re_guts;
c0a7edde
RN
2031 re->re_guts = NULL;
2032 re->re_fns = NULL;
2033 g->magic = 0;
2034 freecm(&g->cmap);
2035 if (g->tree != NULL)
3ca4086b 2036 freesubre((struct vars *)NULL, g->tree);
c0a7edde
RN
2037 if (g->lacons != NULL)
2038 freelacons(g->lacons, g->nlacons);
2039 if (!NULLCNFA(g->search))
2040 freecnfa(&g->search);
2041 FREE(g);
5a419892
VZ
2042}
2043
2044/*
3ca4086b
VS
2045 - dump - dump an RE in human-readable form
2046 ^ static VOID dump(regex_t *, FILE *);
5a419892 2047 */
3ca4086b
VS
2048static VOID
2049dump(re, f)
2050regex_t *re;
2051FILE *f;
5a419892 2052{
3ca4086b 2053#ifdef REG_DEBUG
c0a7edde 2054 struct guts *g;
3ca4086b 2055 int i;
c0a7edde
RN
2056
2057 if (re->re_magic != REMAGIC)
2058 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
3ca4086b
VS
2059 REMAGIC);
2060 if (re->re_guts == NULL) {
c0a7edde
RN
2061 fprintf(f, "NULL guts!!!\n");
2062 return;
5a419892 2063 }
3ca4086b 2064 g = (struct guts *)re->re_guts;
c0a7edde
RN
2065 if (g->magic != GUTSMAGIC)
2066 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
3ca4086b 2067 GUTSMAGIC);
c0a7edde
RN
2068
2069 fprintf(f, "\n\n\n========= DUMP ==========\n");
3ca4086b
VS
2070 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2071 re->re_nsub, re->re_info, re->re_csize, g->ntree);
c0a7edde
RN
2072
2073 dumpcolors(&g->cmap, f);
3ca4086b 2074 if (!NULLCNFA(g->search)) {
c0a7edde
RN
2075 printf("\nsearch:\n");
2076 dumpcnfa(&g->search, f);
2077 }
3ca4086b 2078 for (i = 1; i < g->nlacons; i++) {
c0a7edde
RN
2079 fprintf(f, "\nla%d (%s):\n", i,
2080 (g->lacons[i].subno) ? "positive" : "negative");
2081 dumpcnfa(&g->lacons[i].cnfa, f);
2082 }
2083 fprintf(f, "\n");
2084 dumpst(g->tree, f, 0);
3ca4086b 2085#endif
5a419892
VZ
2086}
2087
2088/*
3ca4086b
VS
2089 - dumpst - dump a subRE tree
2090 ^ static VOID dumpst(struct subre *, FILE *, int);
5a419892 2091 */
3ca4086b
VS
2092static VOID
2093dumpst(t, f, nfapresent)
2094struct subre *t;
2095FILE *f;
2096int nfapresent; /* is the original NFA still around? */
5a419892 2097{
c0a7edde
RN
2098 if (t == NULL)
2099 fprintf(f, "null tree\n");
2100 else
2101 stdump(t, f, nfapresent);
2102 fflush(f);
2103}
5a419892 2104
c0a7edde 2105/*
3ca4086b
VS
2106 - stdump - recursive guts of dumpst
2107 ^ static VOID stdump(struct subre *, FILE *, int);
c0a7edde 2108 */
3ca4086b
VS
2109static VOID
2110stdump(t, f, nfapresent)
2111struct subre *t;
2112FILE *f;
2113int nfapresent; /* is the original NFA still around? */
c0a7edde 2114{
3ca4086b 2115 char idbuf[50];
c0a7edde
RN
2116
2117 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
3ca4086b 2118 if (t->flags&LONGER)
c0a7edde 2119 fprintf(f, " longest");
3ca4086b 2120 if (t->flags&SHORTER)
c0a7edde 2121 fprintf(f, " shortest");
3ca4086b 2122 if (t->flags&MIXED)
c0a7edde 2123 fprintf(f, " hasmixed");
3ca4086b 2124 if (t->flags&CAP)
c0a7edde 2125 fprintf(f, " hascapture");
3ca4086b 2126 if (t->flags&BACKR)
c0a7edde 2127 fprintf(f, " hasbackref");
3ca4086b 2128 if (!(t->flags&INUSE))
c0a7edde
RN
2129 fprintf(f, " UNUSED");
2130 if (t->subno != 0)
2131 fprintf(f, " (#%d)", t->subno);
3ca4086b 2132 if (t->min != 1 || t->max != 1) {
c0a7edde
RN
2133 fprintf(f, " {%d,", t->min);
2134 if (t->max != INFINITY)
2135 fprintf(f, "%d", t->max);
2136 fprintf(f, "}");
5a419892 2137 }
c0a7edde 2138 if (nfapresent)
3ca4086b 2139 fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
c0a7edde
RN
2140 if (t->left != NULL)
2141 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2142 if (t->right != NULL)
2143 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
3ca4086b 2144 if (!NULLCNFA(t->cnfa)) {
c0a7edde
RN
2145 fprintf(f, "\n");
2146 dumpcnfa(&t->cnfa, f);
2147 fprintf(f, "\n");
5a419892 2148 }
c0a7edde
RN
2149 if (t->left != NULL)
2150 stdump(t->left, f, nfapresent);
2151 if (t->right != NULL)
2152 stdump(t->right, f, nfapresent);
5a419892
VZ
2153}
2154
2155/*
3ca4086b
VS
2156 - stid - identify a subtree node for dumping
2157 ^ static char *stid(struct subre *, char *, size_t);
5a419892 2158 */
3ca4086b
VS
2159static char * /* points to buf or constant string */
2160stid(t, buf, bufsize)
2161struct subre *t;
2162char *buf;
2163size_t bufsize;
5a419892 2164{
c0a7edde 2165 /* big enough for hex int or decimal t->retry? */
3ca4086b 2166 if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
c0a7edde
RN
2167 return "unable";
2168 if (t->retry != 0)
2169 sprintf(buf, "%d", t->retry);
2170 else
3ca4086b 2171 sprintf(buf, "0x%x", (int)t); /* may lose bits, that's okay */
c0a7edde 2172 return buf;
5a419892 2173}
c0a7edde
RN
2174
2175#include "regc_lex.c"
2176#include "regc_color.c"
2177#include "regc_nfa.c"
2178#include "regc_cvec.c"
2179#include "regc_locale.c"