]> git.saurik.com Git - wxWidgets.git/blame - src/regex/regcomp.c
reSWIGed
[wxWidgets.git] / src / regex / regcomp.c
CommitLineData
c0a7edde
RN
1/*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
32 *
33 */
5a419892 34
c0a7edde 35#include "regguts.h"
5a419892
VZ
36
37/*
c0a7edde 38 * forward declarations, up here so forward datatypes etc. are defined early
5a419892 39 */
c0a7edde
RN
40/* === regcomp.c === */
41static void moresubs(struct vars *, int);
42static int freev(struct vars *, int);
43static void makesearch(struct vars *, struct nfa *);
44static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
45static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
46static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
47static void nonword(struct vars *, int, struct state *, struct state *);
48static void word(struct vars *, int, struct state *, struct state *);
49static int scannum(struct vars *);
50static void repeat(struct vars *, struct state *, struct state *, int, int);
51static void bracket(struct vars *, struct state *, struct state *);
52static void cbracket(struct vars *, struct state *, struct state *);
53static void brackpart(struct vars *, struct state *, struct state *);
54static chr *scanplain(struct vars *);
55static void leaders(struct vars *, struct cvec *);
56static void onechr(struct vars *, chr, struct state *, struct state *);
57static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
58static celt nextleader(struct vars *, chr, chr);
59static void wordchrs(struct vars *);
60static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
61static void freesubre(struct vars *, struct subre *);
62static void freesrnode(struct vars *, struct subre *);
63static void optst(struct vars *, struct subre *);
64static int numst(struct subre *, int);
65static void markst(struct subre *);
66static void cleanst(struct vars *);
67static long nfatree(struct vars *, struct subre *, FILE *);
68static long nfanode(struct vars *, struct subre *, FILE *);
69static int newlacon(struct vars *, struct state *, struct state *, int);
70static void freelacons(struct subre *, int);
71static void rfree(regex_t *);
72
73#ifdef REG_DEBUG
74static void dump(regex_t *, FILE *);
75static void dumpst(struct subre *, FILE *, int);
76static void stdump(struct subre *, FILE *, int);
77static char *stid(struct subre *, char *, size_t);
78#endif
79/* === regc_lex.c === */
80static void lexstart(struct vars *);
81static void prefixes(struct vars *);
82static void lexnest(struct vars *, chr *, chr *);
83static void lexword(struct vars *);
84static int next(struct vars *);
85static int lexescape(struct vars *);
86static chr lexdigits(struct vars *, int, int, int);
87static int brenext(struct vars *, chr);
88static void skip(struct vars *);
89static chr newline(void);
90static chr chrnamed(struct vars *, chr *, chr *, chr);
91
92/* === regc_color.c === */
93static void initcm(struct vars *, struct colormap *);
94static void freecm(struct colormap *);
95static void cmtreefree(struct colormap *, union tree *, int);
96static color setcolor(struct colormap *, chr, pcolor);
97static color maxcolor(struct colormap *);
98static color newcolor(struct colormap *);
99static void freecolor(struct colormap *, pcolor);
100static color pseudocolor(struct colormap *);
101static color subcolor(struct colormap *, chr c);
102static color newsub(struct colormap *, pcolor);
103static void subrange(struct vars *, chr, chr, struct state *, struct state *);
104static void subblock(struct vars *, chr, struct state *, struct state *);
105static void okcolors(struct nfa *, struct colormap *);
106static void colorchain(struct colormap *, struct arc *);
107static void uncolorchain(struct colormap *, struct arc *);
108static int singleton(struct colormap *, chr c);
109static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
110static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
111
112#ifdef REG_DEBUG
113static void dumpcolors(struct colormap *, FILE *);
114static void fillcheck(struct colormap *, union tree *, int, FILE *);
115static void dumpchr(chr, FILE *);
116#endif
117/* === regc_nfa.c === */
118static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
119static void freenfa(struct nfa *);
120static struct state *newstate(struct nfa *);
121static struct state *newfstate(struct nfa *, int flag);
122static void dropstate(struct nfa *, struct state *);
123static void freestate(struct nfa *, struct state *);
124static void destroystate(struct nfa *, struct state *);
125static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
126static struct arc *allocarc(struct nfa *, struct state *);
127static void freearc(struct nfa *, struct arc *);
128static struct arc *findarc(struct state *, int, pcolor);
129static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
130static void moveins(struct nfa *, struct state *, struct state *);
131static void copyins(struct nfa *, struct state *, struct state *);
132static void moveouts(struct nfa *, struct state *, struct state *);
133static void copyouts(struct nfa *, struct state *, struct state *);
134static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
135static void delsub(struct nfa *, struct state *, struct state *);
136static void deltraverse(struct nfa *, struct state *, struct state *);
137static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
138static void duptraverse(struct nfa *, struct state *, struct state *);
139static void cleartraverse(struct nfa *, struct state *);
140static void specialcolors(struct nfa *);
141static long optimize(struct nfa *, FILE *);
142static void pullback(struct nfa *, FILE *);
143static int pull(struct nfa *, struct arc *);
144static void pushfwd(struct nfa *, FILE *);
145static int push(struct nfa *, struct arc *);
146
147#define INCOMPATIBLE 1 /* destroys arc */
148#define SATISFIED 2 /* constraint satisfied */
149#define COMPATIBLE 3 /* compatible but not satisfied yet */
150static int combine(struct arc *, struct arc *);
151static void fixempties(struct nfa *, FILE *);
152static int unempty(struct nfa *, struct arc *);
153static void cleanup(struct nfa *);
154static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
155static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
156static long analyze(struct nfa *);
157static void compact(struct nfa *, struct cnfa *);
158static void carcsort(struct carc *, struct carc *);
159static void freecnfa(struct cnfa *);
160static void dumpnfa(struct nfa *, FILE *);
161
162#ifdef REG_DEBUG
163static void dumpstate(struct state *, FILE *);
164static void dumparcs(struct state *, FILE *);
165static int dumprarcs(struct arc *, struct state *, FILE *, int);
166static void dumparc(struct arc *, struct state *, FILE *);
167static void dumpcnfa(struct cnfa *, FILE *);
168static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
169#endif
170/* === regc_cvec.c === */
171static struct cvec *newcvec(int, int, int);
172static struct cvec *clearcvec(struct cvec *);
173static void addchr(struct cvec *, chr);
174static void addrange(struct cvec *, chr, chr);
175static void addmcce(struct cvec *, chr *, chr *);
176static int haschr(struct cvec *, chr);
177static struct cvec *getcvec(struct vars *, int, int, int);
178static void freecvec(struct cvec *);
179
180/* === regc_locale.c === */
181static int wx_isdigit(wx_wchar c);
182static int wx_isalpha(wx_wchar c);
183static int wx_isalnum(wx_wchar c);
184static int wx_isupper(wx_wchar c);
185static int wx_islower(wx_wchar c);
186static int wx_isgraph(wx_wchar c);
187static int wx_ispunct(wx_wchar c);
188static int wx_isspace(wx_wchar c);
189static wx_wchar wx_toupper(wx_wchar c);
190static wx_wchar wx_tolower(wx_wchar c);
191static int nmcces(struct vars *);
192static int nleaders(struct vars *);
193static struct cvec *allmcces(struct vars *, struct cvec *);
194static celt element(struct vars *, chr *, chr *);
195static struct cvec *range(struct vars *, celt, celt, int);
196static int before(celt, celt);
197static struct cvec *eclass(struct vars *, celt, int);
198static struct cvec *cclass(struct vars *, chr *, chr *, int);
199static struct cvec *allcases(struct vars *, chr);
200static int cmp(const chr *, const chr *, size_t);
201static int casecmp(const chr *, const chr *, size_t);
202
203
204/* internal variables, bundled for easy passing around */
205struct vars
206{
207 regex_t *re;
208 chr *now; /* scan pointer into string */
209 chr *stop; /* end of string */
210 chr *savenow; /* saved now and stop for "subroutine
211 * call" */
212 chr *savestop;
213 int err; /* error code (0 if none) */
214 int cflags; /* copy of compile flags */
215 int lasttype; /* type of previous token */
216 int nexttype; /* type of next token */
217 chr nextvalue; /* value (if any) of next token */
218 int lexcon; /* lexical context type (see lex.c) */
219 int nsubexp; /* subexpression count */
220 struct subre **subs; /* subRE pointer vector */
221 size_t nsubs; /* length of vector */
222 struct subre *sub10[10]; /* initial vector, enough for most */
223 struct nfa *nfa; /* the NFA */
224 struct colormap *cm; /* character color map */
225 color nlcolor; /* color of newline */
226 struct state *wordchrs; /* state in nfa holding word-char outarcs */
227 struct subre *tree; /* subexpression tree */
228 struct subre *treechain; /* all tree nodes allocated */
229 struct subre *treefree; /* any free tree nodes */
230 int ntree; /* number of tree nodes */
231 struct cvec *cv; /* interface cvec */
232 struct cvec *cv2; /* utility cvec */
233 struct cvec *mcces; /* collating-element information */
234#define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
235 struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
236 struct state *mccepend; /* in nfa, end of MCCE prototypes */
237 struct subre *lacons; /* lookahead-constraint vector */
238 int nlacons; /* size of lacons */
5a419892
VZ
239};
240
c0a7edde
RN
241/* parsing macros; most know that `v' is the struct vars pointer */
242#define NEXT() (next(v)) /* advance by one token */
243#define SEE(t) (v->nexttype == (t)) /* is next token this? */
244#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
245#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
246#define ISERR() VISERR(v)
247#define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
248 ((vv)->err = (e)))
249#define ERR(e) VERR(v, e) /* record an error */
250#define NOERR() {if (ISERR()) return;} /* if error seen, return */
251#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
252#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
253#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false,
254 * error */
255#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
256#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
257
258/* token type codes, some also used as NFA arc types */
259#define EMPTY 'n' /* no token present */
260#define EOS 'e' /* end of string */
261#define PLAIN 'p' /* ordinary character */
262#define DIGIT 'd' /* digit (in bound) */
263#define BACKREF 'b' /* back reference */
264#define COLLEL 'I' /* start of [. */
265#define ECLASS 'E' /* start of [= */
266#define CCLASS 'C' /* start of [: */
267#define END 'X' /* end of [. [= [: */
268#define RANGE 'R' /* - within [] which might be range delim. */
269#define LACON 'L' /* lookahead constraint subRE */
270#define AHEAD 'a' /* color-lookahead arc */
271#define BEHIND 'r' /* color-lookbehind arc */
272#define WBDRY 'w' /* word boundary constraint */
273#define NWBDRY 'W' /* non-word-boundary constraint */
274#define SBEGIN 'A' /* beginning of string (even if not BOL) */
275#define SEND 'Z' /* end of string (even if not EOL) */
276#define PREFER 'P' /* length preference */
277
278/* is an arc colored, and hence on a color chain? */
279#define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
280 (a)->type == BEHIND)
281
282
283
284/* static function list */
285static struct fns functions = {
286 rfree, /* regfree insides */
287};
5a419892 288
5a419892 289
5a419892
VZ
290
291/*
c0a7edde 292 * regcomp - compile regular expression
5a419892 293 */
c0a7edde
RN
294int
295regcomp(regex_t *re,
296 const chr *string,
297 int flags)
5a419892 298{
c0a7edde
RN
299
300 size_t nLen = 0;
301 chr* s2 = (chr*) string;
5a419892 302
c0a7edde
RN
303 if (string && *string)
304 {
305 while(*++s2);
306 }
5a419892 307
c0a7edde 308 nLen = ((s2 - string) / sizeof(chr));
5a419892 309
c0a7edde 310 return wx_regcomp(re, string, nLen, flags);
5a419892 311}
c0a7edde
RN
312int
313wx_regcomp(regex_t *re,
314 const chr *string,
315 size_t len,
316 int flags)
5a419892 317{
c0a7edde
RN
318 struct vars var;
319 struct vars *v = &var;
320 struct guts *g;
321 int i;
322 size_t j;
323
324#ifdef REG_DEBUG
325 FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
326
327#else
328 FILE *debug = (FILE *) NULL;
329#endif
330
331#define CNOERR() { if (ISERR()) return freev(v, v->err); }
332
333 /* sanity checks */
334
335 if (re == NULL || string == NULL)
336 return REG_INVARG;
337 if ((flags & REG_QUOTE) &&
338 (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
339 return REG_INVARG;
340 if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
341 return REG_INVARG;
342
343 /* initial setup (after which freev() is callable) */
344 v->re = re;
345 v->now = (chr *) string;
346 v->stop = v->now + len;
347 v->savenow = v->savestop = NULL;
348 v->err = 0;
349 v->cflags = flags;
350 v->nsubexp = 0;
351 v->subs = v->sub10;
352 v->nsubs = 10;
353 for (j = 0; j < v->nsubs; j++)
354 v->subs[j] = NULL;
355 v->nfa = NULL;
356 v->cm = NULL;
357 v->nlcolor = COLORLESS;
358 v->wordchrs = NULL;
359 v->tree = NULL;
360 v->treechain = NULL;
361 v->treefree = NULL;
362 v->cv = NULL;
363 v->cv2 = NULL;
364 v->mcces = NULL;
365 v->lacons = NULL;
366 v->nlacons = 0;
367 re->re_magic = REMAGIC;
368 re->re_info = 0; /* bits get set during parse */
369 re->re_csize = sizeof(chr);
370 re->re_guts = NULL;
371 re->re_fns = VS(&functions);
372
373 /* more complex setup, malloced things */
374 re->re_guts = VS(MALLOC(sizeof(struct guts)));
375 if (re->re_guts == NULL)
376 return freev(v, REG_ESPACE);
377 g = (struct guts *) re->re_guts;
378 g->tree = NULL;
379 initcm(v, &g->cmap);
380 v->cm = &g->cmap;
381 g->lacons = NULL;
382 g->nlacons = 0;
383 ZAPCNFA(g->search);
384 v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
385 CNOERR();
386 v->cv = newcvec(100, 20, 10);
387 if (v->cv == NULL)
388 return freev(v, REG_ESPACE);
389 i = nmcces(v);
390 if (i > 0)
391 {
392 v->mcces = newcvec(nleaders(v), 0, i);
393 CNOERR();
394 v->mcces = allmcces(v, v->mcces);
395 leaders(v, v->mcces);
396 addmcce(v->mcces, (chr *) NULL, (chr *) NULL); /* dummy */
397 }
398 CNOERR();
399
400 /* parsing */
401 lexstart(v); /* also handles prefixes */
402 if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
403 {
404 /* assign newline a unique color */
405 v->nlcolor = subcolor(v->cm, newline());
406 okcolors(v->nfa, v->cm);
407 }
408 CNOERR();
409 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
410 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
411 CNOERR();
412 assert(v->tree != NULL);
413
414 /* finish setup of nfa and its subre tree */
415 specialcolors(v->nfa);
416 CNOERR();
417#ifdef REG_DEBUG
418 if (debug != NULL)
419 {
420 fprintf(debug, "\n\n\n========= RAW ==========\n");
421 dumpnfa(v->nfa, debug);
422 dumpst(v->tree, debug, 1);
5a419892 423 }
c0a7edde
RN
424#endif
425 optst(v, v->tree);
426 v->ntree = numst(v->tree, 1);
427 markst(v->tree);
428 cleanst(v);
429#ifdef REG_DEBUG
430 if (debug != NULL)
431 {
432 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
433 dumpst(v->tree, debug, 1);
5a419892 434 }
c0a7edde 435#endif
5a419892 436
c0a7edde
RN
437 /* build compacted NFAs for tree and lacons */
438 re->re_info |= nfatree(v, v->tree, debug);
439 CNOERR();
440 assert(v->nlacons == 0 || v->lacons != NULL);
441 for (i = 1; i < v->nlacons; i++)
442 {
443#ifdef REG_DEBUG
444 if (debug != NULL)
445 fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
446#endif
447 nfanode(v, &v->lacons[i], debug);
5a419892 448 }
c0a7edde
RN
449 CNOERR();
450 if (v->tree->flags & SHORTER)
451 NOTE(REG_USHORTEST);
452
453 /* build compacted NFAs for tree, lacons, fast search */
454#ifdef REG_DEBUG
455 if (debug != NULL)
456 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
457#endif
458 /* can sacrifice main NFA now, so use it as work area */
459 (DISCARD) optimize(v->nfa, debug);
460 CNOERR();
461 makesearch(v, v->nfa);
462 CNOERR();
463 compact(v->nfa, &g->search);
464 CNOERR();
465
466 /* looks okay, package it up */
467 re->re_nsub = v->nsubexp;
468 v->re = NULL; /* freev no longer frees re */
469 g->magic = GUTSMAGIC;
470 g->cflags = v->cflags;
471 g->info = re->re_info;
472 g->nsub = re->re_nsub;
473 g->tree = v->tree;
474 v->tree = NULL;
475 g->ntree = v->ntree;
476 g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
477 g->lacons = v->lacons;
478 v->lacons = NULL;
479 g->nlacons = v->nlacons;
480
481#ifdef REG_DEBUG
482 if (flags & REG_DUMP)
483 dump(re, stdout);
484#endif
5a419892 485
c0a7edde
RN
486 assert(v->err == 0);
487 return freev(v, 0);
5a419892
VZ
488}
489
490/*
c0a7edde 491 * moresubs - enlarge subRE vector
5a419892
VZ
492 */
493static void
c0a7edde
RN
494moresubs(struct vars * v,
495 int wanted) /* want enough room for this one */
5a419892 496{
c0a7edde
RN
497 struct subre **p;
498 size_t n;
499
500 assert(wanted > 0 && (size_t) wanted >= v->nsubs);
501 n = (size_t) wanted *3 / 2 + 1;
502
503 if (v->subs == v->sub10)
504 {
505 p = (struct subre **) MALLOC(n * sizeof(struct subre *));
506 if (p != NULL)
507 memcpy(VS(p), VS(v->subs),
508 v->nsubs * sizeof(struct subre *));
5a419892 509 }
5a419892 510 else
c0a7edde
RN
511 p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
512 if (p == NULL)
513 {
514 ERR(REG_ESPACE);
5a419892 515 return;
5a419892 516 }
c0a7edde
RN
517 v->subs = p;
518 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
519 *p = NULL;
520 assert(v->nsubs == n);
521 assert((size_t) wanted < v->nsubs);
5a419892
VZ
522}
523
524/*
c0a7edde
RN
525 * freev - free vars struct's substructures where necessary
526 *
527 * Optionally does error-number setting, and always returns error code
528 * (if any), to make error-handling code terser.
5a419892 529 */
c0a7edde
RN
530static int
531freev(struct vars * v,
532 int err)
5a419892 533{
c0a7edde
RN
534 if (v->re != NULL)
535 rfree(v->re);
536 if (v->subs != v->sub10)
537 FREE(v->subs);
538 if (v->nfa != NULL)
539 freenfa(v->nfa);
540 if (v->tree != NULL)
541 freesubre(v, v->tree);
542 if (v->treechain != NULL)
543 cleanst(v);
544 if (v->cv != NULL)
545 freecvec(v->cv);
546 if (v->cv2 != NULL)
547 freecvec(v->cv2);
548 if (v->mcces != NULL)
549 freecvec(v->mcces);
550 if (v->lacons != NULL)
551 freelacons(v->lacons, v->nlacons);
552 ERR(err); /* nop if err==0 */
553
554 return v->err;
5a419892
VZ
555}
556
557/*
c0a7edde
RN
558 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
559 * NFA must have been optimize()d already.
5a419892
VZ
560 */
561static void
c0a7edde
RN
562makesearch(struct vars * v,
563 struct nfa * nfa)
5a419892 564{
c0a7edde
RN
565 struct arc *a;
566 struct arc *b;
567 struct state *pre = nfa->pre;
568 struct state *s;
569 struct state *s2;
570 struct state *slist;
571
572 /* no loops are needed if it's anchored */
573 for (a = pre->outs; a != NULL; a = a->outchain)
574 {
575 assert(a->type == PLAIN);
576 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
577 break;
5a419892 578 }
c0a7edde
RN
579 if (a != NULL)
580 {
581 /* add implicit .* in front */
582 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
583
584 /* and ^* and \A* too -- not always necessary, but harmless */
585 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
586 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
5a419892
VZ
587 }
588
c0a7edde
RN
589 /*
590 * Now here's the subtle part. Because many REs have no lookback
591 * constraints, often knowing when you were in the pre state tells you
592 * little; it's the next state(s) that are informative. But some of
593 * them may have other inarcs, i.e. it may be possible to make actual
594 * progress and then return to one of them. We must de-optimize such
595 * cases, splitting each such state into progress and no-progress
596 * states.
5a419892 597 */
c0a7edde
RN
598
599 /* first, make a list of the states */
600 slist = NULL;
601 for (a = pre->outs; a != NULL; a = a->outchain)
602 {
603 s = a->to;
604 for (b = s->ins; b != NULL; b = b->inchain)
605 if (b->from != pre)
5a419892 606 break;
c0a7edde
RN
607 if (b != NULL)
608 { /* must be split */
609 s->tmp = slist;
610 slist = s;
5a419892 611 }
5a419892 612 }
5a419892 613
c0a7edde
RN
614 /* do the splits */
615 for (s = slist; s != NULL; s = s2)
616 {
617 s2 = newstate(nfa);
618 copyouts(nfa, s, s2);
619 for (a = s->ins; a != NULL; a = b)
620 {
621 b = a->inchain;
622 if (a->from != pre)
623 {
624 cparc(nfa, a, a->from, s2);
625 freearc(nfa, a);
626 }
627 }
628 s2 = s->tmp;
629 s->tmp = NULL; /* clean up while we're at it */
630 }
5a419892
VZ
631}
632
633/*
c0a7edde
RN
634 * parse - parse an RE
635 *
636 * This is actually just the top level, which parses a bunch of branches
637 * tied together with '|'. They appear in the tree as the left children
638 * of a chain of '|' subres.
5a419892 639 */
c0a7edde
RN
640static struct subre *
641parse(struct vars * v,
642 int stopper, /* EOS or ')' */
643 int type, /* LACON (lookahead subRE) or PLAIN */
644 struct state * init, /* initial state */
645 struct state * final) /* final state */
5a419892 646{
c0a7edde
RN
647 struct state *left; /* scaffolding for branch */
648 struct state *right;
649 struct subre *branches; /* top level */
650 struct subre *branch; /* current branch */
651 struct subre *t; /* temporary */
652 int firstbranch; /* is this the first branch? */
653
654 assert(stopper == ')' || stopper == EOS);
655
656 branches = subre(v, '|', LONGER, init, final);
657 NOERRN();
658 branch = branches;
659 firstbranch = 1;
660 do
661 { /* a branch */
662 if (!firstbranch)
663 {
664 /* need a place to hang it */
665 branch->right = subre(v, '|', LONGER, init, final);
666 NOERRN();
667 branch = branch->right;
668 }
669 firstbranch = 0;
670 left = newstate(v->nfa);
671 right = newstate(v->nfa);
672 NOERRN();
673 EMPTYARC(init, left);
674 EMPTYARC(right, final);
675 NOERRN();
676 branch->left = parsebranch(v, stopper, type, left, right, 0);
677 NOERRN();
678 branch->flags |= UP(branch->flags | branch->left->flags);
679 if ((branch->flags & ~branches->flags) != 0) /* new flags */
680 for (t = branches; t != branch; t = t->right)
681 t->flags |= branch->flags;
682 } while (EAT('|'));
683 assert(SEE(stopper) || SEE(EOS));
684
685 if (!SEE(stopper))
686 {
687 assert(stopper == ')' && SEE(EOS));
688 ERR(REG_EPAREN);
689 }
5a419892 690
c0a7edde
RN
691 /* optimize out simple cases */
692 if (branch == branches)
693 { /* only one branch */
694 assert(branch->right == NULL);
695 t = branch->left;
696 branch->left = NULL;
697 freesubre(v, branches);
698 branches = t;
699 }
700 else if (!MESSY(branches->flags))
701 { /* no interesting innards */
702 freesubre(v, branches->left);
703 branches->left = NULL;
704 freesubre(v, branches->right);
705 branches->right = NULL;
706 branches->op = '=';
5a419892
VZ
707 }
708
c0a7edde 709 return branches;
5a419892
VZ
710}
711
712/*
c0a7edde 713 * parsebranch - parse one branch of an RE
5a419892 714 *
c0a7edde
RN
715 * This mostly manages concatenation, working closely with parseqatom().
716 * Concatenated things are bundled up as much as possible, with separate
717 * ',' nodes introduced only when necessary due to substructure.
5a419892 718 */
c0a7edde
RN
719static struct subre *
720parsebranch(struct vars * v,
721 int stopper, /* EOS or ')' */
722 int type, /* LACON (lookahead subRE) or PLAIN */
723 struct state * left, /* leftmost state */
724 struct state * right, /* rightmost state */
725 int partial) /* is this only part of a branch? */
5a419892 726{
c0a7edde
RN
727 struct state *lp; /* left end of current construct */
728 int seencontent; /* is there anything in this branch yet? */
729 struct subre *t;
730
731 lp = left;
732 seencontent = 0;
733 t = subre(v, '=', 0, left, right); /* op '=' is tentative */
734 NOERRN();
735 while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
736 {
737 if (seencontent)
738 { /* implicit concat operator */
739 lp = newstate(v->nfa);
740 NOERRN();
741 moveins(v->nfa, right, lp);
5a419892 742 }
c0a7edde 743 seencontent = 1;
5a419892 744
c0a7edde
RN
745 /* NB, recursion in parseqatom() may swallow rest of branch */
746 parseqatom(v, stopper, type, lp, right, t);
5a419892 747 }
5a419892 748
c0a7edde
RN
749 if (!seencontent)
750 { /* empty branch */
751 if (!partial)
752 NOTE(REG_UUNSPEC);
753 assert(lp == left);
754 EMPTYARC(left, right);
5a419892
VZ
755 }
756
c0a7edde 757 return t;
5a419892
VZ
758}
759
760/*
c0a7edde
RN
761 * parseqatom - parse one quantified atom or constraint of an RE
762 *
763 * The bookkeeping near the end cooperates very closely with parsebranch();
764 * in particular, it contains a recursion that can involve parsing the rest
765 * of the branch, making this function's name somewhat inaccurate.
5a419892
VZ
766 */
767static void
c0a7edde
RN
768parseqatom(struct vars * v,
769 int stopper, /* EOS or ')' */
770 int type, /* LACON (lookahead subRE) or PLAIN */
771 struct state * lp, /* left state to hang it on */
772 struct state * rp, /* right state to hang it on */
773 struct subre * top) /* subtree top */
5a419892 774{
c0a7edde
RN
775 struct state *s; /* temporaries for new states */
776 struct state *s2;
777
778#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
779 int m,
780 n;
781 struct subre *atom; /* atom's subtree */
782 struct subre *t;
783 int cap; /* capturing parens? */
784 int pos; /* positive lookahead? */
785 int subno; /* capturing-parens or backref number */
786 int atomtype;
787 int qprefer; /* quantifier short/long preference */
788 int f;
789 struct subre **atomp; /* where the pointer to atom is */
790
791 /* initial bookkeeping */
792 atom = NULL;
793 assert(lp->nouts == 0); /* must string new code */
794 assert(rp->nins == 0); /* between lp and rp */
795 subno = 0; /* just to shut lint up */
796
797 /* an atom or constraint... */
798 atomtype = v->nexttype;
799 switch (atomtype)
800 {
801 /* first, constraints, which end by returning */
802 case '^':
803 ARCV('^', 1);
804 if (v->cflags & REG_NLANCH)
805 ARCV(BEHIND, v->nlcolor);
806 NEXT();
807 return;
808 break;
809 case '$':
810 ARCV('$', 1);
811 if (v->cflags & REG_NLANCH)
812 ARCV(AHEAD, v->nlcolor);
813 NEXT();
814 return;
815 break;
816 case SBEGIN:
817 ARCV('^', 1); /* BOL */
818 ARCV('^', 0); /* or BOS */
819 NEXT();
820 return;
821 break;
822 case SEND:
823 ARCV('$', 1); /* EOL */
824 ARCV('$', 0); /* or EOS */
825 NEXT();
826 return;
827 break;
828 case '<':
829 wordchrs(v); /* does NEXT() */
830 s = newstate(v->nfa);
831 NOERR();
832 nonword(v, BEHIND, lp, s);
833 word(v, AHEAD, s, rp);
834 return;
835 break;
836 case '>':
837 wordchrs(v); /* does NEXT() */
838 s = newstate(v->nfa);
839 NOERR();
840 word(v, BEHIND, lp, s);
841 nonword(v, AHEAD, s, rp);
842 return;
5a419892 843 break;
c0a7edde
RN
844 case WBDRY:
845 wordchrs(v); /* does NEXT() */
846 s = newstate(v->nfa);
847 NOERR();
848 nonword(v, BEHIND, lp, s);
849 word(v, AHEAD, s, rp);
850 s = newstate(v->nfa);
851 NOERR();
852 word(v, BEHIND, lp, s);
853 nonword(v, AHEAD, s, rp);
854 return;
855 break;
856 case NWBDRY:
857 wordchrs(v); /* does NEXT() */
858 s = newstate(v->nfa);
859 NOERR();
860 word(v, BEHIND, lp, s);
861 word(v, AHEAD, s, rp);
862 s = newstate(v->nfa);
863 NOERR();
864 nonword(v, BEHIND, lp, s);
865 nonword(v, AHEAD, s, rp);
866 return;
5a419892 867 break;
c0a7edde
RN
868 case LACON: /* lookahead constraint */
869 pos = v->nextvalue;
870 NEXT();
871 s = newstate(v->nfa);
872 s2 = newstate(v->nfa);
873 NOERR();
874 t = parse(v, ')', LACON, s, s2);
875 freesubre(v, t); /* internal structure irrelevant */
876 assert(SEE(')') || ISERR());
877 NEXT();
878 n = newlacon(v, s, s2, pos);
879 NOERR();
880 ARCV(LACON, n);
881 return;
882 break;
883 /* then errors, to get them out of the way */
884 case '*':
885 case '+':
886 case '?':
887 case '{':
888 ERR(REG_BADRPT);
889 return;
890 break;
5a419892 891 default:
c0a7edde
RN
892 ERR(REG_ASSERT);
893 return;
5a419892 894 break;
c0a7edde
RN
895 /* then plain characters, and minor variants on that theme */
896 case ')': /* unbalanced paren */
897 if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
898 {
899 ERR(REG_EPAREN);
900 return;
5a419892 901 }
c0a7edde
RN
902 /* legal in EREs due to specification botch */
903 NOTE(REG_UPBOTCH);
904 /* fallthrough into case PLAIN */
905 case PLAIN:
906 onechr(v, v->nextvalue, lp, rp);
907 okcolors(v->nfa, v->cm);
908 NOERR();
909 NEXT();
5a419892 910 break;
c0a7edde
RN
911 case '[':
912 if (v->nextvalue == 1)
913 bracket(v, lp, rp);
914 else
915 cbracket(v, lp, rp);
916 assert(SEE(']') || ISERR());
917 NEXT();
5a419892 918 break;
c0a7edde
RN
919 case '.':
920 rainbow(v->nfa, v->cm, PLAIN,
921 (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
922 lp, rp);
5a419892 923 NEXT();
c0a7edde
RN
924 break;
925 /* and finally the ugly stuff */
926 case '(': /* value flags as capturing or non */
927 cap = (type == LACON) ? 0 : v->nextvalue;
928 if (cap)
929 {
930 v->nsubexp++;
931 subno = v->nsubexp;
932 if ((size_t) subno >= v->nsubs)
933 moresubs(v, subno);
934 assert((size_t) subno < v->nsubs);
935 }
5a419892 936 else
c0a7edde
RN
937 atomtype = PLAIN; /* something that's not '(' */
938 NEXT();
939 /* need new endpoints because tree will contain pointers */
940 s = newstate(v->nfa);
941 s2 = newstate(v->nfa);
942 NOERR();
943 EMPTYARC(lp, s);
944 EMPTYARC(s2, rp);
945 NOERR();
946 atom = parse(v, ')', PLAIN, s, s2);
947 assert(SEE(')') || ISERR());
948 NEXT();
949 NOERR();
950 if (cap)
951 {
952 v->subs[subno] = atom;
953 t = subre(v, '(', atom->flags | CAP, lp, rp);
954 NOERR();
955 t->subno = subno;
956 t->left = atom;
957 atom = t;
958 }
959 /* postpone everything else pending possible {0} */
960 break;
961 case BACKREF: /* the Feature From The Black Lagoon */
962 INSIST(type != LACON, REG_ESUBREG);
963 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
964 INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
965 NOERR();
966 assert(v->nextvalue > 0);
967 atom = subre(v, 'b', BACKR, lp, rp);
968 subno = v->nextvalue;
969 atom->subno = subno;
970 EMPTYARC(lp, rp); /* temporarily, so there's something */
971 NEXT();
5a419892
VZ
972 break;
973 }
5a419892 974
c0a7edde
RN
975 /* ...and an atom may be followed by a quantifier */
976 switch (v->nexttype)
977 {
978 case '*':
979 m = 0;
980 n = INFINITY;
981 qprefer = (v->nextvalue) ? LONGER : SHORTER;
982 NEXT();
983 break;
984 case '+':
985 m = 1;
986 n = INFINITY;
987 qprefer = (v->nextvalue) ? LONGER : SHORTER;
988 NEXT();
989 break;
990 case '?':
991 m = 0;
992 n = 1;
993 qprefer = (v->nextvalue) ? LONGER : SHORTER;
5a419892 994 NEXT();
5a419892 995 break;
c0a7edde
RN
996 case '{':
997 NEXT();
998 m = scannum(v);
999 if (EAT(','))
1000 {
1001 if (SEE(DIGIT))
1002 n = scannum(v);
1003 else
1004 n = INFINITY;
1005 if (m > n)
1006 {
1007 ERR(REG_BADBR);
5a419892
VZ
1008 return;
1009 }
c0a7edde
RN
1010 /* {m,n} exercises preference, even if it's {m,m} */
1011 qprefer = (v->nextvalue) ? LONGER : SHORTER;
1012 }
1013 else
1014 {
1015 n = m;
1016 /* {m} passes operand's preference through */
1017 qprefer = 0;
1018 }
1019 if (!SEE('}'))
1020 { /* catches errors too */
1021 ERR(REG_BADBR);
1022 return;
1023 }
1024 NEXT();
1025 break;
1026 default: /* no quantifier */
1027 m = n = 1;
1028 qprefer = 0;
1029 break;
1030 }
5a419892 1031
c0a7edde
RN
1032 /* annoying special case: {0} or {0,0} cancels everything */
1033 if (m == 0 && n == 0)
1034 {
1035 if (atom != NULL)
1036 freesubre(v, atom);
1037 if (atomtype == '(')
1038 v->subs[subno] = NULL;
1039 delsub(v->nfa, lp, rp);
1040 EMPTYARC(lp, rp);
1041 return;
1042 }
5a419892 1043
c0a7edde
RN
1044 /* if not a messy case, avoid hard part */
1045 assert(!MESSY(top->flags));
1046 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1047 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1048 {
1049 if (!(m == 1 && n == 1))
1050 repeat(v, lp, rp, m, n);
1051 if (atom != NULL)
1052 freesubre(v, atom);
1053 top->flags = f;
1054 return;
1055 }
1056
1057 /*
1058 * hard part: something messy That is, capturing parens, back
1059 * reference, short/long clash, or an atom with substructure
1060 * containing one of those.
5a419892 1061 */
5a419892 1062
c0a7edde
RN
1063 /* now we'll need a subre for the contents even if they're boring */
1064 if (atom == NULL)
1065 {
1066 atom = subre(v, '=', 0, lp, rp);
1067 NOERR();
1068 }
5a419892 1069
c0a7edde
RN
1070 /*
1071 * prepare a general-purpose state skeleton
1072 *
1073 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1074 * [lp] ----> [s2] ----bypass---------------------
1075 *
1076 * where bypass is an empty, and prefix is some repetitions of atom
5a419892 1077 */
c0a7edde
RN
1078 s = newstate(v->nfa); /* first, new endpoints for the atom */
1079 s2 = newstate(v->nfa);
1080 NOERR();
1081 moveouts(v->nfa, lp, s);
1082 moveins(v->nfa, rp, s2);
1083 NOERR();
1084 atom->begin = s;
1085 atom->end = s2;
1086 s = newstate(v->nfa); /* and spots for prefix and bypass */
1087 s2 = newstate(v->nfa);
1088 NOERR();
1089 EMPTYARC(lp, s);
1090 EMPTYARC(lp, s2);
1091 NOERR();
1092
1093 /* break remaining subRE into x{...} and what follows */
1094 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1095 t->left = atom;
1096 atomp = &t->left;
1097 /* here we should recurse... but we must postpone that to the end */
1098
1099 /* split top into prefix and remaining */
1100 assert(top->op == '=' && top->left == NULL && top->right == NULL);
1101 top->left = subre(v, '=', top->flags, top->begin, lp);
1102 top->op = '.';
1103 top->right = t;
1104
1105 /* if it's a backref, now is the time to replicate the subNFA */
1106 if (atomtype == BACKREF)
1107 {
1108 assert(atom->begin->nouts == 1); /* just the EMPTY */
1109 delsub(v->nfa, atom->begin, atom->end);
1110 assert(v->subs[subno] != NULL);
1111 /* and here's why the recursion got postponed: it must */
1112 /* wait until the skeleton is filled in, because it may */
1113 /* hit a backref that wants to copy the filled-in skeleton */
1114 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1115 atom->begin, atom->end);
1116 NOERR();
1117 }
1118
1119 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1120 if (m == 0)
1121 {
1122 EMPTYARC(s2, atom->end); /* the bypass */
1123 assert(PREF(qprefer) != 0);
1124 f = COMBINE(qprefer, atom->flags);
1125 t = subre(v, '|', f, lp, atom->end);
1126 NOERR();
1127 t->left = atom;
1128 t->right = subre(v, '|', PREF(f), s2, atom->end);
1129 NOERR();
1130 t->right->left = subre(v, '=', 0, s2, atom->end);
1131 NOERR();
1132 *atomp = t;
1133 atomp = &t->left;
1134 m = 1;
1135 }
5a419892 1136
c0a7edde
RN
1137 /* deal with the rest of the quantifier */
1138 if (atomtype == BACKREF)
1139 {
1140 /* special case: backrefs have internal quantifiers */
1141 EMPTYARC(s, atom->begin); /* empty prefix */
1142 /* just stuff everything into atom */
1143 repeat(v, atom->begin, atom->end, m, n);
1144 atom->min = (short) m;
1145 atom->max = (short) n;
1146 atom->flags |= COMBINE(qprefer, atom->flags);
1147 }
1148 else if (m == 1 && n == 1)
1149 {
1150 /* no/vacuous quantifier: done */
1151 EMPTYARC(s, atom->begin); /* empty prefix */
1152 }
1153 else
1154 {
1155 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1156 /* parens in only second x */
1157 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1158 assert(m >= 1 && m != INFINITY && n >= 1);
1159 repeat(v, s, atom->begin, m - 1, (n == INFINITY) ? n : n - 1);
1160 f = COMBINE(qprefer, atom->flags);
1161 t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1162 NOERR();
1163 t->left = subre(v, '=', PREF(f), s, atom->begin);
1164 NOERR();
1165 t->right = atom;
1166 *atomp = t;
1167 }
5a419892 1168
c0a7edde
RN
1169 /* and finally, look after that postponed recursion */
1170 t = top->right;
1171 if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1172 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1173 else
1174 {
1175 EMPTYARC(atom->end, rp);
1176 t->right = subre(v, '=', 0, atom->end, rp);
1177 }
1178 assert(SEE('|') || SEE(stopper) || SEE(EOS));
1179 t->flags |= COMBINE(t->flags, t->right->flags);
1180 top->flags |= COMBINE(top->flags, t->flags);
5a419892
VZ
1181}
1182
1183/*
c0a7edde 1184 * nonword - generate arcs for non-word-character ahead or behind
5a419892 1185 */
c0a7edde
RN
1186static void
1187nonword(struct vars * v,
1188 int dir, /* AHEAD or BEHIND */
1189 struct state * lp,
1190 struct state * rp)
5a419892 1191{
c0a7edde 1192 int anchor = (dir == AHEAD) ? '$' : '^';
5a419892 1193
c0a7edde
RN
1194 assert(dir == AHEAD || dir == BEHIND);
1195 newarc(v->nfa, anchor, 1, lp, rp);
1196 newarc(v->nfa, anchor, 0, lp, rp);
1197 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1198 /* (no need for special attention to \n) */
5a419892
VZ
1199}
1200
1201/*
c0a7edde 1202 * word - generate arcs for word character ahead or behind
5a419892 1203 */
c0a7edde
RN
1204static void
1205word(struct vars * v,
1206 int dir, /* AHEAD or BEHIND */
1207 struct state * lp,
1208 struct state * rp)
5a419892 1209{
c0a7edde
RN
1210 assert(dir == AHEAD || dir == BEHIND);
1211 cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1212 /* (no need for special attention to \n) */
5a419892
VZ
1213}
1214
1215/*
c0a7edde 1216 * scannum - scan a number
5a419892 1217 */
c0a7edde
RN
1218static int /* value, <= DUPMAX */
1219scannum(struct vars * v)
5a419892 1220{
c0a7edde
RN
1221 int n = 0;
1222
1223 while (SEE(DIGIT) && n < DUPMAX)
1224 {
1225 n = n * 10 + v->nextvalue;
1226 NEXT();
1227 }
1228 if (SEE(DIGIT) || n > DUPMAX)
1229 {
1230 ERR(REG_BADBR);
1231 return 0;
1232 }
1233 return n;
5a419892
VZ
1234}
1235
1236/*
c0a7edde
RN
1237 * repeat - replicate subNFA for quantifiers
1238 *
1239 * The duplication sequences used here are chosen carefully so that any
1240 * pointers starting out pointing into the subexpression end up pointing into
1241 * the last occurrence. (Note that it may not be strung between the same
1242 * left and right end states, however!) This used to be important for the
1243 * subRE tree, although the important bits are now handled by the in-line
1244 * code in parse(), and when this is called, it doesn't matter any more.
5a419892
VZ
1245 */
1246static void
c0a7edde
RN
1247repeat(struct vars * v,
1248 struct state * lp,
1249 struct state * rp,
1250 int m,
1251 int n)
5a419892 1252{
c0a7edde
RN
1253#define SOME 2
1254#define INF 3
1255#define PAIR(x, y) ((x)*4 + (y))
1256#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1257 const int rm = REDUCE(m);
1258 const int rn = REDUCE(n);
1259 struct state *s;
1260 struct state *s2;
1261
1262 switch (PAIR(rm, rn))
1263 {
1264 case PAIR(0, 0): /* empty string */
1265 delsub(v->nfa, lp, rp);
1266 EMPTYARC(lp, rp);
1267 break;
1268 case PAIR(0, 1): /* do as x| */
1269 EMPTYARC(lp, rp);
1270 break;
1271 case PAIR(0, SOME): /* do as x{1,n}| */
1272 repeat(v, lp, rp, 1, n);
1273 NOERR();
1274 EMPTYARC(lp, rp);
1275 break;
1276 case PAIR(0, INF): /* loop x around */
1277 s = newstate(v->nfa);
1278 NOERR();
1279 moveouts(v->nfa, lp, s);
1280 moveins(v->nfa, rp, s);
1281 EMPTYARC(lp, s);
1282 EMPTYARC(s, rp);
1283 break;
1284 case PAIR(1, 1): /* no action required */
1285 break;
1286 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1287 s = newstate(v->nfa);
1288 NOERR();
1289 moveouts(v->nfa, lp, s);
1290 dupnfa(v->nfa, s, rp, lp, s);
1291 NOERR();
1292 repeat(v, lp, s, 1, n - 1);
1293 NOERR();
1294 EMPTYARC(lp, s);
1295 break;
1296 case PAIR(1, INF): /* add loopback arc */
1297 s = newstate(v->nfa);
1298 s2 = newstate(v->nfa);
1299 NOERR();
1300 moveouts(v->nfa, lp, s);
1301 moveins(v->nfa, rp, s2);
1302 EMPTYARC(lp, s);
1303 EMPTYARC(s2, rp);
1304 EMPTYARC(s2, s);
1305 break;
1306 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1307 s = newstate(v->nfa);
1308 NOERR();
1309 moveouts(v->nfa, lp, s);
1310 dupnfa(v->nfa, s, rp, lp, s);
1311 NOERR();
1312 repeat(v, lp, s, m - 1, n - 1);
1313 break;
1314 case PAIR(SOME, INF): /* do as x{m-1,}x */
1315 s = newstate(v->nfa);
1316 NOERR();
1317 moveouts(v->nfa, lp, s);
1318 dupnfa(v->nfa, s, rp, lp, s);
1319 NOERR();
1320 repeat(v, lp, s, m - 1, n);
1321 break;
1322 default:
1323 ERR(REG_ASSERT);
1324 break;
5a419892
VZ
1325 }
1326}
1327
1328/*
c0a7edde
RN
1329 * bracket - handle non-complemented bracket expression
1330 * Also called from cbracket for complemented bracket expressions.
5a419892
VZ
1331 */
1332static void
c0a7edde
RN
1333bracket(struct vars * v,
1334 struct state * lp,
1335 struct state * rp)
5a419892 1336{
c0a7edde
RN
1337 assert(SEE('['));
1338 NEXT();
1339 while (!SEE(']') && !SEE(EOS))
1340 brackpart(v, lp, rp);
1341 assert(SEE(']') || ISERR());
1342 okcolors(v->nfa, v->cm);
5a419892
VZ
1343}
1344
1345/*
c0a7edde
RN
1346 * cbracket - handle complemented bracket expression
1347 * We do it by calling bracket() with dummy endpoints, and then complementing
1348 * the result. The alternative would be to invoke rainbow(), and then delete
1349 * arcs as the b.e. is seen... but that gets messy.
5a419892
VZ
1350 */
1351static void
c0a7edde
RN
1352cbracket(struct vars * v,
1353 struct state * lp,
1354 struct state * rp)
5a419892 1355{
c0a7edde
RN
1356 struct state *left = newstate(v->nfa);
1357 struct state *right = newstate(v->nfa);
1358 struct state *s;
1359 struct arc *a; /* arc from lp */
1360 struct arc *ba; /* arc from left, from bracket() */
1361 struct arc *pa; /* MCCE-prototype arc */
1362 color co;
1363 chr *p;
1364 int i;
1365
1366 NOERR();
1367 bracket(v, left, right);
1368 if (v->cflags & REG_NLSTOP)
1369 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1370 NOERR();
1371
1372 assert(lp->nouts == 0); /* all outarcs will be ours */
1373
1374 /* easy part of complementing */
1375 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1376 NOERR();
1377 if (v->mcces == NULL)
1378 { /* no MCCEs -- we're done */
1379 dropstate(v->nfa, left);
1380 assert(right->nins == 0);
1381 freestate(v->nfa, right);
5a419892 1382 return;
c0a7edde
RN
1383 }
1384
1385 /* but complementing gets messy in the presence of MCCEs... */
1386 NOTE(REG_ULOCALE);
1387 for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--)
1388 {
1389 co = GETCOLOR(v->cm, *p);
1390 a = findarc(lp, PLAIN, co);
1391 ba = findarc(left, PLAIN, co);
1392 if (ba == NULL)
1393 {
1394 assert(a != NULL);
1395 freearc(v->nfa, a);
1396 }
1397 else
1398 assert(a == NULL);
1399 s = newstate(v->nfa);
1400 NOERR();
1401 newarc(v->nfa, PLAIN, co, lp, s);
1402 NOERR();
1403 pa = findarc(v->mccepbegin, PLAIN, co);
1404 assert(pa != NULL);
1405 if (ba == NULL)
1406 { /* easy case, need all of them */
1407 cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1408 newarc(v->nfa, '$', 1, s, rp);
1409 newarc(v->nfa, '$', 0, s, rp);
1410 colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1411 }
1412 else
1413 { /* must be selective */
1414 if (findarc(ba->to, '$', 1) == NULL)
1415 {
1416 newarc(v->nfa, '$', 1, s, rp);
1417 newarc(v->nfa, '$', 0, s, rp);
1418 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1419 s, rp);
1420 }
1421 for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1422 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1423 newarc(v->nfa, PLAIN, pa->co, s, rp);
1424 if (s->nouts == 0) /* limit of selectivity: none */
1425 dropstate(v->nfa, s); /* frees arc too */
1426 }
1427 NOERR();
1428 }
5a419892 1429
c0a7edde
RN
1430 delsub(v->nfa, left, right);
1431 assert(left->nouts == 0);
1432 freestate(v->nfa, left);
1433 assert(right->nins == 0);
1434 freestate(v->nfa, right);
1435}
5a419892 1436
c0a7edde
RN
1437/*
1438 * brackpart - handle one item (or range) within a bracket expression
1439 */
1440static void
1441brackpart(struct vars * v,
1442 struct state * lp,
1443 struct state * rp)
1444{
1445 celt startc;
1446 celt endc;
1447 struct cvec *cv;
1448 chr *startp;
1449 chr *endp;
1450 chr c[1];
1451
1452 /* parse something, get rid of special cases, take shortcuts */
1453 switch (v->nexttype)
1454 {
1455 case RANGE: /* a-b-c or other botch */
1456 ERR(REG_ERANGE);
1457 return;
5a419892 1458 break;
c0a7edde
RN
1459 case PLAIN:
1460 c[0] = v->nextvalue;
1461 NEXT();
1462 /* shortcut for ordinary chr (not range, not MCCE leader) */
1463 if (!SEE(RANGE) && !ISCELEADER(v, c[0]))
1464 {
1465 onechr(v, c[0], lp, rp);
1466 return;
1467 }
1468 startc = element(v, c, c + 1);
1469 NOERR();
5a419892 1470 break;
c0a7edde
RN
1471 case COLLEL:
1472 startp = v->now;
1473 endp = scanplain(v);
1474 INSIST(startp < endp, REG_ECOLLATE);
1475 NOERR();
1476 startc = element(v, startp, endp);
1477 NOERR();
1478 break;
1479 case ECLASS:
1480 startp = v->now;
1481 endp = scanplain(v);
1482 INSIST(startp < endp, REG_ECOLLATE);
1483 NOERR();
1484 startc = element(v, startp, endp);
1485 NOERR();
1486 cv = eclass(v, startc, (v->cflags & REG_ICASE));
1487 NOERR();
1488 dovec(v, cv, lp, rp);
1489 return;
5a419892 1490 break;
c0a7edde
RN
1491 case CCLASS:
1492 startp = v->now;
1493 endp = scanplain(v);
1494 INSIST(startp < endp, REG_ECTYPE);
1495 NOERR();
1496 cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1497 NOERR();
1498 dovec(v, cv, lp, rp);
1499 return;
5a419892 1500 break;
c0a7edde
RN
1501 default:
1502 ERR(REG_ASSERT);
1503 return;
5a419892 1504 break;
c0a7edde
RN
1505 }
1506
1507 if (SEE(RANGE))
1508 {
1509 NEXT();
1510 switch (v->nexttype)
1511 {
1512 case PLAIN:
1513 case RANGE:
1514 c[0] = v->nextvalue;
1515 NEXT();
1516 endc = element(v, c, c + 1);
1517 NOERR();
5a419892 1518 break;
c0a7edde
RN
1519 case COLLEL:
1520 startp = v->now;
1521 endp = scanplain(v);
1522 INSIST(startp < endp, REG_ECOLLATE);
1523 NOERR();
1524 endc = element(v, startp, endp);
1525 NOERR();
5a419892 1526 break;
c0a7edde
RN
1527 default:
1528 ERR(REG_ERANGE);
1529 return;
5a419892
VZ
1530 break;
1531 }
c0a7edde
RN
1532 }
1533 else
1534 endc = startc;
1535
1536 /*
1537 * Ranges are unportable. Actually, standard C does guarantee that
1538 * digits are contiguous, but making that an exception is just too
1539 * complicated.
1540 */
1541 if (startc != endc)
1542 NOTE(REG_UUNPORT);
1543 cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1544 NOERR();
1545 dovec(v, cv, lp, rp);
5a419892
VZ
1546}
1547
1548/*
c0a7edde
RN
1549 * scanplain - scan PLAIN contents of [. etc.
1550 *
1551 * Certain bits of trickery in lex.c know that this code does not try
1552 * to look past the final bracket of the [. etc.
5a419892 1553 */
c0a7edde
RN
1554static chr * /* just after end of sequence */
1555scanplain(struct vars * v)
5a419892 1556{
c0a7edde
RN
1557 chr *endp;
1558
1559 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1560 NEXT();
1561
1562 endp = v->now;
1563 while (SEE(PLAIN))
1564 {
1565 endp = v->now;
1566 NEXT();
1567 }
1568
1569 assert(SEE(END) || ISERR());
1570 NEXT();
1571
1572 return endp;
5a419892
VZ
1573}
1574
1575/*
c0a7edde
RN
1576 * leaders - process a cvec of collating elements to also include leaders
1577 * Also gives all characters involved their own colors, which is almost
1578 * certainly necessary, and sets up little disconnected subNFA.
5a419892 1579 */
c0a7edde
RN
1580static void
1581leaders(struct vars * v,
1582 struct cvec * cv)
5a419892 1583{
c0a7edde
RN
1584 int mcce;
1585 chr *p;
1586 chr leader;
1587 struct state *s;
1588 struct arc *a;
1589
1590 v->mccepbegin = newstate(v->nfa);
1591 v->mccepend = newstate(v->nfa);
1592 NOERR();
1593
1594 for (mcce = 0; mcce < cv->nmcces; mcce++)
1595 {
1596 p = cv->mcces[mcce];
1597 leader = *p;
1598 if (!haschr(cv, leader))
1599 {
1600 addchr(cv, leader);
1601 s = newstate(v->nfa);
1602 newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1603 v->mccepbegin, s);
1604 okcolors(v->nfa, v->cm);
5a419892 1605 }
c0a7edde
RN
1606 else
1607 {
1608 a = findarc(v->mccepbegin, PLAIN,
1609 GETCOLOR(v->cm, leader));
1610 assert(a != NULL);
1611 s = a->to;
1612 assert(s != v->mccepend);
5a419892 1613 }
c0a7edde
RN
1614 p++;
1615 assert(*p != 0 && *(p + 1) == 0); /* only 2-char MCCEs for
1616 * now */
1617 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1618 okcolors(v->nfa, v->cm);
5a419892 1619 }
5a419892
VZ
1620}
1621
1622/*
c0a7edde
RN
1623 * onechr - fill in arcs for a plain character, and possible case complements
1624 * This is mostly a shortcut for efficient handling of the common case.
5a419892
VZ
1625 */
1626static void
c0a7edde
RN
1627onechr(struct vars * v,
1628 chr c,
1629 struct state * lp,
1630 struct state * rp)
5a419892 1631{
c0a7edde
RN
1632 if (!(v->cflags & REG_ICASE))
1633 {
1634 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1635 return;
1636 }
1637
1638 /* rats, need general case anyway... */
1639 dovec(v, allcases(v, c), lp, rp);
5a419892
VZ
1640}
1641
1642/*
c0a7edde
RN
1643 * dovec - fill in arcs for each element of a cvec
1644 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
5a419892 1645 */
c0a7edde
RN
1646static void
1647dovec(struct vars * v,
1648 struct cvec * cv,
1649 struct state * lp,
1650 struct state * rp)
5a419892 1651{
c0a7edde
RN
1652 chr ch,
1653 from,
1654 to;
1655 celt ce;
1656 chr *p;
1657 int i;
1658 color co;
1659 struct cvec *leads;
1660 struct arc *a;
1661 struct arc *pa; /* arc in prototype */
1662 struct state *s;
1663 struct state *ps; /* state in prototype */
1664
1665 /* need a place to store leaders, if any */
1666 if (nmcces(v) > 0)
1667 {
1668 assert(v->mcces != NULL);
1669 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs)
1670 {
1671 if (v->cv2 != NULL)
1672 free(v->cv2);
1673 v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1674 NOERR();
1675 leads = v->cv2;
5a419892 1676 }
c0a7edde
RN
1677 else
1678 leads = clearcvec(v->cv2);
1679 }
1680 else
1681 leads = NULL;
1682
1683 /* first, get the ordinary characters out of the way */
1684 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
1685 {
1686 ch = *p;
1687 if (!ISCELEADER(v, ch))
1688 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1689 else
1690 {
1691 assert(singleton(v->cm, ch));
1692 assert(leads != NULL);
1693 if (!haschr(leads, ch))
1694 addchr(leads, ch);
1695 }
1696 }
5a419892 1697
c0a7edde
RN
1698 /* and the ranges */
1699 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
1700 {
1701 from = *p;
1702 to = *(p + 1);
1703 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT)
1704 {
1705 if (from < ce)
1706 subrange(v, from, ce - 1, lp, rp);
1707 assert(singleton(v->cm, ce));
1708 assert(leads != NULL);
1709 if (!haschr(leads, ce))
1710 addchr(leads, ce);
1711 from = ce + 1;
1712 }
1713 if (from <= to)
1714 subrange(v, from, to, lp, rp);
5a419892
VZ
1715 }
1716
c0a7edde
RN
1717 if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1718 return;
5a419892 1719
c0a7edde
RN
1720 /* deal with the MCCE leaders */
1721 NOTE(REG_ULOCALE);
1722 for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--)
1723 {
1724 co = GETCOLOR(v->cm, *p);
1725 a = findarc(lp, PLAIN, co);
1726 if (a != NULL)
1727 s = a->to;
1728 else
1729 {
1730 s = newstate(v->nfa);
1731 NOERR();
1732 newarc(v->nfa, PLAIN, co, lp, s);
1733 NOERR();
1734 }
1735 pa = findarc(v->mccepbegin, PLAIN, co);
1736 assert(pa != NULL);
1737 ps = pa->to;
1738 newarc(v->nfa, '$', 1, s, rp);
1739 newarc(v->nfa, '$', 0, s, rp);
1740 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1741 NOERR();
1742 }
1743
1744 /* and the MCCEs */
1745 for (i = 0; i < cv->nmcces; i++)
1746 {
1747 p = cv->mcces[i];
1748 assert(singleton(v->cm, *p));
1749 if (!singleton(v->cm, *p))
1750 {
1751 ERR(REG_ASSERT);
1752 return;
1753 }
1754 ch = *p++;
1755 co = GETCOLOR(v->cm, ch);
1756 a = findarc(lp, PLAIN, co);
1757 if (a != NULL)
1758 s = a->to;
1759 else
1760 {
1761 s = newstate(v->nfa);
1762 NOERR();
1763 newarc(v->nfa, PLAIN, co, lp, s);
1764 NOERR();
1765 }
1766 assert(*p != 0); /* at least two chars */
1767 assert(singleton(v->cm, *p));
1768 ch = *p++;
1769 co = GETCOLOR(v->cm, ch);
1770 assert(*p == 0); /* and only two, for now */
1771 newarc(v->nfa, PLAIN, co, s, rp);
1772 NOERR();
1773 }
5a419892
VZ
1774}
1775
1776/*
c0a7edde 1777 * nextleader - find next MCCE leader within range
5a419892 1778 */
c0a7edde
RN
1779static celt /* NOCELT means none */
1780nextleader(struct vars * v,
1781 chr from,
1782 chr to)
5a419892 1783{
c0a7edde
RN
1784 int i;
1785 chr *p;
1786 chr ch;
1787 celt it = NOCELT;
1788
1789 if (v->mcces == NULL)
1790 return it;
1791
1792 for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++)
1793 {
1794 ch = *p;
1795 if (from <= ch && ch <= to)
1796 if (it == NOCELT || ch < it)
1797 it = ch;
1798 }
1799 return it;
5a419892
VZ
1800}
1801
1802/*
c0a7edde
RN
1803 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1804 *
1805 * The list is kept as a bunch of arcs between two dummy states; it's
1806 * disposed of by the unreachable-states sweep in NFA optimization.
1807 * Does NEXT(). Must not be called from any unusual lexical context.
1808 * This should be reconciled with the \w etc. handling in lex.c, and
1809 * should be cleaned up to reduce dependencies on input scanning.
5a419892
VZ
1810 */
1811static void
c0a7edde 1812wordchrs(struct vars * v)
5a419892 1813{
c0a7edde
RN
1814 struct state *left;
1815 struct state *right;
5a419892 1816
c0a7edde
RN
1817 if (v->wordchrs != NULL)
1818 {
1819 NEXT(); /* for consistency */
5a419892
VZ
1820 return;
1821 }
1822
c0a7edde
RN
1823 left = newstate(v->nfa);
1824 right = newstate(v->nfa);
1825 NOERR();
1826 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1827 lexword(v);
1828 NEXT();
1829 assert(v->savenow != NULL && SEE('['));
1830 bracket(v, left, right);
1831 assert((v->savenow != NULL && SEE(']')) || ISERR());
1832 NEXT();
1833 NOERR();
1834 v->wordchrs = left;
5a419892
VZ
1835}
1836
1837/*
c0a7edde 1838 * subre - allocate a subre
5a419892 1839 */
c0a7edde
RN
1840static struct subre *
1841subre(struct vars * v,
1842 int op,
1843 int flags,
1844 struct state * begin,
1845 struct state * end)
5a419892 1846{
c0a7edde 1847 struct subre *ret;
5a419892 1848
c0a7edde
RN
1849 ret = v->treefree;
1850 if (ret != NULL)
1851 v->treefree = ret->left;
1852 else
1853 {
1854 ret = (struct subre *) MALLOC(sizeof(struct subre));
1855 if (ret == NULL)
1856 {
1857 ERR(REG_ESPACE);
1858 return NULL;
1859 }
1860 ret->chain = v->treechain;
1861 v->treechain = ret;
5a419892
VZ
1862 }
1863
c0a7edde 1864 assert(strchr("|.b(=", op) != NULL);
5a419892 1865
c0a7edde
RN
1866 ret->op = op;
1867 ret->flags = flags;
1868 ret->retry = 0;
1869 ret->subno = 0;
1870 ret->min = ret->max = 1;
1871 ret->left = NULL;
1872 ret->right = NULL;
1873 ret->begin = begin;
1874 ret->end = end;
1875 ZAPCNFA(ret->cnfa);
1876
1877 return ret;
5a419892
VZ
1878}
1879
1880/*
c0a7edde 1881 * freesubre - free a subRE subtree
5a419892 1882 */
c0a7edde
RN
1883static void
1884freesubre(struct vars * v, /* might be NULL */
1885 struct subre * sr)
5a419892 1886{
c0a7edde
RN
1887 if (sr == NULL)
1888 return;
1889
1890 if (sr->left != NULL)
1891 freesubre(v, sr->left);
1892 if (sr->right != NULL)
1893 freesubre(v, sr->right);
1894
1895 freesrnode(v, sr);
5a419892
VZ
1896}
1897
1898/*
c0a7edde 1899 * freesrnode - free one node in a subRE subtree
5a419892
VZ
1900 */
1901static void
c0a7edde
RN
1902freesrnode(struct vars * v, /* might be NULL */
1903 struct subre * sr)
5a419892 1904{
c0a7edde
RN
1905 if (sr == NULL)
1906 return;
1907
1908 if (!NULLCNFA(sr->cnfa))
1909 freecnfa(&sr->cnfa);
1910 sr->flags = 0;
1911
1912 if (v != NULL)
1913 {
1914 sr->left = v->treefree;
1915 v->treefree = sr;
1916 }
1917 else
1918 FREE(sr);
5a419892
VZ
1919}
1920
1921/*
c0a7edde 1922 * optst - optimize a subRE subtree
5a419892
VZ
1923 */
1924static void
c0a7edde
RN
1925optst(struct vars * v,
1926 struct subre * t)
5a419892 1927{
c0a7edde
RN
1928 if (t == NULL)
1929 return;
1930
1931 /* recurse through children */
1932 if (t->left != NULL)
1933 optst(v, t->left);
1934 if (t->right != NULL)
1935 optst(v, t->right);
5a419892
VZ
1936}
1937
1938/*
c0a7edde 1939 * numst - number tree nodes (assigning retry indexes)
5a419892 1940 */
c0a7edde
RN
1941static int /* next number */
1942numst(struct subre * t,
1943 int start) /* starting point for subtree numbers */
5a419892 1944{
c0a7edde
RN
1945 int i;
1946
1947 assert(t != NULL);
1948
1949 i = start;
1950 t->retry = (short) i++;
1951 if (t->left != NULL)
1952 i = numst(t->left, i);
1953 if (t->right != NULL)
1954 i = numst(t->right, i);
1955 return i;
5a419892
VZ
1956}
1957
1958/*
c0a7edde 1959 * markst - mark tree nodes as INUSE
5a419892 1960 */
c0a7edde
RN
1961static void
1962markst(struct subre * t)
5a419892 1963{
c0a7edde
RN
1964 assert(t != NULL);
1965
1966 t->flags |= INUSE;
1967 if (t->left != NULL)
1968 markst(t->left);
1969 if (t->right != NULL)
1970 markst(t->right);
5a419892
VZ
1971}
1972
1973/*
c0a7edde 1974 * cleanst - free any tree nodes not marked INUSE
5a419892
VZ
1975 */
1976static void
c0a7edde 1977cleanst(struct vars * v)
5a419892 1978{
c0a7edde
RN
1979 struct subre *t;
1980 struct subre *next;
1981
1982 for (t = v->treechain; t != NULL; t = next)
1983 {
1984 next = t->chain;
1985 if (!(t->flags & INUSE))
1986 FREE(t);
5a419892 1987 }
c0a7edde
RN
1988 v->treechain = NULL;
1989 v->treefree = NULL; /* just on general principles */
5a419892
VZ
1990}
1991
1992/*
c0a7edde 1993 * nfatree - turn a subRE subtree into a tree of compacted NFAs
5a419892 1994 */
c0a7edde
RN
1995static long /* optimize results from top node */
1996nfatree(struct vars * v,
1997 struct subre * t,
1998 FILE *f) /* for debug output */
5a419892 1999{
c0a7edde
RN
2000 assert(t != NULL && t->begin != NULL);
2001
2002 if (t->left != NULL)
2003 (DISCARD) nfatree(v, t->left, f);
2004 if (t->right != NULL)
2005 (DISCARD) nfatree(v, t->right, f);
2006
2007 return nfanode(v, t, f);
5a419892
VZ
2008}
2009
2010/*
c0a7edde 2011 * nfanode - do one NFA for nfatree
5a419892 2012 */
c0a7edde
RN
2013static long /* optimize results */
2014nfanode(struct vars * v,
2015 struct subre * t,
2016 FILE *f) /* for debug output */
5a419892 2017{
c0a7edde
RN
2018 struct nfa *nfa;
2019 long ret = 0;
5a419892 2020
c0a7edde 2021 assert(t->begin != NULL);
5a419892 2022
c0a7edde
RN
2023#ifdef REG_DEBUG
2024 if (f != NULL)
2025 {
2026 char idbuf[50];
5a419892 2027
c0a7edde
RN
2028 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
2029 stid(t, idbuf, sizeof(idbuf)));
2030 }
2031#endif
2032 nfa = newnfa(v, v->cm, v->nfa);
2033 NOERRZ();
2034 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
2035 if (!ISERR())
2036 {
2037 specialcolors(nfa);
2038 ret = optimize(nfa, f);
2039 }
2040 if (!ISERR())
2041 compact(nfa, &t->cnfa);
2042
2043 freenfa(nfa);
2044 return ret;
5a419892
VZ
2045}
2046
2047/*
c0a7edde 2048 * newlacon - allocate a lookahead-constraint subRE
5a419892 2049 */
c0a7edde
RN
2050static int /* lacon number */
2051newlacon(struct vars * v,
2052 struct state * begin,
2053 struct state * end,
2054 int pos)
5a419892 2055{
c0a7edde
RN
2056 int n;
2057 struct subre *sub;
2058
2059 if (v->nlacons == 0)
2060 {
2061 v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
2062 n = 1; /* skip 0th */
2063 v->nlacons = 2;
5a419892 2064 }
c0a7edde
RN
2065 else
2066 {
2067 v->lacons = (struct subre *) REALLOC(v->lacons,
2068 (v->nlacons + 1) * sizeof(struct subre));
2069 n = v->nlacons++;
5a419892 2070 }
c0a7edde
RN
2071 if (v->lacons == NULL)
2072 {
2073 ERR(REG_ESPACE);
2074 return 0;
5a419892 2075 }
c0a7edde
RN
2076 sub = &v->lacons[n];
2077 sub->begin = begin;
2078 sub->end = end;
2079 sub->subno = pos;
2080 ZAPCNFA(sub->cnfa);
2081 return n;
5a419892
VZ
2082}
2083
2084/*
c0a7edde 2085 * freelacons - free lookahead-constraint subRE vector
5a419892
VZ
2086 */
2087static void
c0a7edde
RN
2088freelacons(struct subre * subs,
2089 int n)
5a419892 2090{
c0a7edde
RN
2091 struct subre *sub;
2092 int i;
2093
2094 assert(n > 0);
2095 for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
2096 if (!NULLCNFA(sub->cnfa))
2097 freecnfa(&sub->cnfa);
2098 FREE(subs);
5a419892
VZ
2099}
2100
2101/*
c0a7edde 2102 * rfree - free a whole RE (insides of regfree)
5a419892
VZ
2103 */
2104static void
c0a7edde 2105rfree(regex_t *re)
5a419892 2106{
c0a7edde 2107 struct guts *g;
5a419892 2108
c0a7edde 2109 if (re == NULL || re->re_magic != REMAGIC)
5a419892
VZ
2110 return;
2111
c0a7edde
RN
2112 re->re_magic = 0; /* invalidate RE */
2113 g = (struct guts *) re->re_guts;
2114 re->re_guts = NULL;
2115 re->re_fns = NULL;
2116 g->magic = 0;
2117 freecm(&g->cmap);
2118 if (g->tree != NULL)
2119 freesubre((struct vars *) NULL, g->tree);
2120 if (g->lacons != NULL)
2121 freelacons(g->lacons, g->nlacons);
2122 if (!NULLCNFA(g->search))
2123 freecnfa(&g->search);
2124 FREE(g);
5a419892
VZ
2125}
2126
c0a7edde
RN
2127#ifdef REG_DEBUG
2128
5a419892 2129/*
c0a7edde 2130 * dump - dump an RE in human-readable form
5a419892
VZ
2131 */
2132static void
c0a7edde
RN
2133dump(regex_t *re,
2134 FILE *f)
5a419892 2135{
c0a7edde
RN
2136 struct guts *g;
2137 int i;
2138
2139 if (re->re_magic != REMAGIC)
2140 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2141 REMAGIC);
2142 if (re->re_guts == NULL)
2143 {
2144 fprintf(f, "NULL guts!!!\n");
2145 return;
5a419892 2146 }
c0a7edde
RN
2147 g = (struct guts *) re->re_guts;
2148 if (g->magic != GUTSMAGIC)
2149 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2150 GUTSMAGIC);
2151
2152 fprintf(f, "\n\n\n========= DUMP ==========\n");
2153 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2154 re->re_nsub, re->re_info, re->re_csize, g->ntree);
2155
2156 dumpcolors(&g->cmap, f);
2157 if (!NULLCNFA(g->search))
2158 {
2159 printf("\nsearch:\n");
2160 dumpcnfa(&g->search, f);
2161 }
2162 for (i = 1; i < g->nlacons; i++)
2163 {
2164 fprintf(f, "\nla%d (%s):\n", i,
2165 (g->lacons[i].subno) ? "positive" : "negative");
2166 dumpcnfa(&g->lacons[i].cnfa, f);
2167 }
2168 fprintf(f, "\n");
2169 dumpst(g->tree, f, 0);
5a419892
VZ
2170}
2171
2172/*
c0a7edde 2173 * dumpst - dump a subRE tree
5a419892
VZ
2174 */
2175static void
c0a7edde
RN
2176dumpst(struct subre * t,
2177 FILE *f,
2178 int nfapresent) /* is the original NFA still around? */
5a419892 2179{
c0a7edde
RN
2180 if (t == NULL)
2181 fprintf(f, "null tree\n");
2182 else
2183 stdump(t, f, nfapresent);
2184 fflush(f);
2185}
5a419892 2186
c0a7edde
RN
2187/*
2188 * stdump - recursive guts of dumpst
2189 */
2190static void
2191stdump(struct subre * t,
2192 FILE *f,
2193 int nfapresent) /* is the original NFA still around? */
2194{
2195 char idbuf[50];
2196
2197 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2198 if (t->flags & LONGER)
2199 fprintf(f, " longest");
2200 if (t->flags & SHORTER)
2201 fprintf(f, " shortest");
2202 if (t->flags & MIXED)
2203 fprintf(f, " hasmixed");
2204 if (t->flags & CAP)
2205 fprintf(f, " hascapture");
2206 if (t->flags & BACKR)
2207 fprintf(f, " hasbackref");
2208 if (!(t->flags & INUSE))
2209 fprintf(f, " UNUSED");
2210 if (t->subno != 0)
2211 fprintf(f, " (#%d)", t->subno);
2212 if (t->min != 1 || t->max != 1)
2213 {
2214 fprintf(f, " {%d,", t->min);
2215 if (t->max != INFINITY)
2216 fprintf(f, "%d", t->max);
2217 fprintf(f, "}");
5a419892 2218 }
c0a7edde
RN
2219 if (nfapresent)
2220 fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2221 if (t->left != NULL)
2222 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2223 if (t->right != NULL)
2224 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2225 if (!NULLCNFA(t->cnfa))
2226 {
2227 fprintf(f, "\n");
2228 dumpcnfa(&t->cnfa, f);
2229 fprintf(f, "\n");
5a419892 2230 }
c0a7edde
RN
2231 if (t->left != NULL)
2232 stdump(t->left, f, nfapresent);
2233 if (t->right != NULL)
2234 stdump(t->right, f, nfapresent);
5a419892
VZ
2235}
2236
2237/*
c0a7edde 2238 * stid - identify a subtree node for dumping
5a419892 2239 */
c0a7edde
RN
2240static char * /* points to buf or constant string */
2241stid(struct subre * t,
2242 char *buf,
2243 size_t bufsize)
5a419892 2244{
c0a7edde
RN
2245 /* big enough for hex int or decimal t->retry? */
2246 if (bufsize < sizeof(int) * 2 + 3 || bufsize < sizeof(t->retry) * 3 + 1)
2247 return "unable";
2248 if (t->retry != 0)
2249 sprintf(buf, "%d", t->retry);
2250 else
2251 sprintf(buf, "0x%x", (int) t); /* may lose bits, that's okay */
2252 return buf;
5a419892 2253}
c0a7edde
RN
2254#endif /* REG_DEBUG */
2255
2256
2257#include "regc_lex.c"
2258#include "regc_color.c"
2259#include "regc_nfa.c"
2260#include "regc_cvec.c"
2261#include "regc_locale.c"