]> git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
reminder added
[wxWidgets.git] / src / regex / regcomp.c
1 /*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regcomp.c,v 1.38 2003/08/08 21:41:56 momjian Exp $
32 *
33 */
34
35 #include "regguts.h"
36
37 /*
38 * forward declarations, up here so forward datatypes etc. are defined early
39 */
40 /* === regcomp.c === */
41 static void moresubs(struct vars *, int);
42 static int freev(struct vars *, int);
43 static void makesearch(struct vars *, struct nfa *);
44 static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
45 static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
46 static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
47 static void nonword(struct vars *, int, struct state *, struct state *);
48 static void word(struct vars *, int, struct state *, struct state *);
49 static int scannum(struct vars *);
50 static void repeat(struct vars *, struct state *, struct state *, int, int);
51 static void bracket(struct vars *, struct state *, struct state *);
52 static void cbracket(struct vars *, struct state *, struct state *);
53 static void brackpart(struct vars *, struct state *, struct state *);
54 static chr *scanplain(struct vars *);
55 static void leaders(struct vars *, struct cvec *);
56 static void onechr(struct vars *, chr, struct state *, struct state *);
57 static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
58 static celt nextleader(struct vars *, chr, chr);
59 static void wordchrs(struct vars *);
60 static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
61 static void freesubre(struct vars *, struct subre *);
62 static void freesrnode(struct vars *, struct subre *);
63 static void optst(struct vars *, struct subre *);
64 static int numst(struct subre *, int);
65 static void markst(struct subre *);
66 static void cleanst(struct vars *);
67 static long nfatree(struct vars *, struct subre *, FILE *);
68 static long nfanode(struct vars *, struct subre *, FILE *);
69 static int newlacon(struct vars *, struct state *, struct state *, int);
70 static void freelacons(struct subre *, int);
71 static void rfree(regex_t *);
72
73 #ifdef REG_DEBUG
74 static void dump(regex_t *, FILE *);
75 static void dumpst(struct subre *, FILE *, int);
76 static void stdump(struct subre *, FILE *, int);
77 static char *stid(struct subre *, char *, size_t);
78 #endif
79 /* === regc_lex.c === */
80 static void lexstart(struct vars *);
81 static void prefixes(struct vars *);
82 static void lexnest(struct vars *, chr *, chr *);
83 static void lexword(struct vars *);
84 static int next(struct vars *);
85 static int lexescape(struct vars *);
86 static chr lexdigits(struct vars *, int, int, int);
87 static int brenext(struct vars *, chr);
88 static void skip(struct vars *);
89 static chr newline(void);
90 static chr chrnamed(struct vars *, chr *, chr *, chr);
91
92 /* === regc_color.c === */
93 static void initcm(struct vars *, struct colormap *);
94 static void freecm(struct colormap *);
95 static void cmtreefree(struct colormap *, union tree *, int);
96 static color setcolor(struct colormap *, chr, pcolor);
97 static color maxcolor(struct colormap *);
98 static color newcolor(struct colormap *);
99 static void freecolor(struct colormap *, pcolor);
100 static color pseudocolor(struct colormap *);
101 static color subcolor(struct colormap *, chr c);
102 static color newsub(struct colormap *, pcolor);
103 static void subrange(struct vars *, chr, chr, struct state *, struct state *);
104 static void subblock(struct vars *, chr, struct state *, struct state *);
105 static void okcolors(struct nfa *, struct colormap *);
106 static void colorchain(struct colormap *, struct arc *);
107 static void uncolorchain(struct colormap *, struct arc *);
108 static int singleton(struct colormap *, chr c);
109 static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
110 static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
111
112 #ifdef REG_DEBUG
113 static void dumpcolors(struct colormap *, FILE *);
114 static void fillcheck(struct colormap *, union tree *, int, FILE *);
115 static void dumpchr(chr, FILE *);
116 #endif
117 /* === regc_nfa.c === */
118 static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
119 static void freenfa(struct nfa *);
120 static struct state *newstate(struct nfa *);
121 static struct state *newfstate(struct nfa *, int flag);
122 static void dropstate(struct nfa *, struct state *);
123 static void freestate(struct nfa *, struct state *);
124 static void destroystate(struct nfa *, struct state *);
125 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
126 static struct arc *allocarc(struct nfa *, struct state *);
127 static void freearc(struct nfa *, struct arc *);
128 static struct arc *findarc(struct state *, int, pcolor);
129 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
130 static void moveins(struct nfa *, struct state *, struct state *);
131 static void copyins(struct nfa *, struct state *, struct state *);
132 static void moveouts(struct nfa *, struct state *, struct state *);
133 static void copyouts(struct nfa *, struct state *, struct state *);
134 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
135 static void delsub(struct nfa *, struct state *, struct state *);
136 static void deltraverse(struct nfa *, struct state *, struct state *);
137 static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
138 static void duptraverse(struct nfa *, struct state *, struct state *);
139 static void cleartraverse(struct nfa *, struct state *);
140 static void specialcolors(struct nfa *);
141 static long optimize(struct nfa *, FILE *);
142 static void pullback(struct nfa *, FILE *);
143 static int pull(struct nfa *, struct arc *);
144 static void pushfwd(struct nfa *, FILE *);
145 static int push(struct nfa *, struct arc *);
146
147 #define INCOMPATIBLE 1 /* destroys arc */
148 #define SATISFIED 2 /* constraint satisfied */
149 #define COMPATIBLE 3 /* compatible but not satisfied yet */
150 static int combine(struct arc *, struct arc *);
151 static void fixempties(struct nfa *, FILE *);
152 static int unempty(struct nfa *, struct arc *);
153 static void cleanup(struct nfa *);
154 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
155 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
156 static long analyze(struct nfa *);
157 static void compact(struct nfa *, struct cnfa *);
158 static void carcsort(struct carc *, struct carc *);
159 static void freecnfa(struct cnfa *);
160 static void dumpnfa(struct nfa *, FILE *);
161
162 #ifdef REG_DEBUG
163 static void dumpstate(struct state *, FILE *);
164 static void dumparcs(struct state *, FILE *);
165 static int dumprarcs(struct arc *, struct state *, FILE *, int);
166 static void dumparc(struct arc *, struct state *, FILE *);
167 static void dumpcnfa(struct cnfa *, FILE *);
168 static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
169 #endif
170 /* === regc_cvec.c === */
171 static struct cvec *newcvec(int, int, int);
172 static struct cvec *clearcvec(struct cvec *);
173 static void addchr(struct cvec *, chr);
174 static void addrange(struct cvec *, chr, chr);
175 static void addmcce(struct cvec *, chr *, chr *);
176 static int haschr(struct cvec *, chr);
177 static struct cvec *getcvec(struct vars *, int, int, int);
178 static void freecvec(struct cvec *);
179
180 /* === regc_locale.c === */
181 static int wx_isdigit(wx_wchar c);
182 static int wx_isalpha(wx_wchar c);
183 static int wx_isalnum(wx_wchar c);
184 static int wx_isupper(wx_wchar c);
185 static int wx_islower(wx_wchar c);
186 static int wx_isgraph(wx_wchar c);
187 static int wx_ispunct(wx_wchar c);
188 static int wx_isspace(wx_wchar c);
189 static wx_wchar wx_toupper(wx_wchar c);
190 static wx_wchar wx_tolower(wx_wchar c);
191 static int nmcces(struct vars *);
192 static int nleaders(struct vars *);
193 static struct cvec *allmcces(struct vars *, struct cvec *);
194 static celt element(struct vars *, chr *, chr *);
195 static struct cvec *range(struct vars *, celt, celt, int);
196 static int before(celt, celt);
197 static struct cvec *eclass(struct vars *, celt, int);
198 static struct cvec *cclass(struct vars *, chr *, chr *, int);
199 static struct cvec *allcases(struct vars *, chr);
200 static int cmp(const chr *, const chr *, size_t);
201 static int casecmp(const chr *, const chr *, size_t);
202
203
204 /* internal variables, bundled for easy passing around */
205 struct vars
206 {
207 regex_t *re;
208 chr *now; /* scan pointer into string */
209 chr *stop; /* end of string */
210 chr *savenow; /* saved now and stop for "subroutine
211 * call" */
212 chr *savestop;
213 int err; /* error code (0 if none) */
214 int cflags; /* copy of compile flags */
215 int lasttype; /* type of previous token */
216 int nexttype; /* type of next token */
217 chr nextvalue; /* value (if any) of next token */
218 int lexcon; /* lexical context type (see lex.c) */
219 int nsubexp; /* subexpression count */
220 struct subre **subs; /* subRE pointer vector */
221 size_t nsubs; /* length of vector */
222 struct subre *sub10[10]; /* initial vector, enough for most */
223 struct nfa *nfa; /* the NFA */
224 struct colormap *cm; /* character color map */
225 color nlcolor; /* color of newline */
226 struct state *wordchrs; /* state in nfa holding word-char outarcs */
227 struct subre *tree; /* subexpression tree */
228 struct subre *treechain; /* all tree nodes allocated */
229 struct subre *treefree; /* any free tree nodes */
230 int ntree; /* number of tree nodes */
231 struct cvec *cv; /* interface cvec */
232 struct cvec *cv2; /* utility cvec */
233 struct cvec *mcces; /* collating-element information */
234 #define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
235 struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
236 struct state *mccepend; /* in nfa, end of MCCE prototypes */
237 struct subre *lacons; /* lookahead-constraint vector */
238 int nlacons; /* size of lacons */
239 };
240
241 /* parsing macros; most know that `v' is the struct vars pointer */
242 #define NEXT() (next(v)) /* advance by one token */
243 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
244 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
245 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
246 #define ISERR() VISERR(v)
247 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
248 ((vv)->err = (e)))
249 #define ERR(e) VERR(v, e) /* record an error */
250 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
251 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
252 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
253 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false,
254 * error */
255 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
256 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
257
258 /* token type codes, some also used as NFA arc types */
259 #define EMPTY 'n' /* no token present */
260 #define EOS 'e' /* end of string */
261 #define PLAIN 'p' /* ordinary character */
262 #define DIGIT 'd' /* digit (in bound) */
263 #define BACKREF 'b' /* back reference */
264 #define COLLEL 'I' /* start of [. */
265 #define ECLASS 'E' /* start of [= */
266 #define CCLASS 'C' /* start of [: */
267 #define END 'X' /* end of [. [= [: */
268 #define RANGE 'R' /* - within [] which might be range delim. */
269 #define LACON 'L' /* lookahead constraint subRE */
270 #define AHEAD 'a' /* color-lookahead arc */
271 #define BEHIND 'r' /* color-lookbehind arc */
272 #define WBDRY 'w' /* word boundary constraint */
273 #define NWBDRY 'W' /* non-word-boundary constraint */
274 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
275 #define SEND 'Z' /* end of string (even if not EOL) */
276 #define PREFER 'P' /* length preference */
277
278 /* is an arc colored, and hence on a color chain? */
279 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
280 (a)->type == BEHIND)
281
282
283
284 /* static function list */
285 static struct fns functions = {
286 rfree, /* regfree insides */
287 };
288
289
290
291 /*
292 * regcomp - compile regular expression
293 */
294 int
295 regcomp(regex_t *re,
296 const chr *string,
297 int flags)
298 {
299
300 size_t nLen = 0;
301 chr* s2 = (chr*) string;
302
303 if (string && *string)
304 {
305 while(*++s2);
306 }
307
308 nLen = ((s2 - string) / sizeof(chr));
309
310 return wx_regcomp(re, string, nLen, flags);
311 }
312 int
313 wx_regcomp(regex_t *re,
314 const chr *string,
315 size_t len,
316 int flags)
317 {
318 struct vars var;
319 struct vars *v = &var;
320 struct guts *g;
321 int i;
322 size_t j;
323
324 #ifdef REG_DEBUG
325 FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
326
327 #else
328 FILE *debug = (FILE *) NULL;
329 #endif
330
331 #define CNOERR() { if (ISERR()) return freev(v, v->err); }
332
333 /* sanity checks */
334
335 if (re == NULL || string == NULL)
336 return REG_INVARG;
337 if ((flags & REG_QUOTE) &&
338 (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
339 return REG_INVARG;
340 if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
341 return REG_INVARG;
342
343 /* initial setup (after which freev() is callable) */
344 v->re = re;
345 v->now = (chr *) string;
346 v->stop = v->now + len;
347 v->savenow = v->savestop = NULL;
348 v->err = 0;
349 v->cflags = flags;
350 v->nsubexp = 0;
351 v->subs = v->sub10;
352 v->nsubs = 10;
353 for (j = 0; j < v->nsubs; j++)
354 v->subs[j] = NULL;
355 v->nfa = NULL;
356 v->cm = NULL;
357 v->nlcolor = COLORLESS;
358 v->wordchrs = NULL;
359 v->tree = NULL;
360 v->treechain = NULL;
361 v->treefree = NULL;
362 v->cv = NULL;
363 v->cv2 = NULL;
364 v->mcces = NULL;
365 v->lacons = NULL;
366 v->nlacons = 0;
367 re->re_magic = REMAGIC;
368 re->re_info = 0; /* bits get set during parse */
369 re->re_csize = sizeof(chr);
370 re->re_guts = NULL;
371 re->re_fns = VS(&functions);
372
373 /* more complex setup, malloced things */
374 re->re_guts = VS(MALLOC(sizeof(struct guts)));
375 if (re->re_guts == NULL)
376 return freev(v, REG_ESPACE);
377 g = (struct guts *) re->re_guts;
378 g->tree = NULL;
379 initcm(v, &g->cmap);
380 v->cm = &g->cmap;
381 g->lacons = NULL;
382 g->nlacons = 0;
383 ZAPCNFA(g->search);
384 v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
385 CNOERR();
386 v->cv = newcvec(100, 20, 10);
387 if (v->cv == NULL)
388 return freev(v, REG_ESPACE);
389 i = nmcces(v);
390 if (i > 0)
391 {
392 v->mcces = newcvec(nleaders(v), 0, i);
393 CNOERR();
394 v->mcces = allmcces(v, v->mcces);
395 leaders(v, v->mcces);
396 addmcce(v->mcces, (chr *) NULL, (chr *) NULL); /* dummy */
397 }
398 CNOERR();
399
400 /* parsing */
401 lexstart(v); /* also handles prefixes */
402 if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
403 {
404 /* assign newline a unique color */
405 v->nlcolor = subcolor(v->cm, newline());
406 okcolors(v->nfa, v->cm);
407 }
408 CNOERR();
409 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
410 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
411 CNOERR();
412 assert(v->tree != NULL);
413
414 /* finish setup of nfa and its subre tree */
415 specialcolors(v->nfa);
416 CNOERR();
417 #ifdef REG_DEBUG
418 if (debug != NULL)
419 {
420 fprintf(debug, "\n\n\n========= RAW ==========\n");
421 dumpnfa(v->nfa, debug);
422 dumpst(v->tree, debug, 1);
423 }
424 #endif
425 optst(v, v->tree);
426 v->ntree = numst(v->tree, 1);
427 markst(v->tree);
428 cleanst(v);
429 #ifdef REG_DEBUG
430 if (debug != NULL)
431 {
432 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
433 dumpst(v->tree, debug, 1);
434 }
435 #endif
436
437 /* build compacted NFAs for tree and lacons */
438 re->re_info |= nfatree(v, v->tree, debug);
439 CNOERR();
440 assert(v->nlacons == 0 || v->lacons != NULL);
441 for (i = 1; i < v->nlacons; i++)
442 {
443 #ifdef REG_DEBUG
444 if (debug != NULL)
445 fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
446 #endif
447 nfanode(v, &v->lacons[i], debug);
448 }
449 CNOERR();
450 if (v->tree->flags & SHORTER)
451 NOTE(REG_USHORTEST);
452
453 /* build compacted NFAs for tree, lacons, fast search */
454 #ifdef REG_DEBUG
455 if (debug != NULL)
456 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
457 #endif
458 /* can sacrifice main NFA now, so use it as work area */
459 (DISCARD) optimize(v->nfa, debug);
460 CNOERR();
461 makesearch(v, v->nfa);
462 CNOERR();
463 compact(v->nfa, &g->search);
464 CNOERR();
465
466 /* looks okay, package it up */
467 re->re_nsub = v->nsubexp;
468 v->re = NULL; /* freev no longer frees re */
469 g->magic = GUTSMAGIC;
470 g->cflags = v->cflags;
471 g->info = re->re_info;
472 g->nsub = re->re_nsub;
473 g->tree = v->tree;
474 v->tree = NULL;
475 g->ntree = v->ntree;
476 g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
477 g->lacons = v->lacons;
478 v->lacons = NULL;
479 g->nlacons = v->nlacons;
480
481 #ifdef REG_DEBUG
482 if (flags & REG_DUMP)
483 dump(re, stdout);
484 #endif
485
486 assert(v->err == 0);
487 return freev(v, 0);
488 }
489
490 /*
491 * moresubs - enlarge subRE vector
492 */
493 static void
494 moresubs(struct vars * v,
495 int wanted) /* want enough room for this one */
496 {
497 struct subre **p;
498 size_t n;
499
500 assert(wanted > 0 && (size_t) wanted >= v->nsubs);
501 n = (size_t) wanted *3 / 2 + 1;
502
503 if (v->subs == v->sub10)
504 {
505 p = (struct subre **) MALLOC(n * sizeof(struct subre *));
506 if (p != NULL)
507 memcpy(VS(p), VS(v->subs),
508 v->nsubs * sizeof(struct subre *));
509 }
510 else
511 p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
512 if (p == NULL)
513 {
514 ERR(REG_ESPACE);
515 return;
516 }
517 v->subs = p;
518 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
519 *p = NULL;
520 assert(v->nsubs == n);
521 assert((size_t) wanted < v->nsubs);
522 }
523
524 /*
525 * freev - free vars struct's substructures where necessary
526 *
527 * Optionally does error-number setting, and always returns error code
528 * (if any), to make error-handling code terser.
529 */
530 static int
531 freev(struct vars * v,
532 int err)
533 {
534 if (v->re != NULL)
535 rfree(v->re);
536 if (v->subs != v->sub10)
537 FREE(v->subs);
538 if (v->nfa != NULL)
539 freenfa(v->nfa);
540 if (v->tree != NULL)
541 freesubre(v, v->tree);
542 if (v->treechain != NULL)
543 cleanst(v);
544 if (v->cv != NULL)
545 freecvec(v->cv);
546 if (v->cv2 != NULL)
547 freecvec(v->cv2);
548 if (v->mcces != NULL)
549 freecvec(v->mcces);
550 if (v->lacons != NULL)
551 freelacons(v->lacons, v->nlacons);
552 ERR(err); /* nop if err==0 */
553
554 return v->err;
555 }
556
557 /*
558 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
559 * NFA must have been optimize()d already.
560 */
561 static void
562 makesearch(struct vars * v,
563 struct nfa * nfa)
564 {
565 struct arc *a;
566 struct arc *b;
567 struct state *pre = nfa->pre;
568 struct state *s;
569 struct state *s2;
570 struct state *slist;
571
572 /* no loops are needed if it's anchored */
573 for (a = pre->outs; a != NULL; a = a->outchain)
574 {
575 assert(a->type == PLAIN);
576 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
577 break;
578 }
579 if (a != NULL)
580 {
581 /* add implicit .* in front */
582 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
583
584 /* and ^* and \A* too -- not always necessary, but harmless */
585 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
586 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
587 }
588
589 /*
590 * Now here's the subtle part. Because many REs have no lookback
591 * constraints, often knowing when you were in the pre state tells you
592 * little; it's the next state(s) that are informative. But some of
593 * them may have other inarcs, i.e. it may be possible to make actual
594 * progress and then return to one of them. We must de-optimize such
595 * cases, splitting each such state into progress and no-progress
596 * states.
597 */
598
599 /* first, make a list of the states */
600 slist = NULL;
601 for (a = pre->outs; a != NULL; a = a->outchain)
602 {
603 s = a->to;
604 for (b = s->ins; b != NULL; b = b->inchain)
605 if (b->from != pre)
606 break;
607 if (b != NULL)
608 { /* must be split */
609 s->tmp = slist;
610 slist = s;
611 }
612 }
613
614 /* do the splits */
615 for (s = slist; s != NULL; s = s2)
616 {
617 s2 = newstate(nfa);
618 copyouts(nfa, s, s2);
619 for (a = s->ins; a != NULL; a = b)
620 {
621 b = a->inchain;
622 if (a->from != pre)
623 {
624 cparc(nfa, a, a->from, s2);
625 freearc(nfa, a);
626 }
627 }
628 s2 = s->tmp;
629 s->tmp = NULL; /* clean up while we're at it */
630 }
631 }
632
633 /*
634 * parse - parse an RE
635 *
636 * This is actually just the top level, which parses a bunch of branches
637 * tied together with '|'. They appear in the tree as the left children
638 * of a chain of '|' subres.
639 */
640 static struct subre *
641 parse(struct vars * v,
642 int stopper, /* EOS or ')' */
643 int type, /* LACON (lookahead subRE) or PLAIN */
644 struct state * init, /* initial state */
645 struct state * final) /* final state */
646 {
647 struct state *left; /* scaffolding for branch */
648 struct state *right;
649 struct subre *branches; /* top level */
650 struct subre *branch; /* current branch */
651 struct subre *t; /* temporary */
652 int firstbranch; /* is this the first branch? */
653
654 assert(stopper == ')' || stopper == EOS);
655
656 branches = subre(v, '|', LONGER, init, final);
657 NOERRN();
658 branch = branches;
659 firstbranch = 1;
660 do
661 { /* a branch */
662 if (!firstbranch)
663 {
664 /* need a place to hang it */
665 branch->right = subre(v, '|', LONGER, init, final);
666 NOERRN();
667 branch = branch->right;
668 }
669 firstbranch = 0;
670 left = newstate(v->nfa);
671 right = newstate(v->nfa);
672 NOERRN();
673 EMPTYARC(init, left);
674 EMPTYARC(right, final);
675 NOERRN();
676 branch->left = parsebranch(v, stopper, type, left, right, 0);
677 NOERRN();
678 branch->flags |= UP(branch->flags | branch->left->flags);
679 if ((branch->flags & ~branches->flags) != 0) /* new flags */
680 for (t = branches; t != branch; t = t->right)
681 t->flags |= branch->flags;
682 } while (EAT('|'));
683 assert(SEE(stopper) || SEE(EOS));
684
685 if (!SEE(stopper))
686 {
687 assert(stopper == ')' && SEE(EOS));
688 ERR(REG_EPAREN);
689 }
690
691 /* optimize out simple cases */
692 if (branch == branches)
693 { /* only one branch */
694 assert(branch->right == NULL);
695 t = branch->left;
696 branch->left = NULL;
697 freesubre(v, branches);
698 branches = t;
699 }
700 else if (!MESSY(branches->flags))
701 { /* no interesting innards */
702 freesubre(v, branches->left);
703 branches->left = NULL;
704 freesubre(v, branches->right);
705 branches->right = NULL;
706 branches->op = '=';
707 }
708
709 return branches;
710 }
711
712 /*
713 * parsebranch - parse one branch of an RE
714 *
715 * This mostly manages concatenation, working closely with parseqatom().
716 * Concatenated things are bundled up as much as possible, with separate
717 * ',' nodes introduced only when necessary due to substructure.
718 */
719 static struct subre *
720 parsebranch(struct vars * v,
721 int stopper, /* EOS or ')' */
722 int type, /* LACON (lookahead subRE) or PLAIN */
723 struct state * left, /* leftmost state */
724 struct state * right, /* rightmost state */
725 int partial) /* is this only part of a branch? */
726 {
727 struct state *lp; /* left end of current construct */
728 int seencontent; /* is there anything in this branch yet? */
729 struct subre *t;
730
731 lp = left;
732 seencontent = 0;
733 t = subre(v, '=', 0, left, right); /* op '=' is tentative */
734 NOERRN();
735 while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
736 {
737 if (seencontent)
738 { /* implicit concat operator */
739 lp = newstate(v->nfa);
740 NOERRN();
741 moveins(v->nfa, right, lp);
742 }
743 seencontent = 1;
744
745 /* NB, recursion in parseqatom() may swallow rest of branch */
746 parseqatom(v, stopper, type, lp, right, t);
747 }
748
749 if (!seencontent)
750 { /* empty branch */
751 if (!partial)
752 NOTE(REG_UUNSPEC);
753 assert(lp == left);
754 EMPTYARC(left, right);
755 }
756
757 return t;
758 }
759
760 /*
761 * parseqatom - parse one quantified atom or constraint of an RE
762 *
763 * The bookkeeping near the end cooperates very closely with parsebranch();
764 * in particular, it contains a recursion that can involve parsing the rest
765 * of the branch, making this function's name somewhat inaccurate.
766 */
767 static void
768 parseqatom(struct vars * v,
769 int stopper, /* EOS or ')' */
770 int type, /* LACON (lookahead subRE) or PLAIN */
771 struct state * lp, /* left state to hang it on */
772 struct state * rp, /* right state to hang it on */
773 struct subre * top) /* subtree top */
774 {
775 struct state *s; /* temporaries for new states */
776 struct state *s2;
777
778 #define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
779 int m,
780 n;
781 struct subre *atom; /* atom's subtree */
782 struct subre *t;
783 int cap; /* capturing parens? */
784 int pos; /* positive lookahead? */
785 int subno; /* capturing-parens or backref number */
786 int atomtype;
787 int qprefer; /* quantifier short/long preference */
788 int f;
789 struct subre **atomp; /* where the pointer to atom is */
790
791 /* initial bookkeeping */
792 atom = NULL;
793 assert(lp->nouts == 0); /* must string new code */
794 assert(rp->nins == 0); /* between lp and rp */
795 subno = 0; /* just to shut lint up */
796
797 /* an atom or constraint... */
798 atomtype = v->nexttype;
799 switch (atomtype)
800 {
801 /* first, constraints, which end by returning */
802 case '^':
803 ARCV('^', 1);
804 if (v->cflags & REG_NLANCH)
805 ARCV(BEHIND, v->nlcolor);
806 NEXT();
807 return;
808 break;
809 case '$':
810 ARCV('$', 1);
811 if (v->cflags & REG_NLANCH)
812 ARCV(AHEAD, v->nlcolor);
813 NEXT();
814 return;
815 break;
816 case SBEGIN:
817 ARCV('^', 1); /* BOL */
818 ARCV('^', 0); /* or BOS */
819 NEXT();
820 return;
821 break;
822 case SEND:
823 ARCV('$', 1); /* EOL */
824 ARCV('$', 0); /* or EOS */
825 NEXT();
826 return;
827 break;
828 case '<':
829 wordchrs(v); /* does NEXT() */
830 s = newstate(v->nfa);
831 NOERR();
832 nonword(v, BEHIND, lp, s);
833 word(v, AHEAD, s, rp);
834 return;
835 break;
836 case '>':
837 wordchrs(v); /* does NEXT() */
838 s = newstate(v->nfa);
839 NOERR();
840 word(v, BEHIND, lp, s);
841 nonword(v, AHEAD, s, rp);
842 return;
843 break;
844 case WBDRY:
845 wordchrs(v); /* does NEXT() */
846 s = newstate(v->nfa);
847 NOERR();
848 nonword(v, BEHIND, lp, s);
849 word(v, AHEAD, s, rp);
850 s = newstate(v->nfa);
851 NOERR();
852 word(v, BEHIND, lp, s);
853 nonword(v, AHEAD, s, rp);
854 return;
855 break;
856 case NWBDRY:
857 wordchrs(v); /* does NEXT() */
858 s = newstate(v->nfa);
859 NOERR();
860 word(v, BEHIND, lp, s);
861 word(v, AHEAD, s, rp);
862 s = newstate(v->nfa);
863 NOERR();
864 nonword(v, BEHIND, lp, s);
865 nonword(v, AHEAD, s, rp);
866 return;
867 break;
868 case LACON: /* lookahead constraint */
869 pos = v->nextvalue;
870 NEXT();
871 s = newstate(v->nfa);
872 s2 = newstate(v->nfa);
873 NOERR();
874 t = parse(v, ')', LACON, s, s2);
875 freesubre(v, t); /* internal structure irrelevant */
876 assert(SEE(')') || ISERR());
877 NEXT();
878 n = newlacon(v, s, s2, pos);
879 NOERR();
880 ARCV(LACON, n);
881 return;
882 break;
883 /* then errors, to get them out of the way */
884 case '*':
885 case '+':
886 case '?':
887 case '{':
888 ERR(REG_BADRPT);
889 return;
890 break;
891 default:
892 ERR(REG_ASSERT);
893 return;
894 break;
895 /* then plain characters, and minor variants on that theme */
896 case ')': /* unbalanced paren */
897 if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
898 {
899 ERR(REG_EPAREN);
900 return;
901 }
902 /* legal in EREs due to specification botch */
903 NOTE(REG_UPBOTCH);
904 /* fallthrough into case PLAIN */
905 case PLAIN:
906 onechr(v, v->nextvalue, lp, rp);
907 okcolors(v->nfa, v->cm);
908 NOERR();
909 NEXT();
910 break;
911 case '[':
912 if (v->nextvalue == 1)
913 bracket(v, lp, rp);
914 else
915 cbracket(v, lp, rp);
916 assert(SEE(']') || ISERR());
917 NEXT();
918 break;
919 case '.':
920 rainbow(v->nfa, v->cm, PLAIN,
921 (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
922 lp, rp);
923 NEXT();
924 break;
925 /* and finally the ugly stuff */
926 case '(': /* value flags as capturing or non */
927 cap = (type == LACON) ? 0 : v->nextvalue;
928 if (cap)
929 {
930 v->nsubexp++;
931 subno = v->nsubexp;
932 if ((size_t) subno >= v->nsubs)
933 moresubs(v, subno);
934 assert((size_t) subno < v->nsubs);
935 }
936 else
937 atomtype = PLAIN; /* something that's not '(' */
938 NEXT();
939 /* need new endpoints because tree will contain pointers */
940 s = newstate(v->nfa);
941 s2 = newstate(v->nfa);
942 NOERR();
943 EMPTYARC(lp, s);
944 EMPTYARC(s2, rp);
945 NOERR();
946 atom = parse(v, ')', PLAIN, s, s2);
947 assert(SEE(')') || ISERR());
948 NEXT();
949 NOERR();
950 if (cap)
951 {
952 v->subs[subno] = atom;
953 t = subre(v, '(', atom->flags | CAP, lp, rp);
954 NOERR();
955 t->subno = subno;
956 t->left = atom;
957 atom = t;
958 }
959 /* postpone everything else pending possible {0} */
960 break;
961 case BACKREF: /* the Feature From The Black Lagoon */
962 INSIST(type != LACON, REG_ESUBREG);
963 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
964 INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
965 NOERR();
966 assert(v->nextvalue > 0);
967 atom = subre(v, 'b', BACKR, lp, rp);
968 subno = v->nextvalue;
969 atom->subno = subno;
970 EMPTYARC(lp, rp); /* temporarily, so there's something */
971 NEXT();
972 break;
973 }
974
975 /* ...and an atom may be followed by a quantifier */
976 switch (v->nexttype)
977 {
978 case '*':
979 m = 0;
980 n = INFINITY;
981 qprefer = (v->nextvalue) ? LONGER : SHORTER;
982 NEXT();
983 break;
984 case '+':
985 m = 1;
986 n = INFINITY;
987 qprefer = (v->nextvalue) ? LONGER : SHORTER;
988 NEXT();
989 break;
990 case '?':
991 m = 0;
992 n = 1;
993 qprefer = (v->nextvalue) ? LONGER : SHORTER;
994 NEXT();
995 break;
996 case '{':
997 NEXT();
998 m = scannum(v);
999 if (EAT(','))
1000 {
1001 if (SEE(DIGIT))
1002 n = scannum(v);
1003 else
1004 n = INFINITY;
1005 if (m > n)
1006 {
1007 ERR(REG_BADBR);
1008 return;
1009 }
1010 /* {m,n} exercises preference, even if it's {m,m} */
1011 qprefer = (v->nextvalue) ? LONGER : SHORTER;
1012 }
1013 else
1014 {
1015 n = m;
1016 /* {m} passes operand's preference through */
1017 qprefer = 0;
1018 }
1019 if (!SEE('}'))
1020 { /* catches errors too */
1021 ERR(REG_BADBR);
1022 return;
1023 }
1024 NEXT();
1025 break;
1026 default: /* no quantifier */
1027 m = n = 1;
1028 qprefer = 0;
1029 break;
1030 }
1031
1032 /* annoying special case: {0} or {0,0} cancels everything */
1033 if (m == 0 && n == 0)
1034 {
1035 if (atom != NULL)
1036 freesubre(v, atom);
1037 if (atomtype == '(')
1038 v->subs[subno] = NULL;
1039 delsub(v->nfa, lp, rp);
1040 EMPTYARC(lp, rp);
1041 return;
1042 }
1043
1044 /* if not a messy case, avoid hard part */
1045 assert(!MESSY(top->flags));
1046 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1047 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1048 {
1049 if (!(m == 1 && n == 1))
1050 repeat(v, lp, rp, m, n);
1051 if (atom != NULL)
1052 freesubre(v, atom);
1053 top->flags = f;
1054 return;
1055 }
1056
1057 /*
1058 * hard part: something messy That is, capturing parens, back
1059 * reference, short/long clash, or an atom with substructure
1060 * containing one of those.
1061 */
1062
1063 /* now we'll need a subre for the contents even if they're boring */
1064 if (atom == NULL)
1065 {
1066 atom = subre(v, '=', 0, lp, rp);
1067 NOERR();
1068 }
1069
1070 /*
1071 * prepare a general-purpose state skeleton
1072 *
1073 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /
1074 * [lp] ----> [s2] ----bypass---------------------
1075 *
1076 * where bypass is an empty, and prefix is some repetitions of atom
1077 */
1078 s = newstate(v->nfa); /* first, new endpoints for the atom */
1079 s2 = newstate(v->nfa);
1080 NOERR();
1081 moveouts(v->nfa, lp, s);
1082 moveins(v->nfa, rp, s2);
1083 NOERR();
1084 atom->begin = s;
1085 atom->end = s2;
1086 s = newstate(v->nfa); /* and spots for prefix and bypass */
1087 s2 = newstate(v->nfa);
1088 NOERR();
1089 EMPTYARC(lp, s);
1090 EMPTYARC(lp, s2);
1091 NOERR();
1092
1093 /* break remaining subRE into x{...} and what follows */
1094 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1095 t->left = atom;
1096 atomp = &t->left;
1097 /* here we should recurse... but we must postpone that to the end */
1098
1099 /* split top into prefix and remaining */
1100 assert(top->op == '=' && top->left == NULL && top->right == NULL);
1101 top->left = subre(v, '=', top->flags, top->begin, lp);
1102 top->op = '.';
1103 top->right = t;
1104
1105 /* if it's a backref, now is the time to replicate the subNFA */
1106 if (atomtype == BACKREF)
1107 {
1108 assert(atom->begin->nouts == 1); /* just the EMPTY */
1109 delsub(v->nfa, atom->begin, atom->end);
1110 assert(v->subs[subno] != NULL);
1111 /* and here's why the recursion got postponed: it must */
1112 /* wait until the skeleton is filled in, because it may */
1113 /* hit a backref that wants to copy the filled-in skeleton */
1114 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1115 atom->begin, atom->end);
1116 NOERR();
1117 }
1118
1119 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1120 if (m == 0)
1121 {
1122 EMPTYARC(s2, atom->end); /* the bypass */
1123 assert(PREF(qprefer) != 0);
1124 f = COMBINE(qprefer, atom->flags);
1125 t = subre(v, '|', f, lp, atom->end);
1126 NOERR();
1127 t->left = atom;
1128 t->right = subre(v, '|', PREF(f), s2, atom->end);
1129 NOERR();
1130 t->right->left = subre(v, '=', 0, s2, atom->end);
1131 NOERR();
1132 *atomp = t;
1133 atomp = &t->left;
1134 m = 1;
1135 }
1136
1137 /* deal with the rest of the quantifier */
1138 if (atomtype == BACKREF)
1139 {
1140 /* special case: backrefs have internal quantifiers */
1141 EMPTYARC(s, atom->begin); /* empty prefix */
1142 /* just stuff everything into atom */
1143 repeat(v, atom->begin, atom->end, m, n);
1144 atom->min = (short) m;
1145 atom->max = (short) n;
1146 atom->flags |= COMBINE(qprefer, atom->flags);
1147 }
1148 else if (m == 1 && n == 1)
1149 {
1150 /* no/vacuous quantifier: done */
1151 EMPTYARC(s, atom->begin); /* empty prefix */
1152 }
1153 else
1154 {
1155 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1156 /* parens in only second x */
1157 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1158 assert(m >= 1 && m != INFINITY && n >= 1);
1159 repeat(v, s, atom->begin, m - 1, (n == INFINITY) ? n : n - 1);
1160 f = COMBINE(qprefer, atom->flags);
1161 t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1162 NOERR();
1163 t->left = subre(v, '=', PREF(f), s, atom->begin);
1164 NOERR();
1165 t->right = atom;
1166 *atomp = t;
1167 }
1168
1169 /* and finally, look after that postponed recursion */
1170 t = top->right;
1171 if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1172 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1173 else
1174 {
1175 EMPTYARC(atom->end, rp);
1176 t->right = subre(v, '=', 0, atom->end, rp);
1177 }
1178 assert(SEE('|') || SEE(stopper) || SEE(EOS));
1179 t->flags |= COMBINE(t->flags, t->right->flags);
1180 top->flags |= COMBINE(top->flags, t->flags);
1181 }
1182
1183 /*
1184 * nonword - generate arcs for non-word-character ahead or behind
1185 */
1186 static void
1187 nonword(struct vars * v,
1188 int dir, /* AHEAD or BEHIND */
1189 struct state * lp,
1190 struct state * rp)
1191 {
1192 int anchor = (dir == AHEAD) ? '$' : '^';
1193
1194 assert(dir == AHEAD || dir == BEHIND);
1195 newarc(v->nfa, anchor, 1, lp, rp);
1196 newarc(v->nfa, anchor, 0, lp, rp);
1197 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1198 /* (no need for special attention to \n) */
1199 }
1200
1201 /*
1202 * word - generate arcs for word character ahead or behind
1203 */
1204 static void
1205 word(struct vars * v,
1206 int dir, /* AHEAD or BEHIND */
1207 struct state * lp,
1208 struct state * rp)
1209 {
1210 assert(dir == AHEAD || dir == BEHIND);
1211 cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1212 /* (no need for special attention to \n) */
1213 }
1214
1215 /*
1216 * scannum - scan a number
1217 */
1218 static int /* value, <= DUPMAX */
1219 scannum(struct vars * v)
1220 {
1221 int n = 0;
1222
1223 while (SEE(DIGIT) && n < DUPMAX)
1224 {
1225 n = n * 10 + v->nextvalue;
1226 NEXT();
1227 }
1228 if (SEE(DIGIT) || n > DUPMAX)
1229 {
1230 ERR(REG_BADBR);
1231 return 0;
1232 }
1233 return n;
1234 }
1235
1236 /*
1237 * repeat - replicate subNFA for quantifiers
1238 *
1239 * The duplication sequences used here are chosen carefully so that any
1240 * pointers starting out pointing into the subexpression end up pointing into
1241 * the last occurrence. (Note that it may not be strung between the same
1242 * left and right end states, however!) This used to be important for the
1243 * subRE tree, although the important bits are now handled by the in-line
1244 * code in parse(), and when this is called, it doesn't matter any more.
1245 */
1246 static void
1247 repeat(struct vars * v,
1248 struct state * lp,
1249 struct state * rp,
1250 int m,
1251 int n)
1252 {
1253 #define SOME 2
1254 #define INF 3
1255 #define PAIR(x, y) ((x)*4 + (y))
1256 #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1257 const int rm = REDUCE(m);
1258 const int rn = REDUCE(n);
1259 struct state *s;
1260 struct state *s2;
1261
1262 switch (PAIR(rm, rn))
1263 {
1264 case PAIR(0, 0): /* empty string */
1265 delsub(v->nfa, lp, rp);
1266 EMPTYARC(lp, rp);
1267 break;
1268 case PAIR(0, 1): /* do as x| */
1269 EMPTYARC(lp, rp);
1270 break;
1271 case PAIR(0, SOME): /* do as x{1,n}| */
1272 repeat(v, lp, rp, 1, n);
1273 NOERR();
1274 EMPTYARC(lp, rp);
1275 break;
1276 case PAIR(0, INF): /* loop x around */
1277 s = newstate(v->nfa);
1278 NOERR();
1279 moveouts(v->nfa, lp, s);
1280 moveins(v->nfa, rp, s);
1281 EMPTYARC(lp, s);
1282 EMPTYARC(s, rp);
1283 break;
1284 case PAIR(1, 1): /* no action required */
1285 break;
1286 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1287 s = newstate(v->nfa);
1288 NOERR();
1289 moveouts(v->nfa, lp, s);
1290 dupnfa(v->nfa, s, rp, lp, s);
1291 NOERR();
1292 repeat(v, lp, s, 1, n - 1);
1293 NOERR();
1294 EMPTYARC(lp, s);
1295 break;
1296 case PAIR(1, INF): /* add loopback arc */
1297 s = newstate(v->nfa);
1298 s2 = newstate(v->nfa);
1299 NOERR();
1300 moveouts(v->nfa, lp, s);
1301 moveins(v->nfa, rp, s2);
1302 EMPTYARC(lp, s);
1303 EMPTYARC(s2, rp);
1304 EMPTYARC(s2, s);
1305 break;
1306 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1307 s = newstate(v->nfa);
1308 NOERR();
1309 moveouts(v->nfa, lp, s);
1310 dupnfa(v->nfa, s, rp, lp, s);
1311 NOERR();
1312 repeat(v, lp, s, m - 1, n - 1);
1313 break;
1314 case PAIR(SOME, INF): /* do as x{m-1,}x */
1315 s = newstate(v->nfa);
1316 NOERR();
1317 moveouts(v->nfa, lp, s);
1318 dupnfa(v->nfa, s, rp, lp, s);
1319 NOERR();
1320 repeat(v, lp, s, m - 1, n);
1321 break;
1322 default:
1323 ERR(REG_ASSERT);
1324 break;
1325 }
1326 }
1327
1328 /*
1329 * bracket - handle non-complemented bracket expression
1330 * Also called from cbracket for complemented bracket expressions.
1331 */
1332 static void
1333 bracket(struct vars * v,
1334 struct state * lp,
1335 struct state * rp)
1336 {
1337 assert(SEE('['));
1338 NEXT();
1339 while (!SEE(']') && !SEE(EOS))
1340 brackpart(v, lp, rp);
1341 assert(SEE(']') || ISERR());
1342 okcolors(v->nfa, v->cm);
1343 }
1344
1345 /*
1346 * cbracket - handle complemented bracket expression
1347 * We do it by calling bracket() with dummy endpoints, and then complementing
1348 * the result. The alternative would be to invoke rainbow(), and then delete
1349 * arcs as the b.e. is seen... but that gets messy.
1350 */
1351 static void
1352 cbracket(struct vars * v,
1353 struct state * lp,
1354 struct state * rp)
1355 {
1356 struct state *left = newstate(v->nfa);
1357 struct state *right = newstate(v->nfa);
1358 struct state *s;
1359 struct arc *a; /* arc from lp */
1360 struct arc *ba; /* arc from left, from bracket() */
1361 struct arc *pa; /* MCCE-prototype arc */
1362 color co;
1363 chr *p;
1364 int i;
1365
1366 NOERR();
1367 bracket(v, left, right);
1368 if (v->cflags & REG_NLSTOP)
1369 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1370 NOERR();
1371
1372 assert(lp->nouts == 0); /* all outarcs will be ours */
1373
1374 /* easy part of complementing */
1375 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1376 NOERR();
1377 if (v->mcces == NULL)
1378 { /* no MCCEs -- we're done */
1379 dropstate(v->nfa, left);
1380 assert(right->nins == 0);
1381 freestate(v->nfa, right);
1382 return;
1383 }
1384
1385 /* but complementing gets messy in the presence of MCCEs... */
1386 NOTE(REG_ULOCALE);
1387 for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--)
1388 {
1389 co = GETCOLOR(v->cm, *p);
1390 a = findarc(lp, PLAIN, co);
1391 ba = findarc(left, PLAIN, co);
1392 if (ba == NULL)
1393 {
1394 assert(a != NULL);
1395 freearc(v->nfa, a);
1396 }
1397 else
1398 assert(a == NULL);
1399 s = newstate(v->nfa);
1400 NOERR();
1401 newarc(v->nfa, PLAIN, co, lp, s);
1402 NOERR();
1403 pa = findarc(v->mccepbegin, PLAIN, co);
1404 assert(pa != NULL);
1405 if (ba == NULL)
1406 { /* easy case, need all of them */
1407 cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1408 newarc(v->nfa, '$', 1, s, rp);
1409 newarc(v->nfa, '$', 0, s, rp);
1410 colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1411 }
1412 else
1413 { /* must be selective */
1414 if (findarc(ba->to, '$', 1) == NULL)
1415 {
1416 newarc(v->nfa, '$', 1, s, rp);
1417 newarc(v->nfa, '$', 0, s, rp);
1418 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1419 s, rp);
1420 }
1421 for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1422 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1423 newarc(v->nfa, PLAIN, pa->co, s, rp);
1424 if (s->nouts == 0) /* limit of selectivity: none */
1425 dropstate(v->nfa, s); /* frees arc too */
1426 }
1427 NOERR();
1428 }
1429
1430 delsub(v->nfa, left, right);
1431 assert(left->nouts == 0);
1432 freestate(v->nfa, left);
1433 assert(right->nins == 0);
1434 freestate(v->nfa, right);
1435 }
1436
1437 /*
1438 * brackpart - handle one item (or range) within a bracket expression
1439 */
1440 static void
1441 brackpart(struct vars * v,
1442 struct state * lp,
1443 struct state * rp)
1444 {
1445 celt startc;
1446 celt endc;
1447 struct cvec *cv;
1448 chr *startp;
1449 chr *endp;
1450 chr c[1];
1451
1452 /* parse something, get rid of special cases, take shortcuts */
1453 switch (v->nexttype)
1454 {
1455 case RANGE: /* a-b-c or other botch */
1456 ERR(REG_ERANGE);
1457 return;
1458 break;
1459 case PLAIN:
1460 c[0] = v->nextvalue;
1461 NEXT();
1462 /* shortcut for ordinary chr (not range, not MCCE leader) */
1463 if (!SEE(RANGE) && !ISCELEADER(v, c[0]))
1464 {
1465 onechr(v, c[0], lp, rp);
1466 return;
1467 }
1468 startc = element(v, c, c + 1);
1469 NOERR();
1470 break;
1471 case COLLEL:
1472 startp = v->now;
1473 endp = scanplain(v);
1474 INSIST(startp < endp, REG_ECOLLATE);
1475 NOERR();
1476 startc = element(v, startp, endp);
1477 NOERR();
1478 break;
1479 case ECLASS:
1480 startp = v->now;
1481 endp = scanplain(v);
1482 INSIST(startp < endp, REG_ECOLLATE);
1483 NOERR();
1484 startc = element(v, startp, endp);
1485 NOERR();
1486 cv = eclass(v, startc, (v->cflags & REG_ICASE));
1487 NOERR();
1488 dovec(v, cv, lp, rp);
1489 return;
1490 break;
1491 case CCLASS:
1492 startp = v->now;
1493 endp = scanplain(v);
1494 INSIST(startp < endp, REG_ECTYPE);
1495 NOERR();
1496 cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1497 NOERR();
1498 dovec(v, cv, lp, rp);
1499 return;
1500 break;
1501 default:
1502 ERR(REG_ASSERT);
1503 return;
1504 break;
1505 }
1506
1507 if (SEE(RANGE))
1508 {
1509 NEXT();
1510 switch (v->nexttype)
1511 {
1512 case PLAIN:
1513 case RANGE:
1514 c[0] = v->nextvalue;
1515 NEXT();
1516 endc = element(v, c, c + 1);
1517 NOERR();
1518 break;
1519 case COLLEL:
1520 startp = v->now;
1521 endp = scanplain(v);
1522 INSIST(startp < endp, REG_ECOLLATE);
1523 NOERR();
1524 endc = element(v, startp, endp);
1525 NOERR();
1526 break;
1527 default:
1528 ERR(REG_ERANGE);
1529 return;
1530 break;
1531 }
1532 }
1533 else
1534 endc = startc;
1535
1536 /*
1537 * Ranges are unportable. Actually, standard C does guarantee that
1538 * digits are contiguous, but making that an exception is just too
1539 * complicated.
1540 */
1541 if (startc != endc)
1542 NOTE(REG_UUNPORT);
1543 cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1544 NOERR();
1545 dovec(v, cv, lp, rp);
1546 }
1547
1548 /*
1549 * scanplain - scan PLAIN contents of [. etc.
1550 *
1551 * Certain bits of trickery in lex.c know that this code does not try
1552 * to look past the final bracket of the [. etc.
1553 */
1554 static chr * /* just after end of sequence */
1555 scanplain(struct vars * v)
1556 {
1557 chr *endp;
1558
1559 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1560 NEXT();
1561
1562 endp = v->now;
1563 while (SEE(PLAIN))
1564 {
1565 endp = v->now;
1566 NEXT();
1567 }
1568
1569 assert(SEE(END) || ISERR());
1570 NEXT();
1571
1572 return endp;
1573 }
1574
1575 /*
1576 * leaders - process a cvec of collating elements to also include leaders
1577 * Also gives all characters involved their own colors, which is almost
1578 * certainly necessary, and sets up little disconnected subNFA.
1579 */
1580 static void
1581 leaders(struct vars * v,
1582 struct cvec * cv)
1583 {
1584 int mcce;
1585 chr *p;
1586 chr leader;
1587 struct state *s;
1588 struct arc *a;
1589
1590 v->mccepbegin = newstate(v->nfa);
1591 v->mccepend = newstate(v->nfa);
1592 NOERR();
1593
1594 for (mcce = 0; mcce < cv->nmcces; mcce++)
1595 {
1596 p = cv->mcces[mcce];
1597 leader = *p;
1598 if (!haschr(cv, leader))
1599 {
1600 addchr(cv, leader);
1601 s = newstate(v->nfa);
1602 newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1603 v->mccepbegin, s);
1604 okcolors(v->nfa, v->cm);
1605 }
1606 else
1607 {
1608 a = findarc(v->mccepbegin, PLAIN,
1609 GETCOLOR(v->cm, leader));
1610 assert(a != NULL);
1611 s = a->to;
1612 assert(s != v->mccepend);
1613 }
1614 p++;
1615 assert(*p != 0 && *(p + 1) == 0); /* only 2-char MCCEs for
1616 * now */
1617 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1618 okcolors(v->nfa, v->cm);
1619 }
1620 }
1621
1622 /*
1623 * onechr - fill in arcs for a plain character, and possible case complements
1624 * This is mostly a shortcut for efficient handling of the common case.
1625 */
1626 static void
1627 onechr(struct vars * v,
1628 chr c,
1629 struct state * lp,
1630 struct state * rp)
1631 {
1632 if (!(v->cflags & REG_ICASE))
1633 {
1634 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1635 return;
1636 }
1637
1638 /* rats, need general case anyway... */
1639 dovec(v, allcases(v, c), lp, rp);
1640 }
1641
1642 /*
1643 * dovec - fill in arcs for each element of a cvec
1644 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1645 */
1646 static void
1647 dovec(struct vars * v,
1648 struct cvec * cv,
1649 struct state * lp,
1650 struct state * rp)
1651 {
1652 chr ch,
1653 from,
1654 to;
1655 celt ce;
1656 chr *p;
1657 int i;
1658 color co;
1659 struct cvec *leads;
1660 struct arc *a;
1661 struct arc *pa; /* arc in prototype */
1662 struct state *s;
1663 struct state *ps; /* state in prototype */
1664
1665 /* need a place to store leaders, if any */
1666 if (nmcces(v) > 0)
1667 {
1668 assert(v->mcces != NULL);
1669 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs)
1670 {
1671 if (v->cv2 != NULL)
1672 free(v->cv2);
1673 v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1674 NOERR();
1675 leads = v->cv2;
1676 }
1677 else
1678 leads = clearcvec(v->cv2);
1679 }
1680 else
1681 leads = NULL;
1682
1683 /* first, get the ordinary characters out of the way */
1684 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
1685 {
1686 ch = *p;
1687 if (!ISCELEADER(v, ch))
1688 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1689 else
1690 {
1691 assert(singleton(v->cm, ch));
1692 assert(leads != NULL);
1693 if (!haschr(leads, ch))
1694 addchr(leads, ch);
1695 }
1696 }
1697
1698 /* and the ranges */
1699 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
1700 {
1701 from = *p;
1702 to = *(p + 1);
1703 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT)
1704 {
1705 if (from < ce)
1706 subrange(v, from, ce - 1, lp, rp);
1707 assert(singleton(v->cm, ce));
1708 assert(leads != NULL);
1709 if (!haschr(leads, ce))
1710 addchr(leads, ce);
1711 from = ce + 1;
1712 }
1713 if (from <= to)
1714 subrange(v, from, to, lp, rp);
1715 }
1716
1717 if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1718 return;
1719
1720 /* deal with the MCCE leaders */
1721 NOTE(REG_ULOCALE);
1722 for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--)
1723 {
1724 co = GETCOLOR(v->cm, *p);
1725 a = findarc(lp, PLAIN, co);
1726 if (a != NULL)
1727 s = a->to;
1728 else
1729 {
1730 s = newstate(v->nfa);
1731 NOERR();
1732 newarc(v->nfa, PLAIN, co, lp, s);
1733 NOERR();
1734 }
1735 pa = findarc(v->mccepbegin, PLAIN, co);
1736 assert(pa != NULL);
1737 ps = pa->to;
1738 newarc(v->nfa, '$', 1, s, rp);
1739 newarc(v->nfa, '$', 0, s, rp);
1740 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1741 NOERR();
1742 }
1743
1744 /* and the MCCEs */
1745 for (i = 0; i < cv->nmcces; i++)
1746 {
1747 p = cv->mcces[i];
1748 assert(singleton(v->cm, *p));
1749 if (!singleton(v->cm, *p))
1750 {
1751 ERR(REG_ASSERT);
1752 return;
1753 }
1754 ch = *p++;
1755 co = GETCOLOR(v->cm, ch);
1756 a = findarc(lp, PLAIN, co);
1757 if (a != NULL)
1758 s = a->to;
1759 else
1760 {
1761 s = newstate(v->nfa);
1762 NOERR();
1763 newarc(v->nfa, PLAIN, co, lp, s);
1764 NOERR();
1765 }
1766 assert(*p != 0); /* at least two chars */
1767 assert(singleton(v->cm, *p));
1768 ch = *p++;
1769 co = GETCOLOR(v->cm, ch);
1770 assert(*p == 0); /* and only two, for now */
1771 newarc(v->nfa, PLAIN, co, s, rp);
1772 NOERR();
1773 }
1774 }
1775
1776 /*
1777 * nextleader - find next MCCE leader within range
1778 */
1779 static celt /* NOCELT means none */
1780 nextleader(struct vars * v,
1781 chr from,
1782 chr to)
1783 {
1784 int i;
1785 chr *p;
1786 chr ch;
1787 celt it = NOCELT;
1788
1789 if (v->mcces == NULL)
1790 return it;
1791
1792 for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++)
1793 {
1794 ch = *p;
1795 if (from <= ch && ch <= to)
1796 if (it == NOCELT || ch < it)
1797 it = ch;
1798 }
1799 return it;
1800 }
1801
1802 /*
1803 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1804 *
1805 * The list is kept as a bunch of arcs between two dummy states; it's
1806 * disposed of by the unreachable-states sweep in NFA optimization.
1807 * Does NEXT(). Must not be called from any unusual lexical context.
1808 * This should be reconciled with the \w etc. handling in lex.c, and
1809 * should be cleaned up to reduce dependencies on input scanning.
1810 */
1811 static void
1812 wordchrs(struct vars * v)
1813 {
1814 struct state *left;
1815 struct state *right;
1816
1817 if (v->wordchrs != NULL)
1818 {
1819 NEXT(); /* for consistency */
1820 return;
1821 }
1822
1823 left = newstate(v->nfa);
1824 right = newstate(v->nfa);
1825 NOERR();
1826 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1827 lexword(v);
1828 NEXT();
1829 assert(v->savenow != NULL && SEE('['));
1830 bracket(v, left, right);
1831 assert((v->savenow != NULL && SEE(']')) || ISERR());
1832 NEXT();
1833 NOERR();
1834 v->wordchrs = left;
1835 }
1836
1837 /*
1838 * subre - allocate a subre
1839 */
1840 static struct subre *
1841 subre(struct vars * v,
1842 int op,
1843 int flags,
1844 struct state * begin,
1845 struct state * end)
1846 {
1847 struct subre *ret;
1848
1849 ret = v->treefree;
1850 if (ret != NULL)
1851 v->treefree = ret->left;
1852 else
1853 {
1854 ret = (struct subre *) MALLOC(sizeof(struct subre));
1855 if (ret == NULL)
1856 {
1857 ERR(REG_ESPACE);
1858 return NULL;
1859 }
1860 ret->chain = v->treechain;
1861 v->treechain = ret;
1862 }
1863
1864 assert(strchr("|.b(=", op) != NULL);
1865
1866 ret->op = op;
1867 ret->flags = flags;
1868 ret->retry = 0;
1869 ret->subno = 0;
1870 ret->min = ret->max = 1;
1871 ret->left = NULL;
1872 ret->right = NULL;
1873 ret->begin = begin;
1874 ret->end = end;
1875 ZAPCNFA(ret->cnfa);
1876
1877 return ret;
1878 }
1879
1880 /*
1881 * freesubre - free a subRE subtree
1882 */
1883 static void
1884 freesubre(struct vars * v, /* might be NULL */
1885 struct subre * sr)
1886 {
1887 if (sr == NULL)
1888 return;
1889
1890 if (sr->left != NULL)
1891 freesubre(v, sr->left);
1892 if (sr->right != NULL)
1893 freesubre(v, sr->right);
1894
1895 freesrnode(v, sr);
1896 }
1897
1898 /*
1899 * freesrnode - free one node in a subRE subtree
1900 */
1901 static void
1902 freesrnode(struct vars * v, /* might be NULL */
1903 struct subre * sr)
1904 {
1905 if (sr == NULL)
1906 return;
1907
1908 if (!NULLCNFA(sr->cnfa))
1909 freecnfa(&sr->cnfa);
1910 sr->flags = 0;
1911
1912 if (v != NULL)
1913 {
1914 sr->left = v->treefree;
1915 v->treefree = sr;
1916 }
1917 else
1918 FREE(sr);
1919 }
1920
1921 /*
1922 * optst - optimize a subRE subtree
1923 */
1924 static void
1925 optst(struct vars * v,
1926 struct subre * t)
1927 {
1928 if (t == NULL)
1929 return;
1930
1931 /* recurse through children */
1932 if (t->left != NULL)
1933 optst(v, t->left);
1934 if (t->right != NULL)
1935 optst(v, t->right);
1936 }
1937
1938 /*
1939 * numst - number tree nodes (assigning retry indexes)
1940 */
1941 static int /* next number */
1942 numst(struct subre * t,
1943 int start) /* starting point for subtree numbers */
1944 {
1945 int i;
1946
1947 assert(t != NULL);
1948
1949 i = start;
1950 t->retry = (short) i++;
1951 if (t->left != NULL)
1952 i = numst(t->left, i);
1953 if (t->right != NULL)
1954 i = numst(t->right, i);
1955 return i;
1956 }
1957
1958 /*
1959 * markst - mark tree nodes as INUSE
1960 */
1961 static void
1962 markst(struct subre * t)
1963 {
1964 assert(t != NULL);
1965
1966 t->flags |= INUSE;
1967 if (t->left != NULL)
1968 markst(t->left);
1969 if (t->right != NULL)
1970 markst(t->right);
1971 }
1972
1973 /*
1974 * cleanst - free any tree nodes not marked INUSE
1975 */
1976 static void
1977 cleanst(struct vars * v)
1978 {
1979 struct subre *t;
1980 struct subre *next;
1981
1982 for (t = v->treechain; t != NULL; t = next)
1983 {
1984 next = t->chain;
1985 if (!(t->flags & INUSE))
1986 FREE(t);
1987 }
1988 v->treechain = NULL;
1989 v->treefree = NULL; /* just on general principles */
1990 }
1991
1992 /*
1993 * nfatree - turn a subRE subtree into a tree of compacted NFAs
1994 */
1995 static long /* optimize results from top node */
1996 nfatree(struct vars * v,
1997 struct subre * t,
1998 FILE *f) /* for debug output */
1999 {
2000 assert(t != NULL && t->begin != NULL);
2001
2002 if (t->left != NULL)
2003 (DISCARD) nfatree(v, t->left, f);
2004 if (t->right != NULL)
2005 (DISCARD) nfatree(v, t->right, f);
2006
2007 return nfanode(v, t, f);
2008 }
2009
2010 /*
2011 * nfanode - do one NFA for nfatree
2012 */
2013 static long /* optimize results */
2014 nfanode(struct vars * v,
2015 struct subre * t,
2016 FILE *f) /* for debug output */
2017 {
2018 struct nfa *nfa;
2019 long ret = 0;
2020
2021 assert(t->begin != NULL);
2022
2023 #ifdef REG_DEBUG
2024 if (f != NULL)
2025 {
2026 char idbuf[50];
2027
2028 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
2029 stid(t, idbuf, sizeof(idbuf)));
2030 }
2031 #endif
2032 nfa = newnfa(v, v->cm, v->nfa);
2033 NOERRZ();
2034 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
2035 if (!ISERR())
2036 {
2037 specialcolors(nfa);
2038 ret = optimize(nfa, f);
2039 }
2040 if (!ISERR())
2041 compact(nfa, &t->cnfa);
2042
2043 freenfa(nfa);
2044 return ret;
2045 }
2046
2047 /*
2048 * newlacon - allocate a lookahead-constraint subRE
2049 */
2050 static int /* lacon number */
2051 newlacon(struct vars * v,
2052 struct state * begin,
2053 struct state * end,
2054 int pos)
2055 {
2056 int n;
2057 struct subre *sub;
2058
2059 if (v->nlacons == 0)
2060 {
2061 v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
2062 n = 1; /* skip 0th */
2063 v->nlacons = 2;
2064 }
2065 else
2066 {
2067 v->lacons = (struct subre *) REALLOC(v->lacons,
2068 (v->nlacons + 1) * sizeof(struct subre));
2069 n = v->nlacons++;
2070 }
2071 if (v->lacons == NULL)
2072 {
2073 ERR(REG_ESPACE);
2074 return 0;
2075 }
2076 sub = &v->lacons[n];
2077 sub->begin = begin;
2078 sub->end = end;
2079 sub->subno = pos;
2080 ZAPCNFA(sub->cnfa);
2081 return n;
2082 }
2083
2084 /*
2085 * freelacons - free lookahead-constraint subRE vector
2086 */
2087 static void
2088 freelacons(struct subre * subs,
2089 int n)
2090 {
2091 struct subre *sub;
2092 int i;
2093
2094 assert(n > 0);
2095 for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
2096 if (!NULLCNFA(sub->cnfa))
2097 freecnfa(&sub->cnfa);
2098 FREE(subs);
2099 }
2100
2101 /*
2102 * rfree - free a whole RE (insides of regfree)
2103 */
2104 static void
2105 rfree(regex_t *re)
2106 {
2107 struct guts *g;
2108
2109 if (re == NULL || re->re_magic != REMAGIC)
2110 return;
2111
2112 re->re_magic = 0; /* invalidate RE */
2113 g = (struct guts *) re->re_guts;
2114 re->re_guts = NULL;
2115 re->re_fns = NULL;
2116 g->magic = 0;
2117 freecm(&g->cmap);
2118 if (g->tree != NULL)
2119 freesubre((struct vars *) NULL, g->tree);
2120 if (g->lacons != NULL)
2121 freelacons(g->lacons, g->nlacons);
2122 if (!NULLCNFA(g->search))
2123 freecnfa(&g->search);
2124 FREE(g);
2125 }
2126
2127 #ifdef REG_DEBUG
2128
2129 /*
2130 * dump - dump an RE in human-readable form
2131 */
2132 static void
2133 dump(regex_t *re,
2134 FILE *f)
2135 {
2136 struct guts *g;
2137 int i;
2138
2139 if (re->re_magic != REMAGIC)
2140 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2141 REMAGIC);
2142 if (re->re_guts == NULL)
2143 {
2144 fprintf(f, "NULL guts!!!\n");
2145 return;
2146 }
2147 g = (struct guts *) re->re_guts;
2148 if (g->magic != GUTSMAGIC)
2149 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2150 GUTSMAGIC);
2151
2152 fprintf(f, "\n\n\n========= DUMP ==========\n");
2153 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2154 re->re_nsub, re->re_info, re->re_csize, g->ntree);
2155
2156 dumpcolors(&g->cmap, f);
2157 if (!NULLCNFA(g->search))
2158 {
2159 printf("\nsearch:\n");
2160 dumpcnfa(&g->search, f);
2161 }
2162 for (i = 1; i < g->nlacons; i++)
2163 {
2164 fprintf(f, "\nla%d (%s):\n", i,
2165 (g->lacons[i].subno) ? "positive" : "negative");
2166 dumpcnfa(&g->lacons[i].cnfa, f);
2167 }
2168 fprintf(f, "\n");
2169 dumpst(g->tree, f, 0);
2170 }
2171
2172 /*
2173 * dumpst - dump a subRE tree
2174 */
2175 static void
2176 dumpst(struct subre * t,
2177 FILE *f,
2178 int nfapresent) /* is the original NFA still around? */
2179 {
2180 if (t == NULL)
2181 fprintf(f, "null tree\n");
2182 else
2183 stdump(t, f, nfapresent);
2184 fflush(f);
2185 }
2186
2187 /*
2188 * stdump - recursive guts of dumpst
2189 */
2190 static void
2191 stdump(struct subre * t,
2192 FILE *f,
2193 int nfapresent) /* is the original NFA still around? */
2194 {
2195 char idbuf[50];
2196
2197 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2198 if (t->flags & LONGER)
2199 fprintf(f, " longest");
2200 if (t->flags & SHORTER)
2201 fprintf(f, " shortest");
2202 if (t->flags & MIXED)
2203 fprintf(f, " hasmixed");
2204 if (t->flags & CAP)
2205 fprintf(f, " hascapture");
2206 if (t->flags & BACKR)
2207 fprintf(f, " hasbackref");
2208 if (!(t->flags & INUSE))
2209 fprintf(f, " UNUSED");
2210 if (t->subno != 0)
2211 fprintf(f, " (#%d)", t->subno);
2212 if (t->min != 1 || t->max != 1)
2213 {
2214 fprintf(f, " {%d,", t->min);
2215 if (t->max != INFINITY)
2216 fprintf(f, "%d", t->max);
2217 fprintf(f, "}");
2218 }
2219 if (nfapresent)
2220 fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2221 if (t->left != NULL)
2222 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2223 if (t->right != NULL)
2224 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2225 if (!NULLCNFA(t->cnfa))
2226 {
2227 fprintf(f, "\n");
2228 dumpcnfa(&t->cnfa, f);
2229 fprintf(f, "\n");
2230 }
2231 if (t->left != NULL)
2232 stdump(t->left, f, nfapresent);
2233 if (t->right != NULL)
2234 stdump(t->right, f, nfapresent);
2235 }
2236
2237 /*
2238 * stid - identify a subtree node for dumping
2239 */
2240 static char * /* points to buf or constant string */
2241 stid(struct subre * t,
2242 char *buf,
2243 size_t bufsize)
2244 {
2245 /* big enough for hex int or decimal t->retry? */
2246 if (bufsize < sizeof(int) * 2 + 3 || bufsize < sizeof(t->retry) * 3 + 1)
2247 return "unable";
2248 if (t->retry != 0)
2249 sprintf(buf, "%d", t->retry);
2250 else
2251 sprintf(buf, "0x%x", (int) t); /* may lose bits, that's okay */
2252 return buf;
2253 }
2254 #endif /* REG_DEBUG */
2255
2256
2257 #include "regc_lex.c"
2258 #include "regc_color.c"
2259 #include "regc_nfa.c"
2260 #include "regc_cvec.c"
2261 #include "regc_locale.c"