]> git.saurik.com Git - wxWidgets.git/blob - src/regex/regcomp.c
wxGetInstance must be defined as extern C (patch 1085837)
[wxWidgets.git] / src / regex / regcomp.c
1 /*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33 #include "regguts.h"
34
35 /*
36 * forward declarations, up here so forward datatypes etc. are defined early
37 */
38 /* =====^!^===== begin forwards =====^!^===== */
39 /* automatically gathered by fwd; do not hand-edit */
40 /* === regcomp.c === */
41 int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
42 static VOID moresubs _ANSI_ARGS_((struct vars *, int));
43 static int freev _ANSI_ARGS_((struct vars *, int));
44 static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
45 static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
46 static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
47 static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
48 static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
49 static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
50 static int scannum _ANSI_ARGS_((struct vars *));
51 static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
52 static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
53 static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
54 static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
55 static chr *scanplain _ANSI_ARGS_((struct vars *));
56 static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
57 static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
58 static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
59 static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
60 static VOID wordchrs _ANSI_ARGS_((struct vars *));
61 static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
62 static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
63 static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
64 static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
65 static int numst _ANSI_ARGS_((struct subre *, int));
66 static VOID markst _ANSI_ARGS_((struct subre *));
67 static VOID cleanst _ANSI_ARGS_((struct vars *));
68 static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
69 static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
70 static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
71 static VOID freelacons _ANSI_ARGS_((struct subre *, int));
72 static VOID rfree _ANSI_ARGS_((regex_t *));
73 static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
74 static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
75 static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
76 static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
77 /* === regc_lex.c === */
78 static VOID lexstart _ANSI_ARGS_((struct vars *));
79 static VOID prefixes _ANSI_ARGS_((struct vars *));
80 static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
81 static VOID lexword _ANSI_ARGS_((struct vars *));
82 static int next _ANSI_ARGS_((struct vars *));
83 static int lexescape _ANSI_ARGS_((struct vars *));
84 static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
85 static int brenext _ANSI_ARGS_((struct vars *, pchr));
86 static VOID skip _ANSI_ARGS_((struct vars *));
87 static chr newline _ANSI_ARGS_((NOPARMS));
88 #ifdef REG_DEBUG
89 static chr *ch _ANSI_ARGS_((NOPARMS));
90 #endif
91 static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
92 /* === regc_color.c === */
93 static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
94 static VOID freecm _ANSI_ARGS_((struct colormap *));
95 static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
96 static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
97 static color maxcolor _ANSI_ARGS_((struct colormap *));
98 static color newcolor _ANSI_ARGS_((struct colormap *));
99 static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
100 static color pseudocolor _ANSI_ARGS_((struct colormap *));
101 static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
102 static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
103 static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
104 static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
105 static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
106 static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
107 static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
108 static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
109 static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
110 static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
111 #ifdef REG_DEBUG
112 static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
113 static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
114 static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
115 #endif
116 /* === regc_nfa.c === */
117 static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
118 static VOID freenfa _ANSI_ARGS_((struct nfa *));
119 static struct state *newstate _ANSI_ARGS_((struct nfa *));
120 static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
121 static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
122 static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
123 static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
124 static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
125 static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
126 static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
127 static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
128 static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
129 static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
130 static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
131 static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
132 static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
133 static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
134 static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
135 static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
136 static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
137 static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
138 static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
139 static VOID specialcolors _ANSI_ARGS_((struct nfa *));
140 static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
141 static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
142 static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
143 static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
144 static int push _ANSI_ARGS_((struct nfa *, struct arc *));
145 #define INCOMPATIBLE 1 /* destroys arc */
146 #define SATISFIED 2 /* constraint satisfied */
147 #define COMPATIBLE 3 /* compatible but not satisfied yet */
148 static int combine _ANSI_ARGS_((struct arc *, struct arc *));
149 static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
150 static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
151 static VOID cleanup _ANSI_ARGS_((struct nfa *));
152 static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
153 static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
154 static long analyze _ANSI_ARGS_((struct nfa *));
155 static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
156 static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
157 static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
158 static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
159 #ifdef REG_DEBUG
160 static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
161 static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
162 static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
163 static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
164 #endif
165 static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
166 #ifdef REG_DEBUG
167 static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
168 #endif
169 /* === regc_cvec.c === */
170 static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
171 static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
172 static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
173 static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
174 static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
175 static int haschr _ANSI_ARGS_((struct cvec *, pchr));
176 static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
177 static VOID freecvec _ANSI_ARGS_((struct cvec *));
178 /* === regc_locale.c === */
179 static int nmcces _ANSI_ARGS_((struct vars *));
180 static int nleaders _ANSI_ARGS_((struct vars *));
181 static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
182 static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
183 static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
184 static int before _ANSI_ARGS_((celt, celt));
185 static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
186 static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
187 static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
188 static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
189 static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
190 /* automatically gathered by fwd; do not hand-edit */
191 /* =====^!^===== end forwards =====^!^===== */
192
193
194
195 /* internal variables, bundled for easy passing around */
196 struct vars {
197 regex_t *re;
198 chr *now; /* scan pointer into string */
199 chr *stop; /* end of string */
200 chr *savenow; /* saved now and stop for "subroutine call" */
201 chr *savestop;
202 int err; /* error code (0 if none) */
203 int cflags; /* copy of compile flags */
204 int lasttype; /* type of previous token */
205 int nexttype; /* type of next token */
206 chr nextvalue; /* value (if any) of next token */
207 int lexcon; /* lexical context type (see lex.c) */
208 int nsubexp; /* subexpression count */
209 struct subre **subs; /* subRE pointer vector */
210 size_t nsubs; /* length of vector */
211 struct subre *sub10[10]; /* initial vector, enough for most */
212 struct nfa *nfa; /* the NFA */
213 struct colormap *cm; /* character color map */
214 color nlcolor; /* color of newline */
215 struct state *wordchrs; /* state in nfa holding word-char outarcs */
216 struct subre *tree; /* subexpression tree */
217 struct subre *treechain; /* all tree nodes allocated */
218 struct subre *treefree; /* any free tree nodes */
219 int ntree; /* number of tree nodes */
220 struct cvec *cv; /* interface cvec */
221 struct cvec *cv2; /* utility cvec */
222 struct cvec *mcces; /* collating-element information */
223 # define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
224 struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
225 struct state *mccepend; /* in nfa, end of MCCE prototypes */
226 struct subre *lacons; /* lookahead-constraint vector */
227 int nlacons; /* size of lacons */
228 };
229
230 /* parsing macros; most know that `v' is the struct vars pointer */
231 #define NEXT() (next(v)) /* advance by one token */
232 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
233 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
234 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
235 #define ISERR() VISERR(v)
236 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
237 ((vv)->err = (e)))
238 #define ERR(e) VERR(v, e) /* record an error */
239 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
240 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
241 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
242 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
243 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
244 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
245
246 /* token type codes, some also used as NFA arc types */
247 #define EMPTY 'n' /* no token present */
248 #define EOS 'e' /* end of string */
249 #define PLAIN 'p' /* ordinary character */
250 #define DIGIT 'd' /* digit (in bound) */
251 #define BACKREF 'b' /* back reference */
252 #define COLLEL 'I' /* start of [. */
253 #define ECLASS 'E' /* start of [= */
254 #define CCLASS 'C' /* start of [: */
255 #define END 'X' /* end of [. [= [: */
256 #define RANGE 'R' /* - within [] which might be range delim. */
257 #define LACON 'L' /* lookahead constraint subRE */
258 #define AHEAD 'a' /* color-lookahead arc */
259 #define BEHIND 'r' /* color-lookbehind arc */
260 #define WBDRY 'w' /* word boundary constraint */
261 #define NWBDRY 'W' /* non-word-boundary constraint */
262 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
263 #define SEND 'Z' /* end of string (even if not EOL) */
264 #define PREFER 'P' /* length preference */
265
266 /* is an arc colored, and hence on a color chain? */
267 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
268 (a)->type == BEHIND)
269
270
271
272 /* static function list */
273 static struct fns functions = {
274 rfree, /* regfree insides */
275 };
276
277
278
279 /*
280 - compile - compile regular expression
281 ^ int compile(regex_t *, CONST chr *, size_t, int);
282 */
283 int
284 compile(re, string, len, flags)
285 regex_t *re;
286 CONST chr *string;
287 size_t len;
288 int flags;
289 {
290 struct vars var;
291 struct vars *v = &var;
292 struct guts *g;
293 int i;
294 size_t j;
295 FILE *debug = (flags&REG_PROGRESS) ? stdout : (FILE *)NULL;
296 # define CNOERR() { if (ISERR()) return freev(v, v->err); }
297
298 /* sanity checks */
299
300 if (re == NULL || string == NULL)
301 return REG_INVARG;
302 if ((flags&REG_QUOTE) &&
303 (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
304 return REG_INVARG;
305 if (!(flags&REG_EXTENDED) && (flags&REG_ADVF))
306 return REG_INVARG;
307
308 /* initial setup (after which freev() is callable) */
309 v->re = re;
310 v->now = (chr *)string;
311 v->stop = v->now + len;
312 v->savenow = v->savestop = NULL;
313 v->err = 0;
314 v->cflags = flags;
315 v->nsubexp = 0;
316 v->subs = v->sub10;
317 v->nsubs = 10;
318 for (j = 0; j < v->nsubs; j++)
319 v->subs[j] = NULL;
320 v->nfa = NULL;
321 v->cm = NULL;
322 v->nlcolor = COLORLESS;
323 v->wordchrs = NULL;
324 v->tree = NULL;
325 v->treechain = NULL;
326 v->treefree = NULL;
327 v->cv = NULL;
328 v->cv2 = NULL;
329 v->mcces = NULL;
330 v->lacons = NULL;
331 v->nlacons = 0;
332 re->re_magic = REMAGIC;
333 re->re_info = 0; /* bits get set during parse */
334 re->re_csize = sizeof(chr);
335 re->re_guts = NULL;
336 re->re_fns = VS(&functions);
337
338 /* more complex setup, malloced things */
339 re->re_guts = VS(MALLOC(sizeof(struct guts)));
340 if (re->re_guts == NULL)
341 return freev(v, REG_ESPACE);
342 g = (struct guts *)re->re_guts;
343 g->tree = NULL;
344 initcm(v, &g->cmap);
345 v->cm = &g->cmap;
346 g->lacons = NULL;
347 g->nlacons = 0;
348 ZAPCNFA(g->search);
349 v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
350 CNOERR();
351 v->cv = newcvec(100, 20, 10);
352 if (v->cv == NULL)
353 return freev(v, REG_ESPACE);
354 i = nmcces(v);
355 if (i > 0) {
356 v->mcces = newcvec(nleaders(v), 0, i);
357 CNOERR();
358 v->mcces = allmcces(v, v->mcces);
359 leaders(v, v->mcces);
360 addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */
361 }
362 CNOERR();
363
364 /* parsing */
365 lexstart(v); /* also handles prefixes */
366 if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
367 /* assign newline a unique color */
368 v->nlcolor = subcolor(v->cm, newline());
369 okcolors(v->nfa, v->cm);
370 }
371 CNOERR();
372 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
373 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
374 CNOERR();
375 assert(v->tree != NULL);
376
377 /* finish setup of nfa and its subre tree */
378 specialcolors(v->nfa);
379 CNOERR();
380 if (debug != NULL) {
381 fprintf(debug, "\n\n\n========= RAW ==========\n");
382 dumpnfa(v->nfa, debug);
383 dumpst(v->tree, debug, 1);
384 }
385 optst(v, v->tree);
386 v->ntree = numst(v->tree, 1);
387 markst(v->tree);
388 cleanst(v);
389 if (debug != NULL) {
390 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
391 dumpst(v->tree, debug, 1);
392 }
393
394 /* build compacted NFAs for tree and lacons */
395 re->re_info |= nfatree(v, v->tree, debug);
396 CNOERR();
397 assert(v->nlacons == 0 || v->lacons != NULL);
398 for (i = 1; i < v->nlacons; i++) {
399 if (debug != NULL)
400 fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
401 nfanode(v, &v->lacons[i], debug);
402 }
403 CNOERR();
404 if (v->tree->flags&SHORTER)
405 NOTE(REG_USHORTEST);
406
407 /* build compacted NFAs for tree, lacons, fast search */
408 if (debug != NULL)
409 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
410 /* can sacrifice main NFA now, so use it as work area */
411 (DISCARD)optimize(v->nfa, debug);
412 CNOERR();
413 makesearch(v, v->nfa);
414 CNOERR();
415 compact(v->nfa, &g->search);
416 CNOERR();
417
418 /* looks okay, package it up */
419 re->re_nsub = v->nsubexp;
420 v->re = NULL; /* freev no longer frees re */
421 g->magic = GUTSMAGIC;
422 g->cflags = v->cflags;
423 g->info = re->re_info;
424 g->nsub = re->re_nsub;
425 g->tree = v->tree;
426 v->tree = NULL;
427 g->ntree = v->ntree;
428 g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
429 g->lacons = v->lacons;
430 v->lacons = NULL;
431 g->nlacons = v->nlacons;
432
433 if (flags&REG_DUMP)
434 dump(re, stdout);
435
436 assert(v->err == 0);
437 return freev(v, 0);
438 }
439
440 /*
441 - moresubs - enlarge subRE vector
442 ^ static VOID moresubs(struct vars *, int);
443 */
444 static VOID
445 moresubs(v, wanted)
446 struct vars *v;
447 int wanted; /* want enough room for this one */
448 {
449 struct subre **p;
450 size_t n;
451
452 assert(wanted > 0 && (size_t)wanted >= v->nsubs);
453 n = (size_t)wanted * 3 / 2 + 1;
454 if (v->subs == v->sub10) {
455 p = (struct subre **)MALLOC(n * sizeof(struct subre *));
456 if (p != NULL)
457 memcpy(VS(p), VS(v->subs),
458 v->nsubs * sizeof(struct subre *));
459 } else
460 p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
461 if (p == NULL) {
462 ERR(REG_ESPACE);
463 return;
464 }
465 v->subs = p;
466 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
467 *p = NULL;
468 assert(v->nsubs == n);
469 assert((size_t)wanted < v->nsubs);
470 }
471
472 /*
473 - freev - free vars struct's substructures where necessary
474 * Optionally does error-number setting, and always returns error code
475 * (if any), to make error-handling code terser.
476 ^ static int freev(struct vars *, int);
477 */
478 static int
479 freev(v, err)
480 struct vars *v;
481 int err;
482 {
483 if (v->re != NULL)
484 rfree(v->re);
485 if (v->subs != v->sub10)
486 FREE(v->subs);
487 if (v->nfa != NULL)
488 freenfa(v->nfa);
489 if (v->tree != NULL)
490 freesubre(v, v->tree);
491 if (v->treechain != NULL)
492 cleanst(v);
493 if (v->cv != NULL)
494 freecvec(v->cv);
495 if (v->cv2 != NULL)
496 freecvec(v->cv2);
497 if (v->mcces != NULL)
498 freecvec(v->mcces);
499 if (v->lacons != NULL)
500 freelacons(v->lacons, v->nlacons);
501 ERR(err); /* nop if err==0 */
502
503 return v->err;
504 }
505
506 /*
507 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
508 * NFA must have been optimize()d already.
509 ^ static VOID makesearch(struct vars *, struct nfa *);
510 */
511 static VOID
512 makesearch(v, nfa)
513 struct vars *v;
514 struct nfa *nfa;
515 {
516 struct arc *a;
517 struct arc *b;
518 struct state *pre = nfa->pre;
519 struct state *s;
520 struct state *s2;
521 struct state *slist;
522
523 /* no loops are needed if it's anchored */
524 for (a = pre->outs; a != NULL; a = a->outchain) {
525 assert(a->type == PLAIN);
526 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
527 break;
528 }
529 if (a != NULL) {
530 /* add implicit .* in front */
531 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
532
533 /* and ^* and \A* too -- not always necessary, but harmless */
534 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
535 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
536 }
537
538 /*
539 * Now here's the subtle part. Because many REs have no lookback
540 * constraints, often knowing when you were in the pre state tells
541 * you little; it's the next state(s) that are informative. But
542 * some of them may have other inarcs, i.e. it may be possible to
543 * make actual progress and then return to one of them. We must
544 * de-optimize such cases, splitting each such state into progress
545 * and no-progress states.
546 */
547
548 /* first, make a list of the states */
549 slist = NULL;
550 for (a = pre->outs; a != NULL; a = a->outchain) {
551 s = a->to;
552 for (b = s->ins; b != NULL; b = b->inchain)
553 if (b->from != pre)
554 break;
555 if (b != NULL) { /* must be split */
556 if (s->tmp == NULL) { /* if not already in the list */
557 /* (fixes bugs 505048, 230589, */
558 /* 840258, 504785) */
559 s->tmp = slist;
560 slist = s;
561 }
562 }
563 }
564
565 /* do the splits */
566 for (s = slist; s != NULL; s = s2) {
567 s2 = newstate(nfa);
568 copyouts(nfa, s, s2);
569 for (a = s->ins; a != NULL; a = b) {
570 b = a->inchain;
571 if (a->from != pre) {
572 cparc(nfa, a, a->from, s2);
573 freearc(nfa, a);
574 }
575 }
576 s2 = s->tmp;
577 s->tmp = NULL; /* clean up while we're at it */
578 }
579 }
580
581 /*
582 - parse - parse an RE
583 * This is actually just the top level, which parses a bunch of branches
584 * tied together with '|'. They appear in the tree as the left children
585 * of a chain of '|' subres.
586 ^ static struct subre *parse(struct vars *, int, int, struct state *,
587 ^ struct state *);
588 */
589 static struct subre *
590 parse(v, stopper, type, init, final)
591 struct vars *v;
592 int stopper; /* EOS or ')' */
593 int type; /* LACON (lookahead subRE) or PLAIN */
594 struct state *init; /* initial state */
595 struct state *final; /* final state */
596 {
597 struct state *left; /* scaffolding for branch */
598 struct state *right;
599 struct subre *branches; /* top level */
600 struct subre *branch; /* current branch */
601 struct subre *t; /* temporary */
602 int firstbranch; /* is this the first branch? */
603
604 assert(stopper == ')' || stopper == EOS);
605
606 branches = subre(v, '|', LONGER, init, final);
607 NOERRN();
608 branch = branches;
609 firstbranch = 1;
610 do { /* a branch */
611 if (!firstbranch) {
612 /* need a place to hang it */
613 branch->right = subre(v, '|', LONGER, init, final);
614 NOERRN();
615 branch = branch->right;
616 }
617 firstbranch = 0;
618 left = newstate(v->nfa);
619 right = newstate(v->nfa);
620 NOERRN();
621 EMPTYARC(init, left);
622 EMPTYARC(right, final);
623 NOERRN();
624 branch->left = parsebranch(v, stopper, type, left, right, 0);
625 NOERRN();
626 branch->flags |= UP(branch->flags | branch->left->flags);
627 if ((branch->flags &~ branches->flags) != 0) /* new flags */
628 for (t = branches; t != branch; t = t->right)
629 t->flags |= branch->flags;
630 } while (EAT('|'));
631 assert(SEE(stopper) || SEE(EOS));
632
633 if (!SEE(stopper)) {
634 assert(stopper == ')' && SEE(EOS));
635 ERR(REG_EPAREN);
636 }
637
638 /* optimize out simple cases */
639 if (branch == branches) { /* only one branch */
640 assert(branch->right == NULL);
641 t = branch->left;
642 branch->left = NULL;
643 freesubre(v, branches);
644 branches = t;
645 } else if (!MESSY(branches->flags)) { /* no interesting innards */
646 freesubre(v, branches->left);
647 branches->left = NULL;
648 freesubre(v, branches->right);
649 branches->right = NULL;
650 branches->op = '=';
651 }
652
653 return branches;
654 }
655
656 /*
657 - parsebranch - parse one branch of an RE
658 * This mostly manages concatenation, working closely with parseqatom().
659 * Concatenated things are bundled up as much as possible, with separate
660 * ',' nodes introduced only when necessary due to substructure.
661 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
662 ^ struct state *, int);
663 */
664 static struct subre *
665 parsebranch(v, stopper, type, left, right, partial)
666 struct vars *v;
667 int stopper; /* EOS or ')' */
668 int type; /* LACON (lookahead subRE) or PLAIN */
669 struct state *left; /* leftmost state */
670 struct state *right; /* rightmost state */
671 int partial; /* is this only part of a branch? */
672 {
673 struct state *lp; /* left end of current construct */
674 int seencontent; /* is there anything in this branch yet? */
675 struct subre *t;
676
677 lp = left;
678 seencontent = 0;
679 t = subre(v, '=', 0, left, right); /* op '=' is tentative */
680 NOERRN();
681 while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
682 if (seencontent) { /* implicit concat operator */
683 lp = newstate(v->nfa);
684 NOERRN();
685 moveins(v->nfa, right, lp);
686 }
687 seencontent = 1;
688
689 /* NB, recursion in parseqatom() may swallow rest of branch */
690 parseqatom(v, stopper, type, lp, right, t);
691 }
692
693 if (!seencontent) { /* empty branch */
694 if (!partial)
695 NOTE(REG_UUNSPEC);
696 assert(lp == left);
697 EMPTYARC(left, right);
698 }
699
700 return t;
701 }
702
703 /*
704 - parseqatom - parse one quantified atom or constraint of an RE
705 * The bookkeeping near the end cooperates very closely with parsebranch();
706 * in particular, it contains a recursion that can involve parsing the rest
707 * of the branch, making this function's name somewhat inaccurate.
708 ^ static VOID parseqatom(struct vars *, int, int, struct state *,
709 ^ struct state *, struct subre *);
710 */
711 static VOID
712 parseqatom(v, stopper, type, lp, rp, top)
713 struct vars *v;
714 int stopper; /* EOS or ')' */
715 int type; /* LACON (lookahead subRE) or PLAIN */
716 struct state *lp; /* left state to hang it on */
717 struct state *rp; /* right state to hang it on */
718 struct subre *top; /* subtree top */
719 {
720 struct state *s; /* temporaries for new states */
721 struct state *s2;
722 # define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
723 int m, n;
724 struct subre *atom; /* atom's subtree */
725 struct subre *t;
726 int cap; /* capturing parens? */
727 int pos; /* positive lookahead? */
728 int subno; /* capturing-parens or backref number */
729 int atomtype;
730 int qprefer; /* quantifier short/long preference */
731 int f;
732 struct subre **atomp; /* where the pointer to atom is */
733
734 /* initial bookkeeping */
735 atom = NULL;
736 assert(lp->nouts == 0); /* must string new code */
737 assert(rp->nins == 0); /* between lp and rp */
738 subno = 0; /* just to shut lint up */
739
740 /* an atom or constraint... */
741 atomtype = v->nexttype;
742 switch (atomtype) {
743 /* first, constraints, which end by returning */
744 case '^':
745 ARCV('^', 1);
746 if (v->cflags&REG_NLANCH)
747 ARCV(BEHIND, v->nlcolor);
748 NEXT();
749 return;
750 break;
751 case '$':
752 ARCV('$', 1);
753 if (v->cflags&REG_NLANCH)
754 ARCV(AHEAD, v->nlcolor);
755 NEXT();
756 return;
757 break;
758 case SBEGIN:
759 ARCV('^', 1); /* BOL */
760 ARCV('^', 0); /* or BOS */
761 NEXT();
762 return;
763 break;
764 case SEND:
765 ARCV('$', 1); /* EOL */
766 ARCV('$', 0); /* or EOS */
767 NEXT();
768 return;
769 break;
770 case '<':
771 wordchrs(v); /* does NEXT() */
772 s = newstate(v->nfa);
773 NOERR();
774 nonword(v, BEHIND, lp, s);
775 word(v, AHEAD, s, rp);
776 return;
777 break;
778 case '>':
779 wordchrs(v); /* does NEXT() */
780 s = newstate(v->nfa);
781 NOERR();
782 word(v, BEHIND, lp, s);
783 nonword(v, AHEAD, s, rp);
784 return;
785 break;
786 case WBDRY:
787 wordchrs(v); /* does NEXT() */
788 s = newstate(v->nfa);
789 NOERR();
790 nonword(v, BEHIND, lp, s);
791 word(v, AHEAD, s, rp);
792 s = newstate(v->nfa);
793 NOERR();
794 word(v, BEHIND, lp, s);
795 nonword(v, AHEAD, s, rp);
796 return;
797 break;
798 case NWBDRY:
799 wordchrs(v); /* does NEXT() */
800 s = newstate(v->nfa);
801 NOERR();
802 word(v, BEHIND, lp, s);
803 word(v, AHEAD, s, rp);
804 s = newstate(v->nfa);
805 NOERR();
806 nonword(v, BEHIND, lp, s);
807 nonword(v, AHEAD, s, rp);
808 return;
809 break;
810 case LACON: /* lookahead constraint */
811 pos = v->nextvalue;
812 NEXT();
813 s = newstate(v->nfa);
814 s2 = newstate(v->nfa);
815 NOERR();
816 t = parse(v, ')', LACON, s, s2);
817 freesubre(v, t); /* internal structure irrelevant */
818 assert(SEE(')') || ISERR());
819 NEXT();
820 n = newlacon(v, s, s2, pos);
821 NOERR();
822 ARCV(LACON, n);
823 return;
824 break;
825 /* then errors, to get them out of the way */
826 case '*':
827 case '+':
828 case '?':
829 case '{':
830 ERR(REG_BADRPT);
831 return;
832 break;
833 default:
834 ERR(REG_ASSERT);
835 return;
836 break;
837 /* then plain characters, and minor variants on that theme */
838 case ')': /* unbalanced paren */
839 if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
840 ERR(REG_EPAREN);
841 return;
842 }
843 /* legal in EREs due to specification botch */
844 NOTE(REG_UPBOTCH);
845 /* fallthrough into case PLAIN */
846 case PLAIN:
847 onechr(v, v->nextvalue, lp, rp);
848 okcolors(v->nfa, v->cm);
849 NOERR();
850 NEXT();
851 break;
852 case '[':
853 if (v->nextvalue == 1)
854 bracket(v, lp, rp);
855 else
856 cbracket(v, lp, rp);
857 assert(SEE(']') || ISERR());
858 NEXT();
859 break;
860 case '.':
861 rainbow(v->nfa, v->cm, PLAIN,
862 (v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,
863 lp, rp);
864 NEXT();
865 break;
866 /* and finally the ugly stuff */
867 case '(': /* value flags as capturing or non */
868 cap = (type == LACON) ? 0 : v->nextvalue;
869 if (cap) {
870 v->nsubexp++;
871 subno = v->nsubexp;
872 if ((size_t)subno >= v->nsubs)
873 moresubs(v, subno);
874 assert((size_t)subno < v->nsubs);
875 } else
876 atomtype = PLAIN; /* something that's not '(' */
877 NEXT();
878 /* need new endpoints because tree will contain pointers */
879 s = newstate(v->nfa);
880 s2 = newstate(v->nfa);
881 NOERR();
882 EMPTYARC(lp, s);
883 EMPTYARC(s2, rp);
884 NOERR();
885 atom = parse(v, ')', PLAIN, s, s2);
886 assert(SEE(')') || ISERR());
887 NEXT();
888 NOERR();
889 if (cap) {
890 v->subs[subno] = atom;
891 t = subre(v, '(', atom->flags|CAP, lp, rp);
892 NOERR();
893 t->subno = subno;
894 t->left = atom;
895 atom = t;
896 }
897 /* postpone everything else pending possible {0} */
898 break;
899 case BACKREF: /* the Feature From The Black Lagoon */
900 INSIST(type != LACON, REG_ESUBREG);
901 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
902 INSIST(v->subs[(int)v->nextvalue] != NULL, REG_ESUBREG);
903 NOERR();
904 assert(v->nextvalue > 0);
905 atom = subre(v, 'b', BACKR, lp, rp);
906 subno = v->nextvalue;
907 atom->subno = subno;
908 EMPTYARC(lp, rp); /* temporarily, so there's something */
909 NEXT();
910 break;
911 }
912
913 /* ...and an atom may be followed by a quantifier */
914 switch (v->nexttype) {
915 case '*':
916 m = 0;
917 n = INFINITY;
918 qprefer = (v->nextvalue) ? LONGER : SHORTER;
919 NEXT();
920 break;
921 case '+':
922 m = 1;
923 n = INFINITY;
924 qprefer = (v->nextvalue) ? LONGER : SHORTER;
925 NEXT();
926 break;
927 case '?':
928 m = 0;
929 n = 1;
930 qprefer = (v->nextvalue) ? LONGER : SHORTER;
931 NEXT();
932 break;
933 case '{':
934 NEXT();
935 m = scannum(v);
936 if (EAT(',')) {
937 if (SEE(DIGIT))
938 n = scannum(v);
939 else
940 n = INFINITY;
941 if (m > n) {
942 ERR(REG_BADBR);
943 return;
944 }
945 /* {m,n} exercises preference, even if it's {m,m} */
946 qprefer = (v->nextvalue) ? LONGER : SHORTER;
947 } else {
948 n = m;
949 /* {m} passes operand's preference through */
950 qprefer = 0;
951 }
952 if (!SEE('}')) { /* catches errors too */
953 ERR(REG_BADBR);
954 return;
955 }
956 NEXT();
957 break;
958 default: /* no quantifier */
959 m = n = 1;
960 qprefer = 0;
961 break;
962 }
963
964 /* annoying special case: {0} or {0,0} cancels everything */
965 if (m == 0 && n == 0) {
966 if (atom != NULL)
967 freesubre(v, atom);
968 if (atomtype == '(')
969 v->subs[subno] = NULL;
970 delsub(v->nfa, lp, rp);
971 EMPTYARC(lp, rp);
972 return;
973 }
974
975 /* if not a messy case, avoid hard part */
976 assert(!MESSY(top->flags));
977 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
978 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
979 if (!(m == 1 && n == 1))
980 repeat(v, lp, rp, m, n);
981 if (atom != NULL)
982 freesubre(v, atom);
983 top->flags = f;
984 return;
985 }
986
987 /*
988 * hard part: something messy
989 * That is, capturing parens, back reference, short/long clash, or
990 * an atom with substructure containing one of those.
991 */
992
993 /* now we'll need a subre for the contents even if they're boring */
994 if (atom == NULL) {
995 atom = subre(v, '=', 0, lp, rp);
996 NOERR();
997 }
998
999 /*
1000 * prepare a general-purpose state skeleton
1001 *
1002 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1003 * / /
1004 * [lp] ----> [s2] ----bypass---------------------
1005 *
1006 * where bypass is an empty, and prefix is some repetitions of atom
1007 */
1008 s = newstate(v->nfa); /* first, new endpoints for the atom */
1009 s2 = newstate(v->nfa);
1010 NOERR();
1011 moveouts(v->nfa, lp, s);
1012 moveins(v->nfa, rp, s2);
1013 NOERR();
1014 atom->begin = s;
1015 atom->end = s2;
1016 s = newstate(v->nfa); /* and spots for prefix and bypass */
1017 s2 = newstate(v->nfa);
1018 NOERR();
1019 EMPTYARC(lp, s);
1020 EMPTYARC(lp, s2);
1021 NOERR();
1022
1023 /* break remaining subRE into x{...} and what follows */
1024 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1025 t->left = atom;
1026 atomp = &t->left;
1027 /* here we should recurse... but we must postpone that to the end */
1028
1029 /* split top into prefix and remaining */
1030 assert(top->op == '=' && top->left == NULL && top->right == NULL);
1031 top->left = subre(v, '=', top->flags, top->begin, lp);
1032 top->op = '.';
1033 top->right = t;
1034
1035 /* if it's a backref, now is the time to replicate the subNFA */
1036 if (atomtype == BACKREF) {
1037 assert(atom->begin->nouts == 1); /* just the EMPTY */
1038 delsub(v->nfa, atom->begin, atom->end);
1039 assert(v->subs[subno] != NULL);
1040 /* and here's why the recursion got postponed: it must */
1041 /* wait until the skeleton is filled in, because it may */
1042 /* hit a backref that wants to copy the filled-in skeleton */
1043 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1044 atom->begin, atom->end);
1045 NOERR();
1046 }
1047
1048 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1049 if (m == 0) {
1050 EMPTYARC(s2, atom->end); /* the bypass */
1051 assert(PREF(qprefer) != 0);
1052 f = COMBINE(qprefer, atom->flags);
1053 t = subre(v, '|', f, lp, atom->end);
1054 NOERR();
1055 t->left = atom;
1056 t->right = subre(v, '|', PREF(f), s2, atom->end);
1057 NOERR();
1058 t->right->left = subre(v, '=', 0, s2, atom->end);
1059 NOERR();
1060 *atomp = t;
1061 atomp = &t->left;
1062 m = 1;
1063 }
1064
1065 /* deal with the rest of the quantifier */
1066 if (atomtype == BACKREF) {
1067 /* special case: backrefs have internal quantifiers */
1068 EMPTYARC(s, atom->begin); /* empty prefix */
1069 /* just stuff everything into atom */
1070 repeat(v, atom->begin, atom->end, m, n);
1071 atom->min = (short)m;
1072 atom->max = (short)n;
1073 atom->flags |= COMBINE(qprefer, atom->flags);
1074 } else if (m == 1 && n == 1) {
1075 /* no/vacuous quantifier: done */
1076 EMPTYARC(s, atom->begin); /* empty prefix */
1077 } else {
1078 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1079 /* parens in only second x */
1080 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1081 assert(m >= 1 && m != INFINITY && n >= 1);
1082 repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
1083 f = COMBINE(qprefer, atom->flags);
1084 t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1085 NOERR();
1086 t->left = subre(v, '=', PREF(f), s, atom->begin);
1087 NOERR();
1088 t->right = atom;
1089 *atomp = t;
1090 }
1091
1092 /* and finally, look after that postponed recursion */
1093 t = top->right;
1094 if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1095 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1096 else {
1097 EMPTYARC(atom->end, rp);
1098 t->right = subre(v, '=', 0, atom->end, rp);
1099 }
1100 assert(SEE('|') || SEE(stopper) || SEE(EOS));
1101 t->flags |= COMBINE(t->flags, t->right->flags);
1102 top->flags |= COMBINE(top->flags, t->flags);
1103 }
1104
1105 /*
1106 - nonword - generate arcs for non-word-character ahead or behind
1107 ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
1108 */
1109 static VOID
1110 nonword(v, dir, lp, rp)
1111 struct vars *v;
1112 int dir; /* AHEAD or BEHIND */
1113 struct state *lp;
1114 struct state *rp;
1115 {
1116 int anchor = (dir == AHEAD) ? '$' : '^';
1117
1118 assert(dir == AHEAD || dir == BEHIND);
1119 newarc(v->nfa, anchor, 1, lp, rp);
1120 newarc(v->nfa, anchor, 0, lp, rp);
1121 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1122 /* (no need for special attention to \n) */
1123 }
1124
1125 /*
1126 - word - generate arcs for word character ahead or behind
1127 ^ static VOID word(struct vars *, int, struct state *, struct state *);
1128 */
1129 static VOID
1130 word(v, dir, lp, rp)
1131 struct vars *v;
1132 int dir; /* AHEAD or BEHIND */
1133 struct state *lp;
1134 struct state *rp;
1135 {
1136 assert(dir == AHEAD || dir == BEHIND);
1137 cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1138 /* (no need for special attention to \n) */
1139 }
1140
1141 /*
1142 - scannum - scan a number
1143 ^ static int scannum(struct vars *);
1144 */
1145 static int /* value, <= DUPMAX */
1146 scannum(v)
1147 struct vars *v;
1148 {
1149 int n = 0;
1150
1151 while (SEE(DIGIT) && n < DUPMAX) {
1152 n = n*10 + v->nextvalue;
1153 NEXT();
1154 }
1155 if (SEE(DIGIT) || n > DUPMAX) {
1156 ERR(REG_BADBR);
1157 return 0;
1158 }
1159 return n;
1160 }
1161
1162 /*
1163 - repeat - replicate subNFA for quantifiers
1164 * The duplication sequences used here are chosen carefully so that any
1165 * pointers starting out pointing into the subexpression end up pointing into
1166 * the last occurrence. (Note that it may not be strung between the same
1167 * left and right end states, however!) This used to be important for the
1168 * subRE tree, although the important bits are now handled by the in-line
1169 * code in parse(), and when this is called, it doesn't matter any more.
1170 ^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
1171 */
1172 static VOID
1173 repeat(v, lp, rp, m, n)
1174 struct vars *v;
1175 struct state *lp;
1176 struct state *rp;
1177 int m;
1178 int n;
1179 {
1180 # define SOME 2
1181 # define INF 3
1182 # define PAIR(x, y) ((x)*4 + (y))
1183 # define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1184 CONST int rm = REDUCE(m);
1185 CONST int rn = REDUCE(n);
1186 struct state *s;
1187 struct state *s2;
1188
1189 switch (PAIR(rm, rn)) {
1190 case PAIR(0, 0): /* empty string */
1191 delsub(v->nfa, lp, rp);
1192 EMPTYARC(lp, rp);
1193 break;
1194 case PAIR(0, 1): /* do as x| */
1195 EMPTYARC(lp, rp);
1196 break;
1197 case PAIR(0, SOME): /* do as x{1,n}| */
1198 repeat(v, lp, rp, 1, n);
1199 NOERR();
1200 EMPTYARC(lp, rp);
1201 break;
1202 case PAIR(0, INF): /* loop x around */
1203 s = newstate(v->nfa);
1204 NOERR();
1205 moveouts(v->nfa, lp, s);
1206 moveins(v->nfa, rp, s);
1207 EMPTYARC(lp, s);
1208 EMPTYARC(s, rp);
1209 break;
1210 case PAIR(1, 1): /* no action required */
1211 break;
1212 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1213 s = newstate(v->nfa);
1214 NOERR();
1215 moveouts(v->nfa, lp, s);
1216 dupnfa(v->nfa, s, rp, lp, s);
1217 NOERR();
1218 repeat(v, lp, s, 1, n-1);
1219 NOERR();
1220 EMPTYARC(lp, s);
1221 break;
1222 case PAIR(1, INF): /* add loopback arc */
1223 s = newstate(v->nfa);
1224 s2 = newstate(v->nfa);
1225 NOERR();
1226 moveouts(v->nfa, lp, s);
1227 moveins(v->nfa, rp, s2);
1228 EMPTYARC(lp, s);
1229 EMPTYARC(s2, rp);
1230 EMPTYARC(s2, s);
1231 break;
1232 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1233 s = newstate(v->nfa);
1234 NOERR();
1235 moveouts(v->nfa, lp, s);
1236 dupnfa(v->nfa, s, rp, lp, s);
1237 NOERR();
1238 repeat(v, lp, s, m-1, n-1);
1239 break;
1240 case PAIR(SOME, INF): /* do as x{m-1,}x */
1241 s = newstate(v->nfa);
1242 NOERR();
1243 moveouts(v->nfa, lp, s);
1244 dupnfa(v->nfa, s, rp, lp, s);
1245 NOERR();
1246 repeat(v, lp, s, m-1, n);
1247 break;
1248 default:
1249 ERR(REG_ASSERT);
1250 break;
1251 }
1252 }
1253
1254 /*
1255 - bracket - handle non-complemented bracket expression
1256 * Also called from cbracket for complemented bracket expressions.
1257 ^ static VOID bracket(struct vars *, struct state *, struct state *);
1258 */
1259 static VOID
1260 bracket(v, lp, rp)
1261 struct vars *v;
1262 struct state *lp;
1263 struct state *rp;
1264 {
1265 assert(SEE('['));
1266 NEXT();
1267 while (!SEE(']') && !SEE(EOS))
1268 brackpart(v, lp, rp);
1269 assert(SEE(']') || ISERR());
1270 okcolors(v->nfa, v->cm);
1271 }
1272
1273 /*
1274 - cbracket - handle complemented bracket expression
1275 * We do it by calling bracket() with dummy endpoints, and then complementing
1276 * the result. The alternative would be to invoke rainbow(), and then delete
1277 * arcs as the b.e. is seen... but that gets messy.
1278 ^ static VOID cbracket(struct vars *, struct state *, struct state *);
1279 */
1280 static VOID
1281 cbracket(v, lp, rp)
1282 struct vars *v;
1283 struct state *lp;
1284 struct state *rp;
1285 {
1286 struct state *left = newstate(v->nfa);
1287 struct state *right = newstate(v->nfa);
1288 struct state *s;
1289 struct arc *a; /* arc from lp */
1290 struct arc *ba; /* arc from left, from bracket() */
1291 struct arc *pa; /* MCCE-prototype arc */
1292 color co;
1293 chr *p;
1294 int i;
1295
1296 NOERR();
1297 bracket(v, left, right);
1298 if (v->cflags&REG_NLSTOP)
1299 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1300 NOERR();
1301
1302 assert(lp->nouts == 0); /* all outarcs will be ours */
1303
1304 /* easy part of complementing */
1305 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1306 NOERR();
1307 if (v->mcces == NULL) { /* no MCCEs -- we're done */
1308 dropstate(v->nfa, left);
1309 assert(right->nins == 0);
1310 freestate(v->nfa, right);
1311 return;
1312 }
1313
1314 /* but complementing gets messy in the presence of MCCEs... */
1315 NOTE(REG_ULOCALE);
1316 for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
1317 co = GETCOLOR(v->cm, *p);
1318 a = findarc(lp, PLAIN, co);
1319 ba = findarc(left, PLAIN, co);
1320 if (ba == NULL) {
1321 assert(a != NULL);
1322 freearc(v->nfa, a);
1323 } else {
1324 assert(a == NULL);
1325 }
1326 s = newstate(v->nfa);
1327 NOERR();
1328 newarc(v->nfa, PLAIN, co, lp, s);
1329 NOERR();
1330 pa = findarc(v->mccepbegin, PLAIN, co);
1331 assert(pa != NULL);
1332 if (ba == NULL) { /* easy case, need all of them */
1333 cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1334 newarc(v->nfa, '$', 1, s, rp);
1335 newarc(v->nfa, '$', 0, s, rp);
1336 colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1337 } else { /* must be selective */
1338 if (findarc(ba->to, '$', 1) == NULL) {
1339 newarc(v->nfa, '$', 1, s, rp);
1340 newarc(v->nfa, '$', 0, s, rp);
1341 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1342 s, rp);
1343 }
1344 for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1345 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1346 newarc(v->nfa, PLAIN, pa->co, s, rp);
1347 if (s->nouts == 0) /* limit of selectivity: none */
1348 dropstate(v->nfa, s); /* frees arc too */
1349 }
1350 NOERR();
1351 }
1352
1353 delsub(v->nfa, left, right);
1354 assert(left->nouts == 0);
1355 freestate(v->nfa, left);
1356 assert(right->nins == 0);
1357 freestate(v->nfa, right);
1358 }
1359
1360 /*
1361 - brackpart - handle one item (or range) within a bracket expression
1362 ^ static VOID brackpart(struct vars *, struct state *, struct state *);
1363 */
1364 static VOID
1365 brackpart(v, lp, rp)
1366 struct vars *v;
1367 struct state *lp;
1368 struct state *rp;
1369 {
1370 celt startc;
1371 celt endc;
1372 struct cvec *cv;
1373 chr *startp;
1374 chr *endp;
1375 chr c[1];
1376
1377 /* parse something, get rid of special cases, take shortcuts */
1378 switch (v->nexttype) {
1379 case RANGE: /* a-b-c or other botch */
1380 ERR(REG_ERANGE);
1381 return;
1382 break;
1383 case PLAIN:
1384 c[0] = v->nextvalue;
1385 NEXT();
1386 /* shortcut for ordinary chr (not range, not MCCE leader) */
1387 if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
1388 onechr(v, c[0], lp, rp);
1389 return;
1390 }
1391 startc = element(v, c, c+1);
1392 NOERR();
1393 break;
1394 case COLLEL:
1395 startp = v->now;
1396 endp = scanplain(v);
1397 INSIST(startp < endp, REG_ECOLLATE);
1398 NOERR();
1399 startc = element(v, startp, endp);
1400 NOERR();
1401 break;
1402 case ECLASS:
1403 startp = v->now;
1404 endp = scanplain(v);
1405 INSIST(startp < endp, REG_ECOLLATE);
1406 NOERR();
1407 startc = element(v, startp, endp);
1408 NOERR();
1409 cv = eclass(v, startc, (v->cflags&REG_ICASE));
1410 NOERR();
1411 dovec(v, cv, lp, rp);
1412 return;
1413 break;
1414 case CCLASS:
1415 startp = v->now;
1416 endp = scanplain(v);
1417 INSIST(startp < endp, REG_ECTYPE);
1418 NOERR();
1419 cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
1420 NOERR();
1421 dovec(v, cv, lp, rp);
1422 return;
1423 break;
1424 default:
1425 ERR(REG_ASSERT);
1426 return;
1427 break;
1428 }
1429
1430 if (SEE(RANGE)) {
1431 NEXT();
1432 switch (v->nexttype) {
1433 case PLAIN:
1434 case RANGE:
1435 c[0] = v->nextvalue;
1436 NEXT();
1437 endc = element(v, c, c+1);
1438 NOERR();
1439 break;
1440 case COLLEL:
1441 startp = v->now;
1442 endp = scanplain(v);
1443 INSIST(startp < endp, REG_ECOLLATE);
1444 NOERR();
1445 endc = element(v, startp, endp);
1446 NOERR();
1447 break;
1448 default:
1449 ERR(REG_ERANGE);
1450 return;
1451 break;
1452 }
1453 } else
1454 endc = startc;
1455
1456 /*
1457 * Ranges are unportable. Actually, standard C does
1458 * guarantee that digits are contiguous, but making
1459 * that an exception is just too complicated.
1460 */
1461 if (startc != endc)
1462 NOTE(REG_UUNPORT);
1463 cv = range(v, startc, endc, (v->cflags&REG_ICASE));
1464 NOERR();
1465 dovec(v, cv, lp, rp);
1466 }
1467
1468 /*
1469 - scanplain - scan PLAIN contents of [. etc.
1470 * Certain bits of trickery in lex.c know that this code does not try
1471 * to look past the final bracket of the [. etc.
1472 ^ static chr *scanplain(struct vars *);
1473 */
1474 static chr * /* just after end of sequence */
1475 scanplain(v)
1476 struct vars *v;
1477 {
1478 chr *endp;
1479
1480 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1481 NEXT();
1482
1483 endp = v->now;
1484 while (SEE(PLAIN)) {
1485 endp = v->now;
1486 NEXT();
1487 }
1488
1489 assert(SEE(END) || ISERR());
1490 NEXT();
1491
1492 return endp;
1493 }
1494
1495 /*
1496 - leaders - process a cvec of collating elements to also include leaders
1497 * Also gives all characters involved their own colors, which is almost
1498 * certainly necessary, and sets up little disconnected subNFA.
1499 ^ static VOID leaders(struct vars *, struct cvec *);
1500 */
1501 static VOID
1502 leaders(v, cv)
1503 struct vars *v;
1504 struct cvec *cv;
1505 {
1506 int mcce;
1507 chr *p;
1508 chr leader;
1509 struct state *s;
1510 struct arc *a;
1511
1512 v->mccepbegin = newstate(v->nfa);
1513 v->mccepend = newstate(v->nfa);
1514 NOERR();
1515
1516 for (mcce = 0; mcce < cv->nmcces; mcce++) {
1517 p = cv->mcces[mcce];
1518 leader = *p;
1519 if (!haschr(cv, leader)) {
1520 addchr(cv, leader);
1521 s = newstate(v->nfa);
1522 newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1523 v->mccepbegin, s);
1524 okcolors(v->nfa, v->cm);
1525 } else {
1526 a = findarc(v->mccepbegin, PLAIN,
1527 GETCOLOR(v->cm, leader));
1528 assert(a != NULL);
1529 s = a->to;
1530 assert(s != v->mccepend);
1531 }
1532 p++;
1533 assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */
1534 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1535 okcolors(v->nfa, v->cm);
1536 }
1537 }
1538
1539 /*
1540 - onechr - fill in arcs for a plain character, and possible case complements
1541 * This is mostly a shortcut for efficient handling of the common case.
1542 ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
1543 */
1544 static VOID
1545 onechr(v, c, lp, rp)
1546 struct vars *v;
1547 pchr c;
1548 struct state *lp;
1549 struct state *rp;
1550 {
1551 if (!(v->cflags&REG_ICASE)) {
1552 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1553 return;
1554 }
1555
1556 /* rats, need general case anyway... */
1557 dovec(v, allcases(v, c), lp, rp);
1558 }
1559
1560 /*
1561 - dovec - fill in arcs for each element of a cvec
1562 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1563 ^ static VOID dovec(struct vars *, struct cvec *, struct state *,
1564 ^ struct state *);
1565 */
1566 static VOID
1567 dovec(v, cv, lp, rp)
1568 struct vars *v;
1569 struct cvec *cv;
1570 struct state *lp;
1571 struct state *rp;
1572 {
1573 chr ch, from, to;
1574 celt ce;
1575 chr *p;
1576 int i;
1577 color co;
1578 struct cvec *leads;
1579 struct arc *a;
1580 struct arc *pa; /* arc in prototype */
1581 struct state *s;
1582 struct state *ps; /* state in prototype */
1583
1584 /* need a place to store leaders, if any */
1585 if (nmcces(v) > 0) {
1586 assert(v->mcces != NULL);
1587 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
1588 if (v->cv2 != NULL)
1589 free(v->cv2);
1590 v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1591 NOERR();
1592 leads = v->cv2;
1593 } else
1594 leads = clearcvec(v->cv2);
1595 } else
1596 leads = NULL;
1597
1598 /* first, get the ordinary characters out of the way */
1599 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
1600 ch = *p;
1601 if (!ISCELEADER(v, ch))
1602 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1603 else {
1604 assert(singleton(v->cm, ch));
1605 assert(leads != NULL);
1606 if (!haschr(leads, ch))
1607 addchr(leads, ch);
1608 }
1609 }
1610
1611 /* and the ranges */
1612 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
1613 from = *p;
1614 to = *(p+1);
1615 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
1616 if (from < ce)
1617 subrange(v, from, ce - 1, lp, rp);
1618 assert(singleton(v->cm, ce));
1619 assert(leads != NULL);
1620 if (!haschr(leads, ce))
1621 addchr(leads, ce);
1622 from = ce + 1;
1623 }
1624 if (from <= to)
1625 subrange(v, from, to, lp, rp);
1626 }
1627
1628 if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1629 return;
1630
1631 /* deal with the MCCE leaders */
1632 NOTE(REG_ULOCALE);
1633 for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
1634 co = GETCOLOR(v->cm, *p);
1635 a = findarc(lp, PLAIN, co);
1636 if (a != NULL)
1637 s = a->to;
1638 else {
1639 s = newstate(v->nfa);
1640 NOERR();
1641 newarc(v->nfa, PLAIN, co, lp, s);
1642 NOERR();
1643 }
1644 pa = findarc(v->mccepbegin, PLAIN, co);
1645 assert(pa != NULL);
1646 ps = pa->to;
1647 newarc(v->nfa, '$', 1, s, rp);
1648 newarc(v->nfa, '$', 0, s, rp);
1649 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1650 NOERR();
1651 }
1652
1653 /* and the MCCEs */
1654 for (i = 0; i < cv->nmcces; i++) {
1655 p = cv->mcces[i];
1656 assert(singleton(v->cm, *p));
1657 if (!singleton(v->cm, *p)) {
1658 ERR(REG_ASSERT);
1659 return;
1660 }
1661 ch = *p++;
1662 co = GETCOLOR(v->cm, ch);
1663 a = findarc(lp, PLAIN, co);
1664 if (a != NULL)
1665 s = a->to;
1666 else {
1667 s = newstate(v->nfa);
1668 NOERR();
1669 newarc(v->nfa, PLAIN, co, lp, s);
1670 NOERR();
1671 }
1672 assert(*p != 0); /* at least two chars */
1673 assert(singleton(v->cm, *p));
1674 ch = *p++;
1675 co = GETCOLOR(v->cm, ch);
1676 assert(*p == 0); /* and only two, for now */
1677 newarc(v->nfa, PLAIN, co, s, rp);
1678 NOERR();
1679 }
1680 }
1681
1682 /*
1683 - nextleader - find next MCCE leader within range
1684 ^ static celt nextleader(struct vars *, pchr, pchr);
1685 */
1686 static celt /* NOCELT means none */
1687 nextleader(v, from, to)
1688 struct vars *v;
1689 pchr from;
1690 pchr to;
1691 {
1692 int i;
1693 chr *p;
1694 chr ch;
1695 celt it = NOCELT;
1696
1697 if (v->mcces == NULL)
1698 return it;
1699
1700 for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
1701 ch = *p;
1702 if (from <= ch && ch <= to)
1703 if (it == NOCELT || ch < it)
1704 it = ch;
1705 }
1706 return it;
1707 }
1708
1709 /*
1710 - wordchrs - set up word-chr list for word-boundary stuff, if needed
1711 * The list is kept as a bunch of arcs between two dummy states; it's
1712 * disposed of by the unreachable-states sweep in NFA optimization.
1713 * Does NEXT(). Must not be called from any unusual lexical context.
1714 * This should be reconciled with the \w etc. handling in lex.c, and
1715 * should be cleaned up to reduce dependencies on input scanning.
1716 ^ static VOID wordchrs(struct vars *);
1717 */
1718 static VOID
1719 wordchrs(v)
1720 struct vars *v;
1721 {
1722 struct state *left;
1723 struct state *right;
1724
1725 if (v->wordchrs != NULL) {
1726 NEXT(); /* for consistency */
1727 return;
1728 }
1729
1730 left = newstate(v->nfa);
1731 right = newstate(v->nfa);
1732 NOERR();
1733 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1734 lexword(v);
1735 NEXT();
1736 assert(v->savenow != NULL && SEE('['));
1737 bracket(v, left, right);
1738 assert((v->savenow != NULL && SEE(']')) || ISERR());
1739 NEXT();
1740 NOERR();
1741 v->wordchrs = left;
1742 }
1743
1744 /*
1745 - subre - allocate a subre
1746 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1747 ^ struct state *);
1748 */
1749 static struct subre *
1750 subre(v, op, flags, begin, end)
1751 struct vars *v;
1752 int op;
1753 int flags;
1754 struct state *begin;
1755 struct state *end;
1756 {
1757 struct subre *ret;
1758
1759 ret = v->treefree;
1760 if (ret != NULL)
1761 v->treefree = ret->left;
1762 else {
1763 ret = (struct subre *)MALLOC(sizeof(struct subre));
1764 if (ret == NULL) {
1765 ERR(REG_ESPACE);
1766 return NULL;
1767 }
1768 ret->chain = v->treechain;
1769 v->treechain = ret;
1770 }
1771
1772 assert(strchr("|.b(=", op) != NULL);
1773
1774 ret->op = op;
1775 ret->flags = flags;
1776 ret->retry = 0;
1777 ret->subno = 0;
1778 ret->min = ret->max = 1;
1779 ret->left = NULL;
1780 ret->right = NULL;
1781 ret->begin = begin;
1782 ret->end = end;
1783 ZAPCNFA(ret->cnfa);
1784
1785 return ret;
1786 }
1787
1788 /*
1789 - freesubre - free a subRE subtree
1790 ^ static VOID freesubre(struct vars *, struct subre *);
1791 */
1792 static VOID
1793 freesubre(v, sr)
1794 struct vars *v; /* might be NULL */
1795 struct subre *sr;
1796 {
1797 if (sr == NULL)
1798 return;
1799
1800 if (sr->left != NULL)
1801 freesubre(v, sr->left);
1802 if (sr->right != NULL)
1803 freesubre(v, sr->right);
1804
1805 freesrnode(v, sr);
1806 }
1807
1808 /*
1809 - freesrnode - free one node in a subRE subtree
1810 ^ static VOID freesrnode(struct vars *, struct subre *);
1811 */
1812 static VOID
1813 freesrnode(v, sr)
1814 struct vars *v; /* might be NULL */
1815 struct subre *sr;
1816 {
1817 if (sr == NULL)
1818 return;
1819
1820 if (!NULLCNFA(sr->cnfa))
1821 freecnfa(&sr->cnfa);
1822 sr->flags = 0;
1823
1824 if (v != NULL) {
1825 sr->left = v->treefree;
1826 v->treefree = sr;
1827 } else
1828 FREE(sr);
1829 }
1830
1831 /*
1832 - optst - optimize a subRE subtree
1833 ^ static VOID optst(struct vars *, struct subre *);
1834 */
1835 static VOID
1836 optst(v, t)
1837 struct vars *v;
1838 struct subre *t;
1839 {
1840 if (t == NULL)
1841 return;
1842
1843 /* recurse through children */
1844 if (t->left != NULL)
1845 optst(v, t->left);
1846 if (t->right != NULL)
1847 optst(v, t->right);
1848 }
1849
1850 /*
1851 - numst - number tree nodes (assigning retry indexes)
1852 ^ static int numst(struct subre *, int);
1853 */
1854 static int /* next number */
1855 numst(t, start)
1856 struct subre *t;
1857 int start; /* starting point for subtree numbers */
1858 {
1859 int i;
1860
1861 assert(t != NULL);
1862
1863 i = start;
1864 t->retry = (short)i++;
1865 if (t->left != NULL)
1866 i = numst(t->left, i);
1867 if (t->right != NULL)
1868 i = numst(t->right, i);
1869 return i;
1870 }
1871
1872 /*
1873 - markst - mark tree nodes as INUSE
1874 ^ static VOID markst(struct subre *);
1875 */
1876 static VOID
1877 markst(t)
1878 struct subre *t;
1879 {
1880 assert(t != NULL);
1881
1882 t->flags |= INUSE;
1883 if (t->left != NULL)
1884 markst(t->left);
1885 if (t->right != NULL)
1886 markst(t->right);
1887 }
1888
1889 /*
1890 - cleanst - free any tree nodes not marked INUSE
1891 ^ static VOID cleanst(struct vars *);
1892 */
1893 static VOID
1894 cleanst(v)
1895 struct vars *v;
1896 {
1897 struct subre *t;
1898 struct subre *next;
1899
1900 for (t = v->treechain; t != NULL; t = next) {
1901 next = t->chain;
1902 if (!(t->flags&INUSE))
1903 FREE(t);
1904 }
1905 v->treechain = NULL;
1906 v->treefree = NULL; /* just on general principles */
1907 }
1908
1909 /*
1910 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1911 ^ static long nfatree(struct vars *, struct subre *, FILE *);
1912 */
1913 static long /* optimize results from top node */
1914 nfatree(v, t, f)
1915 struct vars *v;
1916 struct subre *t;
1917 FILE *f; /* for debug output */
1918 {
1919 assert(t != NULL && t->begin != NULL);
1920
1921 if (t->left != NULL)
1922 (DISCARD)nfatree(v, t->left, f);
1923 if (t->right != NULL)
1924 (DISCARD)nfatree(v, t->right, f);
1925
1926 return nfanode(v, t, f);
1927 }
1928
1929 /*
1930 - nfanode - do one NFA for nfatree
1931 ^ static long nfanode(struct vars *, struct subre *, FILE *);
1932 */
1933 static long /* optimize results */
1934 nfanode(v, t, f)
1935 struct vars *v;
1936 struct subre *t;
1937 FILE *f; /* for debug output */
1938 {
1939 struct nfa *nfa;
1940 long ret = 0;
1941 char idbuf[50];
1942
1943 assert(t->begin != NULL);
1944
1945 if (f != NULL)
1946 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1947 stid(t, idbuf, sizeof(idbuf)));
1948 nfa = newnfa(v, v->cm, v->nfa);
1949 NOERRZ();
1950 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1951 if (!ISERR()) {
1952 specialcolors(nfa);
1953 ret = optimize(nfa, f);
1954 }
1955 if (!ISERR())
1956 compact(nfa, &t->cnfa);
1957
1958 freenfa(nfa);
1959 return ret;
1960 }
1961
1962 /*
1963 - newlacon - allocate a lookahead-constraint subRE
1964 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
1965 */
1966 static int /* lacon number */
1967 newlacon(v, begin, end, pos)
1968 struct vars *v;
1969 struct state *begin;
1970 struct state *end;
1971 int pos;
1972 {
1973 int n;
1974 struct subre *sub;
1975
1976 if (v->nlacons == 0) {
1977 v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
1978 n = 1; /* skip 0th */
1979 v->nlacons = 2;
1980 } else {
1981 v->lacons = (struct subre *)REALLOC(v->lacons,
1982 (v->nlacons+1)*sizeof(struct subre));
1983 n = v->nlacons++;
1984 }
1985 if (v->lacons == NULL) {
1986 ERR(REG_ESPACE);
1987 return 0;
1988 }
1989 sub = &v->lacons[n];
1990 sub->begin = begin;
1991 sub->end = end;
1992 sub->subno = pos;
1993 ZAPCNFA(sub->cnfa);
1994 return n;
1995 }
1996
1997 /*
1998 - freelacons - free lookahead-constraint subRE vector
1999 ^ static VOID freelacons(struct subre *, int);
2000 */
2001 static VOID
2002 freelacons(subs, n)
2003 struct subre *subs;
2004 int n;
2005 {
2006 struct subre *sub;
2007 int i;
2008
2009 assert(n > 0);
2010 for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
2011 if (!NULLCNFA(sub->cnfa))
2012 freecnfa(&sub->cnfa);
2013 FREE(subs);
2014 }
2015
2016 /*
2017 - rfree - free a whole RE (insides of regfree)
2018 ^ static VOID rfree(regex_t *);
2019 */
2020 static VOID
2021 rfree(re)
2022 regex_t *re;
2023 {
2024 struct guts *g;
2025
2026 if (re == NULL || re->re_magic != REMAGIC)
2027 return;
2028
2029 re->re_magic = 0; /* invalidate RE */
2030 g = (struct guts *)re->re_guts;
2031 re->re_guts = NULL;
2032 re->re_fns = NULL;
2033 g->magic = 0;
2034 freecm(&g->cmap);
2035 if (g->tree != NULL)
2036 freesubre((struct vars *)NULL, g->tree);
2037 if (g->lacons != NULL)
2038 freelacons(g->lacons, g->nlacons);
2039 if (!NULLCNFA(g->search))
2040 freecnfa(&g->search);
2041 FREE(g);
2042 }
2043
2044 /*
2045 - dump - dump an RE in human-readable form
2046 ^ static VOID dump(regex_t *, FILE *);
2047 */
2048 static VOID
2049 dump(re, f)
2050 regex_t *re;
2051 FILE *f;
2052 {
2053 #ifdef REG_DEBUG
2054 struct guts *g;
2055 int i;
2056
2057 if (re->re_magic != REMAGIC)
2058 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2059 REMAGIC);
2060 if (re->re_guts == NULL) {
2061 fprintf(f, "NULL guts!!!\n");
2062 return;
2063 }
2064 g = (struct guts *)re->re_guts;
2065 if (g->magic != GUTSMAGIC)
2066 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2067 GUTSMAGIC);
2068
2069 fprintf(f, "\n\n\n========= DUMP ==========\n");
2070 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2071 re->re_nsub, re->re_info, re->re_csize, g->ntree);
2072
2073 dumpcolors(&g->cmap, f);
2074 if (!NULLCNFA(g->search)) {
2075 printf("\nsearch:\n");
2076 dumpcnfa(&g->search, f);
2077 }
2078 for (i = 1; i < g->nlacons; i++) {
2079 fprintf(f, "\nla%d (%s):\n", i,
2080 (g->lacons[i].subno) ? "positive" : "negative");
2081 dumpcnfa(&g->lacons[i].cnfa, f);
2082 }
2083 fprintf(f, "\n");
2084 dumpst(g->tree, f, 0);
2085 #endif
2086 }
2087
2088 /*
2089 - dumpst - dump a subRE tree
2090 ^ static VOID dumpst(struct subre *, FILE *, int);
2091 */
2092 static VOID
2093 dumpst(t, f, nfapresent)
2094 struct subre *t;
2095 FILE *f;
2096 int nfapresent; /* is the original NFA still around? */
2097 {
2098 if (t == NULL)
2099 fprintf(f, "null tree\n");
2100 else
2101 stdump(t, f, nfapresent);
2102 fflush(f);
2103 }
2104
2105 /*
2106 - stdump - recursive guts of dumpst
2107 ^ static VOID stdump(struct subre *, FILE *, int);
2108 */
2109 static VOID
2110 stdump(t, f, nfapresent)
2111 struct subre *t;
2112 FILE *f;
2113 int nfapresent; /* is the original NFA still around? */
2114 {
2115 char idbuf[50];
2116
2117 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2118 if (t->flags&LONGER)
2119 fprintf(f, " longest");
2120 if (t->flags&SHORTER)
2121 fprintf(f, " shortest");
2122 if (t->flags&MIXED)
2123 fprintf(f, " hasmixed");
2124 if (t->flags&CAP)
2125 fprintf(f, " hascapture");
2126 if (t->flags&BACKR)
2127 fprintf(f, " hasbackref");
2128 if (!(t->flags&INUSE))
2129 fprintf(f, " UNUSED");
2130 if (t->subno != 0)
2131 fprintf(f, " (#%d)", t->subno);
2132 if (t->min != 1 || t->max != 1) {
2133 fprintf(f, " {%d,", t->min);
2134 if (t->max != INFINITY)
2135 fprintf(f, "%d", t->max);
2136 fprintf(f, "}");
2137 }
2138 if (nfapresent)
2139 fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
2140 if (t->left != NULL)
2141 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2142 if (t->right != NULL)
2143 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2144 if (!NULLCNFA(t->cnfa)) {
2145 fprintf(f, "\n");
2146 dumpcnfa(&t->cnfa, f);
2147 fprintf(f, "\n");
2148 }
2149 if (t->left != NULL)
2150 stdump(t->left, f, nfapresent);
2151 if (t->right != NULL)
2152 stdump(t->right, f, nfapresent);
2153 }
2154
2155 /*
2156 - stid - identify a subtree node for dumping
2157 ^ static char *stid(struct subre *, char *, size_t);
2158 */
2159 static char * /* points to buf or constant string */
2160 stid(t, buf, bufsize)
2161 struct subre *t;
2162 char *buf;
2163 size_t bufsize;
2164 {
2165 /* big enough for hex int or decimal t->retry? */
2166 if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
2167 return "unable";
2168 if (t->retry != 0)
2169 sprintf(buf, "%d", t->retry);
2170 else
2171 sprintf(buf, "0x%x", (int)t); /* may lose bits, that's okay */
2172 return buf;
2173 }
2174
2175 #include "regc_lex.c"
2176 #include "regc_color.c"
2177 #include "regc_nfa.c"
2178 #include "regc_cvec.c"
2179 #include "regc_locale.c"