]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * re_*exec and friends - match REs | |
3 | * | |
4 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. | |
5 | * | |
6 | * Development of this software was funded, in part, by Cray Research Inc., | |
7 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
8 | * Corporation, none of whom are responsible for the results. The author | |
9 | * thanks all of them. | |
10 | * | |
11 | * Redistribution and use in source and binary forms -- with or without | |
12 | * modification -- are permitted for any purpose, provided that | |
13 | * redistributions in source form retain this entire copyright notice and | |
14 | * indicate the origin and nature of any modifications. | |
15 | * | |
16 | * I'd appreciate being given credit for this package in the documentation | |
17 | * of software which uses it, but that is not a requirement. | |
18 | * | |
19 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
20 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
21 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
22 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
25 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
26 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
27 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
28 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | * | |
30 | */ | |
31 | ||
32 | #include "regguts.h" | |
33 | ||
34 | ||
35 | ||
36 | /* lazy-DFA representation */ | |
37 | struct arcp { /* "pointer" to an outarc */ | |
38 | struct sset *ss; | |
39 | color co; | |
40 | }; | |
41 | ||
42 | struct sset { /* state set */ | |
43 | unsigned *states; /* pointer to bitvector */ | |
44 | unsigned hash; /* hash of bitvector */ | |
45 | # define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) | |
46 | # define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ | |
47 | memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) | |
48 | int flags; | |
49 | # define STARTER 01 /* the initial state set */ | |
50 | # define POSTSTATE 02 /* includes the goal state */ | |
51 | # define LOCKED 04 /* locked in cache */ | |
52 | # define NOPROGRESS 010 /* zero-progress state set */ | |
53 | struct arcp ins; /* chain of inarcs pointing here */ | |
54 | chr *lastseen; /* last entered on arrival here */ | |
55 | struct sset **outs; /* outarc vector indexed by color */ | |
56 | struct arcp *inchain; /* chain-pointer vector for outarcs */ | |
57 | }; | |
58 | ||
59 | struct dfa { | |
60 | int nssets; /* size of cache */ | |
61 | int nssused; /* how many entries occupied yet */ | |
62 | int nstates; /* number of states */ | |
63 | int ncolors; /* length of outarc and inchain vectors */ | |
64 | int wordsper; /* length of state-set bitvectors */ | |
65 | struct sset *ssets; /* state-set cache */ | |
66 | unsigned *statesarea; /* bitvector storage */ | |
67 | unsigned *work; /* pointer to work area within statesarea */ | |
68 | struct sset **outsarea; /* outarc-vector storage */ | |
69 | struct arcp *incarea; /* inchain storage */ | |
70 | struct cnfa *cnfa; | |
71 | struct colormap *cm; | |
72 | chr *lastpost; /* location of last cache-flushed success */ | |
73 | chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ | |
74 | struct sset *search; /* replacement-search-pointer memory */ | |
75 | int cptsmalloced; /* were the areas individually malloced? */ | |
76 | char *mallocarea; /* self, or master malloced area, or NULL */ | |
77 | }; | |
78 | ||
79 | #define WORK 1 /* number of work bitvectors needed */ | |
80 | ||
81 | /* setup for non-malloc allocation for small cases */ | |
82 | #define FEWSTATES 20 /* must be less than UBITS */ | |
83 | #define FEWCOLORS 15 | |
84 | struct smalldfa { | |
85 | struct dfa dfa; | |
86 | struct sset ssets[FEWSTATES*2]; | |
87 | unsigned statesarea[FEWSTATES*2 + WORK]; | |
88 | struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; | |
89 | struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; | |
90 | }; | |
91 | #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ | |
92 | ||
93 | ||
94 | ||
95 | /* internal variables, bundled for easy passing around */ | |
96 | struct vars { | |
97 | regex_t *re; | |
98 | struct guts *g; | |
99 | int eflags; /* copies of arguments */ | |
100 | size_t nmatch; | |
101 | regmatch_t *pmatch; | |
102 | rm_detail_t *details; | |
103 | chr *start; /* start of string */ | |
104 | chr *stop; /* just past end of string */ | |
105 | int err; /* error code if any (0 none) */ | |
106 | regoff_t *mem; /* memory vector for backtracking */ | |
107 | struct smalldfa dfa1; | |
108 | struct smalldfa dfa2; | |
109 | }; | |
110 | #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ | |
111 | #define ISERR() VISERR(v) | |
112 | #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) | |
113 | #define ERR(e) VERR(v, e) /* record an error */ | |
114 | #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ | |
115 | #define OFF(p) ((p) - v->start) | |
116 | #define LOFF(p) ((long)OFF(p)) | |
117 | ||
118 | ||
119 | ||
120 | /* | |
121 | * forward declarations | |
122 | */ | |
123 | /* =====^!^===== begin forwards =====^!^===== */ | |
124 | /* automatically gathered by fwd; do not hand-edit */ | |
125 | /* === regexec.c === */ | |
126 | int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); | |
127 | static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); | |
128 | static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); | |
129 | static int cfindloop _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **)); | |
130 | static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t)); | |
131 | static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *)); | |
132 | static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
133 | static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
134 | static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
135 | static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
136 | static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
137 | static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
138 | static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
139 | static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
140 | static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); | |
141 | /* === rege_dfa.c === */ | |
142 | static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, int *)); | |
143 | static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *)); | |
144 | static chr *lastcold _ANSI_ARGS_((struct vars *, struct dfa *)); | |
145 | static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct smalldfa *)); | |
146 | static VOID freedfa _ANSI_ARGS_((struct dfa *)); | |
147 | static unsigned hash _ANSI_ARGS_((unsigned *, int)); | |
148 | static struct sset *initialize _ANSI_ARGS_((struct vars *, struct dfa *, chr *)); | |
149 | static struct sset *miss _ANSI_ARGS_((struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *)); | |
150 | static int lacon _ANSI_ARGS_((struct vars *, struct cnfa *, chr *, pcolor)); | |
151 | static struct sset *getvacant _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); | |
152 | static struct sset *pickss _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); | |
153 | /* automatically gathered by fwd; do not hand-edit */ | |
154 | /* =====^!^===== end forwards =====^!^===== */ | |
155 | ||
156 | ||
157 | ||
158 | /* | |
159 | - exec - match regular expression | |
160 | ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, | |
161 | ^ size_t, regmatch_t [], int); | |
162 | */ | |
163 | int | |
164 | exec(re, string, len, details, nmatch, pmatch, flags) | |
165 | regex_t *re; | |
166 | CONST chr *string; | |
167 | size_t len; | |
168 | rm_detail_t *details; | |
169 | size_t nmatch; | |
170 | regmatch_t pmatch[]; | |
171 | int flags; | |
172 | { | |
173 | struct vars var; | |
174 | register struct vars *v = &var; | |
175 | int st; | |
176 | size_t n; | |
177 | int backref; | |
178 | # define LOCALMAT 20 | |
179 | regmatch_t mat[LOCALMAT]; | |
180 | # define LOCALMEM 40 | |
181 | regoff_t mem[LOCALMEM]; | |
182 | ||
183 | /* sanity checks */ | |
184 | if (re == NULL || string == NULL || re->re_magic != REMAGIC) | |
185 | return REG_INVARG; | |
186 | if (re->re_csize != sizeof(chr)) | |
187 | return REG_MIXED; | |
188 | ||
189 | /* setup */ | |
190 | v->re = re; | |
191 | v->g = (struct guts *)re->re_guts; | |
192 | if ((v->g->cflags®_EXPECT) && details == NULL) | |
193 | return REG_INVARG; | |
194 | if (v->g->info®_UIMPOSSIBLE) | |
195 | return REG_NOMATCH; | |
196 | backref = (v->g->info®_UBACKREF) ? 1 : 0; | |
197 | v->eflags = flags; | |
198 | if (v->g->cflags®_NOSUB) | |
199 | nmatch = 0; /* override client */ | |
200 | v->nmatch = nmatch; | |
201 | if (backref) { | |
202 | /* need work area */ | |
203 | if (v->g->nsub + 1 <= LOCALMAT) | |
204 | v->pmatch = mat; | |
205 | else | |
206 | v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) * | |
207 | sizeof(regmatch_t)); | |
208 | if (v->pmatch == NULL) | |
209 | return REG_ESPACE; | |
210 | v->nmatch = v->g->nsub + 1; | |
211 | } else | |
212 | v->pmatch = pmatch; | |
213 | v->details = details; | |
214 | v->start = (chr *)string; | |
215 | v->stop = (chr *)string + len; | |
216 | v->err = 0; | |
217 | if (backref) { | |
218 | /* need retry memory */ | |
219 | assert(v->g->ntree >= 0); | |
220 | n = (size_t)v->g->ntree; | |
221 | if (n <= LOCALMEM) | |
222 | v->mem = mem; | |
223 | else | |
224 | v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t)); | |
225 | if (v->mem == NULL) { | |
226 | if (v->pmatch != pmatch && v->pmatch != mat) | |
227 | FREE(v->pmatch); | |
228 | return REG_ESPACE; | |
229 | } | |
230 | } else | |
231 | v->mem = NULL; | |
232 | ||
233 | /* do it */ | |
234 | assert(v->g->tree != NULL); | |
235 | if (backref) | |
236 | st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); | |
237 | else | |
238 | st = find(v, &v->g->tree->cnfa, &v->g->cmap); | |
239 | ||
240 | /* copy (portion of) match vector over if necessary */ | |
241 | if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { | |
242 | zapsubs(pmatch, nmatch); | |
243 | n = (nmatch < v->nmatch) ? nmatch : v->nmatch; | |
244 | memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); | |
245 | } | |
246 | ||
247 | /* clean up */ | |
248 | if (v->pmatch != pmatch && v->pmatch != mat) | |
249 | FREE(v->pmatch); | |
250 | if (v->mem != NULL && v->mem != mem) | |
251 | FREE(v->mem); | |
252 | return st; | |
253 | } | |
254 | ||
255 | /* | |
256 | - find - find a match for the main NFA (no-complications case) | |
257 | ^ static int find(struct vars *, struct cnfa *, struct colormap *); | |
258 | */ | |
259 | static int | |
260 | find(v, cnfa, cm) | |
261 | struct vars *v; | |
262 | struct cnfa *cnfa; | |
263 | struct colormap *cm; | |
264 | { | |
265 | struct dfa *s; | |
266 | struct dfa *d; | |
267 | chr *begin; | |
268 | chr *end = NULL; | |
269 | chr *cold; | |
270 | chr *open; /* open and close of range of possible starts */ | |
271 | chr *close; | |
272 | int hitend; | |
273 | int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; | |
274 | ||
275 | /* first, a shot with the search RE */ | |
276 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
277 | assert(!(ISERR() && s != NULL)); | |
278 | NOERR(); | |
279 | MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); | |
280 | cold = NULL; | |
281 | close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)NULL); | |
282 | freedfa(s); | |
283 | NOERR(); | |
284 | if (v->g->cflags®_EXPECT) { | |
285 | assert(v->details != NULL); | |
286 | if (cold != NULL) | |
287 | v->details->rm_extend.rm_so = OFF(cold); | |
288 | else | |
289 | v->details->rm_extend.rm_so = OFF(v->stop); | |
290 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
291 | } | |
292 | if (close == NULL) /* not found */ | |
293 | return REG_NOMATCH; | |
294 | if (v->nmatch == 0) /* found, don't need exact location */ | |
295 | return REG_OKAY; | |
296 | ||
297 | /* find starting point and match */ | |
298 | assert(cold != NULL); | |
299 | open = cold; | |
300 | cold = NULL; | |
301 | MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); | |
302 | d = newdfa(v, cnfa, cm, &v->dfa1); | |
303 | assert(!(ISERR() && d != NULL)); | |
304 | NOERR(); | |
305 | for (begin = open; begin <= close; begin++) { | |
306 | MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); | |
307 | if (shorter) | |
308 | end = shortest(v, d, begin, begin, v->stop, | |
309 | (chr **)NULL, &hitend); | |
310 | else | |
311 | end = longest(v, d, begin, v->stop, &hitend); | |
312 | NOERR(); | |
313 | if (hitend && cold == NULL) | |
314 | cold = begin; | |
315 | if (end != NULL) | |
316 | break; /* NOTE BREAK OUT */ | |
317 | } | |
318 | assert(end != NULL); /* search RE succeeded so loop should */ | |
319 | freedfa(d); | |
320 | ||
321 | /* and pin down details */ | |
322 | assert(v->nmatch > 0); | |
323 | v->pmatch[0].rm_so = OFF(begin); | |
324 | v->pmatch[0].rm_eo = OFF(end); | |
325 | if (v->g->cflags®_EXPECT) { | |
326 | if (cold != NULL) | |
327 | v->details->rm_extend.rm_so = OFF(cold); | |
328 | else | |
329 | v->details->rm_extend.rm_so = OFF(v->stop); | |
330 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
331 | } | |
332 | if (v->nmatch == 1) /* no need for submatches */ | |
333 | return REG_OKAY; | |
334 | ||
335 | /* submatches */ | |
336 | zapsubs(v->pmatch, v->nmatch); | |
337 | return dissect(v, v->g->tree, begin, end); | |
338 | } | |
339 | ||
340 | /* | |
341 | - cfind - find a match for the main NFA (with complications) | |
342 | ^ static int cfind(struct vars *, struct cnfa *, struct colormap *); | |
343 | */ | |
344 | static int | |
345 | cfind(v, cnfa, cm) | |
346 | struct vars *v; | |
347 | struct cnfa *cnfa; | |
348 | struct colormap *cm; | |
349 | { | |
350 | struct dfa *s; | |
351 | struct dfa *d; | |
352 | chr *cold; | |
353 | int ret; | |
354 | ||
355 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
356 | NOERR(); | |
357 | d = newdfa(v, cnfa, cm, &v->dfa2); | |
358 | if (ISERR()) { | |
359 | assert(d == NULL); | |
360 | freedfa(s); | |
361 | return v->err; | |
362 | } | |
363 | ||
364 | ret = cfindloop(v, cnfa, cm, d, s, &cold); | |
365 | ||
366 | freedfa(d); | |
367 | freedfa(s); | |
368 | NOERR(); | |
369 | if (v->g->cflags®_EXPECT) { | |
370 | assert(v->details != NULL); | |
371 | if (cold != NULL) | |
372 | v->details->rm_extend.rm_so = OFF(cold); | |
373 | else | |
374 | v->details->rm_extend.rm_so = OFF(v->stop); | |
375 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
376 | } | |
377 | return ret; | |
378 | } | |
379 | ||
380 | /* | |
381 | - cfindloop - the heart of cfind | |
382 | ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, | |
383 | ^ struct dfa *, struct dfa *, chr **); | |
384 | */ | |
385 | static int | |
386 | cfindloop(v, cnfa, cm, d, s, coldp) | |
387 | struct vars *v; | |
388 | struct cnfa *cnfa; | |
389 | struct colormap *cm; | |
390 | struct dfa *d; | |
391 | struct dfa *s; | |
392 | chr **coldp; /* where to put coldstart pointer */ | |
393 | { | |
394 | chr *begin; | |
395 | chr *end; | |
396 | chr *cold; | |
397 | chr *open; /* open and close of range of possible starts */ | |
398 | chr *close; | |
399 | chr *estart; | |
400 | chr *estop; | |
401 | int er; | |
402 | int shorter = v->g->tree->flags&SHORTER; | |
403 | int hitend; | |
404 | ||
405 | assert(d != NULL && s != NULL); | |
406 | cold = NULL; | |
407 | close = v->start; | |
408 | do { | |
409 | MDEBUG(("\ncsearch at %ld\n", LOFF(close))); | |
410 | close = shortest(v, s, close, close, v->stop, &cold, (int *)NULL); | |
411 | if (close == NULL) | |
412 | break; /* NOTE BREAK */ | |
413 | assert(cold != NULL); | |
414 | open = cold; | |
415 | cold = NULL; | |
416 | MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); | |
417 | for (begin = open; begin <= close; begin++) { | |
418 | MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); | |
419 | estart = begin; | |
420 | estop = v->stop; | |
421 | for (;;) { | |
422 | if (shorter) | |
423 | end = shortest(v, d, begin, estart, | |
424 | estop, (chr **)NULL, &hitend); | |
425 | else | |
426 | end = longest(v, d, begin, estop, | |
427 | &hitend); | |
428 | if (hitend && cold == NULL) | |
429 | cold = begin; | |
430 | if (end == NULL) | |
431 | break; /* NOTE BREAK OUT */ | |
432 | MDEBUG(("tentative end %ld\n", LOFF(end))); | |
433 | zapsubs(v->pmatch, v->nmatch); | |
434 | zapmem(v, v->g->tree); | |
435 | er = cdissect(v, v->g->tree, begin, end); | |
436 | if (er == REG_OKAY) { | |
437 | if (v->nmatch > 0) { | |
438 | v->pmatch[0].rm_so = OFF(begin); | |
439 | v->pmatch[0].rm_eo = OFF(end); | |
440 | } | |
441 | *coldp = cold; | |
442 | return REG_OKAY; | |
443 | } | |
444 | if (er != REG_NOMATCH) { | |
445 | ERR(er); | |
446 | return er; | |
447 | } | |
448 | if ((shorter) ? end == estop : end == begin) { | |
449 | /* no point in trying again */ | |
450 | *coldp = cold; | |
451 | return REG_NOMATCH; | |
452 | } | |
453 | /* go around and try again */ | |
454 | if (shorter) | |
455 | estart = end + 1; | |
456 | else | |
457 | estop = end - 1; | |
458 | } | |
459 | } | |
460 | } while (close < v->stop); | |
461 | ||
462 | *coldp = cold; | |
463 | return REG_NOMATCH; | |
464 | } | |
465 | ||
466 | /* | |
467 | - zapsubs - initialize the subexpression matches to "no match" | |
468 | ^ static VOID zapsubs(regmatch_t *, size_t); | |
469 | */ | |
470 | static VOID | |
471 | zapsubs(p, n) | |
472 | regmatch_t *p; | |
473 | size_t n; | |
474 | { | |
475 | size_t i; | |
476 | ||
477 | for (i = n-1; i > 0; i--) { | |
478 | p[i].rm_so = -1; | |
479 | p[i].rm_eo = -1; | |
480 | } | |
481 | } | |
482 | ||
483 | /* | |
484 | - zapmem - initialize the retry memory of a subtree to zeros | |
485 | ^ static VOID zapmem(struct vars *, struct subre *); | |
486 | */ | |
487 | static VOID | |
488 | zapmem(v, t) | |
489 | struct vars *v; | |
490 | struct subre *t; | |
491 | { | |
492 | if (t == NULL) | |
493 | return; | |
494 | ||
495 | assert(v->mem != NULL); | |
496 | v->mem[t->retry] = 0; | |
497 | if (t->op == '(') { | |
498 | assert(t->subno > 0); | |
499 | v->pmatch[t->subno].rm_so = -1; | |
500 | v->pmatch[t->subno].rm_eo = -1; | |
501 | } | |
502 | ||
503 | if (t->left != NULL) | |
504 | zapmem(v, t->left); | |
505 | if (t->right != NULL) | |
506 | zapmem(v, t->right); | |
507 | } | |
508 | ||
509 | /* | |
510 | - subset - set any subexpression relevant to a successful subre | |
511 | ^ static VOID subset(struct vars *, struct subre *, chr *, chr *); | |
512 | */ | |
513 | static VOID | |
514 | subset(v, sub, begin, end) | |
515 | struct vars *v; | |
516 | struct subre *sub; | |
517 | chr *begin; | |
518 | chr *end; | |
519 | { | |
520 | int n = sub->subno; | |
521 | ||
522 | assert(n > 0); | |
523 | if ((size_t)n >= v->nmatch) | |
524 | return; | |
525 | ||
526 | MDEBUG(("setting %d\n", n)); | |
527 | v->pmatch[n].rm_so = OFF(begin); | |
528 | v->pmatch[n].rm_eo = OFF(end); | |
529 | } | |
530 | ||
531 | /* | |
532 | - dissect - determine subexpression matches (uncomplicated case) | |
533 | ^ static int dissect(struct vars *, struct subre *, chr *, chr *); | |
534 | */ | |
535 | static int /* regexec return code */ | |
536 | dissect(v, t, begin, end) | |
537 | struct vars *v; | |
538 | struct subre *t; | |
539 | chr *begin; /* beginning of relevant substring */ | |
540 | chr *end; /* end of same */ | |
541 | { | |
542 | assert(t != NULL); | |
543 | MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); | |
544 | ||
545 | switch (t->op) { | |
546 | case '=': /* terminal node */ | |
547 | assert(t->left == NULL && t->right == NULL); | |
548 | return REG_OKAY; /* no action, parent did the work */ | |
549 | break; | |
550 | case '|': /* alternation */ | |
551 | assert(t->left != NULL); | |
552 | return altdissect(v, t, begin, end); | |
553 | break; | |
554 | case 'b': /* back ref -- shouldn't be calling us! */ | |
555 | return REG_ASSERT; | |
556 | break; | |
557 | case '.': /* concatenation */ | |
558 | assert(t->left != NULL && t->right != NULL); | |
559 | return condissect(v, t, begin, end); | |
560 | break; | |
561 | case '(': /* capturing */ | |
562 | assert(t->left != NULL && t->right == NULL); | |
563 | assert(t->subno > 0); | |
564 | subset(v, t, begin, end); | |
565 | return dissect(v, t->left, begin, end); | |
566 | break; | |
567 | default: | |
568 | return REG_ASSERT; | |
569 | break; | |
570 | } | |
571 | } | |
572 | ||
573 | /* | |
574 | - condissect - determine concatenation subexpression matches (uncomplicated) | |
575 | ^ static int condissect(struct vars *, struct subre *, chr *, chr *); | |
576 | */ | |
577 | static int /* regexec return code */ | |
578 | condissect(v, t, begin, end) | |
579 | struct vars *v; | |
580 | struct subre *t; | |
581 | chr *begin; /* beginning of relevant substring */ | |
582 | chr *end; /* end of same */ | |
583 | { | |
584 | struct dfa *d; | |
585 | struct dfa *d2; | |
586 | chr *mid; | |
587 | int i; | |
588 | int shorter = (t->left->flags&SHORTER) ? 1 : 0; | |
589 | chr *stop = (shorter) ? end : begin; | |
590 | ||
591 | assert(t->op == '.'); | |
592 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
593 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
594 | ||
595 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
596 | NOERR(); | |
597 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); | |
598 | if (ISERR()) { | |
599 | assert(d2 == NULL); | |
600 | freedfa(d); | |
601 | return v->err; | |
602 | } | |
603 | ||
604 | /* pick a tentative midpoint */ | |
605 | if (shorter) | |
606 | mid = shortest(v, d, begin, begin, end, (chr **)NULL, | |
607 | (int *)NULL); | |
608 | else | |
609 | mid = longest(v, d, begin, end, (int *)NULL); | |
610 | if (mid == NULL) { | |
611 | freedfa(d); | |
612 | freedfa(d2); | |
613 | return REG_ASSERT; | |
614 | } | |
615 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
616 | ||
617 | /* iterate until satisfaction or failure */ | |
618 | while (longest(v, d2, mid, end, (int *)NULL) != end) { | |
619 | /* that midpoint didn't work, find a new one */ | |
620 | if (mid == stop) { | |
621 | /* all possibilities exhausted! */ | |
622 | MDEBUG(("no midpoint!\n")); | |
623 | freedfa(d); | |
624 | freedfa(d2); | |
625 | return REG_ASSERT; | |
626 | } | |
627 | if (shorter) | |
628 | mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, | |
629 | (int *)NULL); | |
630 | else | |
631 | mid = longest(v, d, begin, mid-1, (int *)NULL); | |
632 | if (mid == NULL) { | |
633 | /* failed to find a new one! */ | |
634 | MDEBUG(("failed midpoint!\n")); | |
635 | freedfa(d); | |
636 | freedfa(d2); | |
637 | return REG_ASSERT; | |
638 | } | |
639 | MDEBUG(("new midpoint %ld\n", LOFF(mid))); | |
640 | } | |
641 | ||
642 | /* satisfaction */ | |
643 | MDEBUG(("successful\n")); | |
644 | freedfa(d); | |
645 | freedfa(d2); | |
646 | i = dissect(v, t->left, begin, mid); | |
647 | if (i != REG_OKAY) | |
648 | return i; | |
649 | return dissect(v, t->right, mid, end); | |
650 | } | |
651 | ||
652 | /* | |
653 | - altdissect - determine alternative subexpression matches (uncomplicated) | |
654 | ^ static int altdissect(struct vars *, struct subre *, chr *, chr *); | |
655 | */ | |
656 | static int /* regexec return code */ | |
657 | altdissect(v, t, begin, end) | |
658 | struct vars *v; | |
659 | struct subre *t; | |
660 | chr *begin; /* beginning of relevant substring */ | |
661 | chr *end; /* end of same */ | |
662 | { | |
663 | struct dfa *d; | |
664 | int i; | |
665 | ||
666 | assert(t != NULL); | |
667 | assert(t->op == '|'); | |
668 | ||
669 | for (i = 0; t != NULL; t = t->right, i++) { | |
670 | MDEBUG(("trying %dth\n", i)); | |
671 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
672 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
673 | if (ISERR()) | |
674 | return v->err; | |
675 | if (longest(v, d, begin, end, (int *)NULL) == end) { | |
676 | MDEBUG(("success\n")); | |
677 | freedfa(d); | |
678 | return dissect(v, t->left, begin, end); | |
679 | } | |
680 | freedfa(d); | |
681 | } | |
682 | return REG_ASSERT; /* none of them matched?!? */ | |
683 | } | |
684 | ||
685 | /* | |
686 | - cdissect - determine subexpression matches (with complications) | |
687 | * The retry memory stores the offset of the trial midpoint from begin, | |
688 | * plus 1 so that 0 uniquely means "clean slate". | |
689 | ^ static int cdissect(struct vars *, struct subre *, chr *, chr *); | |
690 | */ | |
691 | static int /* regexec return code */ | |
692 | cdissect(v, t, begin, end) | |
693 | struct vars *v; | |
694 | struct subre *t; | |
695 | chr *begin; /* beginning of relevant substring */ | |
696 | chr *end; /* end of same */ | |
697 | { | |
698 | int er; | |
699 | ||
700 | assert(t != NULL); | |
701 | MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); | |
702 | ||
703 | switch (t->op) { | |
704 | case '=': /* terminal node */ | |
705 | assert(t->left == NULL && t->right == NULL); | |
706 | return REG_OKAY; /* no action, parent did the work */ | |
707 | break; | |
708 | case '|': /* alternation */ | |
709 | assert(t->left != NULL); | |
710 | return caltdissect(v, t, begin, end); | |
711 | break; | |
712 | case 'b': /* back ref -- shouldn't be calling us! */ | |
713 | assert(t->left == NULL && t->right == NULL); | |
714 | return cbrdissect(v, t, begin, end); | |
715 | break; | |
716 | case '.': /* concatenation */ | |
717 | assert(t->left != NULL && t->right != NULL); | |
718 | return ccondissect(v, t, begin, end); | |
719 | break; | |
720 | case '(': /* capturing */ | |
721 | assert(t->left != NULL && t->right == NULL); | |
722 | assert(t->subno > 0); | |
723 | er = cdissect(v, t->left, begin, end); | |
724 | if (er == REG_OKAY) | |
725 | subset(v, t, begin, end); | |
726 | return er; | |
727 | break; | |
728 | default: | |
729 | return REG_ASSERT; | |
730 | break; | |
731 | } | |
732 | } | |
733 | ||
734 | /* | |
735 | - ccondissect - concatenation subexpression matches (with complications) | |
736 | * The retry memory stores the offset of the trial midpoint from begin, | |
737 | * plus 1 so that 0 uniquely means "clean slate". | |
738 | ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); | |
739 | */ | |
740 | static int /* regexec return code */ | |
741 | ccondissect(v, t, begin, end) | |
742 | struct vars *v; | |
743 | struct subre *t; | |
744 | chr *begin; /* beginning of relevant substring */ | |
745 | chr *end; /* end of same */ | |
746 | { | |
747 | struct dfa *d; | |
748 | struct dfa *d2; | |
749 | chr *mid; | |
750 | int er; | |
751 | ||
752 | assert(t->op == '.'); | |
753 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
754 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
755 | ||
756 | if (t->left->flags&SHORTER) /* reverse scan */ | |
757 | return crevdissect(v, t, begin, end); | |
758 | ||
759 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
760 | if (ISERR()) | |
761 | return v->err; | |
762 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
763 | if (ISERR()) { | |
764 | freedfa(d); | |
765 | return v->err; | |
766 | } | |
767 | MDEBUG(("cconcat %d\n", t->retry)); | |
768 | ||
769 | /* pick a tentative midpoint */ | |
770 | if (v->mem[t->retry] == 0) { | |
771 | mid = longest(v, d, begin, end, (int *)NULL); | |
772 | if (mid == NULL) { | |
773 | freedfa(d); | |
774 | freedfa(d2); | |
775 | return REG_NOMATCH; | |
776 | } | |
777 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
778 | v->mem[t->retry] = (mid - begin) + 1; | |
779 | } else { | |
780 | mid = begin + (v->mem[t->retry] - 1); | |
781 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
782 | } | |
783 | ||
784 | /* iterate until satisfaction or failure */ | |
785 | for (;;) { | |
786 | /* try this midpoint on for size */ | |
787 | er = cdissect(v, t->left, begin, mid); | |
788 | if (er == REG_OKAY && | |
789 | longest(v, d2, mid, end, (int *)NULL) == end && | |
790 | (er = cdissect(v, t->right, mid, end)) == | |
791 | REG_OKAY) | |
792 | break; /* NOTE BREAK OUT */ | |
793 | if (er != REG_OKAY && er != REG_NOMATCH) { | |
794 | freedfa(d); | |
795 | freedfa(d2); | |
796 | return er; | |
797 | } | |
798 | ||
799 | /* that midpoint didn't work, find a new one */ | |
800 | if (mid == begin) { | |
801 | /* all possibilities exhausted */ | |
802 | MDEBUG(("%d no midpoint\n", t->retry)); | |
803 | freedfa(d); | |
804 | freedfa(d2); | |
805 | return REG_NOMATCH; | |
806 | } | |
807 | mid = longest(v, d, begin, mid-1, (int *)NULL); | |
808 | if (mid == NULL) { | |
809 | /* failed to find a new one */ | |
810 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
811 | freedfa(d); | |
812 | freedfa(d2); | |
813 | return REG_NOMATCH; | |
814 | } | |
815 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
816 | v->mem[t->retry] = (mid - begin) + 1; | |
817 | zapmem(v, t->left); | |
818 | zapmem(v, t->right); | |
819 | } | |
820 | ||
821 | /* satisfaction */ | |
822 | MDEBUG(("successful\n")); | |
823 | freedfa(d); | |
824 | freedfa(d2); | |
825 | return REG_OKAY; | |
826 | } | |
827 | ||
828 | /* | |
829 | - crevdissect - determine backref shortest-first subexpression matches | |
830 | * The retry memory stores the offset of the trial midpoint from begin, | |
831 | * plus 1 so that 0 uniquely means "clean slate". | |
832 | ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); | |
833 | */ | |
834 | static int /* regexec return code */ | |
835 | crevdissect(v, t, begin, end) | |
836 | struct vars *v; | |
837 | struct subre *t; | |
838 | chr *begin; /* beginning of relevant substring */ | |
839 | chr *end; /* end of same */ | |
840 | { | |
841 | struct dfa *d; | |
842 | struct dfa *d2; | |
843 | chr *mid; | |
844 | int er; | |
845 | ||
846 | assert(t->op == '.'); | |
847 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
848 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
849 | assert(t->left->flags&SHORTER); | |
850 | ||
851 | /* concatenation -- need to split the substring between parts */ | |
852 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
853 | if (ISERR()) | |
854 | return v->err; | |
855 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
856 | if (ISERR()) { | |
857 | freedfa(d); | |
858 | return v->err; | |
859 | } | |
860 | MDEBUG(("crev %d\n", t->retry)); | |
861 | ||
862 | /* pick a tentative midpoint */ | |
863 | if (v->mem[t->retry] == 0) { | |
864 | mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL); | |
865 | if (mid == NULL) { | |
866 | freedfa(d); | |
867 | freedfa(d2); | |
868 | return REG_NOMATCH; | |
869 | } | |
870 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
871 | v->mem[t->retry] = (mid - begin) + 1; | |
872 | } else { | |
873 | mid = begin + (v->mem[t->retry] - 1); | |
874 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
875 | } | |
876 | ||
877 | /* iterate until satisfaction or failure */ | |
878 | for (;;) { | |
879 | /* try this midpoint on for size */ | |
880 | er = cdissect(v, t->left, begin, mid); | |
881 | if (er == REG_OKAY && | |
882 | longest(v, d2, mid, end, (int *)NULL) == end && | |
883 | (er = cdissect(v, t->right, mid, end)) == | |
884 | REG_OKAY) | |
885 | break; /* NOTE BREAK OUT */ | |
886 | if (er != REG_OKAY && er != REG_NOMATCH) { | |
887 | freedfa(d); | |
888 | freedfa(d2); | |
889 | return er; | |
890 | } | |
891 | ||
892 | /* that midpoint didn't work, find a new one */ | |
893 | if (mid == end) { | |
894 | /* all possibilities exhausted */ | |
895 | MDEBUG(("%d no midpoint\n", t->retry)); | |
896 | freedfa(d); | |
897 | freedfa(d2); | |
898 | return REG_NOMATCH; | |
899 | } | |
900 | mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL); | |
901 | if (mid == NULL) { | |
902 | /* failed to find a new one */ | |
903 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
904 | freedfa(d); | |
905 | freedfa(d2); | |
906 | return REG_NOMATCH; | |
907 | } | |
908 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
909 | v->mem[t->retry] = (mid - begin) + 1; | |
910 | zapmem(v, t->left); | |
911 | zapmem(v, t->right); | |
912 | } | |
913 | ||
914 | /* satisfaction */ | |
915 | MDEBUG(("successful\n")); | |
916 | freedfa(d); | |
917 | freedfa(d2); | |
918 | return REG_OKAY; | |
919 | } | |
920 | ||
921 | /* | |
922 | - cbrdissect - determine backref subexpression matches | |
923 | ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); | |
924 | */ | |
925 | static int /* regexec return code */ | |
926 | cbrdissect(v, t, begin, end) | |
927 | struct vars *v; | |
928 | struct subre *t; | |
929 | chr *begin; /* beginning of relevant substring */ | |
930 | chr *end; /* end of same */ | |
931 | { | |
932 | int i; | |
933 | int n = t->subno; | |
934 | size_t len; | |
935 | chr *paren; | |
936 | chr *p; | |
937 | chr *stop; | |
938 | int min = t->min; | |
939 | int max = t->max; | |
940 | ||
941 | assert(t != NULL); | |
942 | assert(t->op == 'b'); | |
943 | assert(n >= 0); | |
944 | assert((size_t)n < v->nmatch); | |
945 | ||
946 | MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); | |
947 | ||
948 | if (v->pmatch[n].rm_so == -1) | |
949 | return REG_NOMATCH; | |
950 | paren = v->start + v->pmatch[n].rm_so; | |
951 | len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; | |
952 | ||
953 | /* no room to maneuver -- retries are pointless */ | |
954 | if (v->mem[t->retry]) | |
955 | return REG_NOMATCH; | |
956 | v->mem[t->retry] = 1; | |
957 | ||
958 | /* special-case zero-length string */ | |
959 | if (len == 0) { | |
960 | if (begin == end) | |
961 | return REG_OKAY; | |
962 | return REG_NOMATCH; | |
963 | } | |
964 | ||
965 | /* and too-short string */ | |
966 | assert(end >= begin); | |
967 | if ((size_t)(end - begin) < len) | |
968 | return REG_NOMATCH; | |
969 | stop = end - len; | |
970 | ||
971 | /* count occurrences */ | |
972 | i = 0; | |
973 | for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) { | |
974 | if ((*v->g->compare)(paren, p, len) != 0) | |
975 | break; | |
976 | i++; | |
977 | } | |
978 | MDEBUG(("cbackref found %d\n", i)); | |
979 | ||
980 | /* and sort it out */ | |
981 | if (p != end) /* didn't consume all of it */ | |
982 | return REG_NOMATCH; | |
983 | if (min <= i && (i <= max || max == INFINITY)) | |
984 | return REG_OKAY; | |
985 | return REG_NOMATCH; /* out of range */ | |
986 | } | |
987 | ||
988 | /* | |
989 | - caltdissect - determine alternative subexpression matches (w. complications) | |
990 | ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); | |
991 | */ | |
992 | static int /* regexec return code */ | |
993 | caltdissect(v, t, begin, end) | |
994 | struct vars *v; | |
995 | struct subre *t; | |
996 | chr *begin; /* beginning of relevant substring */ | |
997 | chr *end; /* end of same */ | |
998 | { | |
999 | struct dfa *d; | |
1000 | int er; | |
1001 | # define UNTRIED 0 /* not yet tried at all */ | |
1002 | # define TRYING 1 /* top matched, trying submatches */ | |
1003 | # define TRIED 2 /* top didn't match or submatches exhausted */ | |
1004 | ||
1005 | if (t == NULL) | |
1006 | return REG_NOMATCH; | |
1007 | assert(t->op == '|'); | |
1008 | if (v->mem[t->retry] == TRIED) | |
1009 | return caltdissect(v, t->right, begin, end); | |
1010 | ||
1011 | MDEBUG(("calt n%d\n", t->retry)); | |
1012 | assert(t->left != NULL); | |
1013 | ||
1014 | if (v->mem[t->retry] == UNTRIED) { | |
1015 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
1016 | if (ISERR()) | |
1017 | return v->err; | |
1018 | if (longest(v, d, begin, end, (int *)NULL) != end) { | |
1019 | freedfa(d); | |
1020 | v->mem[t->retry] = TRIED; | |
1021 | return caltdissect(v, t->right, begin, end); | |
1022 | } | |
1023 | freedfa(d); | |
1024 | MDEBUG(("calt matched\n")); | |
1025 | v->mem[t->retry] = TRYING; | |
1026 | } | |
1027 | ||
1028 | er = cdissect(v, t->left, begin, end); | |
1029 | if (er != REG_NOMATCH) | |
1030 | return er; | |
1031 | ||
1032 | v->mem[t->retry] = TRIED; | |
1033 | return caltdissect(v, t->right, begin, end); | |
1034 | } | |
1035 | ||
1036 | ||
1037 | ||
1038 | #include "rege_dfa.c" |