]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * re_*exec and friends - match REs | |
3 | * | |
4 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. | |
5 | * | |
6 | * Development of this software was funded, in part, by Cray Research Inc., | |
7 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
8 | * Corporation, none of whom are responsible for the results. The author | |
9 | * thanks all of them. | |
10 | * | |
11 | * Redistribution and use in source and binary forms -- with or without | |
12 | * modification -- are permitted for any purpose, provided that | |
13 | * redistributions in source form retain this entire copyright notice and | |
14 | * indicate the origin and nature of any modifications. | |
15 | * | |
16 | * I'd appreciate being given credit for this package in the documentation | |
17 | * of software which uses it, but that is not a requirement. | |
18 | * | |
19 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
20 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
21 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
22 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
25 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
26 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
27 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
28 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | * | |
30 | * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regexec.c,v 1.23 2003/08/08 21:41:56 momjian Exp $ | |
31 | * | |
32 | */ | |
33 | ||
34 | #include "regguts.h" | |
35 | ||
36 | ||
37 | ||
38 | /* lazy-DFA representation */ | |
39 | struct arcp | |
40 | { /* "pointer" to an outarc */ | |
41 | struct sset *ss; | |
42 | color co; | |
43 | }; | |
44 | ||
45 | struct sset | |
46 | { /* state set */ | |
47 | unsigned *states; /* pointer to bitvector */ | |
48 | unsigned hash; /* hash of bitvector */ | |
49 | #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) | |
50 | #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ | |
51 | memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) | |
52 | int flags; | |
53 | #define STARTER 01 /* the initial state set */ | |
54 | #define POSTSTATE 02 /* includes the goal state */ | |
55 | #define LOCKED 04 /* locked in cache */ | |
56 | #define NOPROGRESS 010 /* zero-progress state set */ | |
57 | struct arcp ins; /* chain of inarcs pointing here */ | |
58 | chr *lastseen; /* last entered on arrival here */ | |
59 | struct sset **outs; /* outarc vector indexed by color */ | |
60 | struct arcp *inchain; /* chain-pointer vector for outarcs */ | |
61 | }; | |
62 | ||
63 | struct dfa | |
64 | { | |
65 | int nssets; /* size of cache */ | |
66 | int nssused; /* how many entries occupied yet */ | |
67 | int nstates; /* number of states */ | |
68 | int ncolors; /* length of outarc and inchain vectors */ | |
69 | int wordsper; /* length of state-set bitvectors */ | |
70 | struct sset *ssets; /* state-set cache */ | |
71 | unsigned *statesarea; /* bitvector storage */ | |
72 | unsigned *work; /* pointer to work area within statesarea */ | |
73 | struct sset **outsarea; /* outarc-vector storage */ | |
74 | struct arcp *incarea; /* inchain storage */ | |
75 | struct cnfa *cnfa; | |
76 | struct colormap *cm; | |
77 | chr *lastpost; /* location of last cache-flushed success */ | |
78 | chr *lastnopr; /* location of last cache-flushed | |
79 | * NOPROGRESS */ | |
80 | struct sset *search; /* replacement-search-pointer memory */ | |
81 | int cptsmalloced; /* were the areas individually malloced? */ | |
82 | char *mallocarea; /* self, or master malloced area, or NULL */ | |
83 | }; | |
84 | ||
85 | #define WORK 1 /* number of work bitvectors needed */ | |
86 | ||
87 | /* setup for non-malloc allocation for small cases */ | |
88 | #define FEWSTATES 20 /* must be less than UBITS */ | |
89 | #define FEWCOLORS 15 | |
90 | struct smalldfa | |
91 | { | |
92 | struct dfa dfa; | |
93 | struct sset ssets[FEWSTATES * 2]; | |
94 | unsigned statesarea[FEWSTATES * 2 + WORK]; | |
95 | struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS]; | |
96 | struct arcp incarea[FEWSTATES * 2 * FEWCOLORS]; | |
97 | }; | |
98 | ||
99 | #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ | |
100 | ||
101 | ||
102 | ||
103 | /* internal variables, bundled for easy passing around */ | |
104 | struct vars | |
105 | { | |
106 | regex_t *re; | |
107 | struct guts *g; | |
108 | int eflags; /* copies of arguments */ | |
109 | size_t nmatch; | |
110 | regmatch_t *pmatch; | |
111 | rm_detail_t *details; | |
112 | chr *start; /* start of string */ | |
113 | chr *stop; /* just past end of string */ | |
114 | int err; /* error code if any (0 none) */ | |
115 | regoff_t *mem; /* memory vector for backtracking */ | |
116 | struct smalldfa dfa1; | |
117 | struct smalldfa dfa2; | |
118 | }; | |
119 | ||
120 | #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ | |
121 | #define ISERR() VISERR(v) | |
122 | #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) | |
123 | #define ERR(e) VERR(v, e) /* record an error */ | |
124 | #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return | |
125 | * it */ | |
126 | #define OFF(p) ((p) - v->start) | |
127 | #define LOFF(p) ((long)OFF(p)) | |
128 | ||
129 | ||
130 | ||
131 | /* | |
132 | * forward declarations | |
133 | */ | |
134 | /* === regexec.c === */ | |
135 | static int find(struct vars *, struct cnfa *, struct colormap *); | |
136 | static int cfind(struct vars *, struct cnfa *, struct colormap *); | |
137 | static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); | |
138 | static void zapsubs(regmatch_t *, size_t); | |
139 | static void zapmem(struct vars *, struct subre *); | |
140 | static void subset(struct vars *, struct subre *, chr *, chr *); | |
141 | static int dissect(struct vars *, struct subre *, chr *, chr *); | |
142 | static int condissect(struct vars *, struct subre *, chr *, chr *); | |
143 | static int altdissect(struct vars *, struct subre *, chr *, chr *); | |
144 | static int cdissect(struct vars *, struct subre *, chr *, chr *); | |
145 | static int ccondissect(struct vars *, struct subre *, chr *, chr *); | |
146 | static int crevdissect(struct vars *, struct subre *, chr *, chr *); | |
147 | static int cbrdissect(struct vars *, struct subre *, chr *, chr *); | |
148 | static int caltdissect(struct vars *, struct subre *, chr *, chr *); | |
149 | ||
150 | /* === rege_dfa.c === */ | |
151 | static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); | |
152 | static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); | |
153 | static chr *lastcold(struct vars *, struct dfa *); | |
154 | static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); | |
155 | static void freedfa(struct dfa *); | |
156 | static unsigned hash(unsigned *, int); | |
157 | static struct sset *initialize(struct vars *, struct dfa *, chr *); | |
158 | static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *); | |
159 | static int lacon(struct vars *, struct cnfa *, chr *, pcolor); | |
160 | static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); | |
161 | static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); | |
162 | ||
163 | ||
164 | /* | |
165 | * regexec - match regular expression | |
166 | */ | |
167 | int | |
168 | regexec(regex_t *re, | |
169 | const chr *string, | |
170 | size_t nmatch, | |
171 | regmatch_t pmatch[], | |
172 | int flags) | |
173 | { | |
174 | rm_detail_t det; | |
175 | return wx_regexec(re, string, wx_strlen(string), &det, nmatch, pmatch, flags); | |
176 | } | |
177 | int | |
178 | wx_regexec(regex_t *re, | |
179 | const chr *string, | |
180 | size_t len, | |
181 | rm_detail_t *details, | |
182 | size_t nmatch, | |
183 | regmatch_t pmatch[], | |
184 | int flags) | |
185 | { | |
186 | struct vars var; | |
187 | register struct vars *v = &var; | |
188 | int st; | |
189 | size_t n; | |
190 | int backref; | |
191 | ||
192 | #define LOCALMAT 20 | |
193 | regmatch_t mat[LOCALMAT]; | |
194 | ||
195 | #define LOCALMEM 40 | |
196 | regoff_t mem[LOCALMEM]; | |
197 | ||
198 | /* sanity checks */ | |
199 | if (re == NULL || string == NULL || re->re_magic != REMAGIC) | |
200 | return REG_INVARG; | |
201 | if (re->re_csize != sizeof(chr)) | |
202 | return REG_MIXED; | |
203 | ||
204 | /* setup */ | |
205 | v->re = re; | |
206 | v->g = (struct guts *) re->re_guts; | |
207 | if ((v->g->cflags & REG_EXPECT) && details == NULL) | |
208 | return REG_INVARG; | |
209 | if (v->g->info & REG_UIMPOSSIBLE) | |
210 | return REG_NOMATCH; | |
211 | backref = (v->g->info & REG_UBACKREF) ? 1 : 0; | |
212 | v->eflags = flags; | |
213 | if (v->g->cflags & REG_NOSUB) | |
214 | nmatch = 0; /* override client */ | |
215 | v->nmatch = nmatch; | |
216 | if (backref) | |
217 | { | |
218 | /* need work area */ | |
219 | if (v->g->nsub + 1 <= LOCALMAT) | |
220 | v->pmatch = mat; | |
221 | else | |
222 | v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) * | |
223 | sizeof(regmatch_t)); | |
224 | if (v->pmatch == NULL) | |
225 | return REG_ESPACE; | |
226 | v->nmatch = v->g->nsub + 1; | |
227 | } | |
228 | else | |
229 | v->pmatch = pmatch; | |
230 | v->details = details; | |
231 | v->start = (chr *) string; | |
232 | v->stop = (chr *) string + len; | |
233 | v->err = 0; | |
234 | if (backref) | |
235 | { | |
236 | /* need retry memory */ | |
237 | assert(v->g->ntree >= 0); | |
238 | n = (size_t) v->g->ntree; | |
239 | if (n <= LOCALMEM) | |
240 | v->mem = mem; | |
241 | else | |
242 | v->mem = (regoff_t *) MALLOC(n * sizeof(regoff_t)); | |
243 | if (v->mem == NULL) | |
244 | { | |
245 | if (v->pmatch != pmatch && v->pmatch != mat) | |
246 | FREE(v->pmatch); | |
247 | return REG_ESPACE; | |
248 | } | |
249 | } | |
250 | else | |
251 | v->mem = NULL; | |
252 | ||
253 | /* do it */ | |
254 | assert(v->g->tree != NULL); | |
255 | if (backref) | |
256 | st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); | |
257 | else | |
258 | st = find(v, &v->g->tree->cnfa, &v->g->cmap); | |
259 | ||
260 | /* copy (portion of) match vector over if necessary */ | |
261 | if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) | |
262 | { | |
263 | zapsubs(pmatch, nmatch); | |
264 | n = (nmatch < v->nmatch) ? nmatch : v->nmatch; | |
265 | memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); | |
266 | } | |
267 | ||
268 | /* clean up */ | |
269 | if (v->pmatch != pmatch && v->pmatch != mat) | |
270 | FREE(v->pmatch); | |
271 | if (v->mem != NULL && v->mem != mem) | |
272 | FREE(v->mem); | |
273 | return st; | |
274 | } | |
275 | ||
276 | /* | |
277 | * find - find a match for the main NFA (no-complications case) | |
278 | */ | |
279 | static int | |
280 | find(struct vars * v, | |
281 | struct cnfa * cnfa, | |
282 | struct colormap * cm) | |
283 | { | |
284 | struct dfa *s; | |
285 | struct dfa *d; | |
286 | chr *begin; | |
287 | chr *end = NULL; | |
288 | chr *cold; | |
289 | chr *open; /* open and close of range of possible | |
290 | * starts */ | |
291 | chr *close; | |
292 | int hitend; | |
293 | int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0; | |
294 | ||
295 | /* first, a shot with the search RE */ | |
296 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
297 | assert(!(ISERR() && s != NULL)); | |
298 | NOERR(); | |
299 | MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); | |
300 | cold = NULL; | |
301 | close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); | |
302 | freedfa(s); | |
303 | NOERR(); | |
304 | if (v->g->cflags & REG_EXPECT) | |
305 | { | |
306 | assert(v->details != NULL); | |
307 | if (cold != NULL) | |
308 | v->details->rm_extend.rm_so = OFF(cold); | |
309 | else | |
310 | v->details->rm_extend.rm_so = OFF(v->stop); | |
311 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
312 | } | |
313 | if (close == NULL) /* not found */ | |
314 | return REG_NOMATCH; | |
315 | if (v->nmatch == 0) /* found, don't need exact location */ | |
316 | return REG_OKAY; | |
317 | ||
318 | /* find starting point and match */ | |
319 | assert(cold != NULL); | |
320 | open = cold; | |
321 | cold = NULL; | |
322 | MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); | |
323 | d = newdfa(v, cnfa, cm, &v->dfa1); | |
324 | assert(!(ISERR() && d != NULL)); | |
325 | NOERR(); | |
326 | for (begin = open; begin <= close; begin++) | |
327 | { | |
328 | MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); | |
329 | if (shorter) | |
330 | end = shortest(v, d, begin, begin, v->stop, | |
331 | (chr **) NULL, &hitend); | |
332 | else | |
333 | end = longest(v, d, begin, v->stop, &hitend); | |
334 | NOERR(); | |
335 | if (hitend && cold == NULL) | |
336 | cold = begin; | |
337 | if (end != NULL) | |
338 | break; /* NOTE BREAK OUT */ | |
339 | } | |
340 | assert(end != NULL); /* search RE succeeded so loop should */ | |
341 | freedfa(d); | |
342 | ||
343 | /* and pin down details */ | |
344 | assert(v->nmatch > 0); | |
345 | v->pmatch[0].rm_so = OFF(begin); | |
346 | v->pmatch[0].rm_eo = OFF(end); | |
347 | if (v->g->cflags & REG_EXPECT) | |
348 | { | |
349 | if (cold != NULL) | |
350 | v->details->rm_extend.rm_so = OFF(cold); | |
351 | else | |
352 | v->details->rm_extend.rm_so = OFF(v->stop); | |
353 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
354 | } | |
355 | if (v->nmatch == 1) /* no need for submatches */ | |
356 | return REG_OKAY; | |
357 | ||
358 | /* submatches */ | |
359 | zapsubs(v->pmatch, v->nmatch); | |
360 | return dissect(v, v->g->tree, begin, end); | |
361 | } | |
362 | ||
363 | /* | |
364 | * cfind - find a match for the main NFA (with complications) | |
365 | */ | |
366 | static int | |
367 | cfind(struct vars * v, | |
368 | struct cnfa * cnfa, | |
369 | struct colormap * cm) | |
370 | { | |
371 | struct dfa *s; | |
372 | struct dfa *d; | |
373 | chr *cold; | |
374 | int ret; | |
375 | ||
376 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
377 | NOERR(); | |
378 | d = newdfa(v, cnfa, cm, &v->dfa2); | |
379 | if (ISERR()) | |
380 | { | |
381 | assert(d == NULL); | |
382 | freedfa(s); | |
383 | return v->err; | |
384 | } | |
385 | ||
386 | ret = cfindloop(v, cnfa, cm, d, s, &cold); | |
387 | ||
388 | freedfa(d); | |
389 | freedfa(s); | |
390 | NOERR(); | |
391 | if (v->g->cflags & REG_EXPECT) | |
392 | { | |
393 | assert(v->details != NULL); | |
394 | if (cold != NULL) | |
395 | v->details->rm_extend.rm_so = OFF(cold); | |
396 | else | |
397 | v->details->rm_extend.rm_so = OFF(v->stop); | |
398 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
399 | } | |
400 | return ret; | |
401 | } | |
402 | ||
403 | /* | |
404 | * cfindloop - the heart of cfind | |
405 | */ | |
406 | static int | |
407 | cfindloop(struct vars * v, | |
408 | struct cnfa * cnfa, | |
409 | struct colormap * cm, | |
410 | struct dfa * d, | |
411 | struct dfa * s, | |
412 | chr **coldp) /* where to put coldstart pointer */ | |
413 | { | |
414 | chr *begin; | |
415 | chr *end; | |
416 | chr *cold; | |
417 | chr *open; /* open and close of range of possible | |
418 | * starts */ | |
419 | chr *close; | |
420 | chr *estart; | |
421 | chr *estop; | |
422 | int er; | |
423 | int shorter = v->g->tree->flags & SHORTER; | |
424 | int hitend; | |
425 | ||
426 | assert(d != NULL && s != NULL); | |
427 | cold = NULL; | |
428 | close = v->start; | |
429 | do | |
430 | { | |
431 | MDEBUG(("\ncsearch at %ld\n", LOFF(close))); | |
432 | close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL); | |
433 | if (close == NULL) | |
434 | break; /* NOTE BREAK */ | |
435 | assert(cold != NULL); | |
436 | open = cold; | |
437 | cold = NULL; | |
438 | MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); | |
439 | for (begin = open; begin <= close; begin++) | |
440 | { | |
441 | MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); | |
442 | estart = begin; | |
443 | estop = v->stop; | |
444 | for (;;) | |
445 | { | |
446 | if (shorter) | |
447 | end = shortest(v, d, begin, estart, | |
448 | estop, (chr **) NULL, &hitend); | |
449 | else | |
450 | end = longest(v, d, begin, estop, | |
451 | &hitend); | |
452 | if (hitend && cold == NULL) | |
453 | cold = begin; | |
454 | if (end == NULL) | |
455 | break; /* NOTE BREAK OUT */ | |
456 | MDEBUG(("tentative end %ld\n", LOFF(end))); | |
457 | zapsubs(v->pmatch, v->nmatch); | |
458 | zapmem(v, v->g->tree); | |
459 | er = cdissect(v, v->g->tree, begin, end); | |
460 | if (er == REG_OKAY) | |
461 | { | |
462 | if (v->nmatch > 0) | |
463 | { | |
464 | v->pmatch[0].rm_so = OFF(begin); | |
465 | v->pmatch[0].rm_eo = OFF(end); | |
466 | } | |
467 | *coldp = cold; | |
468 | return REG_OKAY; | |
469 | } | |
470 | if (er != REG_NOMATCH) | |
471 | { | |
472 | ERR(er); | |
473 | return er; | |
474 | } | |
475 | if ((shorter) ? end == estop : end == begin) | |
476 | { | |
477 | /* no point in trying again */ | |
478 | *coldp = cold; | |
479 | return REG_NOMATCH; | |
480 | } | |
481 | /* go around and try again */ | |
482 | if (shorter) | |
483 | estart = end + 1; | |
484 | else | |
485 | estop = end - 1; | |
486 | } | |
487 | } | |
488 | } while (close < v->stop); | |
489 | ||
490 | *coldp = cold; | |
491 | return REG_NOMATCH; | |
492 | } | |
493 | ||
494 | /* | |
495 | * zapsubs - initialize the subexpression matches to "no match" | |
496 | */ | |
497 | static void | |
498 | zapsubs(regmatch_t *p, | |
499 | size_t n) | |
500 | { | |
501 | size_t i; | |
502 | ||
503 | for (i = n - 1; i > 0; i--) | |
504 | { | |
505 | p[i].rm_so = -1; | |
506 | p[i].rm_eo = -1; | |
507 | } | |
508 | } | |
509 | ||
510 | /* | |
511 | * zapmem - initialize the retry memory of a subtree to zeros | |
512 | */ | |
513 | static void | |
514 | zapmem(struct vars * v, | |
515 | struct subre * t) | |
516 | { | |
517 | if (t == NULL) | |
518 | return; | |
519 | ||
520 | assert(v->mem != NULL); | |
521 | v->mem[t->retry] = 0; | |
522 | if (t->op == '(') | |
523 | { | |
524 | assert(t->subno > 0); | |
525 | v->pmatch[t->subno].rm_so = -1; | |
526 | v->pmatch[t->subno].rm_eo = -1; | |
527 | } | |
528 | ||
529 | if (t->left != NULL) | |
530 | zapmem(v, t->left); | |
531 | if (t->right != NULL) | |
532 | zapmem(v, t->right); | |
533 | } | |
534 | ||
535 | /* | |
536 | * subset - set any subexpression relevant to a successful subre | |
537 | */ | |
538 | static void | |
539 | subset(struct vars * v, | |
540 | struct subre * sub, | |
541 | chr *begin, | |
542 | chr *end) | |
543 | { | |
544 | int n = sub->subno; | |
545 | ||
546 | assert(n > 0); | |
547 | if ((size_t) n >= v->nmatch) | |
548 | return; | |
549 | ||
550 | MDEBUG(("setting %d\n", n)); | |
551 | v->pmatch[n].rm_so = OFF(begin); | |
552 | v->pmatch[n].rm_eo = OFF(end); | |
553 | } | |
554 | ||
555 | /* | |
556 | * dissect - determine subexpression matches (uncomplicated case) | |
557 | */ | |
558 | static int /* regexec return code */ | |
559 | dissect(struct vars * v, | |
560 | struct subre * t, | |
561 | chr *begin, /* beginning of relevant substring */ | |
562 | chr *end) /* end of same */ | |
563 | { | |
564 | assert(t != NULL); | |
565 | MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); | |
566 | ||
567 | switch (t->op) | |
568 | { | |
569 | case '=': /* terminal node */ | |
570 | assert(t->left == NULL && t->right == NULL); | |
571 | return REG_OKAY; /* no action, parent did the work */ | |
572 | break; | |
573 | case '|': /* alternation */ | |
574 | assert(t->left != NULL); | |
575 | return altdissect(v, t, begin, end); | |
576 | break; | |
577 | case 'b': /* back ref -- shouldn't be calling us! */ | |
578 | return REG_ASSERT; | |
579 | break; | |
580 | case '.': /* concatenation */ | |
581 | assert(t->left != NULL && t->right != NULL); | |
582 | return condissect(v, t, begin, end); | |
583 | break; | |
584 | case '(': /* capturing */ | |
585 | assert(t->left != NULL && t->right == NULL); | |
586 | assert(t->subno > 0); | |
587 | subset(v, t, begin, end); | |
588 | return dissect(v, t->left, begin, end); | |
589 | break; | |
590 | default: | |
591 | return REG_ASSERT; | |
592 | break; | |
593 | } | |
594 | } | |
595 | ||
596 | /* | |
597 | * condissect - determine concatenation subexpression matches (uncomplicated) | |
598 | */ | |
599 | static int /* regexec return code */ | |
600 | condissect(struct vars * v, | |
601 | struct subre * t, | |
602 | chr *begin, /* beginning of relevant substring */ | |
603 | chr *end) /* end of same */ | |
604 | { | |
605 | struct dfa *d; | |
606 | struct dfa *d2; | |
607 | chr *mid; | |
608 | int i; | |
609 | int shorter = (t->left->flags & SHORTER) ? 1 : 0; | |
610 | chr *stop = (shorter) ? end : begin; | |
611 | ||
612 | assert(t->op == '.'); | |
613 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
614 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
615 | ||
616 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
617 | NOERR(); | |
618 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); | |
619 | if (ISERR()) | |
620 | { | |
621 | assert(d2 == NULL); | |
622 | freedfa(d); | |
623 | return v->err; | |
624 | } | |
625 | ||
626 | /* pick a tentative midpoint */ | |
627 | if (shorter) | |
628 | mid = shortest(v, d, begin, begin, end, (chr **) NULL, | |
629 | (int *) NULL); | |
630 | else | |
631 | mid = longest(v, d, begin, end, (int *) NULL); | |
632 | if (mid == NULL) | |
633 | { | |
634 | freedfa(d); | |
635 | freedfa(d2); | |
636 | return REG_ASSERT; | |
637 | } | |
638 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
639 | ||
640 | /* iterate until satisfaction or failure */ | |
641 | while (longest(v, d2, mid, end, (int *) NULL) != end) | |
642 | { | |
643 | /* that midpoint didn't work, find a new one */ | |
644 | if (mid == stop) | |
645 | { | |
646 | /* all possibilities exhausted! */ | |
647 | MDEBUG(("no midpoint!\n")); | |
648 | freedfa(d); | |
649 | freedfa(d2); | |
650 | return REG_ASSERT; | |
651 | } | |
652 | if (shorter) | |
653 | mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, | |
654 | (int *) NULL); | |
655 | else | |
656 | mid = longest(v, d, begin, mid - 1, (int *) NULL); | |
657 | if (mid == NULL) | |
658 | { | |
659 | /* failed to find a new one! */ | |
660 | MDEBUG(("failed midpoint!\n")); | |
661 | freedfa(d); | |
662 | freedfa(d2); | |
663 | return REG_ASSERT; | |
664 | } | |
665 | MDEBUG(("new midpoint %ld\n", LOFF(mid))); | |
666 | } | |
667 | ||
668 | /* satisfaction */ | |
669 | MDEBUG(("successful\n")); | |
670 | freedfa(d); | |
671 | freedfa(d2); | |
672 | i = dissect(v, t->left, begin, mid); | |
673 | if (i != REG_OKAY) | |
674 | return i; | |
675 | return dissect(v, t->right, mid, end); | |
676 | } | |
677 | ||
678 | /* | |
679 | * altdissect - determine alternative subexpression matches (uncomplicated) | |
680 | */ | |
681 | static int /* regexec return code */ | |
682 | altdissect(struct vars * v, | |
683 | struct subre * t, | |
684 | chr *begin, /* beginning of relevant substring */ | |
685 | chr *end) /* end of same */ | |
686 | { | |
687 | struct dfa *d; | |
688 | int i; | |
689 | ||
690 | assert(t != NULL); | |
691 | assert(t->op == '|'); | |
692 | ||
693 | for (i = 0; t != NULL; t = t->right, i++) | |
694 | { | |
695 | MDEBUG(("trying %dth\n", i)); | |
696 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
697 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
698 | if (ISERR()) | |
699 | return v->err; | |
700 | if (longest(v, d, begin, end, (int *) NULL) == end) | |
701 | { | |
702 | MDEBUG(("success\n")); | |
703 | freedfa(d); | |
704 | return dissect(v, t->left, begin, end); | |
705 | } | |
706 | freedfa(d); | |
707 | } | |
708 | return REG_ASSERT; /* none of them matched?!? */ | |
709 | } | |
710 | ||
711 | /* | |
712 | * cdissect - determine subexpression matches (with complications) | |
713 | * The retry memory stores the offset of the trial midpoint from begin, | |
714 | * plus 1 so that 0 uniquely means "clean slate". | |
715 | */ | |
716 | static int /* regexec return code */ | |
717 | cdissect(struct vars * v, | |
718 | struct subre * t, | |
719 | chr *begin, /* beginning of relevant substring */ | |
720 | chr *end) /* end of same */ | |
721 | { | |
722 | int er; | |
723 | ||
724 | assert(t != NULL); | |
725 | MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); | |
726 | ||
727 | switch (t->op) | |
728 | { | |
729 | case '=': /* terminal node */ | |
730 | assert(t->left == NULL && t->right == NULL); | |
731 | return REG_OKAY; /* no action, parent did the work */ | |
732 | break; | |
733 | case '|': /* alternation */ | |
734 | assert(t->left != NULL); | |
735 | return caltdissect(v, t, begin, end); | |
736 | break; | |
737 | case 'b': /* back ref -- shouldn't be calling us! */ | |
738 | assert(t->left == NULL && t->right == NULL); | |
739 | return cbrdissect(v, t, begin, end); | |
740 | break; | |
741 | case '.': /* concatenation */ | |
742 | assert(t->left != NULL && t->right != NULL); | |
743 | return ccondissect(v, t, begin, end); | |
744 | break; | |
745 | case '(': /* capturing */ | |
746 | assert(t->left != NULL && t->right == NULL); | |
747 | assert(t->subno > 0); | |
748 | er = cdissect(v, t->left, begin, end); | |
749 | if (er == REG_OKAY) | |
750 | subset(v, t, begin, end); | |
751 | return er; | |
752 | break; | |
753 | default: | |
754 | return REG_ASSERT; | |
755 | break; | |
756 | } | |
757 | } | |
758 | ||
759 | /* | |
760 | * ccondissect - concatenation subexpression matches (with complications) | |
761 | * The retry memory stores the offset of the trial midpoint from begin, | |
762 | * plus 1 so that 0 uniquely means "clean slate". | |
763 | */ | |
764 | static int /* regexec return code */ | |
765 | ccondissect(struct vars * v, | |
766 | struct subre * t, | |
767 | chr *begin, /* beginning of relevant substring */ | |
768 | chr *end) /* end of same */ | |
769 | { | |
770 | struct dfa *d; | |
771 | struct dfa *d2; | |
772 | chr *mid; | |
773 | int er; | |
774 | ||
775 | assert(t->op == '.'); | |
776 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
777 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
778 | ||
779 | if (t->left->flags & SHORTER) /* reverse scan */ | |
780 | return crevdissect(v, t, begin, end); | |
781 | ||
782 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
783 | if (ISERR()) | |
784 | return v->err; | |
785 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
786 | if (ISERR()) | |
787 | { | |
788 | freedfa(d); | |
789 | return v->err; | |
790 | } | |
791 | MDEBUG(("cconcat %d\n", t->retry)); | |
792 | ||
793 | /* pick a tentative midpoint */ | |
794 | if (v->mem[t->retry] == 0) | |
795 | { | |
796 | mid = longest(v, d, begin, end, (int *) NULL); | |
797 | if (mid == NULL) | |
798 | { | |
799 | freedfa(d); | |
800 | freedfa(d2); | |
801 | return REG_NOMATCH; | |
802 | } | |
803 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
804 | v->mem[t->retry] = (mid - begin) + 1; | |
805 | } | |
806 | else | |
807 | { | |
808 | mid = begin + (v->mem[t->retry] - 1); | |
809 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
810 | } | |
811 | ||
812 | /* iterate until satisfaction or failure */ | |
813 | for (;;) | |
814 | { | |
815 | /* try this midpoint on for size */ | |
816 | er = cdissect(v, t->left, begin, mid); | |
817 | if (er == REG_OKAY && | |
818 | longest(v, d2, mid, end, (int *) NULL) == end && | |
819 | (er = cdissect(v, t->right, mid, end)) == | |
820 | REG_OKAY) | |
821 | break; /* NOTE BREAK OUT */ | |
822 | if (er != REG_OKAY && er != REG_NOMATCH) | |
823 | { | |
824 | freedfa(d); | |
825 | freedfa(d2); | |
826 | return er; | |
827 | } | |
828 | ||
829 | /* that midpoint didn't work, find a new one */ | |
830 | if (mid == begin) | |
831 | { | |
832 | /* all possibilities exhausted */ | |
833 | MDEBUG(("%d no midpoint\n", t->retry)); | |
834 | freedfa(d); | |
835 | freedfa(d2); | |
836 | return REG_NOMATCH; | |
837 | } | |
838 | mid = longest(v, d, begin, mid - 1, (int *) NULL); | |
839 | if (mid == NULL) | |
840 | { | |
841 | /* failed to find a new one */ | |
842 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
843 | freedfa(d); | |
844 | freedfa(d2); | |
845 | return REG_NOMATCH; | |
846 | } | |
847 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
848 | v->mem[t->retry] = (mid - begin) + 1; | |
849 | zapmem(v, t->left); | |
850 | zapmem(v, t->right); | |
851 | } | |
852 | ||
853 | /* satisfaction */ | |
854 | MDEBUG(("successful\n")); | |
855 | freedfa(d); | |
856 | freedfa(d2); | |
857 | return REG_OKAY; | |
858 | } | |
859 | ||
860 | /* | |
861 | * crevdissect - determine backref shortest-first subexpression matches | |
862 | * The retry memory stores the offset of the trial midpoint from begin, | |
863 | * plus 1 so that 0 uniquely means "clean slate". | |
864 | */ | |
865 | static int /* regexec return code */ | |
866 | crevdissect(struct vars * v, | |
867 | struct subre * t, | |
868 | chr *begin, /* beginning of relevant substring */ | |
869 | chr *end) /* end of same */ | |
870 | { | |
871 | struct dfa *d; | |
872 | struct dfa *d2; | |
873 | chr *mid; | |
874 | int er; | |
875 | ||
876 | assert(t->op == '.'); | |
877 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
878 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
879 | assert(t->left->flags & SHORTER); | |
880 | ||
881 | /* concatenation -- need to split the substring between parts */ | |
882 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
883 | if (ISERR()) | |
884 | return v->err; | |
885 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
886 | if (ISERR()) | |
887 | { | |
888 | freedfa(d); | |
889 | return v->err; | |
890 | } | |
891 | MDEBUG(("crev %d\n", t->retry)); | |
892 | ||
893 | /* pick a tentative midpoint */ | |
894 | if (v->mem[t->retry] == 0) | |
895 | { | |
896 | mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); | |
897 | if (mid == NULL) | |
898 | { | |
899 | freedfa(d); | |
900 | freedfa(d2); | |
901 | return REG_NOMATCH; | |
902 | } | |
903 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
904 | v->mem[t->retry] = (mid - begin) + 1; | |
905 | } | |
906 | else | |
907 | { | |
908 | mid = begin + (v->mem[t->retry] - 1); | |
909 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
910 | } | |
911 | ||
912 | /* iterate until satisfaction or failure */ | |
913 | for (;;) | |
914 | { | |
915 | /* try this midpoint on for size */ | |
916 | er = cdissect(v, t->left, begin, mid); | |
917 | if (er == REG_OKAY && | |
918 | longest(v, d2, mid, end, (int *) NULL) == end && | |
919 | (er = cdissect(v, t->right, mid, end)) == | |
920 | REG_OKAY) | |
921 | break; /* NOTE BREAK OUT */ | |
922 | if (er != REG_OKAY && er != REG_NOMATCH) | |
923 | { | |
924 | freedfa(d); | |
925 | freedfa(d2); | |
926 | return er; | |
927 | } | |
928 | ||
929 | /* that midpoint didn't work, find a new one */ | |
930 | if (mid == end) | |
931 | { | |
932 | /* all possibilities exhausted */ | |
933 | MDEBUG(("%d no midpoint\n", t->retry)); | |
934 | freedfa(d); | |
935 | freedfa(d2); | |
936 | return REG_NOMATCH; | |
937 | } | |
938 | mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); | |
939 | if (mid == NULL) | |
940 | { | |
941 | /* failed to find a new one */ | |
942 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
943 | freedfa(d); | |
944 | freedfa(d2); | |
945 | return REG_NOMATCH; | |
946 | } | |
947 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
948 | v->mem[t->retry] = (mid - begin) + 1; | |
949 | zapmem(v, t->left); | |
950 | zapmem(v, t->right); | |
951 | } | |
952 | ||
953 | /* satisfaction */ | |
954 | MDEBUG(("successful\n")); | |
955 | freedfa(d); | |
956 | freedfa(d2); | |
957 | return REG_OKAY; | |
958 | } | |
959 | ||
960 | /* | |
961 | * cbrdissect - determine backref subexpression matches | |
962 | */ | |
963 | static int /* regexec return code */ | |
964 | cbrdissect(struct vars * v, | |
965 | struct subre * t, | |
966 | chr *begin, /* beginning of relevant substring */ | |
967 | chr *end) /* end of same */ | |
968 | { | |
969 | int i; | |
970 | int n = t->subno; | |
971 | size_t len; | |
972 | chr *paren; | |
973 | chr *p; | |
974 | chr *stop; | |
975 | int min = t->min; | |
976 | int max = t->max; | |
977 | ||
978 | assert(t != NULL); | |
979 | assert(t->op == 'b'); | |
980 | assert(n >= 0); | |
981 | assert((size_t) n < v->nmatch); | |
982 | ||
983 | MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); | |
984 | ||
985 | if (v->pmatch[n].rm_so == -1) | |
986 | return REG_NOMATCH; | |
987 | paren = v->start + v->pmatch[n].rm_so; | |
988 | len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; | |
989 | ||
990 | /* no room to maneuver -- retries are pointless */ | |
991 | if (v->mem[t->retry]) | |
992 | return REG_NOMATCH; | |
993 | v->mem[t->retry] = 1; | |
994 | ||
995 | /* special-case zero-length string */ | |
996 | if (len == 0) | |
997 | { | |
998 | if (begin == end) | |
999 | return REG_OKAY; | |
1000 | return REG_NOMATCH; | |
1001 | } | |
1002 | ||
1003 | /* and too-short string */ | |
1004 | assert(end >= begin); | |
1005 | if ((size_t) (end - begin) < len) | |
1006 | return REG_NOMATCH; | |
1007 | stop = end - len; | |
1008 | ||
1009 | /* count occurrences */ | |
1010 | i = 0; | |
1011 | for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) | |
1012 | { | |
1013 | if ((*v->g->compare) (paren, p, len) != 0) | |
1014 | break; | |
1015 | i++; | |
1016 | } | |
1017 | MDEBUG(("cbackref found %d\n", i)); | |
1018 | ||
1019 | /* and sort it out */ | |
1020 | if (p != end) /* didn't consume all of it */ | |
1021 | return REG_NOMATCH; | |
1022 | if (min <= i && (i <= max || max == INFINITY)) | |
1023 | return REG_OKAY; | |
1024 | return REG_NOMATCH; /* out of range */ | |
1025 | } | |
1026 | ||
1027 | /* | |
1028 | * caltdissect - determine alternative subexpression matches (w. complications) | |
1029 | */ | |
1030 | static int /* regexec return code */ | |
1031 | caltdissect(struct vars * v, | |
1032 | struct subre * t, | |
1033 | chr *begin, /* beginning of relevant substring */ | |
1034 | chr *end) /* end of same */ | |
1035 | { | |
1036 | struct dfa *d; | |
1037 | int er; | |
1038 | ||
1039 | #define UNTRIED 0 /* not yet tried at all */ | |
1040 | #define TRYING 1 /* top matched, trying submatches */ | |
1041 | #define TRIED 2 /* top didn't match or submatches | |
1042 | * exhausted */ | |
1043 | ||
1044 | if (t == NULL) | |
1045 | return REG_NOMATCH; | |
1046 | assert(t->op == '|'); | |
1047 | if (v->mem[t->retry] == TRIED) | |
1048 | return caltdissect(v, t->right, begin, end); | |
1049 | ||
1050 | MDEBUG(("calt n%d\n", t->retry)); | |
1051 | assert(t->left != NULL); | |
1052 | ||
1053 | if (v->mem[t->retry] == UNTRIED) | |
1054 | { | |
1055 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
1056 | if (ISERR()) | |
1057 | return v->err; | |
1058 | if (longest(v, d, begin, end, (int *) NULL) != end) | |
1059 | { | |
1060 | freedfa(d); | |
1061 | v->mem[t->retry] = TRIED; | |
1062 | return caltdissect(v, t->right, begin, end); | |
1063 | } | |
1064 | freedfa(d); | |
1065 | MDEBUG(("calt matched\n")); | |
1066 | v->mem[t->retry] = TRYING; | |
1067 | } | |
1068 | ||
1069 | er = cdissect(v, t->left, begin, end); | |
1070 | if (er != REG_NOMATCH) | |
1071 | return er; | |
1072 | ||
1073 | v->mem[t->retry] = TRIED; | |
1074 | return caltdissect(v, t->right, begin, end); | |
1075 | } | |
1076 | ||
1077 | ||
1078 | ||
1079 | #include "rege_dfa.c" |