]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * re_*exec and friends - match REs | |
3 | * | |
4 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. | |
5 | * | |
6 | * Development of this software was funded, in part, by Cray Research Inc., | |
7 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
8 | * Corporation, none of whom are responsible for the results. The author | |
9 | * thanks all of them. | |
10 | * | |
11 | * Redistribution and use in source and binary forms -- with or without | |
12 | * modification -- are permitted for any purpose, provided that | |
13 | * redistributions in source form retain this entire copyright notice and | |
14 | * indicate the origin and nature of any modifications. | |
15 | * | |
16 | * I'd appreciate being given credit for this package in the documentation | |
17 | * of software which uses it, but that is not a requirement. | |
18 | * | |
19 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
20 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
21 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
22 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
25 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
26 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
27 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
28 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | * | |
30 | * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regexec.c,v 1.23 2003/08/08 21:41:56 momjian Exp $ | |
31 | * | |
32 | */ | |
33 | ||
34 | #include "regguts.h" | |
35 | ||
36 | ||
37 | ||
38 | /* lazy-DFA representation */ | |
39 | struct arcp | |
40 | { /* "pointer" to an outarc */ | |
41 | struct sset *ss; | |
42 | color co; | |
43 | }; | |
44 | ||
45 | struct sset | |
46 | { /* state set */ | |
47 | unsigned *states; /* pointer to bitvector */ | |
48 | unsigned hash; /* hash of bitvector */ | |
49 | #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) | |
50 | #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ | |
51 | memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) | |
52 | int flags; | |
53 | #define STARTER 01 /* the initial state set */ | |
54 | #define POSTSTATE 02 /* includes the goal state */ | |
55 | #define LOCKED 04 /* locked in cache */ | |
56 | #define NOPROGRESS 010 /* zero-progress state set */ | |
57 | struct arcp ins; /* chain of inarcs pointing here */ | |
58 | chr *lastseen; /* last entered on arrival here */ | |
59 | struct sset **outs; /* outarc vector indexed by color */ | |
60 | struct arcp *inchain; /* chain-pointer vector for outarcs */ | |
61 | }; | |
62 | ||
63 | struct dfa | |
64 | { | |
65 | int nssets; /* size of cache */ | |
66 | int nssused; /* how many entries occupied yet */ | |
67 | int nstates; /* number of states */ | |
68 | int ncolors; /* length of outarc and inchain vectors */ | |
69 | int wordsper; /* length of state-set bitvectors */ | |
70 | struct sset *ssets; /* state-set cache */ | |
71 | unsigned *statesarea; /* bitvector storage */ | |
72 | unsigned *work; /* pointer to work area within statesarea */ | |
73 | struct sset **outsarea; /* outarc-vector storage */ | |
74 | struct arcp *incarea; /* inchain storage */ | |
75 | struct cnfa *cnfa; | |
76 | struct colormap *cm; | |
77 | chr *lastpost; /* location of last cache-flushed success */ | |
78 | chr *lastnopr; /* location of last cache-flushed | |
79 | * NOPROGRESS */ | |
80 | struct sset *search; /* replacement-search-pointer memory */ | |
81 | int cptsmalloced; /* were the areas individually malloced? */ | |
82 | char *mallocarea; /* self, or master malloced area, or NULL */ | |
83 | }; | |
84 | ||
85 | #define WORK 1 /* number of work bitvectors needed */ | |
86 | ||
87 | /* setup for non-malloc allocation for small cases */ | |
88 | #define FEWSTATES 20 /* must be less than UBITS */ | |
89 | #define FEWCOLORS 15 | |
90 | struct smalldfa | |
91 | { | |
92 | struct dfa dfa; | |
93 | struct sset ssets[FEWSTATES * 2]; | |
94 | unsigned statesarea[FEWSTATES * 2 + WORK]; | |
95 | struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS]; | |
96 | struct arcp incarea[FEWSTATES * 2 * FEWCOLORS]; | |
97 | }; | |
98 | ||
99 | #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ | |
100 | ||
101 | ||
102 | ||
103 | /* internal variables, bundled for easy passing around */ | |
104 | struct vars | |
105 | { | |
106 | regex_t *re; | |
107 | struct guts *g; | |
108 | int eflags; /* copies of arguments */ | |
109 | size_t nmatch; | |
110 | regmatch_t *pmatch; | |
111 | rm_detail_t *details; | |
112 | chr *start; /* start of string */ | |
113 | chr *stop; /* just past end of string */ | |
114 | int err; /* error code if any (0 none) */ | |
115 | regoff_t *mem; /* memory vector for backtracking */ | |
116 | struct smalldfa dfa1; | |
117 | struct smalldfa dfa2; | |
118 | }; | |
119 | ||
120 | #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ | |
121 | #define ISERR() VISERR(v) | |
122 | #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) | |
123 | #define ERR(e) VERR(v, e) /* record an error */ | |
124 | #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return | |
125 | * it */ | |
126 | #define OFF(p) ((p) - v->start) | |
127 | #define LOFF(p) ((long)OFF(p)) | |
128 | ||
129 | ||
130 | ||
131 | /* | |
132 | * forward declarations | |
133 | */ | |
134 | /* === regexec.c === */ | |
135 | static int find(struct vars *, struct cnfa *, struct colormap *); | |
136 | static int cfind(struct vars *, struct cnfa *, struct colormap *); | |
137 | static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); | |
138 | static void zapsubs(regmatch_t *, size_t); | |
139 | static void zapmem(struct vars *, struct subre *); | |
140 | static void subset(struct vars *, struct subre *, chr *, chr *); | |
141 | static int dissect(struct vars *, struct subre *, chr *, chr *); | |
142 | static int condissect(struct vars *, struct subre *, chr *, chr *); | |
143 | static int altdissect(struct vars *, struct subre *, chr *, chr *); | |
144 | static int cdissect(struct vars *, struct subre *, chr *, chr *); | |
145 | static int ccondissect(struct vars *, struct subre *, chr *, chr *); | |
146 | static int crevdissect(struct vars *, struct subre *, chr *, chr *); | |
147 | static int cbrdissect(struct vars *, struct subre *, chr *, chr *); | |
148 | static int caltdissect(struct vars *, struct subre *, chr *, chr *); | |
149 | ||
150 | /* === rege_dfa.c === */ | |
151 | static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); | |
152 | static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); | |
153 | static chr *lastcold(struct vars *, struct dfa *); | |
154 | static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); | |
155 | static void freedfa(struct dfa *); | |
156 | static unsigned hash(unsigned *, int); | |
157 | static struct sset *initialize(struct vars *, struct dfa *, chr *); | |
158 | static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *); | |
159 | static int lacon(struct vars *, struct cnfa *, chr *, pcolor); | |
160 | static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); | |
161 | static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); | |
162 | ||
163 | ||
164 | /* | |
165 | * regexec - match regular expression | |
166 | */ | |
167 | int | |
168 | regexec(regex_t *re, | |
169 | const chr *string, | |
170 | size_t nmatch, | |
171 | regmatch_t pmatch[], | |
172 | int flags) | |
173 | { | |
174 | rm_detail_t det; | |
175 | size_t nLen = 0; | |
176 | chr* s2 = (chr*) string; | |
177 | ||
178 | if (string && *string) | |
179 | { | |
180 | while(*++s2); | |
181 | } | |
182 | ||
183 | nLen = ((s2 - string) / sizeof(chr)); | |
184 | ||
185 | return wx_regexec(re, string, nLen, &det, nmatch, pmatch, flags); | |
186 | } | |
187 | int | |
188 | wx_regexec(regex_t *re, | |
189 | const chr *string, | |
190 | size_t len, | |
191 | rm_detail_t *details, | |
192 | size_t nmatch, | |
193 | regmatch_t pmatch[], | |
194 | int flags) | |
195 | { | |
196 | struct vars var; | |
197 | register struct vars *v = &var; | |
198 | int st; | |
199 | size_t n; | |
200 | int backref; | |
201 | ||
202 | #define LOCALMAT 20 | |
203 | regmatch_t mat[LOCALMAT]; | |
204 | ||
205 | #define LOCALMEM 40 | |
206 | regoff_t mem[LOCALMEM]; | |
207 | ||
208 | /* sanity checks */ | |
209 | if (re == NULL || string == NULL || re->re_magic != REMAGIC) | |
210 | return REG_INVARG; | |
211 | if (re->re_csize != sizeof(chr)) | |
212 | return REG_MIXED; | |
213 | ||
214 | /* setup */ | |
215 | v->re = re; | |
216 | v->g = (struct guts *) re->re_guts; | |
217 | if ((v->g->cflags & REG_EXPECT) && details == NULL) | |
218 | return REG_INVARG; | |
219 | if (v->g->info & REG_UIMPOSSIBLE) | |
220 | return REG_NOMATCH; | |
221 | backref = (v->g->info & REG_UBACKREF) ? 1 : 0; | |
222 | v->eflags = flags; | |
223 | if (v->g->cflags & REG_NOSUB) | |
224 | nmatch = 0; /* override client */ | |
225 | v->nmatch = nmatch; | |
226 | if (backref) | |
227 | { | |
228 | /* need work area */ | |
229 | if (v->g->nsub + 1 <= LOCALMAT) | |
230 | v->pmatch = mat; | |
231 | else | |
232 | v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) * | |
233 | sizeof(regmatch_t)); | |
234 | if (v->pmatch == NULL) | |
235 | return REG_ESPACE; | |
236 | v->nmatch = v->g->nsub + 1; | |
237 | } | |
238 | else | |
239 | v->pmatch = pmatch; | |
240 | v->details = details; | |
241 | v->start = (chr *) string; | |
242 | v->stop = (chr *) string + len; | |
243 | v->err = 0; | |
244 | if (backref) | |
245 | { | |
246 | /* need retry memory */ | |
247 | assert(v->g->ntree >= 0); | |
248 | n = (size_t) v->g->ntree; | |
249 | if (n <= LOCALMEM) | |
250 | v->mem = mem; | |
251 | else | |
252 | v->mem = (regoff_t *) MALLOC(n * sizeof(regoff_t)); | |
253 | if (v->mem == NULL) | |
254 | { | |
255 | if (v->pmatch != pmatch && v->pmatch != mat) | |
256 | FREE(v->pmatch); | |
257 | return REG_ESPACE; | |
258 | } | |
259 | } | |
260 | else | |
261 | v->mem = NULL; | |
262 | ||
263 | /* do it */ | |
264 | assert(v->g->tree != NULL); | |
265 | if (backref) | |
266 | st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); | |
267 | else | |
268 | st = find(v, &v->g->tree->cnfa, &v->g->cmap); | |
269 | ||
270 | /* copy (portion of) match vector over if necessary */ | |
271 | if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) | |
272 | { | |
273 | zapsubs(pmatch, nmatch); | |
274 | n = (nmatch < v->nmatch) ? nmatch : v->nmatch; | |
275 | memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); | |
276 | } | |
277 | ||
278 | /* clean up */ | |
279 | if (v->pmatch != pmatch && v->pmatch != mat) | |
280 | FREE(v->pmatch); | |
281 | if (v->mem != NULL && v->mem != mem) | |
282 | FREE(v->mem); | |
283 | return st; | |
284 | } | |
285 | ||
286 | /* | |
287 | * find - find a match for the main NFA (no-complications case) | |
288 | */ | |
289 | static int | |
290 | find(struct vars * v, | |
291 | struct cnfa * cnfa, | |
292 | struct colormap * cm) | |
293 | { | |
294 | struct dfa *s; | |
295 | struct dfa *d; | |
296 | chr *begin; | |
297 | chr *end = NULL; | |
298 | chr *cold; | |
299 | chr *open; /* open and close of range of possible | |
300 | * starts */ | |
301 | chr *close; | |
302 | int hitend; | |
303 | int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0; | |
304 | ||
305 | /* first, a shot with the search RE */ | |
306 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
307 | assert(!(ISERR() && s != NULL)); | |
308 | NOERR(); | |
309 | MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); | |
310 | cold = NULL; | |
311 | close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); | |
312 | freedfa(s); | |
313 | NOERR(); | |
314 | if (v->g->cflags & REG_EXPECT) | |
315 | { | |
316 | assert(v->details != NULL); | |
317 | if (cold != NULL) | |
318 | v->details->rm_extend.rm_so = OFF(cold); | |
319 | else | |
320 | v->details->rm_extend.rm_so = OFF(v->stop); | |
321 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
322 | } | |
323 | if (close == NULL) /* not found */ | |
324 | return REG_NOMATCH; | |
325 | if (v->nmatch == 0) /* found, don't need exact location */ | |
326 | return REG_OKAY; | |
327 | ||
328 | /* find starting point and match */ | |
329 | assert(cold != NULL); | |
330 | open = cold; | |
331 | cold = NULL; | |
332 | MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); | |
333 | d = newdfa(v, cnfa, cm, &v->dfa1); | |
334 | assert(!(ISERR() && d != NULL)); | |
335 | NOERR(); | |
336 | for (begin = open; begin <= close; begin++) | |
337 | { | |
338 | MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); | |
339 | if (shorter) | |
340 | end = shortest(v, d, begin, begin, v->stop, | |
341 | (chr **) NULL, &hitend); | |
342 | else | |
343 | end = longest(v, d, begin, v->stop, &hitend); | |
344 | NOERR(); | |
345 | if (hitend && cold == NULL) | |
346 | cold = begin; | |
347 | if (end != NULL) | |
348 | break; /* NOTE BREAK OUT */ | |
349 | } | |
350 | assert(end != NULL); /* search RE succeeded so loop should */ | |
351 | freedfa(d); | |
352 | ||
353 | /* and pin down details */ | |
354 | assert(v->nmatch > 0); | |
355 | v->pmatch[0].rm_so = OFF(begin); | |
356 | v->pmatch[0].rm_eo = OFF(end); | |
357 | if (v->g->cflags & REG_EXPECT) | |
358 | { | |
359 | if (cold != NULL) | |
360 | v->details->rm_extend.rm_so = OFF(cold); | |
361 | else | |
362 | v->details->rm_extend.rm_so = OFF(v->stop); | |
363 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
364 | } | |
365 | if (v->nmatch == 1) /* no need for submatches */ | |
366 | return REG_OKAY; | |
367 | ||
368 | /* submatches */ | |
369 | zapsubs(v->pmatch, v->nmatch); | |
370 | return dissect(v, v->g->tree, begin, end); | |
371 | } | |
372 | ||
373 | /* | |
374 | * cfind - find a match for the main NFA (with complications) | |
375 | */ | |
376 | static int | |
377 | cfind(struct vars * v, | |
378 | struct cnfa * cnfa, | |
379 | struct colormap * cm) | |
380 | { | |
381 | struct dfa *s; | |
382 | struct dfa *d; | |
383 | chr *cold; | |
384 | int ret; | |
385 | ||
386 | s = newdfa(v, &v->g->search, cm, &v->dfa1); | |
387 | NOERR(); | |
388 | d = newdfa(v, cnfa, cm, &v->dfa2); | |
389 | if (ISERR()) | |
390 | { | |
391 | assert(d == NULL); | |
392 | freedfa(s); | |
393 | return v->err; | |
394 | } | |
395 | ||
396 | ret = cfindloop(v, cnfa, cm, d, s, &cold); | |
397 | ||
398 | freedfa(d); | |
399 | freedfa(s); | |
400 | NOERR(); | |
401 | if (v->g->cflags & REG_EXPECT) | |
402 | { | |
403 | assert(v->details != NULL); | |
404 | if (cold != NULL) | |
405 | v->details->rm_extend.rm_so = OFF(cold); | |
406 | else | |
407 | v->details->rm_extend.rm_so = OFF(v->stop); | |
408 | v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ | |
409 | } | |
410 | return ret; | |
411 | } | |
412 | ||
413 | /* | |
414 | * cfindloop - the heart of cfind | |
415 | */ | |
416 | static int | |
417 | cfindloop(struct vars * v, | |
418 | struct cnfa * cnfa, | |
419 | struct colormap * cm, | |
420 | struct dfa * d, | |
421 | struct dfa * s, | |
422 | chr **coldp) /* where to put coldstart pointer */ | |
423 | { | |
424 | chr *begin; | |
425 | chr *end; | |
426 | chr *cold; | |
427 | chr *open; /* open and close of range of possible | |
428 | * starts */ | |
429 | chr *close; | |
430 | chr *estart; | |
431 | chr *estop; | |
432 | int er; | |
433 | int shorter = v->g->tree->flags & SHORTER; | |
434 | int hitend; | |
435 | ||
436 | assert(d != NULL && s != NULL); | |
437 | cold = NULL; | |
438 | close = v->start; | |
439 | do | |
440 | { | |
441 | MDEBUG(("\ncsearch at %ld\n", LOFF(close))); | |
442 | close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL); | |
443 | if (close == NULL) | |
444 | break; /* NOTE BREAK */ | |
445 | assert(cold != NULL); | |
446 | open = cold; | |
447 | cold = NULL; | |
448 | MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); | |
449 | for (begin = open; begin <= close; begin++) | |
450 | { | |
451 | MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); | |
452 | estart = begin; | |
453 | estop = v->stop; | |
454 | for (;;) | |
455 | { | |
456 | if (shorter) | |
457 | end = shortest(v, d, begin, estart, | |
458 | estop, (chr **) NULL, &hitend); | |
459 | else | |
460 | end = longest(v, d, begin, estop, | |
461 | &hitend); | |
462 | if (hitend && cold == NULL) | |
463 | cold = begin; | |
464 | if (end == NULL) | |
465 | break; /* NOTE BREAK OUT */ | |
466 | MDEBUG(("tentative end %ld\n", LOFF(end))); | |
467 | zapsubs(v->pmatch, v->nmatch); | |
468 | zapmem(v, v->g->tree); | |
469 | er = cdissect(v, v->g->tree, begin, end); | |
470 | if (er == REG_OKAY) | |
471 | { | |
472 | if (v->nmatch > 0) | |
473 | { | |
474 | v->pmatch[0].rm_so = OFF(begin); | |
475 | v->pmatch[0].rm_eo = OFF(end); | |
476 | } | |
477 | *coldp = cold; | |
478 | return REG_OKAY; | |
479 | } | |
480 | if (er != REG_NOMATCH) | |
481 | { | |
482 | ERR(er); | |
483 | return er; | |
484 | } | |
485 | if ((shorter) ? end == estop : end == begin) | |
486 | { | |
487 | /* no point in trying again */ | |
488 | *coldp = cold; | |
489 | return REG_NOMATCH; | |
490 | } | |
491 | /* go around and try again */ | |
492 | if (shorter) | |
493 | estart = end + 1; | |
494 | else | |
495 | estop = end - 1; | |
496 | } | |
497 | } | |
498 | } while (close < v->stop); | |
499 | ||
500 | *coldp = cold; | |
501 | return REG_NOMATCH; | |
502 | } | |
503 | ||
504 | /* | |
505 | * zapsubs - initialize the subexpression matches to "no match" | |
506 | */ | |
507 | static void | |
508 | zapsubs(regmatch_t *p, | |
509 | size_t n) | |
510 | { | |
511 | size_t i; | |
512 | ||
513 | for (i = n - 1; i > 0; i--) | |
514 | { | |
515 | p[i].rm_so = -1; | |
516 | p[i].rm_eo = -1; | |
517 | } | |
518 | } | |
519 | ||
520 | /* | |
521 | * zapmem - initialize the retry memory of a subtree to zeros | |
522 | */ | |
523 | static void | |
524 | zapmem(struct vars * v, | |
525 | struct subre * t) | |
526 | { | |
527 | if (t == NULL) | |
528 | return; | |
529 | ||
530 | assert(v->mem != NULL); | |
531 | v->mem[t->retry] = 0; | |
532 | if (t->op == '(') | |
533 | { | |
534 | assert(t->subno > 0); | |
535 | v->pmatch[t->subno].rm_so = -1; | |
536 | v->pmatch[t->subno].rm_eo = -1; | |
537 | } | |
538 | ||
539 | if (t->left != NULL) | |
540 | zapmem(v, t->left); | |
541 | if (t->right != NULL) | |
542 | zapmem(v, t->right); | |
543 | } | |
544 | ||
545 | /* | |
546 | * subset - set any subexpression relevant to a successful subre | |
547 | */ | |
548 | static void | |
549 | subset(struct vars * v, | |
550 | struct subre * sub, | |
551 | chr *begin, | |
552 | chr *end) | |
553 | { | |
554 | int n = sub->subno; | |
555 | ||
556 | assert(n > 0); | |
557 | if ((size_t) n >= v->nmatch) | |
558 | return; | |
559 | ||
560 | MDEBUG(("setting %d\n", n)); | |
561 | v->pmatch[n].rm_so = OFF(begin); | |
562 | v->pmatch[n].rm_eo = OFF(end); | |
563 | } | |
564 | ||
565 | /* | |
566 | * dissect - determine subexpression matches (uncomplicated case) | |
567 | */ | |
568 | static int /* regexec return code */ | |
569 | dissect(struct vars * v, | |
570 | struct subre * t, | |
571 | chr *begin, /* beginning of relevant substring */ | |
572 | chr *end) /* end of same */ | |
573 | { | |
574 | assert(t != NULL); | |
575 | MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); | |
576 | ||
577 | switch (t->op) | |
578 | { | |
579 | case '=': /* terminal node */ | |
580 | assert(t->left == NULL && t->right == NULL); | |
581 | return REG_OKAY; /* no action, parent did the work */ | |
582 | break; | |
583 | case '|': /* alternation */ | |
584 | assert(t->left != NULL); | |
585 | return altdissect(v, t, begin, end); | |
586 | break; | |
587 | case 'b': /* back ref -- shouldn't be calling us! */ | |
588 | return REG_ASSERT; | |
589 | break; | |
590 | case '.': /* concatenation */ | |
591 | assert(t->left != NULL && t->right != NULL); | |
592 | return condissect(v, t, begin, end); | |
593 | break; | |
594 | case '(': /* capturing */ | |
595 | assert(t->left != NULL && t->right == NULL); | |
596 | assert(t->subno > 0); | |
597 | subset(v, t, begin, end); | |
598 | return dissect(v, t->left, begin, end); | |
599 | break; | |
600 | default: | |
601 | return REG_ASSERT; | |
602 | break; | |
603 | } | |
604 | } | |
605 | ||
606 | /* | |
607 | * condissect - determine concatenation subexpression matches (uncomplicated) | |
608 | */ | |
609 | static int /* regexec return code */ | |
610 | condissect(struct vars * v, | |
611 | struct subre * t, | |
612 | chr *begin, /* beginning of relevant substring */ | |
613 | chr *end) /* end of same */ | |
614 | { | |
615 | struct dfa *d; | |
616 | struct dfa *d2; | |
617 | chr *mid; | |
618 | int i; | |
619 | int shorter = (t->left->flags & SHORTER) ? 1 : 0; | |
620 | chr *stop = (shorter) ? end : begin; | |
621 | ||
622 | assert(t->op == '.'); | |
623 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
624 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
625 | ||
626 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
627 | NOERR(); | |
628 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); | |
629 | if (ISERR()) | |
630 | { | |
631 | assert(d2 == NULL); | |
632 | freedfa(d); | |
633 | return v->err; | |
634 | } | |
635 | ||
636 | /* pick a tentative midpoint */ | |
637 | if (shorter) | |
638 | mid = shortest(v, d, begin, begin, end, (chr **) NULL, | |
639 | (int *) NULL); | |
640 | else | |
641 | mid = longest(v, d, begin, end, (int *) NULL); | |
642 | if (mid == NULL) | |
643 | { | |
644 | freedfa(d); | |
645 | freedfa(d2); | |
646 | return REG_ASSERT; | |
647 | } | |
648 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
649 | ||
650 | /* iterate until satisfaction or failure */ | |
651 | while (longest(v, d2, mid, end, (int *) NULL) != end) | |
652 | { | |
653 | /* that midpoint didn't work, find a new one */ | |
654 | if (mid == stop) | |
655 | { | |
656 | /* all possibilities exhausted! */ | |
657 | MDEBUG(("no midpoint!\n")); | |
658 | freedfa(d); | |
659 | freedfa(d2); | |
660 | return REG_ASSERT; | |
661 | } | |
662 | if (shorter) | |
663 | mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, | |
664 | (int *) NULL); | |
665 | else | |
666 | mid = longest(v, d, begin, mid - 1, (int *) NULL); | |
667 | if (mid == NULL) | |
668 | { | |
669 | /* failed to find a new one! */ | |
670 | MDEBUG(("failed midpoint!\n")); | |
671 | freedfa(d); | |
672 | freedfa(d2); | |
673 | return REG_ASSERT; | |
674 | } | |
675 | MDEBUG(("new midpoint %ld\n", LOFF(mid))); | |
676 | } | |
677 | ||
678 | /* satisfaction */ | |
679 | MDEBUG(("successful\n")); | |
680 | freedfa(d); | |
681 | freedfa(d2); | |
682 | i = dissect(v, t->left, begin, mid); | |
683 | if (i != REG_OKAY) | |
684 | return i; | |
685 | return dissect(v, t->right, mid, end); | |
686 | } | |
687 | ||
688 | /* | |
689 | * altdissect - determine alternative subexpression matches (uncomplicated) | |
690 | */ | |
691 | static int /* regexec return code */ | |
692 | altdissect(struct vars * v, | |
693 | struct subre * t, | |
694 | chr *begin, /* beginning of relevant substring */ | |
695 | chr *end) /* end of same */ | |
696 | { | |
697 | struct dfa *d; | |
698 | int i; | |
699 | ||
700 | assert(t != NULL); | |
701 | assert(t->op == '|'); | |
702 | ||
703 | for (i = 0; t != NULL; t = t->right, i++) | |
704 | { | |
705 | MDEBUG(("trying %dth\n", i)); | |
706 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
707 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); | |
708 | if (ISERR()) | |
709 | return v->err; | |
710 | if (longest(v, d, begin, end, (int *) NULL) == end) | |
711 | { | |
712 | MDEBUG(("success\n")); | |
713 | freedfa(d); | |
714 | return dissect(v, t->left, begin, end); | |
715 | } | |
716 | freedfa(d); | |
717 | } | |
718 | return REG_ASSERT; /* none of them matched?!? */ | |
719 | } | |
720 | ||
721 | /* | |
722 | * cdissect - determine subexpression matches (with complications) | |
723 | * The retry memory stores the offset of the trial midpoint from begin, | |
724 | * plus 1 so that 0 uniquely means "clean slate". | |
725 | */ | |
726 | static int /* regexec return code */ | |
727 | cdissect(struct vars * v, | |
728 | struct subre * t, | |
729 | chr *begin, /* beginning of relevant substring */ | |
730 | chr *end) /* end of same */ | |
731 | { | |
732 | int er; | |
733 | ||
734 | assert(t != NULL); | |
735 | MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); | |
736 | ||
737 | switch (t->op) | |
738 | { | |
739 | case '=': /* terminal node */ | |
740 | assert(t->left == NULL && t->right == NULL); | |
741 | return REG_OKAY; /* no action, parent did the work */ | |
742 | break; | |
743 | case '|': /* alternation */ | |
744 | assert(t->left != NULL); | |
745 | return caltdissect(v, t, begin, end); | |
746 | break; | |
747 | case 'b': /* back ref -- shouldn't be calling us! */ | |
748 | assert(t->left == NULL && t->right == NULL); | |
749 | return cbrdissect(v, t, begin, end); | |
750 | break; | |
751 | case '.': /* concatenation */ | |
752 | assert(t->left != NULL && t->right != NULL); | |
753 | return ccondissect(v, t, begin, end); | |
754 | break; | |
755 | case '(': /* capturing */ | |
756 | assert(t->left != NULL && t->right == NULL); | |
757 | assert(t->subno > 0); | |
758 | er = cdissect(v, t->left, begin, end); | |
759 | if (er == REG_OKAY) | |
760 | subset(v, t, begin, end); | |
761 | return er; | |
762 | break; | |
763 | default: | |
764 | return REG_ASSERT; | |
765 | break; | |
766 | } | |
767 | } | |
768 | ||
769 | /* | |
770 | * ccondissect - concatenation subexpression matches (with complications) | |
771 | * The retry memory stores the offset of the trial midpoint from begin, | |
772 | * plus 1 so that 0 uniquely means "clean slate". | |
773 | */ | |
774 | static int /* regexec return code */ | |
775 | ccondissect(struct vars * v, | |
776 | struct subre * t, | |
777 | chr *begin, /* beginning of relevant substring */ | |
778 | chr *end) /* end of same */ | |
779 | { | |
780 | struct dfa *d; | |
781 | struct dfa *d2; | |
782 | chr *mid; | |
783 | int er; | |
784 | ||
785 | assert(t->op == '.'); | |
786 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
787 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
788 | ||
789 | if (t->left->flags & SHORTER) /* reverse scan */ | |
790 | return crevdissect(v, t, begin, end); | |
791 | ||
792 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
793 | if (ISERR()) | |
794 | return v->err; | |
795 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
796 | if (ISERR()) | |
797 | { | |
798 | freedfa(d); | |
799 | return v->err; | |
800 | } | |
801 | MDEBUG(("cconcat %d\n", t->retry)); | |
802 | ||
803 | /* pick a tentative midpoint */ | |
804 | if (v->mem[t->retry] == 0) | |
805 | { | |
806 | mid = longest(v, d, begin, end, (int *) NULL); | |
807 | if (mid == NULL) | |
808 | { | |
809 | freedfa(d); | |
810 | freedfa(d2); | |
811 | return REG_NOMATCH; | |
812 | } | |
813 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
814 | v->mem[t->retry] = (mid - begin) + 1; | |
815 | } | |
816 | else | |
817 | { | |
818 | mid = begin + (v->mem[t->retry] - 1); | |
819 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
820 | } | |
821 | ||
822 | /* iterate until satisfaction or failure */ | |
823 | for (;;) | |
824 | { | |
825 | /* try this midpoint on for size */ | |
826 | er = cdissect(v, t->left, begin, mid); | |
827 | if (er == REG_OKAY && | |
828 | longest(v, d2, mid, end, (int *) NULL) == end && | |
829 | (er = cdissect(v, t->right, mid, end)) == | |
830 | REG_OKAY) | |
831 | break; /* NOTE BREAK OUT */ | |
832 | if (er != REG_OKAY && er != REG_NOMATCH) | |
833 | { | |
834 | freedfa(d); | |
835 | freedfa(d2); | |
836 | return er; | |
837 | } | |
838 | ||
839 | /* that midpoint didn't work, find a new one */ | |
840 | if (mid == begin) | |
841 | { | |
842 | /* all possibilities exhausted */ | |
843 | MDEBUG(("%d no midpoint\n", t->retry)); | |
844 | freedfa(d); | |
845 | freedfa(d2); | |
846 | return REG_NOMATCH; | |
847 | } | |
848 | mid = longest(v, d, begin, mid - 1, (int *) NULL); | |
849 | if (mid == NULL) | |
850 | { | |
851 | /* failed to find a new one */ | |
852 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
853 | freedfa(d); | |
854 | freedfa(d2); | |
855 | return REG_NOMATCH; | |
856 | } | |
857 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
858 | v->mem[t->retry] = (mid - begin) + 1; | |
859 | zapmem(v, t->left); | |
860 | zapmem(v, t->right); | |
861 | } | |
862 | ||
863 | /* satisfaction */ | |
864 | MDEBUG(("successful\n")); | |
865 | freedfa(d); | |
866 | freedfa(d2); | |
867 | return REG_OKAY; | |
868 | } | |
869 | ||
870 | /* | |
871 | * crevdissect - determine backref shortest-first subexpression matches | |
872 | * The retry memory stores the offset of the trial midpoint from begin, | |
873 | * plus 1 so that 0 uniquely means "clean slate". | |
874 | */ | |
875 | static int /* regexec return code */ | |
876 | crevdissect(struct vars * v, | |
877 | struct subre * t, | |
878 | chr *begin, /* beginning of relevant substring */ | |
879 | chr *end) /* end of same */ | |
880 | { | |
881 | struct dfa *d; | |
882 | struct dfa *d2; | |
883 | chr *mid; | |
884 | int er; | |
885 | ||
886 | assert(t->op == '.'); | |
887 | assert(t->left != NULL && t->left->cnfa.nstates > 0); | |
888 | assert(t->right != NULL && t->right->cnfa.nstates > 0); | |
889 | assert(t->left->flags & SHORTER); | |
890 | ||
891 | /* concatenation -- need to split the substring between parts */ | |
892 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
893 | if (ISERR()) | |
894 | return v->err; | |
895 | d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); | |
896 | if (ISERR()) | |
897 | { | |
898 | freedfa(d); | |
899 | return v->err; | |
900 | } | |
901 | MDEBUG(("crev %d\n", t->retry)); | |
902 | ||
903 | /* pick a tentative midpoint */ | |
904 | if (v->mem[t->retry] == 0) | |
905 | { | |
906 | mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); | |
907 | if (mid == NULL) | |
908 | { | |
909 | freedfa(d); | |
910 | freedfa(d2); | |
911 | return REG_NOMATCH; | |
912 | } | |
913 | MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); | |
914 | v->mem[t->retry] = (mid - begin) + 1; | |
915 | } | |
916 | else | |
917 | { | |
918 | mid = begin + (v->mem[t->retry] - 1); | |
919 | MDEBUG(("working midpoint %ld\n", LOFF(mid))); | |
920 | } | |
921 | ||
922 | /* iterate until satisfaction or failure */ | |
923 | for (;;) | |
924 | { | |
925 | /* try this midpoint on for size */ | |
926 | er = cdissect(v, t->left, begin, mid); | |
927 | if (er == REG_OKAY && | |
928 | longest(v, d2, mid, end, (int *) NULL) == end && | |
929 | (er = cdissect(v, t->right, mid, end)) == | |
930 | REG_OKAY) | |
931 | break; /* NOTE BREAK OUT */ | |
932 | if (er != REG_OKAY && er != REG_NOMATCH) | |
933 | { | |
934 | freedfa(d); | |
935 | freedfa(d2); | |
936 | return er; | |
937 | } | |
938 | ||
939 | /* that midpoint didn't work, find a new one */ | |
940 | if (mid == end) | |
941 | { | |
942 | /* all possibilities exhausted */ | |
943 | MDEBUG(("%d no midpoint\n", t->retry)); | |
944 | freedfa(d); | |
945 | freedfa(d2); | |
946 | return REG_NOMATCH; | |
947 | } | |
948 | mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); | |
949 | if (mid == NULL) | |
950 | { | |
951 | /* failed to find a new one */ | |
952 | MDEBUG(("%d failed midpoint\n", t->retry)); | |
953 | freedfa(d); | |
954 | freedfa(d2); | |
955 | return REG_NOMATCH; | |
956 | } | |
957 | MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); | |
958 | v->mem[t->retry] = (mid - begin) + 1; | |
959 | zapmem(v, t->left); | |
960 | zapmem(v, t->right); | |
961 | } | |
962 | ||
963 | /* satisfaction */ | |
964 | MDEBUG(("successful\n")); | |
965 | freedfa(d); | |
966 | freedfa(d2); | |
967 | return REG_OKAY; | |
968 | } | |
969 | ||
970 | /* | |
971 | * cbrdissect - determine backref subexpression matches | |
972 | */ | |
973 | static int /* regexec return code */ | |
974 | cbrdissect(struct vars * v, | |
975 | struct subre * t, | |
976 | chr *begin, /* beginning of relevant substring */ | |
977 | chr *end) /* end of same */ | |
978 | { | |
979 | int i; | |
980 | int n = t->subno; | |
981 | size_t len; | |
982 | chr *paren; | |
983 | chr *p; | |
984 | chr *stop; | |
985 | int min = t->min; | |
986 | int max = t->max; | |
987 | ||
988 | assert(t != NULL); | |
989 | assert(t->op == 'b'); | |
990 | assert(n >= 0); | |
991 | assert((size_t) n < v->nmatch); | |
992 | ||
993 | MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); | |
994 | ||
995 | if (v->pmatch[n].rm_so == -1) | |
996 | return REG_NOMATCH; | |
997 | paren = v->start + v->pmatch[n].rm_so; | |
998 | len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; | |
999 | ||
1000 | /* no room to maneuver -- retries are pointless */ | |
1001 | if (v->mem[t->retry]) | |
1002 | return REG_NOMATCH; | |
1003 | v->mem[t->retry] = 1; | |
1004 | ||
1005 | /* special-case zero-length string */ | |
1006 | if (len == 0) | |
1007 | { | |
1008 | if (begin == end) | |
1009 | return REG_OKAY; | |
1010 | return REG_NOMATCH; | |
1011 | } | |
1012 | ||
1013 | /* and too-short string */ | |
1014 | assert(end >= begin); | |
1015 | if ((size_t) (end - begin) < len) | |
1016 | return REG_NOMATCH; | |
1017 | stop = end - len; | |
1018 | ||
1019 | /* count occurrences */ | |
1020 | i = 0; | |
1021 | for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) | |
1022 | { | |
1023 | if ((*v->g->compare) (paren, p, len) != 0) | |
1024 | break; | |
1025 | i++; | |
1026 | } | |
1027 | MDEBUG(("cbackref found %d\n", i)); | |
1028 | ||
1029 | /* and sort it out */ | |
1030 | if (p != end) /* didn't consume all of it */ | |
1031 | return REG_NOMATCH; | |
1032 | if (min <= i && (i <= max || max == INFINITY)) | |
1033 | return REG_OKAY; | |
1034 | return REG_NOMATCH; /* out of range */ | |
1035 | } | |
1036 | ||
1037 | /* | |
1038 | * caltdissect - determine alternative subexpression matches (w. complications) | |
1039 | */ | |
1040 | static int /* regexec return code */ | |
1041 | caltdissect(struct vars * v, | |
1042 | struct subre * t, | |
1043 | chr *begin, /* beginning of relevant substring */ | |
1044 | chr *end) /* end of same */ | |
1045 | { | |
1046 | struct dfa *d; | |
1047 | int er; | |
1048 | ||
1049 | #define UNTRIED 0 /* not yet tried at all */ | |
1050 | #define TRYING 1 /* top matched, trying submatches */ | |
1051 | #define TRIED 2 /* top didn't match or submatches | |
1052 | * exhausted */ | |
1053 | ||
1054 | if (t == NULL) | |
1055 | return REG_NOMATCH; | |
1056 | assert(t->op == '|'); | |
1057 | if (v->mem[t->retry] == TRIED) | |
1058 | return caltdissect(v, t->right, begin, end); | |
1059 | ||
1060 | MDEBUG(("calt n%d\n", t->retry)); | |
1061 | assert(t->left != NULL); | |
1062 | ||
1063 | if (v->mem[t->retry] == UNTRIED) | |
1064 | { | |
1065 | d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); | |
1066 | if (ISERR()) | |
1067 | return v->err; | |
1068 | if (longest(v, d, begin, end, (int *) NULL) != end) | |
1069 | { | |
1070 | freedfa(d); | |
1071 | v->mem[t->retry] = TRIED; | |
1072 | return caltdissect(v, t->right, begin, end); | |
1073 | } | |
1074 | freedfa(d); | |
1075 | MDEBUG(("calt matched\n")); | |
1076 | v->mem[t->retry] = TRYING; | |
1077 | } | |
1078 | ||
1079 | er = cdissect(v, t->left, begin, end); | |
1080 | if (er != REG_NOMATCH) | |
1081 | return er; | |
1082 | ||
1083 | v->mem[t->retry] = TRIED; | |
1084 | return caltdissect(v, t->right, begin, end); | |
1085 | } | |
1086 | ||
1087 | ||
1088 | ||
1089 | #include "rege_dfa.c" |