]>
Commit | Line | Data |
---|---|---|
ad3c9f2a A |
1 | /* |
2 | tre-match-backtrack.c - TRE backtracking regex matching engine | |
3 | ||
4 | This software is released under a BSD-style license. | |
5 | See the file LICENSE for details and copyright. | |
6 | ||
7 | */ | |
8 | ||
9 | /* | |
10 | This matcher is for regexps that use back referencing. Regexp matching | |
11 | with back referencing is an NP-complete problem on the number of back | |
12 | references. The easiest way to match them is to use a backtracking | |
13 | routine which basically goes through all possible paths in the TNFA | |
14 | and chooses the one which results in the best (leftmost and longest) | |
15 | match. This can be spectacularly expensive and may run out of stack | |
16 | space, but there really is no better known generic algorithm. Quoting | |
17 | Henry Spencer from comp.compilers: | |
18 | <URL: http://compilers.iecc.com/comparch/article/93-03-102> | |
19 | ||
20 | POSIX.2 REs require longest match, which is really exciting to | |
21 | implement since the obsolete ("basic") variant also includes | |
22 | \<digit>. I haven't found a better way of tackling this than doing | |
23 | a preliminary match using a DFA (or simulation) on a modified RE | |
24 | that just replicates subREs for \<digit>, and then doing a | |
25 | backtracking match to determine whether the subRE matches were | |
26 | right. This can be rather slow, but I console myself with the | |
27 | thought that people who use \<digit> deserve very slow execution. | |
28 | (Pun unintentional but very appropriate.) | |
29 | ||
30 | */ | |
31 | ||
32 | ||
33 | #ifdef HAVE_CONFIG_H | |
34 | #include <config.h> | |
35 | #endif /* HAVE_CONFIG_H */ | |
36 | ||
37 | /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state | |
38 | info while running */ | |
39 | #undef TRE_USE_ALLOCA | |
40 | ||
41 | #ifdef TRE_USE_ALLOCA | |
42 | /* AIX requires this to be the first thing in the file. */ | |
43 | #ifndef __GNUC__ | |
44 | # if HAVE_ALLOCA_H | |
45 | # include <alloca.h> | |
46 | # else | |
47 | # ifdef _AIX | |
48 | #pragma alloca | |
49 | # else | |
50 | # ifndef alloca /* predefined by HP cc +Olibcalls */ | |
51 | char *alloca (); | |
52 | # endif | |
53 | # endif | |
54 | # endif | |
55 | #endif | |
56 | #endif /* TRE_USE_ALLOCA */ | |
57 | ||
58 | #include <assert.h> | |
59 | #include <stdlib.h> | |
60 | #include <string.h> | |
61 | #ifdef HAVE_WCHAR_H | |
62 | #include <wchar.h> | |
63 | #endif /* HAVE_WCHAR_H */ | |
64 | #ifdef HAVE_WCTYPE_H | |
65 | #include <wctype.h> | |
66 | #endif /* HAVE_WCTYPE_H */ | |
67 | #ifndef TRE_WCHAR | |
68 | #include <ctype.h> | |
69 | #endif /* !TRE_WCHAR */ | |
70 | #ifdef HAVE_MALLOC_H | |
71 | #include <malloc.h> | |
72 | #endif /* HAVE_MALLOC_H */ | |
73 | ||
74 | #include "tre-internal.h" | |
75 | #include "tre-mem.h" | |
76 | #include "tre-match-utils.h" | |
77 | #include "tre.h" | |
78 | #include "xmalloc.h" | |
79 | ||
80 | typedef struct { | |
81 | int pos; | |
82 | unsigned int pos_add_next; | |
83 | const char *str_byte; | |
84 | #ifdef TRE_WCHAR | |
85 | const wchar_t *str_wide; | |
86 | #endif /* TRE_WCHAR */ | |
87 | tre_tnfa_transition_t *state; | |
88 | int state_id; | |
89 | int next_c; | |
90 | tre_tag_t *tags; | |
91 | #ifdef TRE_MBSTATE | |
92 | mbstate_t mbstate; | |
93 | #endif /* TRE_MBSTATE */ | |
94 | } tre_backtrack_item_t; | |
95 | ||
96 | typedef struct tre_backtrack_struct { | |
97 | tre_backtrack_item_t item; | |
98 | struct tre_backtrack_struct *prev; | |
99 | struct tre_backtrack_struct *next; | |
100 | } *tre_backtrack_t; | |
101 | ||
102 | #ifdef TRE_WCHAR | |
103 | #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide) | |
104 | #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide | |
105 | #else /* !TRE_WCHAR */ | |
106 | #define BT_STACK_WIDE_IN(_str_wide) | |
107 | #define BT_STACK_WIDE_OUT | |
108 | #endif /* !TRE_WCHAR */ | |
109 | ||
110 | #ifdef TRE_MBSTATE | |
111 | #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) | |
112 | #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate | |
113 | #else /* !TRE_MBSTATE */ | |
114 | #define BT_STACK_MBSTATE_IN | |
115 | #define BT_STACK_MBSTATE_OUT | |
116 | #endif /* !TRE_MBSTATE */ | |
117 | ||
118 | ||
119 | #ifdef TRE_USE_ALLOCA | |
120 | #define tre_bt_mem_new tre_mem_newa | |
121 | #define tre_bt_mem_alloc tre_mem_alloca | |
122 | #define tre_bt_mem_destroy(obj) do { } while (0) | |
123 | #else /* !TRE_USE_ALLOCA */ | |
124 | #define tre_bt_mem_new tre_mem_new | |
125 | #define tre_bt_mem_alloc tre_mem_alloc | |
126 | #define tre_bt_mem_destroy tre_mem_destroy | |
127 | #endif /* !TRE_USE_ALLOCA */ | |
128 | ||
129 | ||
130 | #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ | |
131 | do \ | |
132 | { \ | |
133 | if (!stack->next) \ | |
134 | { \ | |
135 | tre_backtrack_t s; \ | |
136 | s = tre_bt_mem_alloc(mem, sizeof(*s)); \ | |
137 | if (!s) \ | |
138 | { \ | |
139 | tre_bt_mem_destroy(mem); \ | |
140 | if (tags) \ | |
141 | xfree(tags); \ | |
142 | if (pmatch) \ | |
143 | xfree(pmatch); \ | |
144 | if (states_seen) \ | |
145 | xfree(states_seen); \ | |
146 | return REG_ESPACE; \ | |
147 | } \ | |
148 | s->prev = stack; \ | |
149 | s->next = NULL; \ | |
150 | s->item.tags = tre_bt_mem_alloc(mem, \ | |
151 | num_tags * sizeof(*tags)); \ | |
152 | if (!s->item.tags) \ | |
153 | { \ | |
154 | tre_bt_mem_destroy(mem); \ | |
155 | if (tags) \ | |
156 | xfree(tags); \ | |
157 | if (pmatch) \ | |
158 | xfree(pmatch); \ | |
159 | if (states_seen) \ | |
160 | xfree(states_seen); \ | |
161 | return REG_ESPACE; \ | |
162 | } \ | |
163 | stack->next = s; \ | |
164 | stack = s; \ | |
165 | } \ | |
166 | else \ | |
167 | stack = stack->next; \ | |
168 | stack->item.pos = (_pos); \ | |
169 | stack->item.pos_add_next = (_pos_add_next); \ | |
170 | stack->item.str_byte = (_str_byte); \ | |
171 | BT_STACK_WIDE_IN(_str_wide); \ | |
172 | stack->item.state = (_state); \ | |
173 | stack->item.state_id = (_state_id); \ | |
174 | stack->item.next_c = (_next_c); \ | |
175 | memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \ | |
176 | BT_STACK_MBSTATE_IN; \ | |
177 | } \ | |
178 | while (/*CONSTCOND*/0) | |
179 | ||
180 | #ifdef TRE_STR_USER | |
181 | #define BT_STACK_POP() \ | |
182 | do \ | |
183 | { \ | |
184 | assert(stack->prev); \ | |
185 | pos = stack->item.pos; \ | |
186 | pos_add_next = stack->item.pos_add_next; \ | |
187 | if (type == STR_USER) \ | |
188 | str_source->rewind(pos + pos_add_next, str_source->context); \ | |
189 | str_byte = stack->item.str_byte; \ | |
190 | BT_STACK_WIDE_OUT; \ | |
191 | state = stack->item.state; \ | |
192 | next_c = stack->item.next_c; \ | |
193 | memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \ | |
194 | BT_STACK_MBSTATE_OUT; \ | |
195 | stack = stack->prev; \ | |
196 | } \ | |
197 | while (/*CONSTCOND*/0) | |
198 | #else /* !TRE_STR_USER */ | |
199 | #define BT_STACK_POP() \ | |
200 | do \ | |
201 | { \ | |
202 | assert(stack->prev); \ | |
203 | pos = stack->item.pos; \ | |
204 | pos_add_next = stack->item.pos_add_next; \ | |
205 | str_byte = stack->item.str_byte; \ | |
206 | BT_STACK_WIDE_OUT; \ | |
207 | state = stack->item.state; \ | |
208 | next_c = stack->item.next_c; \ | |
209 | memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \ | |
210 | BT_STACK_MBSTATE_OUT; \ | |
211 | stack = stack->prev; \ | |
212 | } \ | |
213 | while (/*CONSTCOND*/0) | |
214 | #endif /* !TRE_STR_USER */ | |
215 | ||
216 | #undef MIN | |
217 | #define MIN(a, b) ((a) <= (b) ? (a) : (b)) | |
218 | ||
219 | reg_errcode_t | |
220 | tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, | |
221 | int len, tre_str_type_t type, tre_tag_t *match_tags, | |
222 | int eflags, int *match_end_ofs) | |
223 | { | |
224 | /* State variables required by GET_NEXT_WCHAR. */ | |
225 | tre_char_t prev_c = 0, next_c = 0; | |
226 | const char *str_byte = string; | |
227 | int pos = 0; | |
228 | unsigned int pos_add_next = 1; | |
229 | #ifdef TRE_WCHAR | |
230 | const wchar_t *str_wide = string; | |
231 | #ifdef TRE_MBSTATE | |
232 | mbstate_t mbstate; | |
233 | #endif /* TRE_MBSTATE */ | |
234 | #endif /* TRE_WCHAR */ | |
235 | int reg_notbol = eflags & REG_NOTBOL; | |
236 | int reg_noteol = eflags & REG_NOTEOL; | |
237 | int reg_newline = tnfa->cflags & REG_NEWLINE; | |
238 | #ifdef TRE_STR_USER | |
239 | int str_user_end = 0; | |
240 | #endif /* TRE_STR_USER */ | |
241 | int i; | |
242 | ||
243 | /* These are used to remember the necessary values of the above | |
244 | variables to return to the position where the current search | |
245 | started from. */ | |
246 | int next_c_start; | |
247 | const char *str_byte_start; | |
248 | int pos_start = -1; | |
249 | #ifdef TRE_WCHAR | |
250 | const wchar_t *str_wide_start; | |
251 | #endif /* TRE_WCHAR */ | |
252 | #ifdef TRE_MBSTATE | |
253 | mbstate_t mbstate_start; | |
254 | #endif /* TRE_MBSTATE */ | |
255 | ||
256 | /* End offset of best match so far, or -1 if no match found yet. */ | |
257 | int match_eo = -1; | |
258 | /* Tag arrays. */ | |
259 | int *next_tags; | |
260 | tre_tag_t *tags = NULL; | |
261 | /* Current TNFA state. */ | |
262 | tre_tnfa_transition_t *state; | |
263 | int *states_seen = NULL; | |
264 | ||
265 | /* Memory allocator to for allocating the backtracking stack. */ | |
266 | tre_mem_t mem = tre_bt_mem_new(); | |
267 | ||
268 | /* The backtracking stack. */ | |
269 | tre_backtrack_t stack; | |
270 | ||
271 | tre_tnfa_transition_t *trans_i; | |
272 | regmatch_t *pmatch = NULL; | |
273 | reg_errcode_t ret; | |
274 | ||
275 | int num_tags = tnfa->num_tags; | |
276 | int touch = 1; | |
2acb8998 | 277 | char *buf = NULL; |
ad3c9f2a A |
278 | int tbytes; |
279 | ||
280 | #ifdef TRE_MBSTATE | |
281 | memset(&mbstate, '\0', sizeof(mbstate)); | |
282 | #endif /* TRE_MBSTATE */ | |
283 | ||
284 | if (!mem) | |
285 | return REG_ESPACE; | |
286 | stack = tre_bt_mem_alloc(mem, sizeof(*stack)); | |
287 | if (!stack) | |
288 | { | |
289 | ret = REG_ESPACE; | |
290 | goto error_exit; | |
291 | } | |
292 | stack->prev = NULL; | |
293 | stack->next = NULL; | |
294 | ||
295 | DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); | |
296 | DPRINT(("len = %d\n", len)); | |
297 | ||
298 | { | |
299 | int pbytes, sbytes, total_bytes; | |
300 | char *tmp_buf; | |
301 | /* Compute the length of the block we need. */ | |
302 | tbytes = sizeof(*tags) * num_tags; | |
303 | pbytes = sizeof(*pmatch) * tnfa->num_submatches; | |
304 | sbytes = sizeof(*states_seen) * tnfa->num_states; | |
305 | total_bytes = | |
306 | (sizeof(long) - 1) * 2 /* for alignment paddings */ | |
307 | + tbytes + pbytes + sbytes; | |
308 | ||
309 | DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes)); | |
310 | /* Allocate the memory. */ | |
311 | #ifdef TRE_USE_ALLOCA | |
312 | buf = alloca(total_bytes); | |
313 | #else /* !TRE_USE_ALLOCA */ | |
314 | buf = xmalloc((unsigned)total_bytes); | |
315 | #endif /* !TRE_USE_ALLOCA */ | |
316 | if (buf == NULL) | |
317 | return REG_ESPACE; | |
318 | ||
319 | /* Get the various pointers within tmp_buf (properly aligned). */ | |
320 | tags = (void *)buf; | |
321 | tmp_buf = buf + tbytes; | |
322 | tmp_buf += ALIGN(tmp_buf, long); | |
323 | pmatch = (void *)tmp_buf; | |
324 | tmp_buf += pbytes; | |
325 | tmp_buf += ALIGN(tmp_buf, long); | |
326 | states_seen = (void *)tmp_buf; | |
327 | } | |
328 | ||
329 | retry: | |
330 | { | |
331 | memset(tags, 0, num_tags * sizeof(*tags)); | |
332 | if (match_tags) | |
333 | memset(match_tags, 0, num_tags * sizeof(*match_tags)); | |
334 | for (i = 0; i < tnfa->num_states; i++) | |
335 | states_seen[i] = 0; | |
336 | } | |
337 | ||
338 | state = NULL; | |
339 | pos = pos_start; | |
340 | #ifdef TRE_STR_USER | |
341 | if (type == STR_USER) | |
342 | str_source->rewind(pos + pos_add_next, str_source->context); | |
343 | #endif /* TRE_STR_USER */ | |
344 | GET_NEXT_WCHAR(); | |
345 | pos_start = pos; | |
346 | next_c_start = next_c; | |
347 | str_byte_start = str_byte; | |
348 | #ifdef TRE_WCHAR | |
349 | str_wide_start = str_wide; | |
350 | #endif /* TRE_WCHAR */ | |
351 | #ifdef TRE_MBSTATE | |
352 | mbstate_start = mbstate; | |
353 | #endif /* TRE_MBSTATE */ | |
354 | ||
355 | /* Handle initial states. */ | |
356 | next_tags = NULL; | |
357 | for (trans_i = tnfa->initial; trans_i->state; trans_i++) | |
358 | { | |
359 | DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); | |
360 | if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) | |
361 | { | |
362 | DPRINT(("assert failed\n")); | |
363 | continue; | |
364 | } | |
365 | if (state == NULL) | |
366 | { | |
367 | /* Start from this state. */ | |
368 | state = trans_i->state; | |
369 | next_tags = trans_i->tags; | |
370 | } | |
371 | else | |
372 | { | |
373 | /* Backtrack to this state. */ | |
374 | DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); | |
375 | BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state, | |
376 | trans_i->state_id, next_c, tags, mbstate); | |
377 | { | |
378 | int *tmp = trans_i->tags; | |
379 | if (tmp) | |
380 | { | |
381 | while (*tmp >= 0) | |
382 | tre_tag_set(stack->item.tags, *tmp++, pos, touch); | |
383 | touch++; | |
384 | } | |
385 | } | |
386 | } | |
387 | } | |
388 | ||
389 | if (next_tags) | |
390 | { | |
391 | for (; *next_tags >= 0; next_tags++) | |
392 | tre_tag_set(tags, *next_tags, pos, touch); | |
393 | touch++; | |
394 | } | |
395 | ||
396 | ||
397 | DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); | |
398 | DPRINT(("pos:chr/code | state and tags\n")); | |
399 | DPRINT(("-------------+------------------------------------------------\n")); | |
400 | ||
401 | if (state == NULL) | |
402 | goto backtrack; | |
403 | ||
404 | while (/*CONSTCOND*/1) | |
405 | { | |
406 | tre_tnfa_transition_t *next_state; | |
407 | int empty_br_match; | |
408 | ||
409 | DPRINT(("start loop\n")); | |
410 | ||
411 | if (match_eo >= 0 && tnfa->num_minimals) | |
412 | { | |
413 | int skip = 0; | |
414 | #ifdef TRE_DEBUG | |
415 | DPRINT(("Checking minimal conditions: match_eo=%d match_tags=", | |
416 | match_eo)); | |
417 | tre_print_tags(match_tags, tnfa->num_tags); | |
418 | DPRINT(("\n")); | |
419 | #endif /* TRE_DEBUG */ | |
420 | for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) | |
421 | { | |
422 | int end = tnfa->minimal_tags[i]; | |
423 | int start = tnfa->minimal_tags[i + 1]; | |
424 | DPRINT((" Minimal start %d, end %d\n", start, end)); | |
425 | if (tre_minimal_tag_order(start, end, match_tags, tags) > 0) | |
426 | { | |
427 | skip = 1; | |
428 | break; | |
429 | } | |
430 | } | |
431 | if (!skip) | |
432 | { | |
433 | #ifdef TRE_DEBUG | |
434 | DPRINT((" Keeping tags=")); | |
435 | tre_print_tags(tags, tnfa->num_tags); | |
436 | DPRINT(("\n")); | |
437 | #endif /* TRE_DEBUG */ | |
438 | } | |
439 | else | |
440 | { | |
441 | #ifdef TRE_DEBUG | |
442 | DPRINT((" Throwing out tags=")); | |
443 | tre_print_tags(tags, tnfa->num_tags); | |
444 | DPRINT(("\n")); | |
445 | #endif /* TRE_DEBUG */ | |
446 | goto backtrack; | |
447 | } | |
448 | } | |
449 | ||
450 | if (state == tnfa->final) | |
451 | { | |
452 | DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos)); | |
453 | ||
454 | if (match_eo >= 0 && tnfa->num_minimals) | |
455 | { | |
456 | int compare = 0; | |
457 | #ifdef TRE_DEBUG | |
458 | DPRINT(("Checking minimal conditions: match_eo=%d " | |
459 | "match_tags=", match_eo)); | |
460 | tre_print_tags(match_tags, tnfa->num_tags); | |
461 | DPRINT(("\n")); | |
462 | #endif /* TRE_DEBUG */ | |
463 | for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) | |
464 | { | |
465 | int end = tnfa->minimal_tags[i]; | |
466 | int start = tnfa->minimal_tags[i + 1]; | |
467 | DPRINT((" Minimal start %d, end %d\n", start, end)); | |
468 | if ((compare = tre_minimal_tag_order(start, end, | |
469 | match_tags, tags)) != 0) | |
470 | break; | |
471 | } | |
472 | if (compare > 0) | |
473 | { | |
474 | #ifdef TRE_DEBUG | |
475 | DPRINT((" Throwing out new match, tags=")); | |
476 | tre_print_tags(tags, tnfa->num_tags); | |
477 | DPRINT(("\n")); | |
478 | #endif /* TRE_DEBUG */ | |
479 | goto backtrack; | |
480 | } | |
481 | else if (compare < 0) | |
482 | { | |
483 | #ifdef TRE_DEBUG | |
484 | DPRINT((" Throwing out old match, tags=")); | |
485 | tre_print_tags(match_tags, tnfa->num_tags); | |
486 | DPRINT(("\n")); | |
487 | #endif /* TRE_DEBUG */ | |
488 | match_eo = -1; | |
489 | } | |
490 | } | |
491 | ||
492 | if (match_eo < pos | |
493 | || (match_eo == pos | |
494 | && match_tags | |
495 | && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, | |
496 | tags, match_tags))) | |
497 | { | |
498 | /* This match wins the previous match. */ | |
499 | #ifdef TRE_DEBUG | |
500 | DPRINT((" win previous tags=")); | |
501 | tre_print_tags(tags, tnfa->num_tags); | |
502 | DPRINT(("\n")); | |
503 | #endif /* TRE_DEBUG */ | |
504 | match_eo = pos; | |
505 | if (match_tags) | |
506 | memcpy(match_tags, tags, num_tags * sizeof(*tags)); | |
507 | } | |
508 | /* Our TNFAs never have transitions leaving from the final state, | |
509 | so we jump right to backtracking. */ | |
510 | goto backtrack; | |
511 | } | |
512 | ||
513 | #ifdef TRE_DEBUG | |
514 | DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, | |
515 | state)); | |
516 | tre_print_tags(tags, tnfa->num_tags); | |
517 | DPRINT(("\n")); | |
518 | #endif /* TRE_DEBUG */ | |
519 | ||
520 | /* Go to the next character in the input string. */ | |
521 | empty_br_match = 0; | |
522 | trans_i = state; | |
523 | if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) | |
524 | { | |
525 | /* This is a back reference state. All transitions leaving from | |
526 | this state have the same back reference "assertion". Instead | |
527 | of reading the next character, we match the back reference. */ | |
528 | int so, eo, bt = trans_i->u.backref; | |
529 | int bt_len; | |
530 | int result; | |
531 | ||
532 | DPRINT((" should match back reference %d\n", bt)); | |
533 | /* Get the substring we need to match against. Remember to | |
534 | turn off REG_NOSUB temporarily. */ | |
535 | ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, | |
536 | tnfa, tags, pos); | |
537 | if (ret != REG_OK) goto error_exit; | |
538 | so = pmatch[bt].rm_so; | |
539 | eo = pmatch[bt].rm_eo; | |
540 | bt_len = eo - so; | |
541 | ||
542 | #ifdef TRE_DEBUG | |
543 | { | |
544 | int slen; | |
545 | if (len < 0) | |
546 | slen = bt_len; | |
547 | else | |
548 | slen = MIN(bt_len, len - pos); | |
549 | ||
550 | if (type == STR_BYTE) | |
551 | { | |
552 | DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n", | |
553 | bt_len, so, eo, bt_len, (char*)string + so)); | |
554 | DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); | |
555 | } | |
556 | #ifdef TRE_WCHAR | |
557 | else if (type == STR_WIDE) | |
558 | { | |
559 | DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n", | |
560 | bt_len, so, eo, bt_len, (wchar_t*)string + so)); | |
561 | DPRINT((" current string is '%.*" STRF "'\n", | |
562 | slen, str_wide - 1)); | |
563 | } | |
564 | #endif /* TRE_WCHAR */ | |
565 | } | |
566 | #endif | |
567 | ||
568 | if (so < 0) | |
569 | { | |
570 | result = 1; /* Back reference of nomatch doesn't match */ | |
571 | } | |
572 | else if (len < 0) | |
573 | { | |
574 | #ifdef TRE_STR_USER | |
575 | if (type == STR_USER) | |
576 | result = str_source->compare((unsigned)so, (unsigned)pos, | |
577 | (unsigned)bt_len, | |
578 | str_source->context); | |
579 | else | |
580 | #endif /* TRE_STR_USER */ | |
581 | #ifdef TRE_WCHAR | |
582 | if (type == STR_WIDE) | |
583 | result = wcsncmp((const wchar_t*)string + so, str_wide - 1, | |
584 | (size_t)bt_len); | |
585 | else | |
586 | #endif /* TRE_WCHAR */ | |
587 | result = strncmp((const char*)string + so, str_byte - 1, | |
588 | (size_t)bt_len); | |
589 | } | |
590 | else if (len - pos < bt_len) | |
591 | result = 1; | |
592 | #ifdef TRE_WCHAR | |
593 | else if (type == STR_WIDE) | |
594 | result = wmemcmp((const wchar_t*)string + so, str_wide - 1, | |
595 | (size_t)bt_len); | |
596 | #endif /* TRE_WCHAR */ | |
597 | else | |
598 | result = memcmp((const char*)string + so, str_byte - 1, | |
599 | (size_t)bt_len); | |
600 | ||
601 | if (result == 0) | |
602 | { | |
603 | /* Back reference matched. Check for infinite loop. */ | |
604 | if (bt_len == 0) | |
605 | empty_br_match = 1; | |
606 | if (empty_br_match && states_seen[trans_i->state_id]) | |
607 | { | |
608 | DPRINT((" avoid loop\n")); | |
609 | goto backtrack; | |
610 | } | |
611 | ||
612 | states_seen[trans_i->state_id] = empty_br_match; | |
613 | ||
614 | /* Advance in input string and resync `prev_c', `next_c' | |
615 | and pos. */ | |
616 | DPRINT((" back reference matched\n")); | |
617 | str_byte += bt_len - 1; | |
618 | #ifdef TRE_WCHAR | |
619 | str_wide += bt_len - 1; | |
620 | #endif /* TRE_WCHAR */ | |
621 | pos += bt_len - 1; | |
622 | GET_NEXT_WCHAR(); | |
623 | DPRINT((" pos now %d\n", pos)); | |
624 | } | |
625 | else | |
626 | { | |
627 | DPRINT((" back reference did not match\n")); | |
628 | goto backtrack; | |
629 | } | |
630 | } | |
631 | else | |
632 | { | |
633 | /* Check for end of string. */ | |
634 | if (len < 0) | |
635 | { | |
636 | #ifdef TRE_STR_USER | |
637 | if (type == STR_USER) | |
638 | { | |
639 | if (str_user_end) | |
640 | goto backtrack; | |
641 | } | |
642 | else | |
643 | #endif /* TRE_STR_USER */ | |
644 | if (next_c == L'\0') | |
645 | goto backtrack; | |
646 | } | |
647 | else | |
648 | { | |
649 | if (pos >= len) | |
650 | goto backtrack; | |
651 | } | |
652 | ||
653 | /* Read the next character. */ | |
654 | GET_NEXT_WCHAR(); | |
655 | } | |
656 | ||
657 | next_state = NULL; | |
658 | for (trans_i = state; trans_i->state; trans_i++) | |
659 | { | |
660 | DPRINT((" transition %d-%d (%c-%c) %d to %d\n", | |
661 | trans_i->code_min, trans_i->code_max, | |
662 | trans_i->code_min, trans_i->code_max, | |
663 | trans_i->assertions, trans_i->state_id)); | |
664 | if (trans_i->code_min <= (tre_cint_t)prev_c | |
665 | && trans_i->code_max >= (tre_cint_t)prev_c) | |
666 | { | |
667 | if (trans_i->assertions | |
668 | && (CHECK_ASSERTIONS(trans_i->assertions) | |
669 | || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) | |
670 | { | |
671 | DPRINT((" assertion failed\n")); | |
672 | continue; | |
673 | } | |
674 | ||
675 | if (next_state == NULL) | |
676 | { | |
677 | /* First matching transition. */ | |
678 | DPRINT((" Next state is %d\n", trans_i->state_id)); | |
679 | next_state = trans_i->state; | |
680 | next_tags = trans_i->tags; | |
681 | } | |
682 | else | |
683 | { | |
684 | /* Second matching transition. We may need to backtrack here | |
685 | to take this transition instead of the first one, so we | |
686 | push this transition in the backtracking stack so we can | |
687 | jump back here if needed. */ | |
688 | DPRINT((" saving state %d for backtracking\n", | |
689 | trans_i->state_id)); | |
690 | BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, | |
691 | trans_i->state, trans_i->state_id, next_c, | |
692 | tags, mbstate); | |
693 | { | |
694 | int *tmp; | |
695 | for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) | |
696 | tre_tag_set(stack->item.tags, *tmp, pos, touch); | |
697 | touch++; | |
698 | } | |
699 | #if 0 /* XXX - it's important not to look at all transitions here to keep | |
700 | the stack small! */ | |
701 | break; | |
702 | #endif | |
703 | } | |
704 | } | |
705 | } | |
706 | ||
707 | if (next_state != NULL) | |
708 | { | |
709 | /* Matching transitions were found. Take the first one. */ | |
710 | state = next_state; | |
711 | ||
712 | /* Update the tag values. */ | |
713 | if (next_tags) | |
714 | { | |
715 | while (*next_tags >= 0) | |
716 | tre_tag_set(tags, *next_tags++, pos, touch); | |
717 | touch++; | |
718 | } | |
719 | } | |
720 | else | |
721 | { | |
722 | backtrack: | |
723 | /* A matching transition was not found. Try to backtrack. */ | |
724 | if (stack->prev) | |
725 | { | |
726 | DPRINT((" backtracking\n")); | |
727 | if (stack->item.state->assertions & ASSERT_BACKREF) | |
728 | { | |
729 | DPRINT((" states_seen[%d] = 0\n", | |
730 | stack->item.state_id)); | |
731 | states_seen[stack->item.state_id] = 0; | |
732 | } | |
733 | ||
734 | BT_STACK_POP(); | |
735 | } | |
736 | else if (match_eo < 0) | |
737 | { | |
738 | /* Try starting from a later position in the input string. */ | |
739 | /* Check for end of string. */ | |
740 | if (pos == pos_start) | |
741 | { | |
742 | if (len < 0) | |
743 | { | |
744 | if (next_c == L'\0') | |
745 | { | |
746 | DPRINT(("end of string.\n")); | |
747 | break; | |
748 | } | |
749 | } | |
750 | else | |
751 | { | |
752 | if (pos >= len) | |
753 | { | |
754 | DPRINT(("end of string.\n")); | |
755 | break; | |
756 | } | |
757 | } | |
758 | } | |
759 | DPRINT(("restarting from next start position\n")); | |
760 | next_c = next_c_start; | |
761 | #ifdef TRE_MBSTATE | |
762 | mbstate = mbstate_start; | |
763 | #endif /* TRE_MBSTATE */ | |
764 | str_byte = str_byte_start; | |
765 | #ifdef TRE_WCHAR | |
766 | str_wide = str_wide_start; | |
767 | #endif /* TRE_WCHAR */ | |
768 | goto retry; | |
769 | } | |
770 | else | |
771 | { | |
772 | DPRINT(("finished\n")); | |
773 | break; | |
774 | } | |
775 | } | |
776 | } | |
777 | ||
778 | ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; | |
779 | *match_end_ofs = match_eo; | |
780 | ||
781 | error_exit: | |
782 | tre_bt_mem_destroy(mem); | |
783 | #ifndef TRE_USE_ALLOCA | |
784 | if (buf) | |
785 | xfree(buf); | |
786 | #endif /* !TRE_USE_ALLOCA */ | |
787 | ||
788 | return ret; | |
789 | } |