2 tre-match-backtrack.c - TRE backtracking regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
35 #endif /* HAVE_CONFIG_H */
37 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
42 /* AIX requires this to be the first thing in the file. */
50 # ifndef alloca /* predefined by HP cc +Olibcalls */
56 #endif /* TRE_USE_ALLOCA */
63 #endif /* HAVE_WCHAR_H */
66 #endif /* HAVE_WCTYPE_H */
69 #endif /* !TRE_WCHAR */
72 #endif /* HAVE_MALLOC_H */
74 #include "tre-internal.h"
76 #include "tre-match-utils.h"
82 unsigned int pos_add_next
;
85 const wchar_t *str_wide
;
86 #endif /* TRE_WCHAR */
87 tre_tnfa_transition_t
*state
;
93 #endif /* TRE_MBSTATE */
94 } tre_backtrack_item_t
;
96 typedef struct tre_backtrack_struct
{
97 tre_backtrack_item_t item
;
98 struct tre_backtrack_struct
*prev
;
99 struct tre_backtrack_struct
*next
;
103 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
104 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
105 #else /* !TRE_WCHAR */
106 #define BT_STACK_WIDE_IN(_str_wide)
107 #define BT_STACK_WIDE_OUT
108 #endif /* !TRE_WCHAR */
111 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
112 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
113 #else /* !TRE_MBSTATE */
114 #define BT_STACK_MBSTATE_IN
115 #define BT_STACK_MBSTATE_OUT
116 #endif /* !TRE_MBSTATE */
119 #ifdef TRE_USE_ALLOCA
120 #define tre_bt_mem_new tre_mem_newa
121 #define tre_bt_mem_alloc tre_mem_alloca
122 #define tre_bt_mem_destroy(obj) do { } while (0)
123 #else /* !TRE_USE_ALLOCA */
124 #define tre_bt_mem_new tre_mem_new
125 #define tre_bt_mem_alloc tre_mem_alloc
126 #define tre_bt_mem_destroy tre_mem_destroy
127 #endif /* !TRE_USE_ALLOCA */
130 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
136 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
139 tre_bt_mem_destroy(mem); \
145 xfree(states_seen); \
150 s->item.tags = tre_bt_mem_alloc(mem, \
151 num_tags * sizeof(*tags)); \
154 tre_bt_mem_destroy(mem); \
160 xfree(states_seen); \
167 stack = stack->next; \
168 stack->item.pos = (_pos); \
169 stack->item.pos_add_next = (_pos_add_next); \
170 stack->item.str_byte = (_str_byte); \
171 BT_STACK_WIDE_IN(_str_wide); \
172 stack->item.state = (_state); \
173 stack->item.state_id = (_state_id); \
174 stack->item.next_c = (_next_c); \
175 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
176 BT_STACK_MBSTATE_IN; \
178 while (/*CONSTCOND*/0)
181 #define BT_STACK_POP() \
184 assert(stack->prev); \
185 pos = stack->item.pos; \
186 pos_add_next = stack->item.pos_add_next; \
187 if (type == STR_USER) \
188 str_source->rewind(pos + pos_add_next, str_source->context); \
189 str_byte = stack->item.str_byte; \
191 state = stack->item.state; \
192 next_c = stack->item.next_c; \
193 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
194 BT_STACK_MBSTATE_OUT; \
195 stack = stack->prev; \
197 while (/*CONSTCOND*/0)
198 #else /* !TRE_STR_USER */
199 #define BT_STACK_POP() \
202 assert(stack->prev); \
203 pos = stack->item.pos; \
204 pos_add_next = stack->item.pos_add_next; \
205 str_byte = stack->item.str_byte; \
207 state = stack->item.state; \
208 next_c = stack->item.next_c; \
209 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
210 BT_STACK_MBSTATE_OUT; \
211 stack = stack->prev; \
213 while (/*CONSTCOND*/0)
214 #endif /* !TRE_STR_USER */
217 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
220 tre_tnfa_run_backtrack(const tre_tnfa_t
*tnfa
, const void *string
,
221 int len
, tre_str_type_t type
, tre_tag_t
*match_tags
,
222 int eflags
, int *match_end_ofs
)
224 /* State variables required by GET_NEXT_WCHAR. */
225 tre_char_t prev_c
= 0, next_c
= 0;
226 const char *str_byte
= string
;
228 unsigned int pos_add_next
= 1;
230 const wchar_t *str_wide
= string
;
233 #endif /* TRE_MBSTATE */
234 #endif /* TRE_WCHAR */
235 int reg_notbol
= eflags
& REG_NOTBOL
;
236 int reg_noteol
= eflags
& REG_NOTEOL
;
237 int reg_newline
= tnfa
->cflags
& REG_NEWLINE
;
239 int str_user_end
= 0;
240 #endif /* TRE_STR_USER */
243 /* These are used to remember the necessary values of the above
244 variables to return to the position where the current search
247 const char *str_byte_start
;
250 const wchar_t *str_wide_start
;
251 #endif /* TRE_WCHAR */
253 mbstate_t mbstate_start
;
254 #endif /* TRE_MBSTATE */
256 /* End offset of best match so far, or -1 if no match found yet. */
260 tre_tag_t
*tags
= NULL
;
261 /* Current TNFA state. */
262 tre_tnfa_transition_t
*state
;
263 int *states_seen
= NULL
;
265 /* Memory allocator to for allocating the backtracking stack. */
266 tre_mem_t mem
= tre_bt_mem_new();
268 /* The backtracking stack. */
269 tre_backtrack_t stack
;
271 tre_tnfa_transition_t
*trans_i
;
272 regmatch_t
*pmatch
= NULL
;
275 int num_tags
= tnfa
->num_tags
;
281 memset(&mbstate
, '\0', sizeof(mbstate
));
282 #endif /* TRE_MBSTATE */
286 stack
= tre_bt_mem_alloc(mem
, sizeof(*stack
));
295 DPRINT(("tnfa_execute_backtrack, input type %d\n", type
));
296 DPRINT(("len = %d\n", len
));
299 int pbytes
, sbytes
, total_bytes
;
301 /* Compute the length of the block we need. */
302 tbytes
= sizeof(*tags
) * num_tags
;
303 pbytes
= sizeof(*pmatch
) * tnfa
->num_submatches
;
304 sbytes
= sizeof(*states_seen
) * tnfa
->num_states
;
306 (sizeof(long) - 1) * 2 /* for alignment paddings */
307 + tbytes
+ pbytes
+ sbytes
;
309 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes
));
310 /* Allocate the memory. */
311 #ifdef TRE_USE_ALLOCA
312 buf
= alloca(total_bytes
);
313 #else /* !TRE_USE_ALLOCA */
314 buf
= xmalloc((unsigned)total_bytes
);
315 #endif /* !TRE_USE_ALLOCA */
319 /* Get the various pointers within tmp_buf (properly aligned). */
321 tmp_buf
= buf
+ tbytes
;
322 tmp_buf
+= ALIGN(tmp_buf
, long);
323 pmatch
= (void *)tmp_buf
;
325 tmp_buf
+= ALIGN(tmp_buf
, long);
326 states_seen
= (void *)tmp_buf
;
331 memset(tags
, 0, num_tags
* sizeof(*tags
));
333 memset(match_tags
, 0, num_tags
* sizeof(*match_tags
));
334 for (i
= 0; i
< tnfa
->num_states
; i
++)
341 if (type
== STR_USER
)
342 str_source
->rewind(pos
+ pos_add_next
, str_source
->context
);
343 #endif /* TRE_STR_USER */
346 next_c_start
= next_c
;
347 str_byte_start
= str_byte
;
349 str_wide_start
= str_wide
;
350 #endif /* TRE_WCHAR */
352 mbstate_start
= mbstate
;
353 #endif /* TRE_MBSTATE */
355 /* Handle initial states. */
357 for (trans_i
= tnfa
->initial
; trans_i
->state
; trans_i
++)
359 DPRINT(("> init %p, prev_c %lc\n", trans_i
->state
, (tre_cint_t
)prev_c
));
360 if (trans_i
->assertions
&& CHECK_ASSERTIONS(trans_i
->assertions
))
362 DPRINT(("assert failed\n"));
367 /* Start from this state. */
368 state
= trans_i
->state
;
369 next_tags
= trans_i
->tags
;
373 /* Backtrack to this state. */
374 DPRINT(("saving state %d for backtracking\n", trans_i
->state_id
));
375 BT_STACK_PUSH(pos
, pos_add_next
, str_byte
, str_wide
, trans_i
->state
,
376 trans_i
->state_id
, next_c
, tags
, mbstate
);
378 int *tmp
= trans_i
->tags
;
382 tre_tag_set(stack
->item
.tags
, *tmp
++, pos
, touch
);
391 for (; *next_tags
>= 0; next_tags
++)
392 tre_tag_set(tags
, *next_tags
, pos
, touch
);
397 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos
, str_byte
));
398 DPRINT(("pos:chr/code | state and tags\n"));
399 DPRINT(("-------------+------------------------------------------------\n"));
404 while (/*CONSTCOND*/1)
406 tre_tnfa_transition_t
*next_state
;
409 DPRINT(("start loop\n"));
411 if (match_eo
>= 0 && tnfa
->num_minimals
)
415 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
417 tre_print_tags(match_tags
, tnfa
->num_tags
);
419 #endif /* TRE_DEBUG */
420 for (i
= 0; tnfa
->minimal_tags
[i
] >= 0; i
+= 2)
422 int end
= tnfa
->minimal_tags
[i
];
423 int start
= tnfa
->minimal_tags
[i
+ 1];
424 DPRINT((" Minimal start %d, end %d\n", start
, end
));
425 if (tre_minimal_tag_order(start
, end
, match_tags
, tags
) > 0)
434 DPRINT((" Keeping tags="));
435 tre_print_tags(tags
, tnfa
->num_tags
);
437 #endif /* TRE_DEBUG */
442 DPRINT((" Throwing out tags="));
443 tre_print_tags(tags
, tnfa
->num_tags
);
445 #endif /* TRE_DEBUG */
450 if (state
== tnfa
->final
)
452 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo
, pos
));
454 if (match_eo
>= 0 && tnfa
->num_minimals
)
458 DPRINT(("Checking minimal conditions: match_eo=%d "
459 "match_tags=", match_eo
));
460 tre_print_tags(match_tags
, tnfa
->num_tags
);
462 #endif /* TRE_DEBUG */
463 for (i
= 0; tnfa
->minimal_tags
[i
] >= 0; i
+= 2)
465 int end
= tnfa
->minimal_tags
[i
];
466 int start
= tnfa
->minimal_tags
[i
+ 1];
467 DPRINT((" Minimal start %d, end %d\n", start
, end
));
468 if ((compare
= tre_minimal_tag_order(start
, end
,
469 match_tags
, tags
)) != 0)
475 DPRINT((" Throwing out new match, tags="));
476 tre_print_tags(tags
, tnfa
->num_tags
);
478 #endif /* TRE_DEBUG */
481 else if (compare
< 0)
484 DPRINT((" Throwing out old match, tags="));
485 tre_print_tags(match_tags
, tnfa
->num_tags
);
487 #endif /* TRE_DEBUG */
495 && tre_tag_order(tnfa
->num_tags
, tnfa
->tag_directions
,
498 /* This match wins the previous match. */
500 DPRINT((" win previous tags="));
501 tre_print_tags(tags
, tnfa
->num_tags
);
503 #endif /* TRE_DEBUG */
506 memcpy(match_tags
, tags
, num_tags
* sizeof(*tags
));
508 /* Our TNFAs never have transitions leaving from the final state,
509 so we jump right to backtracking. */
514 DPRINT(("%3d:%2lc/%05d | %p ", pos
, (tre_cint_t
)next_c
, (int)next_c
,
516 tre_print_tags(tags
, tnfa
->num_tags
);
518 #endif /* TRE_DEBUG */
520 /* Go to the next character in the input string. */
523 if (trans_i
->state
&& trans_i
->assertions
& ASSERT_BACKREF
)
525 /* This is a back reference state. All transitions leaving from
526 this state have the same back reference "assertion". Instead
527 of reading the next character, we match the back reference. */
528 int so
, eo
, bt
= trans_i
->u
.backref
;
532 DPRINT((" should match back reference %d\n", bt
));
533 /* Get the substring we need to match against. Remember to
534 turn off REG_NOSUB temporarily. */
535 ret
= tre_fill_pmatch(bt
+ 1, pmatch
, tnfa
->cflags
& ~REG_NOSUB
,
537 if (ret
!= REG_OK
) goto error_exit
;
538 so
= pmatch
[bt
].rm_so
;
539 eo
= pmatch
[bt
].rm_eo
;
548 slen
= MIN(bt_len
, len
- pos
);
550 if (type
== STR_BYTE
)
552 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
553 bt_len
, so
, eo
, bt_len
, (char*)string
+ so
));
554 DPRINT((" current string is '%.*s'\n", slen
, str_byte
- 1));
557 else if (type
== STR_WIDE
)
559 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF
"'\n",
560 bt_len
, so
, eo
, bt_len
, (wchar_t*)string
+ so
));
561 DPRINT((" current string is '%.*" STRF
"'\n",
562 slen
, str_wide
- 1));
564 #endif /* TRE_WCHAR */
570 result
= 1; /* Back reference of nomatch doesn't match */
575 if (type
== STR_USER
)
576 result
= str_source
->compare((unsigned)so
, (unsigned)pos
,
578 str_source
->context
);
580 #endif /* TRE_STR_USER */
582 if (type
== STR_WIDE
)
583 result
= wcsncmp((const wchar_t*)string
+ so
, str_wide
- 1,
586 #endif /* TRE_WCHAR */
587 result
= strncmp((const char*)string
+ so
, str_byte
- 1,
590 else if (len
- pos
< bt_len
)
593 else if (type
== STR_WIDE
)
594 result
= wmemcmp((const wchar_t*)string
+ so
, str_wide
- 1,
596 #endif /* TRE_WCHAR */
598 result
= memcmp((const char*)string
+ so
, str_byte
- 1,
603 /* Back reference matched. Check for infinite loop. */
606 if (empty_br_match
&& states_seen
[trans_i
->state_id
])
608 DPRINT((" avoid loop\n"));
612 states_seen
[trans_i
->state_id
] = empty_br_match
;
614 /* Advance in input string and resync `prev_c', `next_c'
616 DPRINT((" back reference matched\n"));
617 str_byte
+= bt_len
- 1;
619 str_wide
+= bt_len
- 1;
620 #endif /* TRE_WCHAR */
623 DPRINT((" pos now %d\n", pos
));
627 DPRINT((" back reference did not match\n"));
633 /* Check for end of string. */
637 if (type
== STR_USER
)
643 #endif /* TRE_STR_USER */
653 /* Read the next character. */
658 for (trans_i
= state
; trans_i
->state
; trans_i
++)
660 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
661 trans_i
->code_min
, trans_i
->code_max
,
662 trans_i
->code_min
, trans_i
->code_max
,
663 trans_i
->assertions
, trans_i
->state_id
));
664 if (trans_i
->code_min
<= (tre_cint_t
)prev_c
665 && trans_i
->code_max
>= (tre_cint_t
)prev_c
)
667 if (trans_i
->assertions
668 && (CHECK_ASSERTIONS(trans_i
->assertions
)
669 || CHECK_CHAR_CLASSES(trans_i
, tnfa
, eflags
)))
671 DPRINT((" assertion failed\n"));
675 if (next_state
== NULL
)
677 /* First matching transition. */
678 DPRINT((" Next state is %d\n", trans_i
->state_id
));
679 next_state
= trans_i
->state
;
680 next_tags
= trans_i
->tags
;
684 /* Second matching transition. We may need to backtrack here
685 to take this transition instead of the first one, so we
686 push this transition in the backtracking stack so we can
687 jump back here if needed. */
688 DPRINT((" saving state %d for backtracking\n",
690 BT_STACK_PUSH(pos
, pos_add_next
, str_byte
, str_wide
,
691 trans_i
->state
, trans_i
->state_id
, next_c
,
695 for (tmp
= trans_i
->tags
; tmp
&& *tmp
>= 0; tmp
++)
696 tre_tag_set(stack
->item
.tags
, *tmp
, pos
, touch
);
699 #if 0 /* XXX - it's important not to look at all transitions here to keep
707 if (next_state
!= NULL
)
709 /* Matching transitions were found. Take the first one. */
712 /* Update the tag values. */
715 while (*next_tags
>= 0)
716 tre_tag_set(tags
, *next_tags
++, pos
, touch
);
723 /* A matching transition was not found. Try to backtrack. */
726 DPRINT((" backtracking\n"));
727 if (stack
->item
.state
->assertions
& ASSERT_BACKREF
)
729 DPRINT((" states_seen[%d] = 0\n",
730 stack
->item
.state_id
));
731 states_seen
[stack
->item
.state_id
] = 0;
736 else if (match_eo
< 0)
738 /* Try starting from a later position in the input string. */
739 /* Check for end of string. */
740 if (pos
== pos_start
)
746 DPRINT(("end of string.\n"));
754 DPRINT(("end of string.\n"));
759 DPRINT(("restarting from next start position\n"));
760 next_c
= next_c_start
;
762 mbstate
= mbstate_start
;
763 #endif /* TRE_MBSTATE */
764 str_byte
= str_byte_start
;
766 str_wide
= str_wide_start
;
767 #endif /* TRE_WCHAR */
772 DPRINT(("finished\n"));
778 ret
= match_eo
>= 0 ? REG_OK
: REG_NOMATCH
;
779 *match_end_ofs
= match_eo
;
782 tre_bt_mem_destroy(mem
);
783 #ifndef TRE_USE_ALLOCA
786 #endif /* !TRE_USE_ALLOCA */