2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
11 #endif /* HAVE_CONFIG_H */
13 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
18 /* AIX requires this to be the first thing in the file. */
26 # ifndef alloca /* predefined by HP cc +Olibcalls */
32 #endif /* TRE_USE_ALLOCA */
39 #endif /* HAVE_WCHAR_H */
42 #endif /* HAVE_WCTYPE_H */
45 #endif /* !TRE_WCHAR */
48 #endif /* HAVE_MALLOC_H */
51 #include "tre-internal.h"
52 #include "tre-match-utils.h"
57 /* For each tre_last_matched_t in the lm array, find the last matched branch by
58 comparing the touch value of the cmp_tag's. For all other branches, reset
59 the corresponding tags. If reset_all is non-zero, reset all tags in all
60 branches. Recurse into the nested last matched structures, clearing tags as
63 tre_reset_last_matched_branches(tre_tag_t
*tags
, const tre_last_matched_t
*lm
,
64 int n
, int start_tag
, int reset_all
)
67 tre_last_matched_branch_t
*b
;
69 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
70 n
, start_tag
, reset_all
));
73 if (lm
->n_branches
== 1)
78 DPRINT((" b->cmp_tag=%d %d <? %d\n", b
->cmp_tag
,
79 tre_tag_touch_get(tags
, b
->cmp_tag
),
80 tre_tag_touch_get(tags
, start_tag
)));
81 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) <
82 tre_tag_touch_get(tags
, start_tag
));
91 for (i
= b
->n_tags
, t
= b
->tags
; i
> 0; i
--, t
++)
93 DPRINT((" Resetting t%d\n", *t
));
94 tre_tag_reset(tags
, *t
);
97 if (b
->n_last_matched
> 0)
98 tre_reset_last_matched_branches(tags
, b
->last_matched
,
100 lm
->start_tag
, reset
);
108 #endif /* TRE_DEBUG */
110 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
113 int touch
= tre_tag_touch_get(tags
, t
);
119 #endif /* TRE_DEBUG */
122 DPRINT((" Last touched end tag t%d=%d\n", last
, max
));
125 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
127 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) < max
);
133 for (j
= b
->n_tags
, t
= b
->tags
; j
> 0; j
--, t
++)
135 DPRINT((" Resetting t%d\n", *t
));
136 tre_tag_reset(tags
, *t
);
139 if (b
->n_last_matched
> 0)
140 tre_reset_last_matched_branches(tags
, b
->last_matched
,
142 lm
->start_tag
, reset
);
149 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
152 tre_fill_pmatch(size_t nmatch
, regmatch_t pmatch
[], int cflags
,
153 const tre_tnfa_t
*tnfa
, const tre_tag_t
*intags
, int match_eo
)
157 if (cflags
& REG_NOSUB
) return REG_OK
;
160 if (match_eo
>= 0 && intags
)
162 const tre_tag_t
*tags
= intags
;
163 tre_submatch_data_t
*submatch_data
;
165 if (tnfa
->last_matched_branch
&&
166 tnfa
->last_matched_branch
->n_last_matched
> 0)
169 #ifdef TRE_USE_ALLOCA
170 t
= alloca(sizeof(*t
) * tnfa
->num_tags
);
171 #else /* !TRE_USE_ALLOCA */
172 t
= xmalloc(sizeof(*t
) * tnfa
->num_tags
);
173 #endif /* !TRE_USE_ALLOCA */
174 if (!t
) return REG_ESPACE
;
175 memcpy(t
, intags
, tnfa
->num_tags
* sizeof(tre_tag_t
));
176 tre_reset_last_matched_branches(t
,
177 tnfa
->last_matched_branch
->last_matched
,
178 tnfa
->last_matched_branch
->n_last_matched
,
182 /* Construct submatch offsets from the tags. */
183 DPRINT(("end tag = t%d = %d\n", tnfa
->end_tag
, match_eo
));
184 submatch_data
= tnfa
->submatch_data
;
185 while (i
< tnfa
->num_submatches
&& i
< nmatch
)
187 if (submatch_data
[i
].so_tag
== tnfa
->end_tag
)
188 pmatch
[i
].rm_so
= match_eo
;
190 pmatch
[i
].rm_so
= tre_tag_get(tags
, submatch_data
[i
].so_tag
);
192 if (submatch_data
[i
].eo_tag
== tnfa
->end_tag
)
193 pmatch
[i
].rm_eo
= match_eo
;
195 pmatch
[i
].rm_eo
= tre_tag_get(tags
, submatch_data
[i
].eo_tag
);
197 /* If either of the endpoints were not used, this submatch
198 was not part of the match. */
199 if (pmatch
[i
].rm_so
== -1 || pmatch
[i
].rm_eo
== -1)
200 pmatch
[i
].rm_so
= pmatch
[i
].rm_eo
= -1;
202 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i
,
203 submatch_data
[i
].so_tag
, pmatch
[i
].rm_so
,
204 submatch_data
[i
].eo_tag
, pmatch
[i
].rm_eo
));
207 #ifndef TRE_USE_ALLOCA
208 if (tags
!= intags
) xfree((void*)tags
);
209 #endif /* !TRE_USE_ALLOCA */
214 pmatch
[i
].rm_so
= -1;
215 pmatch
[i
].rm_eo
= -1;
224 Wrapper functions for POSIX compatible regexp matching.
229 tre_have_backrefs(const regex_t
*preg
)
231 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
232 return tnfa
->have_backrefs
;
237 tre_have_approx(const regex_t
*preg
)
239 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
240 return tnfa
->have_approx
;
242 #endif /* TRE_APPROX */
243 #endif /* !__LIBC__ */
246 tre_match(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
247 tre_str_type_t type
, size_t nmatch
, regmatch_t pmatch
[],
250 reg_errcode_t status
;
251 tre_tag_t
*tags
= NULL
;
253 size_t offset
= 0, count
= 0;
258 #endif /* TRE_STR_USER */
259 (eflags
& REG_STARTEND
) && pmatch
)
261 if (pmatch
->rm_so
< 0)
263 if (len
== (size_t)-1)
265 if (pmatch
->rm_eo
< 0 || pmatch
->rm_so
> pmatch
->rm_eo
)
267 len
= pmatch
->rm_eo
- pmatch
->rm_so
;
269 count
= offset
= pmatch
->rm_so
;
270 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
273 if (tnfa
->num_tags
> 0 && nmatch
> 0)
275 #ifdef TRE_USE_ALLOCA
276 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
277 #else /* !TRE_USE_ALLOCA */
278 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
279 #endif /* !TRE_USE_ALLOCA */
284 /* Dispatch to the appropriate matcher. */
285 if (tnfa
->have_backrefs
|| eflags
& REG_BACKTRACKING_MATCHER
)
287 /* The regex has back references, use the backtracking matcher. */
289 if (type
== STR_USER
)
291 const tre_str_source
*source
= string
;
292 if (source
->rewind
== NULL
|| source
->compare
== NULL
)
293 /* The backtracking matcher requires rewind and compare
294 capabilities from the input stream. */
297 #endif /* TRE_STR_USER */
298 status
= tre_tnfa_run_backtrack(tnfa
, string
+ offset
, (int)len
, type
,
302 else if (tnfa
->have_approx
|| eflags
& REG_APPROX_MATCHER
)
304 /* The regex uses approximate matching, use the approximate matcher. */
307 tre_regaparams_default(¶ms
);
310 status
= tre_tnfa_run_approx(tnfa
, string
+ offset
, (int)len
, type
, tags
,
311 &match
, params
, eflags
, &eo
);
313 #endif /* TRE_APPROX */
316 /* Exact matching, no back references, use the parallel matcher. */
317 status
= tre_tnfa_run_parallel(tnfa
, string
+ offset
, (int)len
, type
,
321 if (status
== REG_OK
)
323 /* A match was found, so fill the submatch registers. */
324 status
= tre_fill_pmatch(nmatch
, pmatch
, tnfa
->cflags
, tnfa
, tags
, eo
);
325 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
326 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
327 tre_fill_pmatch itself). */
328 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
331 #endif /* TRE_STR_USER */
332 (eflags
& REG_STARTEND
) && pmatch
&& nmatch
> 0)
336 for (i
= nmatch
, p
= pmatch
; i
> 0; p
++, i
--)
338 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
339 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
343 #ifndef TRE_USE_ALLOCA
346 #endif /* !TRE_USE_ALLOCA */
351 tre_regnexec(const regex_t
*preg
, const char *str
, size_t len
,
352 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
354 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
355 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
357 #ifdef TRE_USE_SYSTEM_REGEX_H
358 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
359 #endif /* TRE_USE_SYSTEM_REGEX_H */
361 return tre_match(tnfa
, str
, len
, type
, nmatch
, pmatch
, eflags
);
365 tre_regexec(const regex_t
*preg
, const char *str
,
366 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
368 return tre_regnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
375 tre_regwnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
376 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
378 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
380 #ifdef TRE_USE_SYSTEM_REGEX_H
381 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
382 #endif /* TRE_USE_SYSTEM_REGEX_H */
384 return tre_match(tnfa
, str
, len
, STR_WIDE
, nmatch
, pmatch
, eflags
);
388 tre_regwexec(const regex_t
*preg
, const wchar_t *str
,
389 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
391 return tre_regwnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
394 #endif /* TRE_WCHAR */
398 tre_reguexec(const regex_t
*preg
, const tre_str_source
*str
,
399 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
401 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
403 #ifdef TRE_USE_SYSTEM_REGEX_H
404 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
405 #endif /* TRE_USE_SYSTEM_REGEX_H */
407 return tre_match(tnfa
, str
, (size_t)-1, STR_USER
, nmatch
, pmatch
, eflags
);
409 #endif /* TRE_STR_USER */
415 Wrapper functions for approximate regexp matching.
419 tre_match_approx(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
420 tre_str_type_t type
, regamatch_t
*match
, regaparams_t params
,
423 reg_errcode_t status
;
424 tre_tag_t
*tags
= NULL
;
426 size_t offset
= 0, count
= 0;
428 /* If the regexp does not use approximate matching features, the
429 maximum cost is zero, and the approximate matcher isn't forced,
430 use the exact matcher instead. */
431 if (params
.max_cost
== 0 && !tnfa
->have_approx
432 && !(eflags
& REG_APPROX_MATCHER
))
433 return tre_match(tnfa
, string
, len
, type
, match
->nmatch
, match
->pmatch
,
436 /* Back references are not supported by the approximate matcher. */
437 if (tnfa
->have_backrefs
)
440 if (tnfa
->num_tags
> 0 && match
->nmatch
> 0)
443 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
444 #else /* !TRE_USE_ALLOCA */
445 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
446 #endif /* !TRE_USE_ALLOCA */
454 #endif /* TRE_STR_USER */
455 (eflags
& REG_STARTEND
) && match
->pmatch
)
457 if (match
->pmatch
->rm_so
< 0)
459 if (len
== (size_t)-1)
461 if (match
->pmatch
->rm_eo
< 0 || match
->pmatch
->rm_so
>
462 match
->pmatch
->rm_eo
)
464 len
= match
->pmatch
->rm_eo
- match
->pmatch
->rm_so
;
466 count
= offset
= match
->pmatch
->rm_so
;
467 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
470 status
= tre_tnfa_run_approx(tnfa
, string
, (int)len
, type
, tags
,
471 match
, params
, eflags
, &eo
);
472 if (status
== REG_OK
)
474 status
= tre_fill_pmatch(match
->nmatch
, match
->pmatch
, tnfa
->cflags
,
476 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
477 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
478 tre_fill_pmatch itself). */
479 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
482 #endif /* TRE_STR_USER */
483 (eflags
& REG_STARTEND
) && match
->pmatch
&& match
->nmatch
> 0)
487 for (i
= match
->nmatch
, p
= match
->pmatch
; i
> 0; p
++, i
--)
489 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
490 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
494 #ifndef TRE_USE_ALLOCA
497 #endif /* !TRE_USE_ALLOCA */
502 tre_reganexec(const regex_t
*preg
, const char *str
, size_t len
,
503 regamatch_t
*match
, regaparams_t params
, int eflags
)
505 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
506 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
508 #ifdef TRE_USE_SYSTEM_REGEX_H
509 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
510 #endif /* TRE_USE_SYSTEM_REGEX_H */
512 return tre_match_approx(tnfa
, str
, len
, type
, match
, params
, eflags
);
516 tre_regaexec(const regex_t
*preg
, const char *str
,
517 regamatch_t
*match
, regaparams_t params
, int eflags
)
519 return tre_reganexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
525 tre_regawnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
526 regamatch_t
*match
, regaparams_t params
, int eflags
)
528 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
530 #ifdef TRE_USE_SYSTEM_REGEX_H
531 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
532 #endif /* TRE_USE_SYSTEM_REGEX_H */
534 return tre_match_approx(tnfa
, str
, len
, STR_WIDE
,
535 match
, params
, eflags
);
539 tre_regawexec(const regex_t
*preg
, const wchar_t *str
,
540 regamatch_t
*match
, regaparams_t params
, int eflags
)
542 return tre_regawnexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
545 #endif /* TRE_WCHAR */
548 tre_regaparams_default(regaparams_t
*params
)
550 memset(params
, 0, sizeof(*params
));
551 params
->cost_ins
= 1;
552 params
->cost_del
= 1;
553 params
->cost_subst
= 1;
554 params
->max_cost
= INT_MAX
;
555 params
->max_ins
= INT_MAX
;
556 params
->max_del
= INT_MAX
;
557 params
->max_subst
= INT_MAX
;
558 params
->max_err
= INT_MAX
;
561 #endif /* TRE_APPROX */