2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
11 #endif /* HAVE_CONFIG_H */
13 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
18 /* AIX requires this to be the first thing in the file. */
26 # ifndef alloca /* predefined by HP cc +Olibcalls */
32 #endif /* TRE_USE_ALLOCA */
39 #endif /* HAVE_WCHAR_H */
42 #endif /* HAVE_WCTYPE_H */
45 #endif /* !TRE_WCHAR */
48 #endif /* HAVE_MALLOC_H */
51 #include "tre-internal.h"
52 #include "tre-match-utils.h"
57 /* For each tre_last_matched_t in the lm array, find the last matched branch by
58 comparing the touch value of the cmp_tag's. For all other branches, reset
59 the corresponding tags. If reset_all is non-zero, reset all tags in all
60 branches. Recurse into the nested last matched structures, clearing tags as
63 tre_reset_last_matched_branches(tre_tag_t
*tags
, const tre_last_matched_t
*lm
,
64 int n
, int start_tag
, int reset_all
)
67 tre_last_matched_branch_t
*b
;
69 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
70 n
, start_tag
, reset_all
));
73 if (lm
->n_branches
== 1)
78 DPRINT((" b->cmp_tag=%d %d <? %d\n", b
->cmp_tag
,
79 tre_tag_touch_get(tags
, b
->cmp_tag
),
80 tre_tag_touch_get(tags
, start_tag
)));
81 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) <
82 tre_tag_touch_get(tags
, start_tag
));
91 for (i
= b
->n_tags
, t
= b
->tags
; i
> 0; i
--, t
++)
93 DPRINT((" Resetting t%d\n", *t
));
94 tre_tag_reset(tags
, *t
);
97 if (b
->n_last_matched
> 0)
98 tre_reset_last_matched_branches(tags
, b
->last_matched
,
100 lm
->start_tag
, reset
);
108 #endif /* TRE_DEBUG */
110 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
113 int touch
= tre_tag_touch_get(tags
, t
);
119 #endif /* TRE_DEBUG */
122 DPRINT((" Last touched end tag t%d=%d\n", last
, max
));
125 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
127 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) < max
);
133 for (j
= b
->n_tags
, t
= b
->tags
; j
> 0; j
--, t
++)
135 DPRINT((" Resetting t%d\n", *t
));
136 tre_tag_reset(tags
, *t
);
139 if (b
->n_last_matched
> 0)
140 tre_reset_last_matched_branches(tags
, b
->last_matched
,
142 lm
->start_tag
, reset
);
149 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
152 tre_fill_pmatch(size_t nmatch
, regmatch_t pmatch
[], int cflags
,
153 const tre_tnfa_t
*tnfa
, const tre_tag_t
*intags
, int match_eo
)
157 if (cflags
& REG_NOSUB
) return REG_OK
;
160 if (match_eo
>= 0 && intags
)
162 const tre_tag_t
*tags
= intags
;
163 tre_submatch_data_t
*submatch_data
;
165 if (tnfa
->last_matched_branch
&&
166 tnfa
->last_matched_branch
->n_last_matched
> 0)
169 #ifdef TRE_USE_ALLOCA
170 t
= alloca(sizeof(*t
) * tnfa
->num_tags
);
171 #else /* !TRE_USE_ALLOCA */
172 t
= xmalloc(sizeof(*t
) * tnfa
->num_tags
);
173 #endif /* !TRE_USE_ALLOCA */
174 if (!t
) return REG_ESPACE
;
175 memcpy(t
, intags
, tnfa
->num_tags
* sizeof(tre_tag_t
));
176 tre_reset_last_matched_branches(t
,
177 tnfa
->last_matched_branch
->last_matched
,
178 tnfa
->last_matched_branch
->n_last_matched
,
182 /* Construct submatch offsets from the tags. */
183 DPRINT(("end tag = t%d = %d\n", tnfa
->end_tag
, match_eo
));
184 submatch_data
= tnfa
->submatch_data
;
185 while (i
< tnfa
->num_submatches
&& i
< nmatch
)
187 if (submatch_data
[i
].so_tag
== tnfa
->end_tag
)
188 pmatch
[i
].rm_so
= match_eo
;
190 pmatch
[i
].rm_so
= tre_tag_get(tags
, submatch_data
[i
].so_tag
);
192 if (submatch_data
[i
].eo_tag
== tnfa
->end_tag
)
193 pmatch
[i
].rm_eo
= match_eo
;
195 pmatch
[i
].rm_eo
= tre_tag_get(tags
, submatch_data
[i
].eo_tag
);
197 /* If either of the endpoints were not used, this submatch
198 was not part of the match. */
199 if (pmatch
[i
].rm_so
== -1 || pmatch
[i
].rm_eo
== -1)
200 pmatch
[i
].rm_so
= pmatch
[i
].rm_eo
= -1;
202 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i
,
203 submatch_data
[i
].so_tag
, pmatch
[i
].rm_so
,
204 submatch_data
[i
].eo_tag
, pmatch
[i
].rm_eo
));
207 #ifndef TRE_USE_ALLOCA
208 if (tags
!= intags
) xfree(tags
);
209 #endif /* !TRE_USE_ALLOCA */
214 pmatch
[i
].rm_so
= -1;
215 pmatch
[i
].rm_eo
= -1;
224 Wrapper functions for POSIX compatible regexp matching.
229 tre_have_backrefs(const regex_t
*preg
)
231 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
232 return tnfa
->have_backrefs
;
237 tre_have_approx(const regex_t
*preg
)
239 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
240 return tnfa
->have_approx
;
242 #endif /* TRE_APPROX */
243 #endif /* !__LIBC__ */
246 tre_match(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
247 tre_str_type_t type
, size_t nmatch
, regmatch_t pmatch
[],
250 reg_errcode_t status
;
251 tre_tag_t
*tags
= NULL
;
253 size_t offset
= 0, count
= 0;
254 if (tnfa
->num_tags
> 0 && nmatch
> 0)
256 #ifdef TRE_USE_ALLOCA
257 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
258 #else /* !TRE_USE_ALLOCA */
259 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
260 #endif /* !TRE_USE_ALLOCA */
268 #endif /* TRE_STR_USER */
269 (eflags
& REG_STARTEND
) && pmatch
)
271 if (pmatch
->rm_so
< 0)
273 if (len
== (size_t)-1)
275 if (pmatch
->rm_eo
< 0 || pmatch
->rm_so
> pmatch
->rm_eo
)
277 len
= pmatch
->rm_eo
- pmatch
->rm_so
;
279 count
= offset
= pmatch
->rm_so
;
280 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
283 /* Dispatch to the appropriate matcher. */
284 if (tnfa
->have_backrefs
|| eflags
& REG_BACKTRACKING_MATCHER
)
286 /* The regex has back references, use the backtracking matcher. */
288 if (type
== STR_USER
)
290 const tre_str_source
*source
= string
;
291 if (source
->rewind
== NULL
|| source
->compare
== NULL
)
292 /* The backtracking matcher requires rewind and compare
293 capabilities from the input stream. */
296 #endif /* TRE_STR_USER */
297 status
= tre_tnfa_run_backtrack(tnfa
, string
+ offset
, (int)len
, type
,
301 else if (tnfa
->have_approx
|| eflags
& REG_APPROX_MATCHER
)
303 /* The regex uses approximate matching, use the approximate matcher. */
306 tre_regaparams_default(¶ms
);
309 status
= tre_tnfa_run_approx(tnfa
, string
+ offset
, (int)len
, type
, tags
,
310 &match
, params
, eflags
, &eo
);
312 #endif /* TRE_APPROX */
315 /* Exact matching, no back references, use the parallel matcher. */
316 status
= tre_tnfa_run_parallel(tnfa
, string
+ offset
, (int)len
, type
,
320 if (status
== REG_OK
)
322 /* A match was found, so fill the submatch registers. */
323 status
= tre_fill_pmatch(nmatch
, pmatch
, tnfa
->cflags
, tnfa
, tags
, eo
);
324 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
325 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
326 tre_fill_pmatch itself). */
327 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
330 #endif /* TRE_STR_USER */
331 (eflags
& REG_STARTEND
) && pmatch
&& nmatch
> 0)
335 for (i
= nmatch
, p
= pmatch
; i
> 0; p
++, i
--)
337 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
338 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
342 #ifndef TRE_USE_ALLOCA
345 #endif /* !TRE_USE_ALLOCA */
350 tre_regnexec(const regex_t
*preg
, const char *str
, size_t len
,
351 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
353 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
354 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
356 #ifdef TRE_USE_SYSTEM_REGEX_H
357 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
358 #endif /* TRE_USE_SYSTEM_REGEX_H */
360 return tre_match(tnfa
, str
, len
, type
, nmatch
, pmatch
, eflags
);
364 tre_regexec(const regex_t
*preg
, const char *str
,
365 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
367 return tre_regnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
374 tre_regwnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
375 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
377 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
379 #ifdef TRE_USE_SYSTEM_REGEX_H
380 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
381 #endif /* TRE_USE_SYSTEM_REGEX_H */
383 return tre_match(tnfa
, str
, len
, STR_WIDE
, nmatch
, pmatch
, eflags
);
387 tre_regwexec(const regex_t
*preg
, const wchar_t *str
,
388 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
390 return tre_regwnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
393 #endif /* TRE_WCHAR */
397 tre_reguexec(const regex_t
*preg
, const tre_str_source
*str
,
398 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
400 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
402 #ifdef TRE_USE_SYSTEM_REGEX_H
403 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
404 #endif /* TRE_USE_SYSTEM_REGEX_H */
406 return tre_match(tnfa
, str
, (size_t)-1, STR_USER
, nmatch
, pmatch
, eflags
);
408 #endif /* TRE_STR_USER */
414 Wrapper functions for approximate regexp matching.
418 tre_match_approx(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
419 tre_str_type_t type
, regamatch_t
*match
, regaparams_t params
,
422 reg_errcode_t status
;
423 tre_tag_t
*tags
= NULL
;
425 size_t offset
= 0, count
= 0;
427 /* If the regexp does not use approximate matching features, the
428 maximum cost is zero, and the approximate matcher isn't forced,
429 use the exact matcher instead. */
430 if (params
.max_cost
== 0 && !tnfa
->have_approx
431 && !(eflags
& REG_APPROX_MATCHER
))
432 return tre_match(tnfa
, string
, len
, type
, match
->nmatch
, match
->pmatch
,
435 /* Back references are not supported by the approximate matcher. */
436 if (tnfa
->have_backrefs
)
439 if (tnfa
->num_tags
> 0 && match
->nmatch
> 0)
442 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
443 #else /* !TRE_USE_ALLOCA */
444 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
445 #endif /* !TRE_USE_ALLOCA */
453 #endif /* TRE_STR_USER */
454 (eflags
& REG_STARTEND
) && match
->pmatch
)
456 if (match
->pmatch
->rm_so
< 0)
458 if (len
== (size_t)-1)
460 if (match
->pmatch
->rm_eo
< 0 || match
->pmatch
->rm_so
>
461 match
->pmatch
->rm_eo
)
463 len
= match
->pmatch
->rm_eo
- match
->pmatch
->rm_so
;
465 count
= offset
= match
->pmatch
->rm_so
;
466 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
469 status
= tre_tnfa_run_approx(tnfa
, string
, (int)len
, type
, tags
,
470 match
, params
, eflags
, &eo
);
471 if (status
== REG_OK
)
473 status
= tre_fill_pmatch(match
->nmatch
, match
->pmatch
, tnfa
->cflags
,
475 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
476 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
477 tre_fill_pmatch itself). */
478 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
481 #endif /* TRE_STR_USER */
482 (eflags
& REG_STARTEND
) && match
->pmatch
&& match
->nmatch
> 0)
486 for (i
= match
->nmatch
, p
= match
->pmatch
; i
> 0; p
++, i
--)
488 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
489 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
493 #ifndef TRE_USE_ALLOCA
496 #endif /* !TRE_USE_ALLOCA */
501 tre_reganexec(const regex_t
*preg
, const char *str
, size_t len
,
502 regamatch_t
*match
, regaparams_t params
, int eflags
)
504 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
505 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
507 #ifdef TRE_USE_SYSTEM_REGEX_H
508 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
509 #endif /* TRE_USE_SYSTEM_REGEX_H */
511 return tre_match_approx(tnfa
, str
, len
, type
, match
, params
, eflags
);
515 tre_regaexec(const regex_t
*preg
, const char *str
,
516 regamatch_t
*match
, regaparams_t params
, int eflags
)
518 return tre_reganexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
524 tre_regawnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
525 regamatch_t
*match
, regaparams_t params
, int eflags
)
527 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
529 #ifdef TRE_USE_SYSTEM_REGEX_H
530 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
531 #endif /* TRE_USE_SYSTEM_REGEX_H */
533 return tre_match_approx(tnfa
, str
, len
, STR_WIDE
,
534 match
, params
, eflags
);
538 tre_regawexec(const regex_t
*preg
, const wchar_t *str
,
539 regamatch_t
*match
, regaparams_t params
, int eflags
)
541 return tre_regawnexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
544 #endif /* TRE_WCHAR */
547 tre_regaparams_default(regaparams_t
*params
)
549 memset(params
, 0, sizeof(*params
));
550 params
->cost_ins
= 1;
551 params
->cost_del
= 1;
552 params
->cost_subst
= 1;
553 params
->max_cost
= INT_MAX
;
554 params
->max_ins
= INT_MAX
;
555 params
->max_del
= INT_MAX
;
556 params
->max_subst
= INT_MAX
;
557 params
->max_err
= INT_MAX
;
560 #endif /* TRE_APPROX */