2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
11 #endif /* HAVE_CONFIG_H */
14 /* AIX requires this to be the first thing in the file. */
22 # ifndef alloca /* predefined by HP cc +Olibcalls */
28 #endif /* TRE_USE_ALLOCA */
35 #endif /* HAVE_WCHAR_H */
38 #endif /* HAVE_WCTYPE_H */
41 #endif /* !TRE_WCHAR */
44 #endif /* HAVE_MALLOC_H */
47 #include "tre-internal.h"
48 #include "tre-match-utils.h"
53 /* For each tre_last_matched_t in the lm array, find the last matched branch by
54 comparing the touch value of the cmp_tag's. For all other branches, reset
55 the corresponding tags. If reset_all is non-zero, reset all tags in all
56 branches. Recurse into the nested last matched structures, clearing tags as
59 tre_reset_last_matched_branches(tre_tag_t
*tags
, const tre_last_matched_t
*lm
,
60 int n
, int start_tag
, int reset_all
)
63 tre_last_matched_branch_t
*b
;
65 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
66 n
, start_tag
, reset_all
));
69 if (lm
->n_branches
== 1)
74 DPRINT((" b->cmp_tag=%d %d <? %d\n", b
->cmp_tag
,
75 tre_tag_touch_get(tags
, b
->cmp_tag
),
76 tre_tag_touch_get(tags
, start_tag
)));
77 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) <
78 tre_tag_touch_get(tags
, start_tag
));
87 for (i
= b
->n_tags
, t
= b
->tags
; i
> 0; i
--, t
++)
89 DPRINT((" Resetting t%d\n", *t
));
90 tre_tag_reset(tags
, *t
);
93 if (b
->n_last_matched
> 0)
94 tre_reset_last_matched_branches(tags
, b
->last_matched
,
96 lm
->start_tag
, reset
);
104 #endif /* TRE_DEBUG */
106 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
109 int touch
= tre_tag_touch_get(tags
, t
);
115 #endif /* TRE_DEBUG */
118 DPRINT((" Last touched end tag t%d=%d\n", last
, max
));
121 for (i
= lm
->n_branches
, b
= lm
->branches
; i
> 0; i
--, b
++)
123 reset
= (reset_all
|| tre_tag_touch_get(tags
, b
->cmp_tag
) < max
);
129 for (j
= b
->n_tags
, t
= b
->tags
; j
> 0; j
--, t
++)
131 DPRINT((" Resetting t%d\n", *t
));
132 tre_tag_reset(tags
, *t
);
135 if (b
->n_last_matched
> 0)
136 tre_reset_last_matched_branches(tags
, b
->last_matched
,
138 lm
->start_tag
, reset
);
145 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
148 tre_fill_pmatch(size_t nmatch
, regmatch_t pmatch
[], int cflags
,
149 const tre_tnfa_t
*tnfa
, const tre_tag_t
*intags
, int match_eo
)
153 if (cflags
& REG_NOSUB
) return REG_OK
;
156 if (match_eo
>= 0 && intags
)
158 const tre_tag_t
*tags
= intags
;
159 tre_submatch_data_t
*submatch_data
;
161 if (tnfa
->last_matched_branch
&&
162 tnfa
->last_matched_branch
->n_last_matched
> 0)
165 #ifdef TRE_USE_ALLOCA
166 t
= alloca(sizeof(*t
) * tnfa
->num_tags
);
167 #else /* !TRE_USE_ALLOCA */
168 t
= xmalloc(sizeof(*t
) * tnfa
->num_tags
);
169 #endif /* !TRE_USE_ALLOCA */
170 if (!t
) return REG_ESPACE
;
171 memcpy(t
, intags
, tnfa
->num_tags
* sizeof(tre_tag_t
));
172 tre_reset_last_matched_branches(t
,
173 tnfa
->last_matched_branch
->last_matched
,
174 tnfa
->last_matched_branch
->n_last_matched
,
178 /* Construct submatch offsets from the tags. */
179 DPRINT(("end tag = t%d = %d\n", tnfa
->end_tag
, match_eo
));
180 submatch_data
= tnfa
->submatch_data
;
181 while (i
< tnfa
->num_submatches
&& i
< nmatch
)
183 if (submatch_data
[i
].so_tag
== tnfa
->end_tag
)
184 pmatch
[i
].rm_so
= match_eo
;
186 pmatch
[i
].rm_so
= tre_tag_get(tags
, submatch_data
[i
].so_tag
);
188 if (submatch_data
[i
].eo_tag
== tnfa
->end_tag
)
189 pmatch
[i
].rm_eo
= match_eo
;
191 pmatch
[i
].rm_eo
= tre_tag_get(tags
, submatch_data
[i
].eo_tag
);
193 /* If either of the endpoints were not used, this submatch
194 was not part of the match. */
195 if (pmatch
[i
].rm_so
== -1 || pmatch
[i
].rm_eo
== -1)
196 pmatch
[i
].rm_so
= pmatch
[i
].rm_eo
= -1;
198 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i
,
199 submatch_data
[i
].so_tag
, pmatch
[i
].rm_so
,
200 submatch_data
[i
].eo_tag
, pmatch
[i
].rm_eo
));
203 #ifndef TRE_USE_ALLOCA
204 if (tags
!= intags
) xfree(tags
);
205 #endif /* !TRE_USE_ALLOCA */
210 pmatch
[i
].rm_so
= -1;
211 pmatch
[i
].rm_eo
= -1;
220 Wrapper functions for POSIX compatible regexp matching.
225 tre_have_backrefs(const regex_t
*preg
)
227 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
228 return tnfa
->have_backrefs
;
233 tre_have_approx(const regex_t
*preg
)
235 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
236 return tnfa
->have_approx
;
238 #endif /* TRE_APPROX */
239 #endif /* !__LIBC__ */
242 tre_match(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
243 tre_str_type_t type
, size_t nmatch
, regmatch_t pmatch
[],
246 reg_errcode_t status
;
247 tre_tag_t
*tags
= NULL
;
249 size_t offset
= 0, count
= 0;
250 if (tnfa
->num_tags
> 0 && nmatch
> 0)
252 #ifdef TRE_USE_ALLOCA
253 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
254 #else /* !TRE_USE_ALLOCA */
255 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
256 #endif /* !TRE_USE_ALLOCA */
264 #endif /* TRE_STR_USER */
265 (eflags
& REG_STARTEND
) && pmatch
)
267 if (pmatch
->rm_so
< 0)
269 if (len
== (size_t)-1)
271 if (pmatch
->rm_eo
< 0 || pmatch
->rm_so
> pmatch
->rm_eo
)
273 len
= pmatch
->rm_eo
- pmatch
->rm_so
;
275 count
= offset
= pmatch
->rm_so
;
276 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
279 /* Dispatch to the appropriate matcher. */
280 if (tnfa
->have_backrefs
|| eflags
& REG_BACKTRACKING_MATCHER
)
282 /* The regex has back references, use the backtracking matcher. */
284 if (type
== STR_USER
)
286 const tre_str_source
*source
= string
;
287 if (source
->rewind
== NULL
|| source
->compare
== NULL
)
288 /* The backtracking matcher requires rewind and compare
289 capabilities from the input stream. */
292 #endif /* TRE_STR_USER */
293 status
= tre_tnfa_run_backtrack(tnfa
, string
+ offset
, (int)len
, type
,
297 else if (tnfa
->have_approx
|| eflags
& REG_APPROX_MATCHER
)
299 /* The regex uses approximate matching, use the approximate matcher. */
302 tre_regaparams_default(¶ms
);
305 status
= tre_tnfa_run_approx(tnfa
, string
+ offset
, (int)len
, type
, tags
,
306 &match
, params
, eflags
, &eo
);
308 #endif /* TRE_APPROX */
311 /* Exact matching, no back references, use the parallel matcher. */
312 status
= tre_tnfa_run_parallel(tnfa
, string
+ offset
, (int)len
, type
,
316 if (status
== REG_OK
)
318 /* A match was found, so fill the submatch registers. */
319 status
= tre_fill_pmatch(nmatch
, pmatch
, tnfa
->cflags
, tnfa
, tags
, eo
);
320 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
321 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
322 tre_fill_pmatch itself). */
323 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
326 #endif /* TRE_STR_USER */
327 (eflags
& REG_STARTEND
) && pmatch
&& nmatch
> 0)
331 for (i
= nmatch
, p
= pmatch
; i
> 0; p
++, i
--)
333 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
334 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
338 #ifndef TRE_USE_ALLOCA
341 #endif /* !TRE_USE_ALLOCA */
346 tre_regnexec(const regex_t
*preg
, const char *str
, size_t len
,
347 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
349 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
350 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
352 #ifdef TRE_USE_SYSTEM_REGEX_H
353 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
354 #endif /* TRE_USE_SYSTEM_REGEX_H */
356 return tre_match(tnfa
, str
, len
, type
, nmatch
, pmatch
, eflags
);
360 tre_regexec(const regex_t
*preg
, const char *str
,
361 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
363 return tre_regnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
370 tre_regwnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
371 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
373 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
375 #ifdef TRE_USE_SYSTEM_REGEX_H
376 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
377 #endif /* TRE_USE_SYSTEM_REGEX_H */
379 return tre_match(tnfa
, str
, len
, STR_WIDE
, nmatch
, pmatch
, eflags
);
383 tre_regwexec(const regex_t
*preg
, const wchar_t *str
,
384 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
386 return tre_regwnexec(preg
, str
, (size_t)-1, nmatch
, pmatch
, eflags
);
389 #endif /* TRE_WCHAR */
393 tre_reguexec(const regex_t
*preg
, const tre_str_source
*str
,
394 size_t nmatch
, regmatch_t pmatch
[], int eflags
)
396 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
398 #ifdef TRE_USE_SYSTEM_REGEX_H
399 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
400 #endif /* TRE_USE_SYSTEM_REGEX_H */
402 return tre_match(tnfa
, str
, (size_t)-1, STR_USER
, nmatch
, pmatch
, eflags
);
404 #endif /* TRE_STR_USER */
410 Wrapper functions for approximate regexp matching.
414 tre_match_approx(const tre_tnfa_t
*tnfa
, const void *string
, size_t len
,
415 tre_str_type_t type
, regamatch_t
*match
, regaparams_t params
,
418 reg_errcode_t status
;
419 tre_tag_t
*tags
= NULL
;
421 size_t offset
= 0, count
= 0;
423 /* If the regexp does not use approximate matching features, the
424 maximum cost is zero, and the approximate matcher isn't forced,
425 use the exact matcher instead. */
426 if (params
.max_cost
== 0 && !tnfa
->have_approx
427 && !(eflags
& REG_APPROX_MATCHER
))
428 return tre_match(tnfa
, string
, len
, type
, match
->nmatch
, match
->pmatch
,
431 /* Back references are not supported by the approximate matcher. */
432 if (tnfa
->have_backrefs
)
435 if (tnfa
->num_tags
> 0 && match
->nmatch
> 0)
438 tags
= alloca(sizeof(*tags
) * tnfa
->num_tags
);
439 #else /* !TRE_USE_ALLOCA */
440 tags
= xmalloc(sizeof(*tags
) * tnfa
->num_tags
);
441 #endif /* !TRE_USE_ALLOCA */
449 #endif /* TRE_STR_USER */
450 (eflags
& REG_STARTEND
) && match
->pmatch
)
452 if (match
->pmatch
->rm_so
< 0)
454 if (len
== (size_t)-1)
456 if (match
->pmatch
->rm_eo
< 0 || match
->pmatch
->rm_so
>
457 match
->pmatch
->rm_eo
)
459 len
= match
->pmatch
->rm_eo
- match
->pmatch
->rm_so
;
461 count
= offset
= match
->pmatch
->rm_so
;
462 if (type
== STR_WIDE
) offset
*= sizeof(wchar_t);
465 status
= tre_tnfa_run_approx(tnfa
, string
, (int)len
, type
, tags
,
466 match
, params
, eflags
, &eo
);
467 if (status
== REG_OK
)
469 status
= tre_fill_pmatch(match
->nmatch
, match
->pmatch
, tnfa
->cflags
,
471 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
472 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
473 tre_fill_pmatch itself). */
474 if (status
== REG_OK
&& !(tnfa
->cflags
& REG_NOSUB
) &&
477 #endif /* TRE_STR_USER */
478 (eflags
& REG_STARTEND
) && match
->pmatch
&& match
->nmatch
> 0)
482 for (i
= match
->nmatch
, p
= match
->pmatch
; i
> 0; p
++, i
--)
484 if (p
->rm_so
>= 0) p
->rm_so
+= count
;
485 if (p
->rm_eo
>= 0) p
->rm_eo
+= count
;
489 #ifndef TRE_USE_ALLOCA
492 #endif /* !TRE_USE_ALLOCA */
497 tre_reganexec(const regex_t
*preg
, const char *str
, size_t len
,
498 regamatch_t
*match
, regaparams_t params
, int eflags
)
500 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
501 tre_str_type_t type
= (TRE_MB_CUR_MAX_L(tnfa
->loc
) == 1) ? STR_BYTE
: STR_MBS
;
503 #ifdef TRE_USE_SYSTEM_REGEX_H
504 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
505 #endif /* TRE_USE_SYSTEM_REGEX_H */
507 return tre_match_approx(tnfa
, str
, len
, type
, match
, params
, eflags
);
511 tre_regaexec(const regex_t
*preg
, const char *str
,
512 regamatch_t
*match
, regaparams_t params
, int eflags
)
514 return tre_reganexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
520 tre_regawnexec(const regex_t
*preg
, const wchar_t *str
, size_t len
,
521 regamatch_t
*match
, regaparams_t params
, int eflags
)
523 tre_tnfa_t
*tnfa
= (void *)preg
->TRE_REGEX_T_FIELD
;
525 #ifdef TRE_USE_SYSTEM_REGEX_H
526 if (preg
->re_magic
!= RE_MAGIC
) return REG_BADPAT
;
527 #endif /* TRE_USE_SYSTEM_REGEX_H */
529 return tre_match_approx(tnfa
, str
, len
, STR_WIDE
,
530 match
, params
, eflags
);
534 tre_regawexec(const regex_t
*preg
, const wchar_t *str
,
535 regamatch_t
*match
, regaparams_t params
, int eflags
)
537 return tre_regawnexec(preg
, str
, (size_t)-1, match
, params
, eflags
);
540 #endif /* TRE_WCHAR */
543 tre_regaparams_default(regaparams_t
*params
)
545 memset(params
, 0, sizeof(*params
));
546 params
->cost_ins
= 1;
547 params
->cost_del
= 1;
548 params
->cost_subst
= 1;
549 params
->max_cost
= INT_MAX
;
550 params
->max_ins
= INT_MAX
;
551 params
->max_del
= INT_MAX
;
552 params
->max_subst
= INT_MAX
;
553 params
->max_err
= INT_MAX
;
556 #endif /* TRE_APPROX */