2 tre-match-utils.h - TRE matcher helper definitions
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
9 #include "tre-internal.h"
11 #define str_source ((const tre_str_source*)string)
17 /* Wide character and multibyte support. */
20 #error TRE_STR_USER defined
21 #define GET_NEXT_WCHAR() \
24 if (type == STR_BYTE) \
27 if (len >= 0 && pos >= len) \
30 next_c = (unsigned char)(*str_byte++); \
32 else if (type == STR_WIDE) \
35 if (len >= 0 && pos >= len) \
38 next_c = *str_wide++; \
40 else if (type == STR_MBS) \
42 pos += pos_add_next; \
43 if (str_byte == NULL) \
60 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
62 if (w == (size_t)-1 || w == (size_t)-2) \
64 if (w == 0 && len >= 0) \
78 else if (type == STR_USER) \
80 pos += pos_add_next; \
81 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
82 str_source->context); \
84 } while(/*CONSTCOND*/0)
85 #else /* !TRE_STR_USER */
87 * Because all multibyte encodings are exclusively single-shift encoding,
88 * with the shift codes having the high bit set, we can make an optimization
89 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
90 * is detected, and just do a direct copy for ASCII characters.
92 #define GET_NEXT_WCHAR() \
99 if (len >= 0 && pos >= len) \
102 next_c = (unsigned char)(*str_byte++); \
106 if (len >= 0 && pos >= len) \
109 next_c = *str_wide++; \
112 pos += pos_add_next; \
113 if (__builtin_expect(len >= 0 && pos >= len, 0)) \
118 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \
120 next_c = (unsigned char)(*str_byte++); \
131 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
133 if (w == (size_t)-1 || w == (size_t)-2) \
135 if (w == 0 && len >= 0) \
149 } while(/*CONSTCOND*/0)
150 #endif /* !TRE_STR_USER */
152 #else /* !TRE_MULTIBYTE */
154 /* Wide character support, no multibyte support. */
155 #error TRE_MULTIBYTE undefined
158 #define GET_NEXT_WCHAR() \
161 if (type == STR_BYTE) \
164 if (len >= 0 && pos >= len) \
167 next_c = (unsigned char)(*str_byte++); \
169 else if (type == STR_WIDE) \
172 if (len >= 0 && pos >= len) \
175 next_c = *str_wide++; \
177 else if (type == STR_USER) \
179 pos += pos_add_next; \
180 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
181 str_source->context); \
183 } while(/*CONSTCOND*/0)
184 #else /* !TRE_STR_USER */
185 #define GET_NEXT_WCHAR() \
188 if (type == STR_BYTE) \
191 if (len >= 0 && pos >= len) \
194 next_c = (unsigned char)(*str_byte++); \
196 else if (type == STR_WIDE) \
199 if (len >= 0 && pos >= len) \
202 next_c = *str_wide++; \
204 } while(/*CONSTCOND*/0)
205 #endif /* !TRE_STR_USER */
207 #endif /* !TRE_MULTIBYTE */
209 #else /* !TRE_WCHAR */
211 /* No wide character or multibyte support. */
212 #error TRE_WCHAR undefined
215 #define GET_NEXT_WCHAR() \
218 if (type == STR_BYTE) \
221 if (len >= 0 && pos >= len) \
224 next_c = (unsigned char)(*str_byte++); \
226 else if (type == STR_USER) \
228 pos += pos_add_next; \
229 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
230 str_source->context); \
232 } while(/*CONSTCOND*/0)
233 #else /* !TRE_STR_USER */
234 #define GET_NEXT_WCHAR() \
237 if (type == STR_BYTE) \
240 if (len >= 0 && pos >= len) \
243 next_c = (unsigned char)(*str_byte++); \
245 } while(/*CONSTCOND*/0)
246 #endif /* !TRE_STR_USER */
248 #endif /* !TRE_WCHAR */
252 /* Assumes tre_tnfa_t *tnfa in scope */
253 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
255 #define CHECK_ASSERTIONS(assertions) \
256 (((assertions & ASSERT_AT_BOL) \
257 && (pos > 0 || reg_notbol) \
258 && (prev_c != L'\n' || !reg_newline)) \
259 || ((assertions & ASSERT_AT_EOL) \
260 && (next_c != L'\0' || reg_noteol) \
261 && (next_c != L'\n' || !reg_newline)) \
262 || ((assertions & ASSERT_AT_BOW) \
263 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
264 || ((assertions & ASSERT_AT_EOW) \
265 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
266 || ((assertions & ASSERT_AT_WB) \
267 && (pos != 0 && next_c != L'\0' \
268 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
269 || ((assertions & ASSERT_AT_WB_NEG) \
270 && (pos == 0 || next_c == L'\0' \
271 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
273 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
274 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \
275 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \
282 tre_tag_get(const tre_tag_t
*tags
, int i
)
285 return tags
->count
> 0 ? tags
->value
: -1;
289 tre_tag_set(tre_tag_t
*tags
, int i
, int val
, int touch
)
292 if (tags
->count
++ == 0)
299 tre_tag_reset(tre_tag_t
*tags
, int i
)
305 tre_tag_touch_get(const tre_tag_t
*tags
, int i
)
307 return tags
[i
].touch
;
312 tre_print_tags(const tre_tag_t
*tags
, int num_tags
)
315 for (i
= 0; i
< num_tags
; i
++, tags
++)
320 DPRINT(("%d:(0,-1)", i
));
323 DPRINT(("%d:(1,%d)", i
, tags
->first
));
326 DPRINT(("%d:(%d,%d,%d)", i
, tags
->count
, tags
->first
,
330 if (i
< (num_tags
- 1))
336 tre_print_tags_all(const tre_tag_t
*tags
, int num_tags
)
339 for (i
= 0; i
< num_tags
; i
++, tags
++)
344 DPRINT(("%d:(0,-1)/%d", i
, tags
->touch
));
347 DPRINT(("%d:(1,%d)/%d", i
, tags
->first
, tags
->touch
));
350 DPRINT(("%d:(%d,%d,%d)/%d", i
, tags
->count
, tags
->first
,
351 tags
->value
, tags
->touch
));
354 if (i
< (num_tags
- 1))
358 #endif /* TRE_DEBUG */
360 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
361 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
363 tre_minimal_tag_order(int start
, int end
, const tre_tag_t
*tags1
,
364 const tre_tag_t
*tags2
)
366 const tre_tag_t
*t1
, *t2
;
370 /* We need both start tags to be set */
371 if (t1
->count
== 0 || t2
->count
== 0)
374 /* The start tags must be equal */
375 if (t1
->value
!= t2
->value
)
380 /* For the end tags, we prefer set over unset, because unset means that
381 * the end tag is still growing */
384 /* if t2 is set, t1 loses since it is unset */
388 /* if t2 not set, t1 wins since it is set */
389 else if (t2
->count
== 0)
392 /* least current value wins */
393 return t2
->value
- t1
->value
;
396 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
397 * (t1 loses if < 0, t1 wins if > 0) */
399 tre_tag_order_1(int i
, tre_tag_direction_t dir
, const tre_tag_t
*t1
,
408 case TRE_TAG_MINIMIZE
:
409 /* least current value wins (because tags are initialized to all zeros,
410 * unset wins over set; also, tre_minimal_tag_order() will have already
411 * been run, which checks for being unset) */
412 return t2
->value
- t1
->value
;
414 case TRE_TAG_MAXIMIZE
:
418 /* if neither t1 and t2 are set, try next tag */
421 /* t2 is set, t1 loses since it is unset */
424 /* if t2 not set, t1 wins since it is set */
425 else if (t2
->count
== 0)
427 /* greatest initial value wins */
428 if ((diff
= t1
->first
- t2
->first
) != 0)
430 /* least number of times the tag was set, wins */
431 if ((diff
= t2
->count
- t1
->count
) != 0)
433 /* if the tags were only set once, they only have initial values */
436 /* greatest current value wins */
437 return t1
->value
- t2
->value
;
439 case TRE_TAG_LEFT_MAXIMIZE
:
443 /* if neither t1 and t2 are set, try next tag */
446 /* t2 is set, t1 loses since it is unset */
449 /* if t2 not set, t1 wins since it is set */
450 else if (t2
->count
== 0)
452 /* least initial value wins */
453 if ((diff
= t2
->first
- t1
->first
) != 0)
455 /* least number of times the tag was set, wins */
456 if ((diff
= t2
->count
- t1
->count
) != 0)
458 /* if the tags were only set once, they only have initial values */
461 /* greatest current value wins */
462 return t1
->value
- t2
->value
;
465 /* Shouldn't happen: only assert if TRE_DEBUG defined */
473 #define _MORE_DEBUGGING
474 #endif /* TRE_DEBUG */
476 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
478 #ifdef _MORE_DEBUGGING
479 _tre_tag_order(int num_tags
, tre_tag_direction_t
*tag_directions
,
480 const tre_tag_t
*t1
, const tre_tag_t
*t2
)
481 #else /* !_MORE_DEBUGGING */
482 tre_tag_order(int num_tags
, tre_tag_direction_t
*tag_directions
,
483 const tre_tag_t
*t1
, const tre_tag_t
*t2
)
484 #endif /* !_MORE_DEBUGGING */
488 for (i
= 0; i
< num_tags
; i
++)
490 if ((ret
= tre_tag_order_1(i
, tag_directions
[i
], t1
, t2
)) != 0)
497 #ifdef _MORE_DEBUGGING
499 tre_tag_order(int num_tags
, tre_tag_direction_t
*tag_directions
,
500 const tre_tag_t
*t1
, const tre_tag_t
*t2
)
502 int ret
= _tre_tag_order(num_tags
, tag_directions
, t1
, t2
);
503 DPRINT(("tre_tag_order: "));
504 tre_print_tags(t1
, num_tags
);
505 DPRINT((" %s ", ret
? "wins" : "doesn't win"));
506 tre_print_tags(t2
, num_tags
);
510 #endif /* _MORE_DEBUGGING */
513 #include <xlocale_private.h>
514 #else /* !__LIBC__ */
516 #endif /* !__LIBC__ */
518 int __collate_equiv_value(locale_t loc
, const wchar_t *str
, size_t len
);
521 tre_bracket_match(tre_bracket_match_list_t
* __restrict list
, tre_cint_t wc
,
522 const tre_tnfa_t
* __restrict tnfa
)
526 tre_bracket_match_t
*b
;
528 int we
, ue
, le
, got_equiv
= 0;
529 int icase
= ((tnfa
->cflags
& REG_ICASE
) != 0);
531 DPRINT(("tre_bracket_match: %p, %d, %d\n", list
, wc
, icase
));
534 if (tre_islower_l(wc
, tnfa
->loc
))
537 uc
= tre_toupper_l(wc
, tnfa
->loc
);
539 else if (tre_isupper_l(wc
, tnfa
->loc
))
542 lc
= tre_tolower_l(wc
, tnfa
->loc
);
549 for (i
= 0, b
= list
->bracket_matches
; i
< list
->num_bracket_matches
;
554 case TRE_BRACKET_MATCH_TYPE_CHAR
:
556 match
= (b
->value
== uc
|| b
->value
== lc
);
558 match
= (b
->value
== wc
);
560 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN
:
562 tre_cint_t start
= b
->value
, end
;
563 if (++i
>= list
->num_bracket_matches
||
564 (++b
)->type
!= TRE_BRACKET_MATCH_TYPE_RANGE_END
)
566 DPRINT(("tre_bracket_match: no following range end\n"));
575 ue
= __collate_equiv_value(tnfa
->loc
, &uc
, 1);
576 le
= __collate_equiv_value(tnfa
->loc
, &lc
, 1);
579 we
= __collate_equiv_value(tnfa
->loc
, &wc
, 1);
583 match
= ((start
<= ue
&& ue
<= end
) ||
584 (start
<= le
&& le
<= end
));
586 match
= (start
<= we
&& we
<= end
);
589 case TRE_BRACKET_MATCH_TYPE_RANGE_END
:
590 DPRINT(("tre_bracket_match: range end without preceeding start\n"));
593 case TRE_BRACKET_MATCH_TYPE_CLASS
:
595 match
= (tre_isctype_l(uc
, b
->value
, tnfa
->loc
) ||
596 tre_isctype_l(lc
, b
->value
, tnfa
->loc
));
598 match
= (tre_isctype_l(wc
, b
->value
, tnfa
->loc
));
600 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE
:
605 ue
= __collate_equiv_value(tnfa
->loc
, &uc
, 1);
606 le
= __collate_equiv_value(tnfa
->loc
, &lc
, 1);
609 we
= __collate_equiv_value(tnfa
->loc
, &wc
, 1);
613 match
= (b
->value
== ue
|| b
->value
== le
);
615 match
= (b
->value
== we
);
618 DPRINT(("tre_bracket_match: unknown type %d\n", b
->type
));
626 if (list
->flags
& TRE_BRACKET_MATCH_FLAG_NEGATE
) {
627 if ((tnfa
->cflags
& REG_NEWLINE
) && wc
== '\n') return 0;