]> git.saurik.com Git - apple/libc.git/blame - regex/TRE/lib/tre-match-utils.h
Libc-825.24.tar.gz
[apple/libc.git] / regex / TRE / lib / tre-match-utils.h
CommitLineData
ad3c9f2a
A
1/*
2 tre-match-utils.h - TRE matcher helper definitions
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7*/
8
9#include "tre-internal.h"
10
11#define str_source ((const tre_str_source*)string)
12
13#ifdef TRE_WCHAR
14
15#ifdef TRE_MULTIBYTE
16
17/* Wide character and multibyte support. */
18
19#ifdef TRE_STR_USER
20#define GET_NEXT_WCHAR() \
21 do { \
22 prev_c = next_c; \
23 if (type == STR_BYTE) \
24 { \
25 pos++; \
26 if (len >= 0 && pos >= len) \
27 next_c = '\0'; \
28 else \
29 next_c = (unsigned char)(*str_byte++); \
30 } \
31 else if (type == STR_WIDE) \
32 { \
33 pos++; \
34 if (len >= 0 && pos >= len) \
35 next_c = L'\0'; \
36 else \
37 next_c = *str_wide++; \
38 } \
39 else if (type == STR_MBS) \
40 { \
41 pos += pos_add_next; \
42 if (str_byte == NULL) \
43 next_c = L'\0'; \
44 else \
45 { \
46 size_t w; \
47 int max; \
48 if (len >= 0) \
49 max = len - pos; \
50 else \
51 max = 32; \
52 if (max <= 0) \
53 { \
54 next_c = L'\0'; \
55 pos_add_next = 1; \
56 } \
57 else \
58 { \
59 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
60 tnfa->loc); \
61 if (w == (size_t)-1 || w == (size_t)-2) \
62 return REG_ILLSEQ; \
63 if (w == 0 && len >= 0) \
64 { \
65 pos_add_next = 1; \
66 next_c = 0; \
67 str_byte++; \
68 } \
69 else \
70 { \
71 pos_add_next = w; \
72 str_byte += w; \
73 } \
74 } \
75 } \
76 } \
77 else if (type == STR_USER) \
78 { \
79 pos += pos_add_next; \
80 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
81 str_source->context); \
82 } \
83 } while(/*CONSTCOND*/0)
84#else /* !TRE_STR_USER */
85#define GET_NEXT_WCHAR() \
86 do { \
87 prev_c = next_c; \
88 if (type == STR_BYTE) \
89 { \
90 pos++; \
91 if (len >= 0 && pos >= len) \
92 next_c = '\0'; \
93 else \
94 next_c = (unsigned char)(*str_byte++); \
95 } \
96 else if (type == STR_WIDE) \
97 { \
98 pos++; \
99 if (len >= 0 && pos >= len) \
100 next_c = L'\0'; \
101 else \
102 next_c = *str_wide++; \
103 } \
104 else if (type == STR_MBS) \
105 { \
106 pos += pos_add_next; \
107 if (str_byte == NULL) \
108 next_c = L'\0'; \
109 else \
110 { \
111 size_t w; \
112 int max; \
113 if (len >= 0) \
114 max = len - pos; \
115 else \
116 max = 32; \
117 if (max <= 0) \
118 { \
119 next_c = L'\0'; \
120 pos_add_next = 1; \
121 } \
122 else \
123 { \
124 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
125 tnfa->loc); \
126 if (w == (size_t)-1 || w == (size_t)-2) \
127 return REG_ILLSEQ; \
128 if (w == 0 && len >= 0) \
129 { \
130 pos_add_next = 1; \
131 next_c = 0; \
132 str_byte++; \
133 } \
134 else \
135 { \
136 pos_add_next = w; \
137 str_byte += w; \
138 } \
139 } \
140 } \
141 } \
142 } while(/*CONSTCOND*/0)
143#endif /* !TRE_STR_USER */
144
145#else /* !TRE_MULTIBYTE */
146
147/* Wide character support, no multibyte support. */
148
149#ifdef TRE_STR_USER
150#define GET_NEXT_WCHAR() \
151 do { \
152 prev_c = next_c; \
153 if (type == STR_BYTE) \
154 { \
155 pos++; \
156 if (len >= 0 && pos >= len) \
157 next_c = '\0'; \
158 else \
159 next_c = (unsigned char)(*str_byte++); \
160 } \
161 else if (type == STR_WIDE) \
162 { \
163 pos++; \
164 if (len >= 0 && pos >= len) \
165 next_c = L'\0'; \
166 else \
167 next_c = *str_wide++; \
168 } \
169 else if (type == STR_USER) \
170 { \
171 pos += pos_add_next; \
172 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
173 str_source->context); \
174 } \
175 } while(/*CONSTCOND*/0)
176#else /* !TRE_STR_USER */
177#define GET_NEXT_WCHAR() \
178 do { \
179 prev_c = next_c; \
180 if (type == STR_BYTE) \
181 { \
182 pos++; \
183 if (len >= 0 && pos >= len) \
184 next_c = '\0'; \
185 else \
186 next_c = (unsigned char)(*str_byte++); \
187 } \
188 else if (type == STR_WIDE) \
189 { \
190 pos++; \
191 if (len >= 0 && pos >= len) \
192 next_c = L'\0'; \
193 else \
194 next_c = *str_wide++; \
195 } \
196 } while(/*CONSTCOND*/0)
197#endif /* !TRE_STR_USER */
198
199#endif /* !TRE_MULTIBYTE */
200
201#else /* !TRE_WCHAR */
202
203/* No wide character or multibyte support. */
204
205#ifdef TRE_STR_USER
206#define GET_NEXT_WCHAR() \
207 do { \
208 prev_c = next_c; \
209 if (type == STR_BYTE) \
210 { \
211 pos++; \
212 if (len >= 0 && pos >= len) \
213 next_c = '\0'; \
214 else \
215 next_c = (unsigned char)(*str_byte++); \
216 } \
217 else if (type == STR_USER) \
218 { \
219 pos += pos_add_next; \
220 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
221 str_source->context); \
222 } \
223 } while(/*CONSTCOND*/0)
224#else /* !TRE_STR_USER */
225#define GET_NEXT_WCHAR() \
226 do { \
227 prev_c = next_c; \
228 if (type == STR_BYTE) \
229 { \
230 pos++; \
231 if (len >= 0 && pos >= len) \
232 next_c = '\0'; \
233 else \
234 next_c = (unsigned char)(*str_byte++); \
235 } \
236 } while(/*CONSTCOND*/0)
237#endif /* !TRE_STR_USER */
238
239#endif /* !TRE_WCHAR */
240
241
242
243/* Assumes tre_tnfa_t *tnfa in scope */
244#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
245
246#define CHECK_ASSERTIONS(assertions) \
247 (((assertions & ASSERT_AT_BOL) \
248 && (pos > 0 || reg_notbol) \
249 && (prev_c != L'\n' || !reg_newline)) \
250 || ((assertions & ASSERT_AT_EOL) \
251 && (next_c != L'\0' || reg_noteol) \
252 && (next_c != L'\n' || !reg_newline)) \
253 || ((assertions & ASSERT_AT_BOW) \
254 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
255 || ((assertions & ASSERT_AT_EOW) \
256 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
257 || ((assertions & ASSERT_AT_WB) \
258 && (pos != 0 && next_c != L'\0' \
259 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
260 || ((assertions & ASSERT_AT_WB_NEG) \
261 && (pos == 0 || next_c == L'\0' \
262 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
263
264#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
265 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \
266 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \
267 tnfa))
268
269
270
271
272inline static int
273tre_tag_get(const tre_tag_t *tags, int i)
274{
275 tags += i;
276 return tags->count > 0 ? tags->value : -1;
277}
278
279inline static void
280tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
281{
282 tags += i;
283 if (tags->count++ == 0)
284 tags->first = val;
285 tags->value = val;
286 tags->touch = touch;
287}
288
289inline static void
290tre_tag_reset(tre_tag_t *tags, int i)
291{
292 tags[i].count = 0;
293}
294
295inline static int
296tre_tag_touch_get(const tre_tag_t *tags, int i)
297{
298 return tags[i].touch;
299}
300
301#ifdef TRE_DEBUG
302inline static void
303tre_print_tags(const tre_tag_t *tags, int num_tags)
304{
305 int i;
306 for (i = 0; i < num_tags; i++, tags++)
307 {
308 switch(tags->count)
309 {
310 case 0:
311 DPRINT(("%d:(0,-1)", i));
312 break;
313 case 1:
314 DPRINT(("%d:(1,%d)", i, tags->first));
315 break;
316 default:
317 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
318 tags->value));
319 break;
320 }
321 if (i < (num_tags - 1))
322 DPRINT((" "));
323 }
324}
325
326inline static void
327tre_print_tags_all(const tre_tag_t *tags, int num_tags)
328{
329 int i;
330 for (i = 0; i < num_tags; i++, tags++)
331 {
332 switch(tags->count)
333 {
334 case 0:
335 DPRINT(("%d:(0,-1)/%d", i, tags->touch));
336 break;
337 case 1:
338 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
339 break;
340 default:
341 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
342 tags->value, tags->touch));
343 break;
344 }
345 if (i < (num_tags - 1))
346 DPRINT((" "));
347 }
348}
349#endif /* TRE_DEBUG */
350
351/* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
352 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
353inline static int
354tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
355 const tre_tag_t *tags2)
356{
357 const tre_tag_t *t1, *t2;
358
359 t1 = tags1 + start;
360 t2 = tags2 + start;
361 /* We need both start tags to be set */
362 if (t1->count == 0 || t2->count == 0)
363 return 0;
364
365 /* The start tags must be equal */
366 if (t1->value != t2->value)
367 return 0;
368
369 t1 = tags1 + end;
370 t2 = tags2 + end;
371 /* For the end tags, we prefer set over unset, because unset means that
372 * the end tag is still growing */
373 if (t1->count == 0)
374 {
375 /* if t2 is set, t1 loses since it is unset */
376 if (t2->count != 0)
377 return -1;
378 }
379 /* if t2 not set, t1 wins since it is set */
380 else if (t2->count == 0)
381 return 1;
382
383 /* least current value wins */
384 return t2->value - t1->value;
385}
386
387/* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
388 * (t1 loses if < 0, t1 wins if > 0) */
389inline static int
390tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
391 const tre_tag_t *t2)
392{
393 int diff;
394
395 t1 += i;
396 t2 += i;
397 switch (dir)
398 {
399 case TRE_TAG_MINIMIZE:
400 /* least current value wins (because tags are initialized to all zeros,
401 * unset wins over set; also, tre_minimal_tag_order() will have already
402 * been run, which checks for being unset) */
403 return t2->value - t1->value;
404
405 case TRE_TAG_MAXIMIZE:
406 /* prefer set */
407 if (t1->count == 0)
408 {
409 /* if neither t1 and t2 are set, try next tag */
410 if (t2->count == 0)
411 return 0;
412 /* t2 is set, t1 loses since it is unset */
413 return -1;
414 }
415 /* if t2 not set, t1 wins since it is set */
416 else if (t2->count == 0)
417 return 1;
418 /* greatest initial value wins */
419 if ((diff = t1->first - t2->first) != 0)
420 return diff;
421 /* least number of times the tag was set, wins */
422 if ((diff = t2->count - t1->count) != 0)
423 return diff;
424 /* if the tags were only set once, they only have initial values */
425 if (t1->count == 1)
426 return 0;
427 /* greatest current value wins */
428 return t1->value - t2->value;
429
430 case TRE_TAG_LEFT_MAXIMIZE:
431 /* prefer set */
432 if (t1->count == 0)
433 {
434 /* if neither t1 and t2 are set, try next tag */
435 if (t2->count == 0)
436 return 0;
437 /* t2 is set, t1 loses since it is unset */
438 return -1;
439 }
440 /* if t2 not set, t1 wins since it is set */
441 else if (t2->count == 0)
442 return 1;
443 /* least initial value wins */
444 if ((diff = t2->first - t1->first) != 0)
445 return diff;
446 /* least number of times the tag was set, wins */
447 if ((diff = t2->count - t1->count) != 0)
448 return diff;
449 /* if the tags were only set once, they only have initial values */
450 if (t1->count == 1)
451 return 0;
452 /* greatest current value wins */
453 return t1->value - t2->value;
454
455 default:
456 /* Shouldn't happen: only assert if TRE_DEBUG defined */
457 assert(0);
458 break;
459 }
460 return 0;
461}
462
463#ifdef TRE_DEBUG
464#define _MORE_DEBUGGING
465#endif /* TRE_DEBUG */
466
467/* Returns 1 if `t1' wins `t2', 0 otherwise. */
468inline static int
469#ifdef _MORE_DEBUGGING
470_tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
471 const tre_tag_t *t1, const tre_tag_t *t2)
472#else /* !_MORE_DEBUGGING */
473tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
474 const tre_tag_t *t1, const tre_tag_t *t2)
475#endif /* !_MORE_DEBUGGING */
476{
477 int i, ret;
478
479 for (i = 0; i < num_tags; i++)
480 {
481 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
482 return (ret > 0);
483 }
484 /* assert(0);*/
485 return 0;
486}
487
488#ifdef _MORE_DEBUGGING
489inline static int
490tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
491 const tre_tag_t *t1, const tre_tag_t *t2)
492{
493 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
494 DPRINT(("tre_tag_order: "));
495 tre_print_tags(t1, num_tags);
496 DPRINT((" %s ", ret ? "wins" : "doesn't win"));
497 tre_print_tags(t2, num_tags);
498 DPRINT(("\n"));
499 return ret;
500}
501#endif /* _MORE_DEBUGGING */
502
503#ifdef __LIBC__
504#include <xlocale_private.h>
505#else /* !__LIBC__ */
506#include <xlocale.h>
507#endif /* !__LIBC__ */
508
509int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
510
511inline static int
512tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
513 const tre_tnfa_t * __restrict tnfa)
514{
515 int match = 0;
516 int i;
517 tre_bracket_match_t *b;
518 tre_cint_t uc, lc;
519 int we, ue, le, got_equiv = 0;
520 int icase = ((tnfa->cflags & REG_ICASE) != 0);
521
522 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
523 if (icase)
524 {
525 if (tre_islower_l(wc, tnfa->loc))
526 {
527 lc = wc;
528 uc = tre_toupper_l(wc, tnfa->loc);
529 }
530 else if (tre_isupper_l(wc, tnfa->loc))
531 {
532 uc = wc;
533 lc = tre_tolower_l(wc, tnfa->loc);
534 }
535 else
536 {
537 icase = 0;
538 }
539 }
540 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
541 i++, b++)
542 {
543 switch (b->type)
544 {
545 case TRE_BRACKET_MATCH_TYPE_CHAR:
546 if (icase)
547 match = (b->value == uc || b->value == lc);
548 else
549 match = (b->value == wc);
550 break;
551 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
552 {
553 tre_cint_t start = b->value, end;
554 if (++i >= list->num_bracket_matches ||
555 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
556 {
557 DPRINT(("tre_bracket_match: no following range end\n"));
558 assert(0);
559 goto error;
560 }
561 end = b->value;
562 if (!got_equiv)
563 {
564 if (icase)
565 {
566 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
567 le = __collate_equiv_value(tnfa->loc, &lc, 1);
568 }
569 else
570 we = __collate_equiv_value(tnfa->loc, &wc, 1);
571 got_equiv = 1;
572 }
573 if (icase)
574 match = ((start <= ue && ue <= end) ||
575 (start <= le && le <= end));
576 else
577 match = (start <= we && we <= end);
578 break;
579 }
580 case TRE_BRACKET_MATCH_TYPE_RANGE_END:
581 DPRINT(("tre_bracket_match: range end without preceeding start\n"));
582 assert(0);
583 break;
584 case TRE_BRACKET_MATCH_TYPE_CLASS:
585 if (icase)
586 match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
587 tre_isctype_l(lc, b->value, tnfa->loc));
588 else
589 match = (tre_isctype_l(wc, b->value, tnfa->loc));
590 break;
591 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
592 if (!got_equiv)
593 {
594 if (icase)
595 {
596 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
597 le = __collate_equiv_value(tnfa->loc, &lc, 1);
598 }
599 else
600 we = __collate_equiv_value(tnfa->loc, &wc, 1);
601 got_equiv = 1;
602 }
603 if (icase)
604 match = (b->value == ue || b->value == le);
605 else
606 match = (b->value == we);
607 break;
608 default:
609 DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
610 assert(0);
611 break;
612 }
613 if (match)
614 break;
615 }
616error:
617 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE)
618 match = !match;
619 return match;
620}