]> git.saurik.com Git - apple/libc.git/blob - regex/TRE/lib/tre-match-utils.h
Libc-1439.100.3.tar.gz
[apple/libc.git] / regex / TRE / lib / tre-match-utils.h
1 /*
2 tre-match-utils.h - TRE matcher helper definitions
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 #include "tre-internal.h"
10
11 #define str_source ((const tre_str_source*)string)
12
13 #ifdef TRE_WCHAR
14
15 #ifdef TRE_MULTIBYTE
16
17 /* Wide character and multibyte support. */
18
19 #ifdef TRE_STR_USER
20 #error TRE_STR_USER defined
21 #define GET_NEXT_WCHAR() \
22 do { \
23 prev_c = next_c; \
24 if (type == STR_BYTE) \
25 { \
26 pos++; \
27 if (len >= 0 && pos >= len) \
28 next_c = '\0'; \
29 else \
30 next_c = (unsigned char)(*str_byte++); \
31 } \
32 else if (type == STR_WIDE) \
33 { \
34 pos++; \
35 if (len >= 0 && pos >= len) \
36 next_c = L'\0'; \
37 else \
38 next_c = *str_wide++; \
39 } \
40 else if (type == STR_MBS) \
41 { \
42 pos += pos_add_next; \
43 if (str_byte == NULL) \
44 next_c = L'\0'; \
45 else \
46 { \
47 size_t w; \
48 int max; \
49 if (len >= 0) \
50 max = len - pos; \
51 else \
52 max = 32; \
53 if (max <= 0) \
54 { \
55 next_c = L'\0'; \
56 pos_add_next = 1; \
57 } \
58 else \
59 { \
60 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
61 tnfa->loc); \
62 if (w == (size_t)-1 || w == (size_t)-2) \
63 return REG_ILLSEQ; \
64 if (w == 0 && len >= 0) \
65 { \
66 pos_add_next = 1; \
67 next_c = 0; \
68 str_byte++; \
69 } \
70 else \
71 { \
72 pos_add_next = w; \
73 str_byte += w; \
74 } \
75 } \
76 } \
77 } \
78 else if (type == STR_USER) \
79 { \
80 pos += pos_add_next; \
81 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
82 str_source->context); \
83 } \
84 } while(/*CONSTCOND*/0)
85 #else /* !TRE_STR_USER */
86 /*
87 * Because all multibyte encodings are exclusively single-shift encoding,
88 * with the shift codes having the high bit set, we can make an optimization
89 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
90 * is detected, and just do a direct copy for ASCII characters.
91 */
92 #define GET_NEXT_WCHAR() \
93 do { \
94 prev_c = next_c; \
95 switch (type) \
96 { \
97 case STR_BYTE: \
98 pos++; \
99 if (len >= 0 && pos >= len) \
100 next_c = '\0'; \
101 else \
102 next_c = (unsigned char)(*str_byte++); \
103 break; \
104 case STR_WIDE: \
105 pos++; \
106 if (len >= 0 && pos >= len) \
107 next_c = L'\0'; \
108 else \
109 next_c = *str_wide++; \
110 break; \
111 case STR_MBS: \
112 pos += pos_add_next; \
113 if (__builtin_expect(len >= 0 && pos >= len, 0)) \
114 { \
115 next_c = L'\0'; \
116 pos_add_next = 1; \
117 } \
118 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \
119 { \
120 next_c = (unsigned char)(*str_byte++); \
121 pos_add_next = 1; \
122 } \
123 else \
124 { \
125 size_t w; \
126 int max; \
127 if (len >= 0) \
128 max = len - pos; \
129 else \
130 max = 32; \
131 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
132 tnfa->loc); \
133 if (w == (size_t)-1 || w == (size_t)-2) \
134 return REG_ILLSEQ; \
135 if (w == 0 && len >= 0) \
136 { \
137 pos_add_next = 1; \
138 next_c = 0; \
139 str_byte++; \
140 } \
141 else \
142 { \
143 pos_add_next = w; \
144 str_byte += w; \
145 } \
146 } \
147 break; \
148 } \
149 } while(/*CONSTCOND*/0)
150 #endif /* !TRE_STR_USER */
151
152 #else /* !TRE_MULTIBYTE */
153
154 /* Wide character support, no multibyte support. */
155 #error TRE_MULTIBYTE undefined
156
157 #ifdef TRE_STR_USER
158 #define GET_NEXT_WCHAR() \
159 do { \
160 prev_c = next_c; \
161 if (type == STR_BYTE) \
162 { \
163 pos++; \
164 if (len >= 0 && pos >= len) \
165 next_c = '\0'; \
166 else \
167 next_c = (unsigned char)(*str_byte++); \
168 } \
169 else if (type == STR_WIDE) \
170 { \
171 pos++; \
172 if (len >= 0 && pos >= len) \
173 next_c = L'\0'; \
174 else \
175 next_c = *str_wide++; \
176 } \
177 else if (type == STR_USER) \
178 { \
179 pos += pos_add_next; \
180 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
181 str_source->context); \
182 } \
183 } while(/*CONSTCOND*/0)
184 #else /* !TRE_STR_USER */
185 #define GET_NEXT_WCHAR() \
186 do { \
187 prev_c = next_c; \
188 if (type == STR_BYTE) \
189 { \
190 pos++; \
191 if (len >= 0 && pos >= len) \
192 next_c = '\0'; \
193 else \
194 next_c = (unsigned char)(*str_byte++); \
195 } \
196 else if (type == STR_WIDE) \
197 { \
198 pos++; \
199 if (len >= 0 && pos >= len) \
200 next_c = L'\0'; \
201 else \
202 next_c = *str_wide++; \
203 } \
204 } while(/*CONSTCOND*/0)
205 #endif /* !TRE_STR_USER */
206
207 #endif /* !TRE_MULTIBYTE */
208
209 #else /* !TRE_WCHAR */
210
211 /* No wide character or multibyte support. */
212 #error TRE_WCHAR undefined
213
214 #ifdef TRE_STR_USER
215 #define GET_NEXT_WCHAR() \
216 do { \
217 prev_c = next_c; \
218 if (type == STR_BYTE) \
219 { \
220 pos++; \
221 if (len >= 0 && pos >= len) \
222 next_c = '\0'; \
223 else \
224 next_c = (unsigned char)(*str_byte++); \
225 } \
226 else if (type == STR_USER) \
227 { \
228 pos += pos_add_next; \
229 str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
230 str_source->context); \
231 } \
232 } while(/*CONSTCOND*/0)
233 #else /* !TRE_STR_USER */
234 #define GET_NEXT_WCHAR() \
235 do { \
236 prev_c = next_c; \
237 if (type == STR_BYTE) \
238 { \
239 pos++; \
240 if (len >= 0 && pos >= len) \
241 next_c = '\0'; \
242 else \
243 next_c = (unsigned char)(*str_byte++); \
244 } \
245 } while(/*CONSTCOND*/0)
246 #endif /* !TRE_STR_USER */
247
248 #endif /* !TRE_WCHAR */
249
250
251
252 /* Assumes tre_tnfa_t *tnfa in scope */
253 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
254
255 #define CHECK_ASSERTIONS(assertions) \
256 (((assertions & ASSERT_AT_BOL) \
257 && (pos > 0 || reg_notbol) \
258 && (prev_c != L'\n' || !reg_newline)) \
259 || ((assertions & ASSERT_AT_EOL) \
260 && (next_c != L'\0' || reg_noteol) \
261 && (next_c != L'\n' || !reg_newline)) \
262 || ((assertions & ASSERT_AT_BOW) \
263 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
264 || ((assertions & ASSERT_AT_EOW) \
265 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
266 || ((assertions & ASSERT_AT_WB) \
267 && (pos != 0 && next_c != L'\0' \
268 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
269 || ((assertions & ASSERT_AT_WB_NEG) \
270 && (pos == 0 || next_c == L'\0' \
271 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
272
273 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
274 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \
275 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \
276 tnfa))
277
278
279
280
281 inline static int
282 tre_tag_get(const tre_tag_t *tags, int i)
283 {
284 tags += i;
285 return tags->count > 0 ? tags->value : -1;
286 }
287
288 inline static void
289 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
290 {
291 tags += i;
292 if (tags->count++ == 0)
293 tags->first = val;
294 tags->value = val;
295 tags->touch = touch;
296 }
297
298 inline static void
299 tre_tag_reset(tre_tag_t *tags, int i)
300 {
301 tags[i].count = 0;
302 }
303
304 inline static int
305 tre_tag_touch_get(const tre_tag_t *tags, int i)
306 {
307 return tags[i].touch;
308 }
309
310 #ifdef TRE_DEBUG
311 inline static void
312 tre_print_tags(const tre_tag_t *tags, int num_tags)
313 {
314 int i;
315 for (i = 0; i < num_tags; i++, tags++)
316 {
317 switch(tags->count)
318 {
319 case 0:
320 DPRINT(("%d:(0,-1)", i));
321 break;
322 case 1:
323 DPRINT(("%d:(1,%d)", i, tags->first));
324 break;
325 default:
326 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
327 tags->value));
328 break;
329 }
330 if (i < (num_tags - 1))
331 DPRINT((" "));
332 }
333 }
334
335 inline static void
336 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
337 {
338 int i;
339 for (i = 0; i < num_tags; i++, tags++)
340 {
341 switch(tags->count)
342 {
343 case 0:
344 DPRINT(("%d:(0,-1)/%d", i, tags->touch));
345 break;
346 case 1:
347 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
348 break;
349 default:
350 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
351 tags->value, tags->touch));
352 break;
353 }
354 if (i < (num_tags - 1))
355 DPRINT((" "));
356 }
357 }
358 #endif /* TRE_DEBUG */
359
360 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
361 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
362 inline static int
363 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
364 const tre_tag_t *tags2)
365 {
366 const tre_tag_t *t1, *t2;
367
368 t1 = tags1 + start;
369 t2 = tags2 + start;
370 /* We need both start tags to be set */
371 if (t1->count == 0 || t2->count == 0)
372 return 0;
373
374 /* The start tags must be equal */
375 if (t1->value != t2->value)
376 return 0;
377
378 t1 = tags1 + end;
379 t2 = tags2 + end;
380 /* For the end tags, we prefer set over unset, because unset means that
381 * the end tag is still growing */
382 if (t1->count == 0)
383 {
384 /* if t2 is set, t1 loses since it is unset */
385 if (t2->count != 0)
386 return -1;
387 }
388 /* if t2 not set, t1 wins since it is set */
389 else if (t2->count == 0)
390 return 1;
391
392 /* least current value wins */
393 return t2->value - t1->value;
394 }
395
396 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
397 * (t1 loses if < 0, t1 wins if > 0) */
398 inline static int
399 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
400 const tre_tag_t *t2)
401 {
402 int diff;
403
404 t1 += i;
405 t2 += i;
406 switch (dir)
407 {
408 case TRE_TAG_MINIMIZE:
409 /* least current value wins (because tags are initialized to all zeros,
410 * unset wins over set; also, tre_minimal_tag_order() will have already
411 * been run, which checks for being unset) */
412 return t2->value - t1->value;
413
414 case TRE_TAG_MAXIMIZE:
415 /* prefer set */
416 if (t1->count == 0)
417 {
418 /* if neither t1 and t2 are set, try next tag */
419 if (t2->count == 0)
420 return 0;
421 /* t2 is set, t1 loses since it is unset */
422 return -1;
423 }
424 /* if t2 not set, t1 wins since it is set */
425 else if (t2->count == 0)
426 return 1;
427 /* greatest initial value wins */
428 if ((diff = t1->first - t2->first) != 0)
429 return diff;
430 /* least number of times the tag was set, wins */
431 if ((diff = t2->count - t1->count) != 0)
432 return diff;
433 /* if the tags were only set once, they only have initial values */
434 if (t1->count == 1)
435 return 0;
436 /* greatest current value wins */
437 return t1->value - t2->value;
438
439 case TRE_TAG_LEFT_MAXIMIZE:
440 /* prefer set */
441 if (t1->count == 0)
442 {
443 /* if neither t1 and t2 are set, try next tag */
444 if (t2->count == 0)
445 return 0;
446 /* t2 is set, t1 loses since it is unset */
447 return -1;
448 }
449 /* if t2 not set, t1 wins since it is set */
450 else if (t2->count == 0)
451 return 1;
452 /* least initial value wins */
453 if ((diff = t2->first - t1->first) != 0)
454 return diff;
455 /* least number of times the tag was set, wins */
456 if ((diff = t2->count - t1->count) != 0)
457 return diff;
458 /* if the tags were only set once, they only have initial values */
459 if (t1->count == 1)
460 return 0;
461 /* greatest current value wins */
462 return t1->value - t2->value;
463
464 default:
465 /* Shouldn't happen: only assert if TRE_DEBUG defined */
466 assert(0);
467 break;
468 }
469 return 0;
470 }
471
472 #ifdef TRE_DEBUG
473 #define _MORE_DEBUGGING
474 #endif /* TRE_DEBUG */
475
476 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
477 inline static int
478 #ifdef _MORE_DEBUGGING
479 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
480 const tre_tag_t *t1, const tre_tag_t *t2)
481 #else /* !_MORE_DEBUGGING */
482 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
483 const tre_tag_t *t1, const tre_tag_t *t2)
484 #endif /* !_MORE_DEBUGGING */
485 {
486 int i, ret;
487
488 for (i = 0; i < num_tags; i++)
489 {
490 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
491 return (ret > 0);
492 }
493 /* assert(0);*/
494 return 0;
495 }
496
497 #ifdef _MORE_DEBUGGING
498 inline static int
499 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
500 const tre_tag_t *t1, const tre_tag_t *t2)
501 {
502 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
503 DPRINT(("tre_tag_order: "));
504 tre_print_tags(t1, num_tags);
505 DPRINT((" %s ", ret ? "wins" : "doesn't win"));
506 tre_print_tags(t2, num_tags);
507 DPRINT(("\n"));
508 return ret;
509 }
510 #endif /* _MORE_DEBUGGING */
511
512 #ifdef __LIBC__
513 #include <xlocale_private.h>
514 #else /* !__LIBC__ */
515 #include <xlocale.h>
516 #endif /* !__LIBC__ */
517
518 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
519
520 inline static int
521 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
522 const tre_tnfa_t * __restrict tnfa)
523 {
524 int match = 0;
525 int i;
526 tre_bracket_match_t *b;
527 tre_cint_t uc, lc;
528 int we, ue, le, got_equiv = 0;
529 int icase = ((tnfa->cflags & REG_ICASE) != 0);
530
531 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
532 if (icase)
533 {
534 if (tre_islower_l(wc, tnfa->loc))
535 {
536 lc = wc;
537 uc = tre_toupper_l(wc, tnfa->loc);
538 }
539 else if (tre_isupper_l(wc, tnfa->loc))
540 {
541 uc = wc;
542 lc = tre_tolower_l(wc, tnfa->loc);
543 }
544 else
545 {
546 icase = 0;
547 }
548 }
549 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
550 i++, b++)
551 {
552 switch (b->type)
553 {
554 case TRE_BRACKET_MATCH_TYPE_CHAR:
555 if (icase)
556 match = (b->value == uc || b->value == lc);
557 else
558 match = (b->value == wc);
559 break;
560 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
561 {
562 tre_cint_t start = b->value, end;
563 if (++i >= list->num_bracket_matches ||
564 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
565 {
566 DPRINT(("tre_bracket_match: no following range end\n"));
567 assert(0);
568 goto error;
569 }
570 end = b->value;
571 if (!got_equiv)
572 {
573 if (icase)
574 {
575 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
576 le = __collate_equiv_value(tnfa->loc, &lc, 1);
577 }
578 else
579 we = __collate_equiv_value(tnfa->loc, &wc, 1);
580 got_equiv = 1;
581 }
582 if (icase)
583 match = ((start <= ue && ue <= end) ||
584 (start <= le && le <= end));
585 else
586 match = (start <= we && we <= end);
587 break;
588 }
589 case TRE_BRACKET_MATCH_TYPE_RANGE_END:
590 DPRINT(("tre_bracket_match: range end without preceeding start\n"));
591 assert(0);
592 break;
593 case TRE_BRACKET_MATCH_TYPE_CLASS:
594 if (icase)
595 match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
596 tre_isctype_l(lc, b->value, tnfa->loc));
597 else
598 match = (tre_isctype_l(wc, b->value, tnfa->loc));
599 break;
600 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
601 if (!got_equiv)
602 {
603 if (icase)
604 {
605 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
606 le = __collate_equiv_value(tnfa->loc, &lc, 1);
607 }
608 else
609 we = __collate_equiv_value(tnfa->loc, &wc, 1);
610 got_equiv = 1;
611 }
612 if (icase)
613 match = (b->value == ue || b->value == le);
614 else
615 match = (b->value == we);
616 break;
617 default:
618 DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
619 assert(0);
620 break;
621 }
622 if (match)
623 break;
624 }
625 error:
626 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
627 if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
628 match = !match;
629 }
630 return match;
631 }