]>
git.saurik.com Git - apple/libc.git/blob - regex/TRE/lib/tre-match-parallel.c
2 tre-match-parallel.c - TRE parallel regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This algorithm searches for matches basically by reading characters
11 in the searched string one by one, starting at the beginning. All
12 matching paths in the TNFA are traversed in parallel. When two or
13 more paths reach the same state, exactly one is chosen according to
14 tag ordering rules; if returning submatches is not required it does
15 not matter which path is chosen.
17 The worst case time required for finding the leftmost and longest
18 match, or determining that there is no match, is always linearly
19 dependent on the length of the text being searched.
21 This algorithm cannot handle TNFAs with back referencing nodes.
22 See `tre-match-backtrack.c'.
28 #endif /* HAVE_CONFIG_H */
30 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
35 /* AIX requires this to be the first thing in the file. */
43 # ifndef alloca /* predefined by HP cc +Olibcalls */
49 #endif /* TRE_USE_ALLOCA */
56 #endif /* HAVE_WCHAR_H */
59 #endif /* HAVE_WCTYPE_H */
62 #endif /* !TRE_WCHAR */
65 #endif /* HAVE_MALLOC_H */
67 #include "tre-internal.h"
68 #include "tre-match-utils.h"
75 tre_tnfa_transition_t
*state
;
87 tre_print_reach1(tre_tnfa_transition_t
*state
, tre_tag_t
*tags
, int num_tags
)
89 DPRINT((" %p", (void *)state
));
93 tre_print_tags(tags
, num_tags
);
98 tre_print_reach(const tre_tnfa_t
*tnfa
, tre_tnfa_reach_t
*reach
, int num_tags
)
100 while (reach
->state
!= NULL
)
102 tre_print_reach1(reach
->state
, reach
->tags
, num_tags
);
108 #endif /* TRE_DEBUG */
111 tre_tnfa_run_parallel(const tre_tnfa_t
*tnfa
, const void *string
, int len
,
112 tre_str_type_t type
, tre_tag_t
*match_tags
, int eflags
,
115 /* State variables required by GET_NEXT_WCHAR. */
116 tre_char_t prev_c
= 0, next_c
= 0;
117 const char *str_byte
= string
;
119 unsigned int pos_add_next
= 1;
121 const wchar_t *str_wide
= string
;
124 #endif /* TRE_MBSTATE */
125 #endif /* TRE_WCHAR */
126 int reg_notbol
= eflags
& REG_NOTBOL
;
127 int reg_noteol
= eflags
& REG_NOTEOL
;
128 int reg_newline
= tnfa
->cflags
& REG_NEWLINE
;
130 int str_user_end
= 0;
131 #endif /* TRE_STR_USER */
134 tre_tnfa_transition_t
*trans_i
;
135 tre_tnfa_reach_t
*reach
, *reach_next
, *reach_i
, *reach_next_i
;
136 tre_reach_pos_t
*reach_pos
;
140 int match_eo
= -1; /* end offset of match (-1 if no match found yet) */
143 #endif /* TRE_DEBUG */
144 tre_tag_t
*tmp_tags
= NULL
;
150 memset(&mbstate
, '\0', sizeof(mbstate
));
151 #endif /* TRE_MBSTATE */
153 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type
));
158 num_tags
= tnfa
->num_tags
;
160 /* Allocate memory for temporary data required for matching. This needs to
161 be done for every matching operation to be thread safe. This allocates
162 everything in a single large block from the stack frame using alloca()
163 or with malloc() if alloca is unavailable. */
165 size_t rbytes
, pbytes
, total_bytes
;
167 /* Compute the length of the block we need. */
168 tbytes
= sizeof(*tmp_tags
) * num_tags
;
169 rbytes
= sizeof(*reach_next
) * (tnfa
->num_states
+ 1);
170 pbytes
= sizeof(*reach_pos
) * tnfa
->num_states
;
172 (sizeof(long) - 1) * 4 /* for alignment paddings */
173 + (rbytes
+ tbytes
* tnfa
->num_states
) * 2 + tbytes
+ pbytes
;
175 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes
));
176 /* Allocate the memory. */
177 #ifdef TRE_USE_ALLOCA
178 buf
= alloca(total_bytes
);
179 #else /* !TRE_USE_ALLOCA */
180 buf
= xmalloc(total_bytes
);
181 #endif /* !TRE_USE_ALLOCA */
184 memset(buf
, 0, total_bytes
);
186 /* Get the various pointers within tmp_buf (properly aligned). */
187 tmp_tags
= (void *)buf
;
188 tmp_buf
= buf
+ tbytes
;
189 tmp_buf
+= ALIGN(tmp_buf
, long);
190 reach_next
= (void *)tmp_buf
;
192 tmp_buf
+= ALIGN(tmp_buf
, long);
193 reach
= (void *)tmp_buf
;
195 tmp_buf
+= ALIGN(tmp_buf
, long);
196 reach_pos
= (void *)tmp_buf
;
198 tmp_buf
+= ALIGN(tmp_buf
, long);
199 for (i
= 0; i
< tnfa
->num_states
; i
++)
201 reach
[i
].tags
= (void *)tmp_buf
;
203 reach_next
[i
].tags
= (void *)tmp_buf
;
208 for (i
= 0; i
< tnfa
->num_states
; i
++)
209 reach_pos
[i
].pos
= -1;
211 /* If only one character can start a match, find it first. */
212 if (tnfa
->first_char
>= 0 && str_byte
)
214 const char *orig_str
= str_byte
;
215 int first
= tnfa
->first_char
;
216 int found_high_bit
= 0;
219 if (type
== STR_BYTE
)
222 str_byte
= memchr(orig_str
, first
, (size_t)len
);
224 str_byte
= strchr(orig_str
, first
);
226 else if (type
== STR_MBS
)
229 * If the match character is ASCII, try to match the character
230 * directly, but if a high bit character is found, we stop there.
237 for (i
= 0; ; str_byte
++, i
++)
244 if (*str_byte
== first
)
246 if (*str_byte
& 0x80)
262 if (*str_byte
== first
)
264 if (*str_byte
& 0x80)
277 for (i
= 0; ; str_byte
++, i
++)
284 if (*str_byte
& 0x80)
300 if (*str_byte
& 0x80)
309 if (str_byte
== NULL
)
311 #ifndef TRE_USE_ALLOCA
314 #endif /* !TRE_USE_ALLOCA */
317 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte
- orig_str
)));
320 if (str_byte
>= orig_str
+ 1)
321 prev_c
= (unsigned char)*(str_byte
- 1);
322 next_c
= (unsigned char)*str_byte
;
323 pos
= str_byte
- orig_str
;
324 if (len
< 0 || pos
< len
)
329 if (str_byte
== orig_str
)
330 goto no_first_optimization
;
332 * Back up one character, fix up the position, then call
333 * GET_NEXT_WCHAR() to process the multibyte character.
335 /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
336 next_c
= (unsigned char)*(str_byte
- 1);
337 pos
= (str_byte
- 1) - orig_str
;
343 no_first_optimization
:
348 #ifdef USE_FIRSTPOS_CHARS /* not defined */
349 /* Skip over characters that cannot possibly be the first character
351 if (tnfa
->firstpos_chars
!= NULL
)
353 char *chars
= tnfa
->firstpos_chars
;
357 const char *orig_str
= str_byte
;
358 /* XXX - use strpbrk() and wcspbrk() because they might be
359 optimized for the target architecture. Try also strcspn()
360 and wcscspn() and compare the speeds. */
361 while (next_c
!= L
'\0' && !chars
[next_c
])
363 next_c
= *str_byte
++;
365 prev_c
= *(str_byte
- 2);
366 pos
+= str_byte
- orig_str
;
367 DPRINT(("skipped %d chars\n", str_byte
- orig_str
));
371 while (pos
<= len
&& !chars
[next_c
])
374 next_c
= (unsigned char)(*str_byte
++);
379 #endif /* USE_FIRSTPOS_CHARS */
381 DPRINT(("length: %d\n", len
));
382 DPRINT(("pos:chr/code | states and tags\n"));
383 DPRINT(("-------------+------------------------------------------------\n"));
385 reach_next_i
= reach_next
;
386 while (/*CONSTCOND*/1)
388 /* If no match found yet, add the initial states to `reach_next'. */
392 trans_i
= tnfa
->initial
;
393 while (trans_i
->state
!= NULL
)
395 if (reach_pos
[trans_i
->state_id
].pos
< pos
)
397 if (trans_i
->assertions
398 && CHECK_ASSERTIONS(trans_i
->assertions
))
400 DPRINT(("assertion failed\n"));
405 DPRINT((" %p", (void *)trans_i
->state
));
406 reach_next_i
->state
= trans_i
->state
;
407 memset(reach_next_i
->tags
, 0, tbytes
);
408 tag_i
= trans_i
->tags
;
413 if (*tag_i
< num_tags
)
414 tre_tag_set(reach_next_i
->tags
, *tag_i
, pos
, touch
);
419 if (reach_next_i
->state
== tnfa
->final
)
421 DPRINT((" found empty match\n"));
423 memcpy(match_tags
, reach_next_i
->tags
, tbytes
);
425 reach_pos
[trans_i
->state_id
].pos
= pos
;
426 reach_pos
[trans_i
->state_id
].tags
= &reach_next_i
->tags
;
432 reach_next_i
->state
= NULL
;
436 if (num_tags
== 0 || reach_next_i
== reach_next
)
437 /* We have found a match. */
441 /* Check for end of string. */
445 if (type
== STR_USER
)
451 #endif /* TRE_STR_USER */
464 DPRINT(("%3d:%2lc/%05d |", pos
- 1, (tre_cint_t
)prev_c
, (int)prev_c
));
465 tre_print_reach(tnfa
, reach_next
, num_tags
);
466 //DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
467 //tre_print_reach(tnfa, reach_next, num_tags);
468 #endif /* TRE_DEBUG */
470 /* Swap `reach' and `reach_next'. */
473 reach_next
= reach_i
;
477 #endif /* TRE_DEBUG */
479 /* For each state in `reach' see if there is a transition leaving with
480 the current input symbol to a state not yet in `reach_next', and
481 add the destination states to `reach_next'. */
482 reach_next_i
= reach_next
;
483 for (reach_i
= reach
; reach_i
->state
; reach_i
++)
485 for (trans_i
= reach_i
->state
; trans_i
->state
; trans_i
++)
487 /* Does this transition match the input symbol? */
488 if (trans_i
->code_min
<= (tre_cint_t
)prev_c
&&
489 trans_i
->code_max
>= (tre_cint_t
)prev_c
)
491 if (trans_i
->assertions
492 && (CHECK_ASSERTIONS(trans_i
->assertions
)
493 || CHECK_CHAR_CLASSES(trans_i
, tnfa
, eflags
)))
495 DPRINT(("assertion failed\n"));
499 /* Compute the tags after this transition. */
500 memcpy(tmp_tags
, reach_i
->tags
, tbytes
);
501 tag_i
= trans_i
->tags
;
506 if (*tag_i
< num_tags
)
507 tre_tag_set(tmp_tags
, *tag_i
, pos
, touch
);
513 /* For each new transition, weed out those that don't
514 fulfill the minimal matching conditions. */
515 if (tnfa
->num_minimals
&& match_eo
>= 0)
521 DPRINT(("Checking minimal conditions: match_eo=%d "
522 "match_tags=", match_eo
));
523 tre_print_tags(match_tags
, num_tags
);
527 #endif /* TRE_DEBUG */
528 for (i
= 0; tnfa
->minimal_tags
[i
] >= 0; i
+= 2)
530 int end
= tnfa
->minimal_tags
[i
];
531 int start
= tnfa
->minimal_tags
[i
+ 1];
532 DPRINT((" Minimal start %d, end %d\n", start
, end
));
533 if (tre_minimal_tag_order(start
, end
, match_tags
,
543 DPRINT((" Throwing out"));
544 tre_print_reach1(reach_i
->state
, tmp_tags
,
547 #endif /* TRE_DEBUG */
552 if (reach_pos
[trans_i
->state_id
].pos
< pos
)
554 /* Found an unvisited node. */
555 reach_next_i
->state
= trans_i
->state
;
556 tmp_iptr
= reach_next_i
->tags
;
557 reach_next_i
->tags
= tmp_tags
;
559 reach_pos
[trans_i
->state_id
].pos
= pos
;
560 reach_pos
[trans_i
->state_id
].tags
= &reach_next_i
->tags
;
562 if (reach_next_i
->state
== tnfa
->final
565 && tre_tag_get(reach_next_i
->tags
, 0) <=
566 tre_tag_get(match_tags
, 0))))
569 DPRINT((" found match"));
570 tre_print_reach1(trans_i
->state
, reach_next_i
->tags
, num_tags
);
572 #endif /* TRE_DEBUG */
574 memcpy(match_tags
, reach_next_i
->tags
, tbytes
);
581 assert(reach_pos
[trans_i
->state_id
].pos
== pos
);
582 /* Another path has also reached this state. We choose
583 the winner by examining the tag values for both
585 if (tre_tag_order(num_tags
, tnfa
->tag_directions
,
587 *reach_pos
[trans_i
->state_id
].tags
))
589 /* The new path wins. */
590 tmp_iptr
= *reach_pos
[trans_i
->state_id
].tags
;
591 *reach_pos
[trans_i
->state_id
].tags
= tmp_tags
;
592 if (trans_i
->state
== tnfa
->final
)
595 DPRINT((" found better match"));
596 tre_print_reach1(trans_i
->state
, tmp_tags
, num_tags
);
598 #endif /* TRE_DEBUG */
600 memcpy(match_tags
, tmp_tags
, tbytes
);
608 reach_next_i
->state
= NULL
;
611 DPRINT(("match end offset = %d\n", match_eo
));
613 *match_end_ofs
= match_eo
;
617 DPRINT(("Winning tags="));
618 tre_print_tags_all(match_tags
, num_tags
);
619 DPRINT((" touch=%d\n", touch
));
621 #endif /* TRE_DEBUG */
623 #ifndef TRE_USE_ALLOCA
626 #endif /* !TRE_USE_ALLOCA */
628 return match_eo
>= 0 ? REG_OK
: REG_NOMATCH
;