reach_pos[i].pos = -1;
/* If only one character can start a match, find it first. */
- if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
+ if (tnfa->first_char >= 0 && str_byte)
{
const char *orig_str = str_byte;
int first = tnfa->first_char;
+ int found_high_bit = 0;
- if (len >= 0)
- str_byte = memchr(orig_str, first, (size_t)len);
- else
- str_byte = strchr(orig_str, first);
+
+ if (type == STR_BYTE)
+ {
+ if (len >= 0)
+ str_byte = memchr(orig_str, first, (size_t)len);
+ else
+ str_byte = strchr(orig_str, first);
+ }
+ else if (type == STR_MBS)
+ {
+ /*
+ * If the match character is ASCII, try to match the character
+ * directly, but if a high bit character is found, we stop there.
+ */
+ if (first < 0x80)
+ {
+ if (len >= 0)
+ {
+ int i;
+ for (i = 0; ; str_byte++, i++)
+ {
+ if (i >= len)
+ {
+ str_byte = NULL;
+ break;
+ }
+ if (*str_byte == first)
+ break;
+ if (*str_byte & 0x80)
+ {
+ found_high_bit = 1;
+ break;
+ }
+ }
+ }
+ else
+ {
+ for (; ; str_byte++)
+ {
+ if (!*str_byte)
+ {
+ str_byte = NULL;
+ break;
+ }
+ if (*str_byte == first)
+ break;
+ if (*str_byte & 0x80)
+ {
+ found_high_bit = 1;
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (len >= 0)
+ {
+ int i;
+ for (i = 0; ; str_byte++, i++)
+ {
+ if (i >= len)
+ {
+ str_byte = NULL;
+ break;
+ }
+ if (*str_byte & 0x80)
+ {
+ found_high_bit = 1;
+ break;
+ }
+ }
+ }
+ else
+ {
+ for (; ; str_byte++)
+ {
+ if (!*str_byte)
+ {
+ str_byte = NULL;
+ break;
+ }
+ if (*str_byte & 0x80)
+ {
+ found_high_bit = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
if (str_byte == NULL)
{
#ifndef TRE_USE_ALLOCA
return REG_NOMATCH;
}
DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
- if (str_byte >= orig_str + 1)
- prev_c = (unsigned char)*(str_byte - 1);
- next_c = (unsigned char)*str_byte;
- pos = str_byte - orig_str;
- if (len < 0 || pos < len)
- str_byte++;
+ if (!found_high_bit)
+ {
+ if (str_byte >= orig_str + 1)
+ prev_c = (unsigned char)*(str_byte - 1);
+ next_c = (unsigned char)*str_byte;
+ pos = str_byte - orig_str;
+ if (len < 0 || pos < len)
+ str_byte++;
+ }
+ else
+ {
+ if (str_byte == orig_str)
+ goto no_first_optimization;
+ /*
+ * Back up one character, fix up the position, then call
+ * GET_NEXT_WCHAR() to process the multibyte character.
+ */
+ /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
+ next_c = (unsigned char)*(str_byte - 1);
+ pos = (str_byte - 1) - orig_str;
+ GET_NEXT_WCHAR();
+ }
}
else
{
+no_first_optimization:
GET_NEXT_WCHAR();
pos = 0;
}
-#if 0
+#ifdef USE_FIRSTPOS_CHARS /* not defined */
/* Skip over characters that cannot possibly be the first character
of a match. */
if (tnfa->firstpos_chars != NULL)
}
}
}
-#endif
+#endif /* USE_FIRSTPOS_CHARS */
DPRINT(("length: %d\n", len));
DPRINT(("pos:chr/code | states and tags\n"));