+void RegexTest::Bug10459() {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString patternString("(txt)");
+ UnicodeString txtString("txt");
+
+ UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
+ REGEX_CHECK_STATUS;
+ UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
+ REGEX_CHECK_STATUS;
+
+ URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
+ REGEX_CHECK_STATUS;
+
+ uregex_setUText(icu_re, utext_txt, &status);
+ REGEX_CHECK_STATUS;
+
+ // The bug was that calling uregex_group() before doing a matching operation
+ // was causing a segfault. Only for Regular Expressions created from UText.
+ // It should set an U_REGEX_INVALID_STATE.
+
+ UChar buf[100];
+ int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
+ REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
+ REGEX_ASSERT(len == 0);
+
+ uregex_close(icu_re);
+ utext_close(utext_pat);
+ utext_close(utext_txt);
+}
+
+void RegexTest::TestCaseInsensitiveStarters() {
+ // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
+ // become stale because of new Unicode characters.
+ // If it is stale, rerun the generation tool
+ // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
+ // and replace the embedded data in i18n/regexcmp.cpp
+
+ for (UChar32 cp=0; cp<=0x10ffff; cp++) {
+ if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
+ continue;
+ }
+ UnicodeSet s(cp, cp);
+ s.closeOver(USET_CASE_INSENSITIVE);
+ UnicodeSetIterator setIter(s);
+ while (setIter.next()) {
+ if (!setIter.isString()) {
+ continue;
+ }
+ const UnicodeString &str = setIter.getString();
+ UChar32 firstChar = str.char32At(0);
+ UnicodeSet starters;
+ RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
+ if (!starters.contains(cp)) {
+ errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
+ return;
+ }
+ }
+ }
+}
+
+
+void RegexTest::TestBug11049() {
+ // Original bug report: pattern with match start consisting of one of several individual characters,
+ // and the text being matched ending with a supplementary character. find() would read past the
+ // end of the input text when searching for potential match starting points.
+
+ // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
+ // detect the bad read.
+
+ TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
+ TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
+
+ // Test again with a pattern starting with a single character,
+ // which takes a different code path than starting with an OR expression,
+ // but with similar logic.
+ TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
+ TestCase11049("C", "string matches at end C", TRUE, __LINE__);
+}
+
+// Run a single test case from TestBug11049(). Internal function.
+void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString patternString = UnicodeString(pattern).unescape();
+ LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
+
+ UnicodeString dataString = UnicodeString(data).unescape();
+ UChar *exactBuffer = new UChar[dataString.length()];
+ dataString.extract(exactBuffer, dataString.length(), status);
+ UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
+
+ LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
+ REGEX_CHECK_STATUS;
+ matcher->reset(ut);
+ UBool result = matcher->find();
+ if (result != expectMatch) {
+ errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+ __FILE__, lineNumber, expectMatch, result, pattern, data);
+ }
+
+ // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
+ // off-by-one on find() with match at the last code point.
+ // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
+ // because string.unescape() will only shrink it.
+ char * utf8Buffer = new char[uprv_strlen(data)+1];
+ u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
+ REGEX_CHECK_STATUS;
+ ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
+ REGEX_CHECK_STATUS;
+ matcher->reset(ut);
+ result = matcher->find();
+ if (result != expectMatch) {
+ errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
+ __FILE__, lineNumber, expectMatch, result, pattern, data);
+ }
+ delete [] utf8Buffer;
+
+ utext_close(ut);
+ delete [] exactBuffer;
+}
+
+
+void RegexTest::TestBug11371() {
+ if (quick) {
+ logln("Skipping test. Runs in exhuastive mode only.");
+ return;
+ }
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString patternString;
+
+ for (int i=0; i<8000000; i++) {
+ patternString.append(UnicodeString("()"));
+ }
+ LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
+ if (status != U_REGEX_PATTERN_TOO_BIG) {
+ errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
+ __FILE__, __LINE__, u_errorName(status));
+ }
+
+ status = U_ZERO_ERROR;
+ patternString = "(";
+ for (int i=0; i<20000000; i++) {
+ patternString.append(UnicodeString("A++"));
+ }
+ patternString.append(UnicodeString("){0}B++"));
+ LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
+ if (status != U_REGEX_PATTERN_TOO_BIG) {
+ errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
+ __FILE__, __LINE__, u_errorName(status));
+ }
+
+ // Pattern with too much string data, such that string indexes overflow operand data field size
+ // in compiled instruction.
+ status = U_ZERO_ERROR;
+ patternString = "";
+ while (patternString.length() < 0x00ffffff) {
+ patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
+ }
+ patternString.append(UnicodeString("X? trailing string"));
+ LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
+ if (status != U_REGEX_PATTERN_TOO_BIG) {
+ errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
+ __FILE__, __LINE__, u_errorName(status));
+ }
+}
+
+void RegexTest::TestBug11480() {
+ // C API, get capture group of a group that does not participate in the match.
+ // (Returns a zero length string, with nul termination,
+ // indistinguishable from a group with a zero length match.)
+
+ UErrorCode status = U_ZERO_ERROR;
+ URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
+ REGEX_CHECK_STATUS;
+ UnicodeString text = UNICODE_STRING_SIMPLE("A");
+ uregex_setText(re, text.getBuffer(), text.length(), &status);
+ REGEX_CHECK_STATUS;
+ REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
+ UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
+ int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
+ REGEX_ASSERT(length == 0);
+ REGEX_ASSERT(buf[0] == 13);
+ REGEX_ASSERT(buf[1] == 0);
+ REGEX_ASSERT(buf[2] == 13);
+ uregex_close(re);
+
+ // UText C++ API, length of match is 0 for non-participating matches.
+ UText ut = UTEXT_INITIALIZER;
+ utext_openUnicodeString(&ut, &text, &status);
+ RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
+ REGEX_CHECK_STATUS;
+ matcher.reset(&ut);
+ REGEX_ASSERT(matcher.lookingAt(0, status));
+
+ // UText C++ API, Capture group 1 matches "A", position 0, length 1.
+ int64_t groupLen = -666;
+ UText group = UTEXT_INITIALIZER;
+ matcher.group(1, &group, groupLen, status);
+ REGEX_CHECK_STATUS;
+ REGEX_ASSERT(groupLen == 1);
+ REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
+
+ // Capture group 2, the (B), does not participate in the match.
+ matcher.group(2, &group, groupLen, status);
+ REGEX_CHECK_STATUS;
+ REGEX_ASSERT(groupLen == 0);
+ REGEX_ASSERT(matcher.start(2, status) == -1);
+ REGEX_CHECK_STATUS;
+}
+
+void RegexTest::TestBug12884() {
+ // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
+ UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
+ UnicodeString text(u"hello");
+ UErrorCode status = U_ZERO_ERROR;
+ RegexMatcher m(pattern, text, 0, status);
+ REGEX_CHECK_STATUS;
+ m.setTimeLimit(5, status);
+ m.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ // Non-greedy loops. They take a different code path during matching.
+ UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
+ status = U_ZERO_ERROR;
+ RegexMatcher ngM(ngPattern, text, 0, status);
+ REGEX_CHECK_STATUS;
+ ngM.setTimeLimit(5, status);
+ ngM.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ // UText, wrapping non-UTF-16 text, also takes a different execution path.
+ const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
+ "carácter, sin importar la plataforma, sin importar el programa,"
+ "sin importar el idioma.";
+ status = U_ZERO_ERROR;
+ LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
+ REGEX_CHECK_STATUS;
+ m.reset(ut.getAlias());
+ m.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ status = U_ZERO_ERROR;
+ ngM.reset(ut.getAlias());
+ ngM.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+}
+
+// Bug 13631. A find() of a pattern with a zero length look-behind assertions
+// can cause a read past the end of the input text.
+// The failure is seen when running this test with Clang's Addresss Sanitizer.
+
+void RegexTest::TestBug13631() {
+ const UChar *pats[] = { u"(?<!^)",
+ u"(?<=^)",
+ nullptr
+ };
+ for (const UChar **pat=pats; *pat; ++pat) {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString upat(*pat);
+ RegexMatcher matcher(upat, 0, status);
+ const UChar s =u'a';
+ UText *ut = utext_openUChars(nullptr, &s, 1, &status);
+ REGEX_CHECK_STATUS;
+ matcher.reset(ut);
+ while (matcher.find()) {
+ }
+ utext_close(ut);
+ }
+}
+
+// Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
+// where a following group specification would be expected.
+// Failure shows when running the test under Clang's Address Sanitizer.
+
+void RegexTest::TestBug13632() {
+ UErrorCode status = U_ZERO_ERROR;
+ URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
+ const char16_t *sourceString = u"Hello, world.";
+ uregex_setText(re, sourceString, u_strlen(sourceString), &status);
+
+ const int32_t destCap = 20;
+ char16_t dest[destCap] = {};
+ const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
+ uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
+
+ assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ uregex_close(re);
+}
+
+#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */