+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 2002-2015, International Business Machines Corporation and
+ * Copyright (c) 2002-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#include "intltest.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
#include "unicode/localpointer.h"
#include "unicode/regex.h"
#include "unicode/uchar.h"
#include "unicode/uregex.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
+#include "unicode/utext.h"
+#include "unicode/utf16.h"
+#include "cstr.h"
#include "regextst.h"
#include "regexcmp.h"
#include "uvector.h"
#include "util.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
#include "cmemory.h"
#include "cstring.h"
#include "uinvchar.h"
void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
if (exec) logln("TestSuite RegexTest: ");
- switch (index) {
-
- case 0: name = "Basic";
- if (exec) Basic();
- break;
- case 1: name = "API_Match";
- if (exec) API_Match();
- break;
- case 2: name = "API_Replace";
- if (exec) API_Replace();
- break;
- case 3: name = "API_Pattern";
- if (exec) API_Pattern();
- break;
- case 4:
+ TESTCASE_AUTO_BEGIN;
+ TESTCASE_AUTO(Basic);
+ TESTCASE_AUTO(API_Match);
+ TESTCASE_AUTO(API_Replace);
+ TESTCASE_AUTO(API_Pattern);
#if !UCONFIG_NO_FILE_IO
- name = "Extended";
- if (exec) Extended();
-#else
- name = "skip";
+ TESTCASE_AUTO(Extended);
#endif
- break;
- case 5: name = "Errors";
- if (exec) Errors();
- break;
- case 6: name = "PerlTests";
- if (exec) PerlTests();
- break;
- case 7: name = "Callbacks";
- if (exec) Callbacks();
- break;
- case 8: name = "FindProgressCallbacks";
- if (exec) FindProgressCallbacks();
- break;
- case 9: name = "Bug 6149";
- if (exec) Bug6149();
- break;
- case 10: name = "UTextBasic";
- if (exec) UTextBasic();
- break;
- case 11: name = "API_Match_UTF8";
- if (exec) API_Match_UTF8();
- break;
- case 12: name = "API_Replace_UTF8";
- if (exec) API_Replace_UTF8();
- break;
- case 13: name = "API_Pattern_UTF8";
- if (exec) API_Pattern_UTF8();
- break;
- case 14: name = "PerlTestsUTF8";
- if (exec) PerlTestsUTF8();
- break;
- case 15: name = "PreAllocatedUTextCAPI";
- if (exec) PreAllocatedUTextCAPI();
- break;
- case 16: name = "Bug 7651";
- if (exec) Bug7651();
- break;
- case 17: name = "Bug 7740";
- if (exec) Bug7740();
- break;
- case 18: name = "Bug 8479";
- if (exec) Bug8479();
- break;
- case 19: name = "Bug 7029";
- if (exec) Bug7029();
- break;
- case 20: name = "CheckInvBufSize";
- if (exec) CheckInvBufSize();
- break;
- case 21: name = "Bug 9283";
- if (exec) Bug9283();
- break;
- case 22: name = "Bug10459";
- if (exec) Bug10459();
- break;
- case 23: name = "TestCaseInsensitiveStarters";
- if (exec) TestCaseInsensitiveStarters();
- break;
- case 24: name = "TestBug11049";
- if (exec) TestBug11049();
- break;
- case 25: name = "TestBug11371";
- if (exec) TestBug11371();
- break;
- case 26: name = "TestBug11480";
- if (exec) TestBug11480();
- break;
- case 27: name = "NamedCapture";
- if (exec) NamedCapture();
- break;
- case 28: name = "NamedCaptureLimits";
- if (exec) NamedCaptureLimits();
- break;
- default: name = "";
- break; //needed to end loop
- }
+ TESTCASE_AUTO(Errors);
+ TESTCASE_AUTO(PerlTests);
+ TESTCASE_AUTO(Callbacks);
+ TESTCASE_AUTO(FindProgressCallbacks);
+ TESTCASE_AUTO(Bug6149);
+ TESTCASE_AUTO(UTextBasic);
+ TESTCASE_AUTO(API_Match_UTF8);
+ TESTCASE_AUTO(API_Replace_UTF8);
+ TESTCASE_AUTO(API_Pattern_UTF8);
+ TESTCASE_AUTO(PerlTestsUTF8);
+ TESTCASE_AUTO(PreAllocatedUTextCAPI);
+ TESTCASE_AUTO(Bug7651);
+ TESTCASE_AUTO(Bug7740);
+ TESTCASE_AUTO(Bug8479);
+ TESTCASE_AUTO(Bug7029);
+ TESTCASE_AUTO(CheckInvBufSize);
+ TESTCASE_AUTO(Bug9283);
+ TESTCASE_AUTO(Bug10459);
+ TESTCASE_AUTO(TestCaseInsensitiveStarters);
+ TESTCASE_AUTO(TestBug11049);
+ TESTCASE_AUTO(TestBug11371);
+ TESTCASE_AUTO(TestBug11480);
+ TESTCASE_AUTO(NamedCapture);
+ TESTCASE_AUTO(NamedCaptureLimits);
+ TESTCASE_AUTO(TestBug12884);
+ TESTCASE_AUTO(TestBug13631);
+ TESTCASE_AUTO(TestBug13632);
+ TESTCASE_AUTO_END;
}
-
/**
* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
* into ASCII.
return ASSERT_BUF;
}
-#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
+#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
__FILE__, __LINE__, u_errorName(status)); return;}}
if (!testUTextEqual(&expectedText, actual)) {
char buf[201 /*21*/];
char expectedBuf[201];
- utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
- utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
+ utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
+ utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
}
utext_close(&expectedText);
if (!testUTextEqual(&expectedText, actual)) {
char buf[201 /*21*/];
char expectedBuf[201];
- utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
- utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
+ utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
+ utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
}
utext_close(&expectedText);
if (UTF8Matcher == NULL) {
// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
- logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
+ logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
status = U_ZERO_ERROR;
}
}
// Generate native indices for UTF8 versions of region and capture group info
//
if (UTF8Matcher != NULL) {
+ if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
+ UTF8Matcher->setTrace(TRUE);
+ }
if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
}
}
matcher->setTrace(FALSE);
+ if (UTF8Matcher) {
+ UTF8Matcher->setTrace(FALSE);
+ }
if (U_FAILURE(status)) {
errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
}
failed = TRUE;
goto cleanupAndReturn;
}
+ if (isMatch && groupStarts.size() == 0) {
+ errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
+ failed = TRUE;
+ }
+ if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
+ errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
+ failed = TRUE;
+ }
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
// Only check for match / no match. Don't check capture groups.
- if (isMatch && groupStarts.size() == 0) {
- errln("Error at line %d: No match expected, but one found.", line);
- failed = TRUE;
- } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
- errln("Error at line %d: No match expected, but one found. (UTF8)", line);
- failed = TRUE;
- }
goto cleanupAndReturn;
}
void RegexTest::TestBug11480() {
// C API, get capture group of a group that does not participate in the match.
// (Returns a zero length string, with nul termination,
- // indistinguishable from a group with a zero lenght match.)
+ // indistinguishable from a group with a zero length match.)
UErrorCode status = U_ZERO_ERROR;
URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
REGEX_ASSERT(buf[1] == 0);
REGEX_ASSERT(buf[2] == 13);
uregex_close(re);
+
+ // UText C++ API, length of match is 0 for non-participating matches.
+ UText ut = UTEXT_INITIALIZER;
+ utext_openUnicodeString(&ut, &text, &status);
+ RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
+ REGEX_CHECK_STATUS;
+ matcher.reset(&ut);
+ REGEX_ASSERT(matcher.lookingAt(0, status));
+
+ // UText C++ API, Capture group 1 matches "A", position 0, length 1.
+ int64_t groupLen = -666;
+ UText group = UTEXT_INITIALIZER;
+ matcher.group(1, &group, groupLen, status);
+ REGEX_CHECK_STATUS;
+ REGEX_ASSERT(groupLen == 1);
+ REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
+
+ // Capture group 2, the (B), does not participate in the match.
+ matcher.group(2, &group, groupLen, status);
+ REGEX_CHECK_STATUS;
+ REGEX_ASSERT(groupLen == 0);
+ REGEX_ASSERT(matcher.start(2, status) == -1);
+ REGEX_CHECK_STATUS;
+}
+
+void RegexTest::TestBug12884() {
+ // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
+ UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
+ UnicodeString text(u"hello");
+ UErrorCode status = U_ZERO_ERROR;
+ RegexMatcher m(pattern, text, 0, status);
+ REGEX_CHECK_STATUS;
+ m.setTimeLimit(5, status);
+ m.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ // Non-greedy loops. They take a different code path during matching.
+ UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
+ status = U_ZERO_ERROR;
+ RegexMatcher ngM(ngPattern, text, 0, status);
+ REGEX_CHECK_STATUS;
+ ngM.setTimeLimit(5, status);
+ ngM.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ // UText, wrapping non-UTF-16 text, also takes a different execution path.
+ const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
+ "carácter, sin importar la plataforma, sin importar el programa,"
+ "sin importar el idioma.";
+ status = U_ZERO_ERROR;
+ LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
+ REGEX_CHECK_STATUS;
+ m.reset(ut.getAlias());
+ m.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+
+ status = U_ZERO_ERROR;
+ ngM.reset(ut.getAlias());
+ ngM.find(status);
+ REGEX_ASSERT(status == U_REGEX_TIME_OUT);
+}
+
+// Bug 13631. A find() of a pattern with a zero length look-behind assertions
+// can cause a read past the end of the input text.
+// The failure is seen when running this test with Clang's Addresss Sanitizer.
+
+void RegexTest::TestBug13631() {
+ const UChar *pats[] = { u"(?<!^)",
+ u"(?<=^)",
+ nullptr
+ };
+ for (const UChar **pat=pats; *pat; ++pat) {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString upat(*pat);
+ RegexMatcher matcher(upat, 0, status);
+ const UChar s =u'a';
+ UText *ut = utext_openUChars(nullptr, &s, 1, &status);
+ REGEX_CHECK_STATUS;
+ matcher.reset(ut);
+ while (matcher.find()) {
+ }
+ utext_close(ut);
+ }
}
+// Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
+// where a following group specification would be expected.
+// Failure shows when running the test under Clang's Address Sanitizer.
+
+void RegexTest::TestBug13632() {
+ UErrorCode status = U_ZERO_ERROR;
+ URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
+ const char16_t *sourceString = u"Hello, world.";
+ uregex_setText(re, sourceString, u_strlen(sourceString), &status);
+
+ const int32_t destCap = 20;
+ char16_t dest[destCap] = {};
+ const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
+ uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
+
+ assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
+ uregex_close(re);
+}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */