X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..4f1e1a09ce4daed860e35d359ce2fceccb0764e8:/icuSources/test/intltest/regextst.cpp diff --git a/icuSources/test/intltest/regextst.cpp b/icuSources/test/intltest/regextst.cpp index 1e197a90..4b0a2f43 100644 --- a/icuSources/test/intltest/regextst.cpp +++ b/icuSources/test/intltest/regextst.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2012, International Business Machines Corporation and + * Copyright (c) 2002-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -23,17 +25,26 @@ #include "intltest.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS +#include +#include +#include + +#include "unicode/localpointer.h" #include "unicode/regex.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" #include "unicode/uniset.h" +#include "unicode/uregex.h" +#include "unicode/usetiter.h" #include "unicode/ustring.h" +#include "unicode/utext.h" +#include "unicode/utf16.h" +#include "cstr.h" #include "regextst.h" +#include "regexcmp.h" #include "uvector.h" #include "util.h" -#include -#include -#include +#include "cmemory.h" #include "cstring.h" #include "uinvchar.h" @@ -58,90 +69,48 @@ RegexTest::~RegexTest() void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) { if (exec) logln("TestSuite RegexTest: "); - switch (index) { - - case 0: name = "Basic"; - if (exec) Basic(); - break; - case 1: name = "API_Match"; - if (exec) API_Match(); - break; - case 2: name = "API_Replace"; - if (exec) API_Replace(); - break; - case 3: name = "API_Pattern"; - if (exec) API_Pattern(); - break; - case 4: + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(Basic); + TESTCASE_AUTO(API_Match); + TESTCASE_AUTO(API_Replace); + TESTCASE_AUTO(API_Pattern); #if !UCONFIG_NO_FILE_IO - name = "Extended"; - if (exec) Extended(); -#else - name = "skip"; + TESTCASE_AUTO(Extended); #endif - break; - case 5: name = "Errors"; - if (exec) Errors(); - break; - case 6: name = "PerlTests"; - if (exec) PerlTests(); - break; - case 7: name = "Callbacks"; - if (exec) Callbacks(); - break; - case 8: name = "FindProgressCallbacks"; - if (exec) FindProgressCallbacks(); - break; - case 9: name = "Bug 6149"; - if (exec) Bug6149(); - break; - case 10: name = "UTextBasic"; - if (exec) UTextBasic(); - break; - case 11: name = "API_Match_UTF8"; - if (exec) API_Match_UTF8(); - break; - case 12: name = "API_Replace_UTF8"; - if (exec) API_Replace_UTF8(); - break; - case 13: name = "API_Pattern_UTF8"; - if (exec) API_Pattern_UTF8(); - break; - case 14: name = "PerlTestsUTF8"; - if (exec) PerlTestsUTF8(); - break; - case 15: name = "PreAllocatedUTextCAPI"; - if (exec) PreAllocatedUTextCAPI(); - break; - case 16: name = "Bug 7651"; - if (exec) Bug7651(); - break; - case 17: name = "Bug 7740"; - if (exec) Bug7740(); - break; - case 18: name = "Bug 8479"; - if (exec) Bug8479(); - break; - case 19: name = "Bug 7029"; - if (exec) Bug7029(); - break; - case 20: name = "CheckInvBufSize"; - if (exec) CheckInvBufSize(); - break; - case 21: name = "Bug 9283"; - if (exec) Bug9283(); - break; - - default: name = ""; - break; //needed to end loop - } + TESTCASE_AUTO(Errors); + TESTCASE_AUTO(PerlTests); + TESTCASE_AUTO(Callbacks); + TESTCASE_AUTO(FindProgressCallbacks); + TESTCASE_AUTO(Bug6149); + TESTCASE_AUTO(UTextBasic); + TESTCASE_AUTO(API_Match_UTF8); + TESTCASE_AUTO(API_Replace_UTF8); + TESTCASE_AUTO(API_Pattern_UTF8); + TESTCASE_AUTO(PerlTestsUTF8); + TESTCASE_AUTO(PreAllocatedUTextCAPI); + TESTCASE_AUTO(Bug7651); + TESTCASE_AUTO(Bug7740); + TESTCASE_AUTO(Bug8479); + TESTCASE_AUTO(Bug7029); + TESTCASE_AUTO(CheckInvBufSize); + TESTCASE_AUTO(Bug9283); + TESTCASE_AUTO(Bug10459); + TESTCASE_AUTO(TestCaseInsensitiveStarters); + TESTCASE_AUTO(TestBug11049); + TESTCASE_AUTO(TestBug11371); + TESTCASE_AUTO(TestBug11480); + TESTCASE_AUTO(NamedCapture); + TESTCASE_AUTO(NamedCaptureLimits); + TESTCASE_AUTO(TestBug12884); + TESTCASE_AUTO(TestBug13631); + TESTCASE_AUTO(TestBug13632); + TESTCASE_AUTO_END; } - /** * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage - * into ASCII. + * into ASCII. * @see utext_openUTF8 */ static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); @@ -207,8 +176,7 @@ const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { return ASSERT_BUF; } - -#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} +#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ __FILE__, __LINE__, u_errorName(status)); return;}} @@ -225,7 +193,12 @@ if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} -#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};} +// expected: const char * , restricted to invariant characters. +// actual: const UnicodeString & +#define REGEX_ASSERT_UNISTR(expected, actual) { \ + if (UnicodeString(expected, -1, US_INV) != (actual)) { \ + errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ + __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} static UBool testUTextEqual(UText *uta, UText *utb) { @@ -263,8 +236,8 @@ void RegexTest::assertUText(const char *expected, UText *actual, const char *fil if (!testUTextEqual(&expectedText, actual)) { char buf[201 /*21*/]; char expectedBuf[201]; - utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); - utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); + utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); + utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); } utext_close(&expectedText); @@ -285,19 +258,19 @@ void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const if (!testUTextEqual(&expectedText, actual)) { char buf[201 /*21*/]; char expectedBuf[201]; - utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); - utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); + utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); + utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); } utext_close(&expectedText); } /** - * Assumes utf-8 input + * Assumes utf-8 input */ #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) /** - * Assumes Invariant input + * Assumes Invariant input */ #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) @@ -305,11 +278,11 @@ void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const * This buffer ( inv_buf ) is used to hold the UTF-8 strings * passed into utext_openUTF8. An error will be given if * INV_BUFSIZ is too small. It's only used on EBCDIC systems. - */ + */ #define INV_BUFSIZ 2048 /* increase this if too small */ -static int32_t inv_next=0; +static int64_t inv_next=0; #if U_CHARSET_FAMILY!=U_ASCII_FAMILY static char inv_buf[INV_BUFSIZ]; @@ -373,7 +346,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, line, u_errorName(status)); return FALSE; } - if (line==376) { RegexPatternDump(REPattern);} + if (line==376) { REPattern->dumpPattern();} UnicodeString inputString(inputText); UnicodeString unEscapedInput = inputString.unescape(); @@ -409,7 +382,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, } if (retVal == FALSE) { - RegexPatternDump(REPattern); + REPattern->dumpPattern(); } delete REPattern; @@ -436,12 +409,12 @@ UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool look line, u_errorName(status)); return FALSE; } - + UnicodeString inputString(text, -1, US_INV); UnicodeString unEscapedInput = inputString.unescape(); LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); - + inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { // UTF-8 does not allow unpaired surrogates, so this could actually happen @@ -452,7 +425,7 @@ UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool look textChars = new char[inputUTF8Length+1]; unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); - + REMatcher = &REPattern->matcher(status)->reset(&inputText); if (U_FAILURE(status)) { errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", @@ -485,7 +458,7 @@ UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool look } if (retVal == FALSE) { - RegexPatternDump(REPattern); + REPattern->dumpPattern(); } delete REPattern; @@ -551,7 +524,7 @@ void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, } } } - + delete callerPattern; utext_close(&patternText); } @@ -578,7 +551,7 @@ void RegexTest::Basic() { UErrorCode status = U_ZERO_ERROR; RegexPattern *pattern; pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); - RegexPatternDump(pattern); + pattern->dumpPattern(); RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); UBool result = m->find(); printf("result = %d\n", result); @@ -726,18 +699,18 @@ void RegexTest::UTextBasic() { utext_openUTF8(&pattern, str_abc, -1, &status); RegexMatcher matcher(&pattern, 0, status); REGEX_CHECK_STATUS; - + UText input = UTEXT_INITIALIZER; utext_openUTF8(&input, str_abc, -1, &status); REGEX_CHECK_STATUS; matcher.reset(&input); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); - + matcher.reset(matcher.inputText()); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); - + utext_close(&pattern); utext_close(&input); } @@ -1114,7 +1087,7 @@ void RegexTest::API_Match() { delete m; delete p; } - + // // Regions // @@ -1127,34 +1100,34 @@ void RegexTest::API_Match() { REGEX_ASSERT(m.regionEnd() == testString.length()); REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); - + m.region(2,4, status); REGEX_CHECK_STATUS; REGEX_ASSERT(m.matches(status)); REGEX_ASSERT(m.start(status)==2); REGEX_ASSERT(m.end(status)==4); REGEX_CHECK_STATUS; - + m.reset(); REGEX_ASSERT(m.regionStart() == 0); REGEX_ASSERT(m.regionEnd() == testString.length()); - + UnicodeString shorterString("short"); m.reset(shorterString); REGEX_ASSERT(m.regionStart() == 0); REGEX_ASSERT(m.regionEnd() == shorterString.length()); - + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); - + REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); - + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); REGEX_ASSERT(m.hasTransparentBounds() == TRUE); @@ -1165,9 +1138,9 @@ void RegexTest::API_Match() { REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasTransparentBounds() == FALSE); - + } - + // // hitEnd() and requireEnd() // @@ -1179,7 +1152,7 @@ void RegexTest::API_Match() { REGEX_ASSERT(m1.hitEnd() == TRUE); REGEX_ASSERT(m1.requireEnd() == FALSE); REGEX_CHECK_STATUS; - + status = U_ZERO_ERROR; RegexMatcher m2("a*", testString, 0, status); REGEX_ASSERT(m2.lookingAt(status) == TRUE); @@ -1217,7 +1190,7 @@ void RegexTest::API_Match() { #endif // - // Time Outs. + // Time Outs. // Note: These tests will need to be changed when the regexp engine is // able to detect and cut short the exponential time behavior on // this type of match. @@ -1245,22 +1218,22 @@ void RegexTest::API_Match() { REGEX_ASSERT(matcher.lookingAt(status) == FALSE); REGEX_CHECK_STATUS; } - + // // Stack Limits // { UErrorCode status = U_ZERO_ERROR; UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' - + // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations // of the '+', and makes the stack frames larger. RegexMatcher matcher("(A)+A$", testString, 0, status); - + // With the default stack, this match should fail to run REGEX_ASSERT(matcher.lookingAt(status) == FALSE); REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); - + // With unlimited stack, it should run status = U_ZERO_ERROR; matcher.setStackLimit(0, status); @@ -1276,7 +1249,7 @@ void RegexTest::API_Match() { REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); REGEX_ASSERT(matcher.getStackLimit() == 10000); } - + // A pattern that doesn't save state should work with // a minimal sized stack { @@ -1289,7 +1262,7 @@ void RegexTest::API_Match() { REGEX_ASSERT(matcher.matches(status) == TRUE); REGEX_CHECK_STATUS; REGEX_ASSERT(matcher.getStackLimit() == 30); - + // Negative stack sizes should fail status = U_ZERO_ERROR; matcher.setStackLimit(1000, status); @@ -1298,7 +1271,7 @@ void RegexTest::API_Match() { REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); REGEX_ASSERT(matcher.getStackLimit() == 1000); } - + } @@ -1409,8 +1382,8 @@ void RegexTest::API_Replace() { REGEX_ASSERT(dest == "The value of $1 is bc.defg"); dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); - REGEX_CHECK_STATUS; - REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); + REGEX_ASSERT(U_FAILURE(status)); + status = U_ZERO_ERROR; UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); replacement = replacement.unescape(); @@ -1847,7 +1820,7 @@ void RegexTest::API_Match_UTF8() { regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); REGEX_VERBOSE_TEXT(&input2); utext_openUChars(&empty, NULL, 0, &status); - + int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ int32_t input2Len = strlen("not abc"); @@ -1957,7 +1930,7 @@ void RegexTest::API_Match_UTF8() { delete m1; delete pat2; - + utext_close(&re); utext_close(&input1); utext_close(&input2); @@ -1978,10 +1951,10 @@ void RegexTest::API_Match_UTF8() { UText re=UTEXT_INITIALIZER; const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ utext_openUTF8(&re, str_01234567_pat, -1, &status); - + RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); REGEX_CHECK_STATUS; - + UText input = UTEXT_INITIALIZER; const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ utext_openUTF8(&input, str_0123456789, -1, &status); @@ -2016,13 +1989,13 @@ void RegexTest::API_Match_UTF8() { REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); matcher->lookingAt(status); - + UnicodeString dest; UText destText = UTEXT_INITIALIZER; utext_openUnicodeString(&destText, &dest, &status); UText *result; //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ - // Test shallow-clone API + // Test shallow-clone API int64_t group_len; result = matcher->group((UText *)NULL, group_len, status); REGEX_CHECK_STATUS; @@ -2035,54 +2008,79 @@ void RegexTest::API_Match_UTF8() { // destText is now immutable, reopen it utext_close(&destText); utext_openUnicodeString(&destText, &dest, &status); - - result = matcher->group(0, NULL, status); + + int64_t length; + result = matcher->group(0, NULL, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); utext_close(result); - result = matcher->group(0, &destText, status); + result = matcher->group(0, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); - - result = matcher->group(1, NULL, status); + REGEX_ASSERT(utext_getNativeIndex(result) == 0); + REGEX_ASSERT(length == 10); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + + // Capture Group 1 == "234567" + result = matcher->group(1, NULL, length, status); REGEX_CHECK_STATUS; - const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(1, &destText, status); + + result = matcher->group(1, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); - - result = matcher->group(2, NULL, status); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + + // Capture Group 2 == "45" + result = matcher->group(2, NULL, length, status); REGEX_CHECK_STATUS; - const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ - REGEX_ASSERT_UTEXT_UTF8(str_45, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(2, &destText, status); + + result = matcher->group(2, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_45, result); - - result = matcher->group(3, NULL, status); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + + // Capture Group 3 == "89" + result = matcher->group(3, NULL, length, status); REGEX_CHECK_STATUS; - const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(3, &destText, status); + + result = matcher->group(3, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + // Capture Group number out of range. + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; matcher->reset(); REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); delete matcher; delete pat; - + utext_close(&destText); utext_close(&input); utext_close(&re); @@ -2143,7 +2141,7 @@ void RegexTest::API_Match_UTF8() { delete matcher; delete pat; - + utext_close(&input); utext_close(&re); } @@ -2161,7 +2159,7 @@ void RegexTest::API_Match_UTF8() { utext_openUTF8(&re, str_Gabcabc, -1, &status); RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); - + REGEX_CHECK_STATUS; UText input = UTEXT_INITIALIZER; const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ @@ -2183,7 +2181,7 @@ void RegexTest::API_Match_UTF8() { delete matcher; delete pat; - + utext_close(&input); utext_close(&re); } @@ -2223,7 +2221,7 @@ void RegexTest::API_Match_UTF8() { REGEX_ASSERT(m.end(status) == i); } REGEX_ASSERT(i==20); - + utext_close(&s); } { @@ -2245,7 +2243,7 @@ void RegexTest::API_Match_UTF8() { REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); } REGEX_ASSERT(i==5); - + utext_close(&s); } @@ -2273,7 +2271,7 @@ void RegexTest::API_Match_UTF8() { delete m; delete p; } - + // // Regions // @@ -2285,42 +2283,42 @@ void RegexTest::API_Match_UTF8() { REGEX_VERBOSE_TEXT(&testPattern); regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); REGEX_VERBOSE_TEXT(&testText); - + RegexMatcher m(&testPattern, &testText, 0, status); REGEX_CHECK_STATUS; REGEX_ASSERT(m.regionStart() == 0); REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); - + m.region(2,4, status); REGEX_CHECK_STATUS; REGEX_ASSERT(m.matches(status)); REGEX_ASSERT(m.start(status)==2); REGEX_ASSERT(m.end(status)==4); REGEX_CHECK_STATUS; - + m.reset(); REGEX_ASSERT(m.regionStart() == 0); REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); - + regextst_openUTF8FromInvariant(&testText, "short", -1, &status); REGEX_VERBOSE_TEXT(&testText); m.reset(&testText); REGEX_ASSERT(m.regionStart() == 0); REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); - + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); - + REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); - + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); REGEX_ASSERT(m.hasTransparentBounds() == TRUE); @@ -2331,11 +2329,11 @@ void RegexTest::API_Match_UTF8() { REGEX_ASSERT(m.hasTransparentBounds() == FALSE); REGEX_ASSERT(&m == &m.reset()); REGEX_ASSERT(m.hasTransparentBounds() == FALSE); - + utext_close(&testText); utext_close(&testPattern); } - + // // hitEnd() and requireEnd() // @@ -2347,13 +2345,13 @@ void RegexTest::API_Match_UTF8() { const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ utext_openUTF8(&testPattern, str_, -1, &status); utext_openUTF8(&testText, str_aabb, -1, &status); - + RegexMatcher m1(&testPattern, &testText, 0, status); REGEX_ASSERT(m1.lookingAt(status) == TRUE); REGEX_ASSERT(m1.hitEnd() == TRUE); REGEX_ASSERT(m1.requireEnd() == FALSE); REGEX_CHECK_STATUS; - + status = U_ZERO_ERROR; const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ utext_openUTF8(&testPattern, str_a, -1, &status); @@ -2371,7 +2369,7 @@ void RegexTest::API_Match_UTF8() { REGEX_ASSERT(m3.hitEnd() == TRUE); REGEX_ASSERT(m3.requireEnd() == TRUE); REGEX_CHECK_STATUS; - + utext_close(&testText); utext_close(&testPattern); } @@ -2397,7 +2395,7 @@ void RegexTest::API_Replace_UTF8() { REGEX_VERBOSE_TEXT(&re); RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); REGEX_CHECK_STATUS; - + char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ // 012345678901234567 UText dataText = UTEXT_INITIALIZER; @@ -2413,9 +2411,9 @@ void RegexTest::API_Replace_UTF8() { UText destText = UTEXT_INITIALIZER; utext_openUnicodeString(&destText, &dest, &status); UText *result; - + UText replText = UTEXT_INITIALIZER; - + const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ utext_openUTF8(&replText, str_yz, -1, &status); REGEX_VERBOSE_TEXT(&replText); @@ -2447,7 +2445,7 @@ void RegexTest::API_Replace_UTF8() { const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ utext_openUTF8(&dataText, str_abxabxabx, -1, &status); matcher->reset(&dataText); - + result = matcher->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); @@ -2472,7 +2470,7 @@ void RegexTest::API_Replace_UTF8() { // utext_openUTF8(&dataText, NULL, 0, &status); matcher->reset(&dataText); - + result = matcher->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8("", result); @@ -2496,7 +2494,7 @@ void RegexTest::API_Replace_UTF8() { // utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." matcher->reset(&dataText); - + utext_openUTF8(&replText, NULL, 0, &status); result = matcher->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; @@ -2560,7 +2558,7 @@ void RegexTest::API_Replace_UTF8() { utext_openUTF8(&dataText, str_abcdefg, -1, &status); RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); REGEX_CHECK_STATUS; - + const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ utext_openUTF8(&replText, str_11, -1, &status); result = matcher2->replaceFirst(&replText, NULL, status); @@ -2573,8 +2571,8 @@ void RegexTest::API_Replace_UTF8() { REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); - - const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ + + const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ utext_openUTF8(&replText, str_v, -1, &status); REGEX_VERBOSE_TEXT(&replText); result = matcher2->replaceFirst(&replText, NULL, status); @@ -2587,8 +2585,10 @@ void RegexTest::API_Replace_UTF8() { REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); - - const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */ + + const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, + 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); result = matcher2->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; @@ -2609,7 +2609,7 @@ void RegexTest::API_Replace_UTF8() { supplDigitChars[24] = 0x9F; supplDigitChars[25] = 0x8F; utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); - + result = matcher2->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ @@ -2639,7 +2639,7 @@ void RegexTest::API_Replace_UTF8() { utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); utext_openUTF8(&replText, str_u0043, -1, &status); matcher->reset(&dataText); - + result = matcher->replaceAll(&replText, NULL, status); REGEX_CHECK_STATUS; const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ @@ -2659,7 +2659,7 @@ void RegexTest::API_Replace_UTF8() { matcher->reset(&dataText); unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" - // 0123456789 + // 0123456789 expected[2] = 0xF0; expected[3] = 0x90; expected[4] = 0x80; @@ -2687,10 +2687,10 @@ const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ utext_openUTF8(&re, str_ssee, -1, &status); utext_openUTF8(&dataText, str_blah, -1, &status); utext_openUTF8(&replText, str_ooh, -1, &status); - + RegexMatcher m(&re, 0, status); REGEX_CHECK_STATUS; - + UnicodeString result; UText resultText = UTEXT_INITIALIZER; utext_openUnicodeString(&resultText, &result, &status); @@ -2731,7 +2731,7 @@ const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ m.appendTail(&resultText, status); const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); - + utext_close(&resultText); } @@ -2739,7 +2739,7 @@ const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ delete pat2; delete matcher; delete pat; - + utext_close(&dataText); utext_close(&replText); utext_close(&destText); @@ -2764,7 +2764,7 @@ void RegexTest::API_Pattern_UTF8() { UText re2 = UTEXT_INITIALIZER; UErrorCode status = U_ZERO_ERROR; UParseError pe; - + const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ utext_openUTF8(&re1, str_abcalmz, -1, &status); @@ -2813,7 +2813,7 @@ void RegexTest::API_Pattern_UTF8() { delete pat1a; delete pat1; delete pat2; - + utext_close(&re1); utext_close(&re2); @@ -2827,13 +2827,13 @@ void RegexTest::API_Pattern_UTF8() { UText pattern = UTEXT_INITIALIZER; const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ utext_openUTF8(&pattern, str_pL, -1, &status); - + RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); RegexPattern *pClone = pSource->clone(); delete pSource; RegexMatcher *mFromClone = pClone->matcher(status); REGEX_CHECK_STATUS; - + UText input = UTEXT_INITIALIZER; const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ utext_openUTF8(&input, str_HelloWorld, -1, &status); @@ -2845,7 +2845,7 @@ void RegexTest::API_Pattern_UTF8() { REGEX_ASSERT(mFromClone->find() == FALSE); delete mFromClone; delete pClone; - + utext_close(&input); utext_close(&pattern); } @@ -2857,7 +2857,7 @@ void RegexTest::API_Pattern_UTF8() { UErrorCode status = U_ZERO_ERROR; UText pattern = UTEXT_INITIALIZER; UText input = UTEXT_INITIALIZER; - + const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ utext_openUTF8(&input, str_randominput, -1, &status); @@ -2865,17 +2865,17 @@ void RegexTest::API_Pattern_UTF8() { utext_openUTF8(&pattern, str_dotstar, -1, &status); REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); REGEX_CHECK_STATUS; - + const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ utext_openUTF8(&pattern, str_abc, -1, &status); REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); REGEX_CHECK_STATUS; - + const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ utext_openUTF8(&pattern, str_nput, -1, &status); REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); REGEX_CHECK_STATUS; - + utext_openUTF8(&pattern, str_randominput, -1, &status); REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); REGEX_CHECK_STATUS; @@ -2884,13 +2884,13 @@ void RegexTest::API_Pattern_UTF8() { utext_openUTF8(&pattern, str_u, -1, &status); REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); REGEX_CHECK_STATUS; - + utext_openUTF8(&input, str_abc, -1, &status); utext_openUTF8(&pattern, str_abc, -1, &status); status = U_INDEX_OUTOFBOUNDS_ERROR; REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); - + utext_close(&input); utext_close(&pattern); } @@ -3054,6 +3054,37 @@ void RegexTest::API_Pattern_UTF8() { delete pat1; + // + // split of a UText based string, with library allocating output UTexts. + // + { + status = U_ZERO_ERROR; + RegexMatcher matcher(UnicodeString("(:)"), 0, status); + UnicodeString stringToSplit("first:second:third"); + UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); + REGEX_CHECK_STATUS; + + UText *splits[10] = {NULL}; + int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(numFields == 5); + REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); + REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); + REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); + REGEX_ASSERT(splits[5] == NULL); + + for (int i=0; ipattern(),"(Hello, world)*"); + REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); delete pat1; @@ -3131,7 +3162,7 @@ void RegexTest::Extended() { UnicodeString matchString; // The marked up string to be used as input if (U_FAILURE(status)){ - dataerrln("Construct RegexMatcher() error."); + dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); delete [] testData; return; } @@ -3281,7 +3312,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, int32_t line) { UnicodeString unEscapedInput; UnicodeString deTaggedInput; - + int32_t patternUTF8Length, inputUTF8Length; char *patternChars = NULL, *inputChars = NULL; UText patternText = UTEXT_INITIALIZER; @@ -3308,7 +3339,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, int32_t regionEnd = -1; int32_t regionStartUTF8 = -1; int32_t regionEndUTF8 = -1; - + // // Compile the caller's pattern @@ -3326,7 +3357,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag bflags |= UREGEX_MULTILINE; } - + if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; } @@ -3362,16 +3393,16 @@ void RegexTest::regex_find(const UnicodeString &pattern, UTF8Converter = ucnv_open("UTF8", &status); ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); - + patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); status = U_ZERO_ERROR; // buffer overflow patternChars = new char[patternUTF8Length+1]; pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); - + if (status == U_ZERO_ERROR) { UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); - + if (status != U_ZERO_ERROR) { #if UCONFIG_NO_BREAK_ITERATION==1 // 'v' test flag means that the test pattern should not compile if ICU was configured @@ -3393,7 +3424,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } } - + if (UTF8Pattern == NULL) { // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); @@ -3401,7 +3432,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, } if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag - RegexPatternDump(callerPattern); + callerPattern->dumpPattern(); } if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag @@ -3423,7 +3454,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, numFinds = i; } } - + // 'M' flag. Use matches() instead of find() if (flags.indexOf((UChar)0x4d) >= 0) { useMatchesFunc = TRUE; @@ -3478,7 +3509,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag matcher->setTrace(TRUE); } - + if (UTF8Pattern != NULL) { inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); status = U_ZERO_ERROR; // buffer overflow @@ -3490,10 +3521,10 @@ void RegexTest::regex_find(const UnicodeString &pattern, UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); REGEX_CHECK_STATUS_L(line); } - + if (UTF8Matcher == NULL) { // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine - logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); + logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); status = U_ZERO_ERROR; } } @@ -3502,9 +3533,12 @@ void RegexTest::regex_find(const UnicodeString &pattern, // Generate native indices for UTF8 versions of region and capture group info // if (UTF8Matcher != NULL) { + if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag + UTF8Matcher->setTrace(TRUE); + } if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); - + // Fill out the native index UVector info. // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() for (i=0; i= 0) { @@ -3554,8 +3588,8 @@ void RegexTest::regex_find(const UnicodeString &pattern, UTF8Matcher->useTransparentBounds(TRUE); } } - - + + // // Do a find on the de-tagged input using the caller's pattern @@ -3581,6 +3615,12 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } matcher->setTrace(FALSE); + if (UTF8Matcher) { + UTF8Matcher->setTrace(FALSE); + } + if (U_FAILURE(status)) { + errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); + } // // Match up the groups from the find() with the groups from the tags @@ -3599,16 +3639,17 @@ void RegexTest::regex_find(const UnicodeString &pattern, failed = TRUE; goto cleanupAndReturn; } + if (isMatch && groupStarts.size() == 0) { + errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status)); + failed = TRUE; + } + if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) { + errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status)); + failed = TRUE; + } if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { // Only check for match / no match. Don't check capture groups. - if (isMatch && groupStarts.size() == 0) { - errln("Error at line %d: No match expected, but one found.", line); - failed = TRUE; - } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { - errln("Error at line %d: No match expected, but one found. (UTF8)", line); - failed = TRUE; - } goto cleanupAndReturn; } @@ -3627,7 +3668,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, failed = TRUE; goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. } - + int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); if (matcher->end(i, status) != expectedEnd) { @@ -3664,7 +3705,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); failed = TRUE; } - + if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true matcher->requireEnd() == FALSE) { errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); @@ -3674,7 +3715,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); failed = TRUE; } - + if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false matcher->hitEnd() == TRUE) { errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); @@ -3684,7 +3725,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); failed = TRUE; } - + if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true matcher->hitEnd() == FALSE) { errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); @@ -3708,7 +3749,7 @@ cleanupAndReturn: delete UTF8Pattern; delete matcher; delete callerPattern; - + utext_close(&inputText); delete[] inputChars; utext_close(&patternText); @@ -3767,7 +3808,7 @@ void RegexTest::Errors() { REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); - REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan + REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); @@ -3784,7 +3825,7 @@ void RegexTest::Errors() { //------------------------------------------------------------------------------- -// +// // Read a text data file, convert it to UChars, and return the data // in one big UChar * buffer, which the caller must delete. // @@ -4127,7 +4168,7 @@ void RegexTest::PerlTests() { lineNum, expected?"":"no ", found?"":"no " ); continue; } - + // Don't try to check expected results if there is no match. // (Some have stuff in the expected fields) if (!found) { @@ -4425,7 +4466,7 @@ void RegexTest::PerlTestsUTF8() { if (flagStr.indexOf(UChar_x) != -1) { flags |= UREGEX_COMMENTS; } - + // // Put the pattern in a UTF-8 UText // @@ -4522,7 +4563,7 @@ void RegexTest::PerlTestsUTF8() { lineNum, expected?"":"no ", found?"":"no " ); continue; } - + // Don't try to check expected results if there is no match. // (Some have stuff in the expected fields) if (!found) { @@ -4665,10 +4706,10 @@ void RegexTest::PerlTestsUTF8() { delete fieldPat; delete [] testData; - + utext_close(&patternText); utext_close(&inputText); - + delete [] patternChars; delete [] inputChars; @@ -4682,13 +4723,14 @@ void RegexTest::PerlTestsUTF8() { // // Bug6149 Verify limits to heap expansion for backtrack stack. // Use this pattern, -// "(a?){1,}" -// The zero-length match will repeat forever. -// (That this goes into a loop is another bug) +// "(a?){1,8000000}" +// Note: was an unbounded upperbounds, but that now has loop-breaking enabled. +// This test is likely to be fragile, as further optimizations stop +// more cases of pointless looping in the match engine. // //--------------------------------------------------------------- void RegexTest::Bug6149() { - UnicodeString pattern("(a?){1,}"); + UnicodeString pattern("(a?){1,8000000}"); UnicodeString s("xyz"); uint32_t flags = 0; UErrorCode status = U_ZERO_ERROR; @@ -4731,12 +4773,12 @@ U_CDECL_END void RegexTest::Callbacks() { { // Getter returns NULLs if no callback has been set - + // The variables that the getter will fill in. // Init to non-null values so that the action of the getter can be seen. const void *returnedContext = &returnedContext; URegexMatchCallback *returnedFn = &testCallBackFn; - + UErrorCode status = U_ZERO_ERROR; RegexMatcher matcher("x", 0, status); REGEX_CHECK_STATUS; @@ -4745,7 +4787,7 @@ void RegexTest::Callbacks() { REGEX_ASSERT(returnedFn == NULL); REGEX_ASSERT(returnedContext == NULL); } - + { // Set and Get work callBackContext cbInfo = {this, 0, 0, 0}; @@ -4760,7 +4802,7 @@ void RegexTest::Callbacks() { REGEX_CHECK_STATUS; REGEX_ASSERT(returnedFn == testCallBackFn); REGEX_ASSERT(returnedContext == &cbInfo); - + // A short-running match shouldn't invoke the callback status = U_ZERO_ERROR; cbInfo.reset(1); @@ -4769,7 +4811,7 @@ void RegexTest::Callbacks() { REGEX_ASSERT(matcher.matches(status)); REGEX_CHECK_STATUS; REGEX_ASSERT(cbInfo.numCalls == 0); - + // A medium-length match that runs long enough to invoke the // callback, but not so long that the callback aborts it. status = U_ZERO_ERROR; @@ -4779,7 +4821,7 @@ void RegexTest::Callbacks() { REGEX_ASSERT(matcher.matches(status)==FALSE); REGEX_CHECK_STATUS; REGEX_ASSERT(cbInfo.numCalls > 0); - + // A longer running match that the callback function will abort. status = U_ZERO_ERROR; cbInfo.reset(4); @@ -4788,8 +4830,17 @@ void RegexTest::Callbacks() { REGEX_ASSERT(matcher.matches(status)==FALSE); REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); REGEX_ASSERT(cbInfo.numCalls == 4); + + // A longer running find that the callback function will abort. + status = U_ZERO_ERROR; + cbInfo.reset(4); + s = "aaaaaaaaaaaaaaaaaaaaaaab"; + matcher.reset(s); + REGEX_ASSERT(matcher.find(status)==FALSE); + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); + REGEX_ASSERT(cbInfo.numCalls == 4); } - + } @@ -4809,6 +4860,9 @@ struct progressCallBackContext { void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; }; +// call-back function for find(). +// Return TRUE to continue the find(). +// Return FALSE to stop the find(). U_CDECL_BEGIN static UBool U_CALLCONV testProgressCallBackFn(const void *context, int64_t matchIndex) { @@ -4823,12 +4877,12 @@ U_CDECL_END void RegexTest::FindProgressCallbacks() { { // Getter returns NULLs if no callback has been set - + // The variables that the getter will fill in. // Init to non-null values so that the action of the getter can be seen. const void *returnedContext = &returnedContext; URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; - + UErrorCode status = U_ZERO_ERROR; RegexMatcher matcher("x", 0, status); REGEX_CHECK_STATUS; @@ -4837,14 +4891,14 @@ void RegexTest::FindProgressCallbacks() { REGEX_ASSERT(returnedFn == NULL); REGEX_ASSERT(returnedContext == NULL); } - + { // Set and Get work progressCallBackContext cbInfo = {this, 0, 0, 0}; const void *returnedContext; URegexFindProgressCallback *returnedFn; UErrorCode status = U_ZERO_ERROR; - RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. + RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); REGEX_CHECK_STATUS; matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); REGEX_CHECK_STATUS; @@ -4852,11 +4906,11 @@ void RegexTest::FindProgressCallbacks() { REGEX_CHECK_STATUS; REGEX_ASSERT(returnedFn == testProgressCallBackFn); REGEX_ASSERT(returnedContext == &cbInfo); - - // A short-running match should NOT invoke the callback. + + // A find that matches on the initial position does NOT invoke the callback. status = U_ZERO_ERROR; cbInfo.reset(100); - UnicodeString s = "abxxx"; + UnicodeString s = "aaxxx"; matcher.reset(s); #if 0 matcher.setTrace(TRUE); @@ -4864,8 +4918,9 @@ void RegexTest::FindProgressCallbacks() { REGEX_ASSERT(matcher.find(0, status)); REGEX_CHECK_STATUS; REGEX_ASSERT(cbInfo.numCalls == 0); - - // A medium running match that causes matcher.find() to invoke our callback for each index. + + // A medium running find() that causes matcher.find() to invoke our callback for each index, + // but not so many times that we interrupt the operation. status = U_ZERO_ERROR; s = "aaaaaaaaaaaaaaaaaaab"; cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string @@ -4873,31 +4928,30 @@ void RegexTest::FindProgressCallbacks() { REGEX_ASSERT(matcher.find(0, status)==FALSE); REGEX_CHECK_STATUS; REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); - + // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. status = U_ZERO_ERROR; UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string matcher.reset(s1); REGEX_ASSERT(matcher.find(0, status)==FALSE); - REGEX_CHECK_STATUS; + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); -#if 0 // Now a match that will succeed, but after an interruption status = U_ZERO_ERROR; UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string matcher.reset(s2); REGEX_ASSERT(matcher.find(0, status)==FALSE); - REGEX_CHECK_STATUS; + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); // Now retry the match from where left off cbInfo.maxCalls = 100; // No callback limit + status = U_ZERO_ERROR; REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); REGEX_CHECK_STATUS; -#endif } - + } @@ -4916,7 +4970,7 @@ void RegexTest::PreAllocatedUTextCAPI () { UText patternText = UTEXT_INITIALIZER; UnicodeString buffer; UText bufferText = UTEXT_INITIALIZER; - + utext_openUnicodeString(&bufferText, &buffer, &status); /* @@ -4933,7 +4987,7 @@ void RegexTest::PreAllocatedUTextCAPI () { regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); utext_openUChars(&text2, text2Chars, -1, &status); - + regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); re = uregex_openUText(&patternText, 0, NULL, &status); @@ -4945,7 +4999,7 @@ void RegexTest::PreAllocatedUTextCAPI () { utext_setNativeIndex(resultText, 0); utext_setNativeIndex(&text1, 0); REGEX_ASSERT(testUTextEqual(resultText, &text1)); - + resultText = uregex_getUText(re, &bufferText, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(resultText == &bufferText); @@ -4961,7 +5015,7 @@ void RegexTest::PreAllocatedUTextCAPI () { utext_setNativeIndex(resultText, 0); utext_setNativeIndex(&text2, 0); REGEX_ASSERT(testUTextEqual(resultText, &text2)); - + uregex_close(re); utext_close(&text1); utext_close(&text2); @@ -4974,7 +5028,11 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text1[80]; UText *actual; UBool result; - u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + int64_t length = 0; + + u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); + // 012345678901234567890123456789012345678901234567 + // 0 1 2 3 4 status = U_ZERO_ERROR; re = uregex_openC("abc(.*?)def", 0, NULL, &status); @@ -4984,30 +5042,33 @@ void RegexTest::PreAllocatedUTextCAPI () { result = uregex_find(re, 0, &status); REGEX_ASSERT(result==TRUE); - /* Capture Group 0, the full match. Should succeed. */ + /* Capture Group 0, the full match. Should succeed. "abc interior def" */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); + actual = uregex_groupUText(re, 0, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 6); + REGEX_ASSERT(length == 16); + REGEX_ASSERT(utext_nativeLength(actual) == 47); - /* Capture group #1. Should succeed. */ + /* Capture group #1. Should succeed, matching " interior ". */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); + actual = uregex_groupUText(re, 1, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " + REGEX_ASSERT(length == 10); + REGEX_ASSERT(utext_nativeLength(actual) == 47); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); + actual = uregex_groupUText(re, 2, &bufferText, &length, &status); REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT(actual == &bufferText); - uregex_close(re); } - + /* * replaceFirst() */ @@ -5016,10 +5077,12 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text2[80]; UText replText = UTEXT_INITIALIZER; UText *result; - status = U_ZERO_ERROR; - u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); - u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + utext_openUnicodeString(&bufferText, &buffer, &status); + + status = U_ZERO_ERROR; + u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); + u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); re = uregex_openC("x(.*?)x", 0, NULL, &status); @@ -5027,7 +5090,9 @@ void RegexTest::PreAllocatedUTextCAPI () { /* Normal case, with match */ uregex_setText(re, text1, -1, &status); + REGEX_CHECK_STATUS; utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + REGEX_CHECK_STATUS; result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &bufferText); @@ -5040,10 +5105,10 @@ void RegexTest::PreAllocatedUTextCAPI () { REGEX_CHECK_STATUS; REGEX_ASSERT(result == &bufferText); REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); - + /* Unicode escapes */ uregex_setText(re, text1, -1, &status); - regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); + regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); REGEX_CHECK_STATUS; @@ -5097,11 +5162,281 @@ void RegexTest::PreAllocatedUTextCAPI () { * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, * so we don't need to test it here. */ - + utext_close(&bufferText); utext_close(&patternText); } + +//-------------------------------------------------------------- +// +// NamedCapture Check basic named capture group functionality +// +//-------------------------------------------------------------- +void RegexTest::NamedCapture() { + UErrorCode status = U_ZERO_ERROR; + RegexPattern *pat = RegexPattern::compile(UnicodeString( + "abc()()(?xyz)(de)(?hmm)(?oh)f\\k"), 0, status); + REGEX_CHECK_STATUS; + int32_t group = pat->groupNumberFromName("five", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(5 == group); + group = pat->groupNumberFromName("three", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(3 == group); + + status = U_ZERO_ERROR; + group = pat->groupNumberFromName(UnicodeString("six"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(6 == group); + + status = U_ZERO_ERROR; + group = pat->groupNumberFromName(UnicodeString("nosuch"), status); + U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + + // After copying a pattern, named capture should still work in the copy. + RegexPattern *copiedPat = new RegexPattern(*pat); + REGEX_ASSERT(*copiedPat == *pat); + delete pat; pat = NULL; // Delete original, copy should have no references back to it. + + group = copiedPat->groupNumberFromName("five", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(5 == group); + group = copiedPat->groupNumberFromName("three", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(3 == group); + delete copiedPat; + + // ReplaceAll with named capture group. + status = U_ZERO_ERROR; + UnicodeString text("Substitution of <> for <>"); + RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?.+?)>>"), text, 0, status); + REGEX_CHECK_STATUS; + // m.pattern().dumpPattern(); + UnicodeString replacedText = m->replaceAll("'${mid}'", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); + delete m; + + // ReplaceAll, allowed capture group numbers. + text = UnicodeString("abcmxyz"); + m = new RegexMatcher(UnicodeString("..(?m)(.)(.)"), text, 0, status); + REGEX_CHECK_STATUS; + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$3>"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$4>"), status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, + REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits + REGEX_CHECK_STATUS; // that push group num out of range. + REGEX_ASSERT(UnicodeString("az") == replacedText); // This is group 1. + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${one"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + delete m; + + // Repeat the above replaceAll() tests using the plain C API, which + // has a separate implementation internally. + // TODO: factor out the test data. + + status = U_ZERO_ERROR; + URegularExpression *re = uregex_openC("..(?m)(.)(.)", 0, NULL, &status); + REGEX_CHECK_STATUS; + text = UnicodeString("abcmxyz"); + uregex_setText(re, text.getBuffer(), text.length(), &status); + REGEX_CHECK_STATUS; + + UChar resultBuf[100]; + int32_t resultLength; + UnicodeString repl; + + status = U_ZERO_ERROR; + repl = UnicodeString("<$0>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$1>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${one}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$2>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$3>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$4>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$04>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$000016>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$3$2$1${one}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("$3$2$1${one}"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${noSuchName}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${invalid-name}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${one"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("$not a capture group"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + uregex_close(re); +} + +//-------------------------------------------------------------- +// +// NamedCaptureLimits Patterns with huge numbers of named capture groups. +// The point is not so much what the exact limit is, +// but that a largish number doesn't hit bad non-linear performance, +// and that exceeding the limit fails cleanly. +// +//-------------------------------------------------------------- +void RegexTest::NamedCaptureLimits() { + if (quick) { + logln("Skipping test. Runs in exhuastive mode only."); + return; + } + const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. + const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. + char nnbuf[100]; + UnicodeString pattern; + int32_t nn; + + for (nn=1; nn)", nn); + pattern.append(UnicodeString(nnbuf, -1, US_INV)); + } + UErrorCode status = U_ZERO_ERROR; + RegexPattern *pat = RegexPattern::compile(pattern, 0, status); + REGEX_CHECK_STATUS; + for (nn=1; nngroupNumberFromName(nnbuf, -1, status); + REGEX_ASSERT(nn == groupNum); + if (nn != groupNum) { + break; + } + } + delete pat; + + pattern.remove(); + for (nn=1; nn)", nn); + pattern.append(UnicodeString(nnbuf, -1, US_INV)); + } + status = U_ZERO_ERROR; + pat = RegexPattern::compile(pattern, 0, status); + REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); + delete pat; +} + + //-------------------------------------------------------------- // // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. @@ -5172,7 +5507,7 @@ void RegexTest::Bug8479() { delete pMatcher; } } - + // Bug 7029 void RegexTest::Bug7029() { @@ -5190,16 +5525,17 @@ void RegexTest::Bug7029() { // Bug 9283 // This test is checking for the existance of any supplemental characters that case-fold -// to a bmp character. +// to a bmp character. // -// At the time of this writing there are none. If any should appear in a subsequent release -// of Unicode, the code in regular expressions compilation that determines the longest -// posssible match for a literal string will need to be enhanced. +// At the time of this writing there are none. If any should appear in a subsequent release +// of Unicode, the code in regular expressions compilation that determines the longest +// posssible match for a literal string will need to be enhanced. // // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() // for details on what to do in case of a failure of this test. // void RegexTest::Bug9283() { +#if !UCONFIG_NO_NORMALIZATION UErrorCode status = U_ZERO_ERROR; UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); REGEX_CHECK_STATUS; @@ -5213,6 +5549,7 @@ void RegexTest::Bug9283() { UnicodeString cf = UnicodeString(c).foldCase(); REGEX_ASSERT(cf.length() >= 2); } +#endif /* #if !UCONFIG_NO_NORMALIZATION */ } @@ -5225,5 +5562,293 @@ void RegexTest::CheckInvBufSize() { } } -#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ +void RegexTest::Bug10459() { + UErrorCode status = U_ZERO_ERROR; + UnicodeString patternString("(txt)"); + UnicodeString txtString("txt"); + + UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); + REGEX_CHECK_STATUS; + UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); + REGEX_CHECK_STATUS; + + URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); + REGEX_CHECK_STATUS; + + uregex_setUText(icu_re, utext_txt, &status); + REGEX_CHECK_STATUS; + + // The bug was that calling uregex_group() before doing a matching operation + // was causing a segfault. Only for Regular Expressions created from UText. + // It should set an U_REGEX_INVALID_STATE. + + UChar buf[100]; + int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_STATE); + REGEX_ASSERT(len == 0); + + uregex_close(icu_re); + utext_close(utext_pat); + utext_close(utext_txt); +} + +void RegexTest::TestCaseInsensitiveStarters() { + // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't + // become stale because of new Unicode characters. + // If it is stale, rerun the generation tool + // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing + // and replace the embedded data in i18n/regexcmp.cpp + + for (UChar32 cp=0; cp<=0x10ffff; cp++) { + if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { + continue; + } + UnicodeSet s(cp, cp); + s.closeOver(USET_CASE_INSENSITIVE); + UnicodeSetIterator setIter(s); + while (setIter.next()) { + if (!setIter.isString()) { + continue; + } + const UnicodeString &str = setIter.getString(); + UChar32 firstChar = str.char32At(0); + UnicodeSet starters; + RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); + if (!starters.contains(cp)) { + errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); + return; + } + } + } +} + + +void RegexTest::TestBug11049() { + // Original bug report: pattern with match start consisting of one of several individual characters, + // and the text being matched ending with a supplementary character. find() would read past the + // end of the input text when searching for potential match starting points. + + // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will + // detect the bad read. + + TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); + TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); + + // Test again with a pattern starting with a single character, + // which takes a different code path than starting with an OR expression, + // but with similar logic. + TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); + TestCase11049("C", "string matches at end C", TRUE, __LINE__); +} + +// Run a single test case from TestBug11049(). Internal function. +void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { + UErrorCode status = U_ZERO_ERROR; + UnicodeString patternString = UnicodeString(pattern).unescape(); + LocalPointer compiledPat(RegexPattern::compile(patternString, 0, status)); + + UnicodeString dataString = UnicodeString(data).unescape(); + UChar *exactBuffer = new UChar[dataString.length()]; + dataString.extract(exactBuffer, dataString.length(), status); + UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); + + LocalPointer matcher(compiledPat->matcher(status)); + REGEX_CHECK_STATUS; + matcher->reset(ut); + UBool result = matcher->find(); + if (result != expectMatch) { + errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", + __FILE__, lineNumber, expectMatch, result, pattern, data); + } + + // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see + // off-by-one on find() with match at the last code point. + // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 + // because string.unescape() will only shrink it. + char * utf8Buffer = new char[uprv_strlen(data)+1]; + u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); + REGEX_CHECK_STATUS; + ut = utext_openUTF8(ut, utf8Buffer, -1, &status); + REGEX_CHECK_STATUS; + matcher->reset(ut); + result = matcher->find(); + if (result != expectMatch) { + errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", + __FILE__, lineNumber, expectMatch, result, pattern, data); + } + delete [] utf8Buffer; + + utext_close(ut); + delete [] exactBuffer; +} + + +void RegexTest::TestBug11371() { + if (quick) { + logln("Skipping test. Runs in exhuastive mode only."); + return; + } + UErrorCode status = U_ZERO_ERROR; + UnicodeString patternString; + + for (int i=0; i<8000000; i++) { + patternString.append(UnicodeString("()")); + } + LocalPointer compiledPat(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } + + status = U_ZERO_ERROR; + patternString = "("; + for (int i=0; i<20000000; i++) { + patternString.append(UnicodeString("A++")); + } + patternString.append(UnicodeString("){0}B++")); + LocalPointer compiledPat2(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } + + // Pattern with too much string data, such that string indexes overflow operand data field size + // in compiled instruction. + status = U_ZERO_ERROR; + patternString = ""; + while (patternString.length() < 0x00ffffff) { + patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); + } + patternString.append(UnicodeString("X? trailing string")); + LocalPointer compiledPat3(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } +} + +void RegexTest::TestBug11480() { + // C API, get capture group of a group that does not participate in the match. + // (Returns a zero length string, with nul termination, + // indistinguishable from a group with a zero length match.) + + UErrorCode status = U_ZERO_ERROR; + URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); + REGEX_CHECK_STATUS; + UnicodeString text = UNICODE_STRING_SIMPLE("A"); + uregex_setText(re, text.getBuffer(), text.length(), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); + UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; + int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); + REGEX_ASSERT(length == 0); + REGEX_ASSERT(buf[0] == 13); + REGEX_ASSERT(buf[1] == 0); + REGEX_ASSERT(buf[2] == 13); + uregex_close(re); + + // UText C++ API, length of match is 0 for non-participating matches. + UText ut = UTEXT_INITIALIZER; + utext_openUnicodeString(&ut, &text, &status); + RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); + REGEX_CHECK_STATUS; + matcher.reset(&ut); + REGEX_ASSERT(matcher.lookingAt(0, status)); + + // UText C++ API, Capture group 1 matches "A", position 0, length 1. + int64_t groupLen = -666; + UText group = UTEXT_INITIALIZER; + matcher.group(1, &group, groupLen, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(groupLen == 1); + REGEX_ASSERT(utext_getNativeIndex(&group) == 0); + + // Capture group 2, the (B), does not participate in the match. + matcher.group(2, &group, groupLen, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(groupLen == 0); + REGEX_ASSERT(matcher.start(2, status) == -1); + REGEX_CHECK_STATUS; +} + +void RegexTest::TestBug12884() { + // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts} + UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}"); + UnicodeString text(u"hello"); + UErrorCode status = U_ZERO_ERROR; + RegexMatcher m(pattern, text, 0, status); + REGEX_CHECK_STATUS; + m.setTimeLimit(5, status); + m.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + // Non-greedy loops. They take a different code path during matching. + UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?"); + status = U_ZERO_ERROR; + RegexMatcher ngM(ngPattern, text, 0, status); + REGEX_CHECK_STATUS; + ngM.setTimeLimit(5, status); + ngM.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + // UText, wrapping non-UTF-16 text, also takes a different execution path. + const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada" + "carácter, sin importar la plataforma, sin importar el programa," + "sin importar el idioma."; + status = U_ZERO_ERROR; + LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status)); + REGEX_CHECK_STATUS; + m.reset(ut.getAlias()); + m.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + status = U_ZERO_ERROR; + ngM.reset(ut.getAlias()); + ngM.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); +} + +// Bug 13631. A find() of a pattern with a zero length look-behind assertions +// can cause a read past the end of the input text. +// The failure is seen when running this test with Clang's Addresss Sanitizer. + +void RegexTest::TestBug13631() { + const UChar *pats[] = { u"(?