X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/57a6839dcb3bba09e8228b822b290604668416fe..4f1e1a09ce4daed860e35d359ce2fceccb0764e8:/icuSources/test/intltest/regextst.cpp diff --git a/icuSources/test/intltest/regextst.cpp b/icuSources/test/intltest/regextst.cpp index 9c09d4f6..4b0a2f43 100644 --- a/icuSources/test/intltest/regextst.cpp +++ b/icuSources/test/intltest/regextst.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2014, International Business Machines Corporation and + * Copyright (c) 2002-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -23,18 +25,26 @@ #include "intltest.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS +#include +#include +#include + +#include "unicode/localpointer.h" #include "unicode/regex.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" #include "unicode/uniset.h" #include "unicode/uregex.h" +#include "unicode/usetiter.h" #include "unicode/ustring.h" +#include "unicode/utext.h" +#include "unicode/utf16.h" +#include "cstr.h" #include "regextst.h" +#include "regexcmp.h" #include "uvector.h" #include "util.h" -#include -#include -#include +#include "cmemory.h" #include "cstring.h" #include "uinvchar.h" @@ -59,90 +69,45 @@ RegexTest::~RegexTest() void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) { if (exec) logln("TestSuite RegexTest: "); - switch (index) { - - case 0: name = "Basic"; - if (exec) Basic(); - break; - case 1: name = "API_Match"; - if (exec) API_Match(); - break; - case 2: name = "API_Replace"; - if (exec) API_Replace(); - break; - case 3: name = "API_Pattern"; - if (exec) API_Pattern(); - break; - case 4: + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(Basic); + TESTCASE_AUTO(API_Match); + TESTCASE_AUTO(API_Replace); + TESTCASE_AUTO(API_Pattern); #if !UCONFIG_NO_FILE_IO - name = "Extended"; - if (exec) Extended(); -#else - name = "skip"; + TESTCASE_AUTO(Extended); #endif - break; - case 5: name = "Errors"; - if (exec) Errors(); - break; - case 6: name = "PerlTests"; - if (exec) PerlTests(); - break; - case 7: name = "Callbacks"; - if (exec) Callbacks(); - break; - case 8: name = "FindProgressCallbacks"; - if (exec) FindProgressCallbacks(); - break; - case 9: name = "Bug 6149"; - if (exec) Bug6149(); - break; - case 10: name = "UTextBasic"; - if (exec) UTextBasic(); - break; - case 11: name = "API_Match_UTF8"; - if (exec) API_Match_UTF8(); - break; - case 12: name = "API_Replace_UTF8"; - if (exec) API_Replace_UTF8(); - break; - case 13: name = "API_Pattern_UTF8"; - if (exec) API_Pattern_UTF8(); - break; - case 14: name = "PerlTestsUTF8"; - if (exec) PerlTestsUTF8(); - break; - case 15: name = "PreAllocatedUTextCAPI"; - if (exec) PreAllocatedUTextCAPI(); - break; - case 16: name = "Bug 7651"; - if (exec) Bug7651(); - break; - case 17: name = "Bug 7740"; - if (exec) Bug7740(); - break; - case 18: name = "Bug 8479"; - if (exec) Bug8479(); - break; - case 19: name = "Bug 7029"; - if (exec) Bug7029(); - break; - case 20: name = "CheckInvBufSize"; - if (exec) CheckInvBufSize(); - break; - case 21: name = "Bug 9283"; - if (exec) Bug9283(); - break; - case 22: name = "Bug10459"; - if (exec) Bug10459(); - break; - - default: name = ""; - break; //needed to end loop - } + TESTCASE_AUTO(Errors); + TESTCASE_AUTO(PerlTests); + TESTCASE_AUTO(Callbacks); + TESTCASE_AUTO(FindProgressCallbacks); + TESTCASE_AUTO(Bug6149); + TESTCASE_AUTO(UTextBasic); + TESTCASE_AUTO(API_Match_UTF8); + TESTCASE_AUTO(API_Replace_UTF8); + TESTCASE_AUTO(API_Pattern_UTF8); + TESTCASE_AUTO(PerlTestsUTF8); + TESTCASE_AUTO(PreAllocatedUTextCAPI); + TESTCASE_AUTO(Bug7651); + TESTCASE_AUTO(Bug7740); + TESTCASE_AUTO(Bug8479); + TESTCASE_AUTO(Bug7029); + TESTCASE_AUTO(CheckInvBufSize); + TESTCASE_AUTO(Bug9283); + TESTCASE_AUTO(Bug10459); + TESTCASE_AUTO(TestCaseInsensitiveStarters); + TESTCASE_AUTO(TestBug11049); + TESTCASE_AUTO(TestBug11371); + TESTCASE_AUTO(TestBug11480); + TESTCASE_AUTO(NamedCapture); + TESTCASE_AUTO(NamedCaptureLimits); + TESTCASE_AUTO(TestBug12884); + TESTCASE_AUTO(TestBug13631); + TESTCASE_AUTO(TestBug13632); + TESTCASE_AUTO_END; } - /** * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage * into ASCII. @@ -211,9 +176,7 @@ const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { return ASSERT_BUF; } -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - -#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} +#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ __FILE__, __LINE__, u_errorName(status)); return;}} @@ -230,7 +193,12 @@ if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} -#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};} +// expected: const char * , restricted to invariant characters. +// actual: const UnicodeString & +#define REGEX_ASSERT_UNISTR(expected, actual) { \ + if (UnicodeString(expected, -1, US_INV) != (actual)) { \ + errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ + __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} static UBool testUTextEqual(UText *uta, UText *utb) { @@ -268,8 +236,8 @@ void RegexTest::assertUText(const char *expected, UText *actual, const char *fil if (!testUTextEqual(&expectedText, actual)) { char buf[201 /*21*/]; char expectedBuf[201]; - utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); - utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); + utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); + utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); } utext_close(&expectedText); @@ -290,8 +258,8 @@ void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const if (!testUTextEqual(&expectedText, actual)) { char buf[201 /*21*/]; char expectedBuf[201]; - utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); - utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); + utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); + utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); } utext_close(&expectedText); @@ -1414,8 +1382,8 @@ void RegexTest::API_Replace() { REGEX_ASSERT(dest == "The value of $1 is bc.defg"); dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); - REGEX_CHECK_STATUS; - REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); + REGEX_ASSERT(U_FAILURE(status)); + status = U_ZERO_ERROR; UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); replacement = replacement.unescape(); @@ -2027,7 +1995,7 @@ void RegexTest::API_Match_UTF8() { utext_openUnicodeString(&destText, &dest, &status); UText *result; //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ - // Test shallow-clone API + // Test shallow-clone API int64_t group_len; result = matcher->group((UText *)NULL, group_len, status); REGEX_CHECK_STATUS; @@ -2041,47 +2009,72 @@ void RegexTest::API_Match_UTF8() { utext_close(&destText); utext_openUnicodeString(&destText, &dest, &status); - result = matcher->group(0, NULL, status); + int64_t length; + result = matcher->group(0, NULL, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); utext_close(result); - result = matcher->group(0, &destText, status); + result = matcher->group(0, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 0); + REGEX_ASSERT(length == 10); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); - result = matcher->group(1, NULL, status); + // Capture Group 1 == "234567" + result = matcher->group(1, NULL, length, status); REGEX_CHECK_STATUS; - const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(1, &destText, status); + + result = matcher->group(1, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); - result = matcher->group(2, NULL, status); + // Capture Group 2 == "45" + result = matcher->group(2, NULL, length, status); REGEX_CHECK_STATUS; - const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ - REGEX_ASSERT_UTEXT_UTF8(str_45, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(2, &destText, status); + + result = matcher->group(2, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_45, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); - result = matcher->group(3, NULL, status); + // Capture Group 3 == "89" + result = matcher->group(3, NULL, length, status); REGEX_CHECK_STATUS; - const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(3, &destText, status); + + result = matcher->group(3, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + // Capture Group number out of range. + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; matcher->reset(); REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); @@ -2593,7 +2586,9 @@ void RegexTest::API_Replace_UTF8() { REGEX_ASSERT(result == &destText); REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); - const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */ + const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, + 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); result = matcher2->replaceFirst(&replText, NULL, status); REGEX_CHECK_STATUS; @@ -3059,6 +3054,37 @@ void RegexTest::API_Pattern_UTF8() { delete pat1; + // + // split of a UText based string, with library allocating output UTexts. + // + { + status = U_ZERO_ERROR; + RegexMatcher matcher(UnicodeString("(:)"), 0, status); + UnicodeString stringToSplit("first:second:third"); + UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); + REGEX_CHECK_STATUS; + + UText *splits[10] = {NULL}; + int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(numFields == 5); + REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); + REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); + REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); + REGEX_ASSERT(splits[5] == NULL); + + for (int i=0; ipattern(),"(Hello, world)*"); + REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); delete pat1; @@ -3498,7 +3524,7 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (UTF8Matcher == NULL) { // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine - logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); + logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); status = U_ZERO_ERROR; } } @@ -3507,6 +3533,9 @@ void RegexTest::regex_find(const UnicodeString &pattern, // Generate native indices for UTF8 versions of region and capture group info // if (UTF8Matcher != NULL) { + if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag + UTF8Matcher->setTrace(TRUE); + } if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); @@ -3586,6 +3615,9 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } matcher->setTrace(FALSE); + if (UTF8Matcher) { + UTF8Matcher->setTrace(FALSE); + } if (U_FAILURE(status)) { errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); } @@ -3607,16 +3639,17 @@ void RegexTest::regex_find(const UnicodeString &pattern, failed = TRUE; goto cleanupAndReturn; } + if (isMatch && groupStarts.size() == 0) { + errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status)); + failed = TRUE; + } + if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) { + errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status)); + failed = TRUE; + } if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { // Only check for match / no match. Don't check capture groups. - if (isMatch && groupStarts.size() == 0) { - errln("Error at line %d: No match expected, but one found.", line); - failed = TRUE; - } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { - errln("Error at line %d: No match expected, but one found. (UTF8)", line); - failed = TRUE; - } goto cleanupAndReturn; } @@ -3775,7 +3808,7 @@ void RegexTest::Errors() { REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); - REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan + REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); @@ -4797,6 +4830,15 @@ void RegexTest::Callbacks() { REGEX_ASSERT(matcher.matches(status)==FALSE); REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); REGEX_ASSERT(cbInfo.numCalls == 4); + + // A longer running find that the callback function will abort. + status = U_ZERO_ERROR; + cbInfo.reset(4); + s = "aaaaaaaaaaaaaaaaaaaaaaab"; + matcher.reset(s); + REGEX_ASSERT(matcher.find(status)==FALSE); + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); + REGEX_ASSERT(cbInfo.numCalls == 4); } @@ -4818,6 +4860,9 @@ struct progressCallBackContext { void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; }; +// call-back function for find(). +// Return TRUE to continue the find(). +// Return FALSE to stop the find(). U_CDECL_BEGIN static UBool U_CALLCONV testProgressCallBackFn(const void *context, int64_t matchIndex) { @@ -4853,7 +4898,7 @@ void RegexTest::FindProgressCallbacks() { const void *returnedContext; URegexFindProgressCallback *returnedFn; UErrorCode status = U_ZERO_ERROR; - RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. + RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); REGEX_CHECK_STATUS; matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); REGEX_CHECK_STATUS; @@ -4862,10 +4907,10 @@ void RegexTest::FindProgressCallbacks() { REGEX_ASSERT(returnedFn == testProgressCallBackFn); REGEX_ASSERT(returnedContext == &cbInfo); - // A short-running match should NOT invoke the callback. + // A find that matches on the initial position does NOT invoke the callback. status = U_ZERO_ERROR; cbInfo.reset(100); - UnicodeString s = "abxxx"; + UnicodeString s = "aaxxx"; matcher.reset(s); #if 0 matcher.setTrace(TRUE); @@ -4874,7 +4919,8 @@ void RegexTest::FindProgressCallbacks() { REGEX_CHECK_STATUS; REGEX_ASSERT(cbInfo.numCalls == 0); - // A medium running match that causes matcher.find() to invoke our callback for each index. + // A medium running find() that causes matcher.find() to invoke our callback for each index, + // but not so many times that we interrupt the operation. status = U_ZERO_ERROR; s = "aaaaaaaaaaaaaaaaaaab"; cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string @@ -4889,22 +4935,21 @@ void RegexTest::FindProgressCallbacks() { cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string matcher.reset(s1); REGEX_ASSERT(matcher.find(0, status)==FALSE); - REGEX_CHECK_STATUS; + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); -#if 0 // Now a match that will succeed, but after an interruption status = U_ZERO_ERROR; UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string matcher.reset(s2); REGEX_ASSERT(matcher.find(0, status)==FALSE); - REGEX_CHECK_STATUS; + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); // Now retry the match from where left off cbInfo.maxCalls = 100; // No callback limit + status = U_ZERO_ERROR; REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); REGEX_CHECK_STATUS; -#endif } @@ -4983,7 +5028,11 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text1[80]; UText *actual; UBool result; - u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + int64_t length = 0; + + u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); + // 012345678901234567890123456789012345678901234567 + // 0 1 2 3 4 status = U_ZERO_ERROR; re = uregex_openC("abc(.*?)def", 0, NULL, &status); @@ -4993,26 +5042,29 @@ void RegexTest::PreAllocatedUTextCAPI () { result = uregex_find(re, 0, &status); REGEX_ASSERT(result==TRUE); - /* Capture Group 0, the full match. Should succeed. */ + /* Capture Group 0, the full match. Should succeed. "abc interior def" */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); + actual = uregex_groupUText(re, 0, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 6); + REGEX_ASSERT(length == 16); + REGEX_ASSERT(utext_nativeLength(actual) == 47); - /* Capture group #1. Should succeed. */ + /* Capture group #1. Should succeed, matching " interior ". */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); + actual = uregex_groupUText(re, 1, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " + REGEX_ASSERT(length == 10); + REGEX_ASSERT(utext_nativeLength(actual) == 47); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); + actual = uregex_groupUText(re, 2, &bufferText, &length, &status); REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT(actual == &bufferText); - uregex_close(re); } @@ -5025,10 +5077,12 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text2[80]; UText replText = UTEXT_INITIALIZER; UText *result; + status = U_ZERO_ERROR; + utext_openUnicodeString(&bufferText, &buffer, &status); status = U_ZERO_ERROR; - u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); - u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); + u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); re = uregex_openC("x(.*?)x", 0, NULL, &status); @@ -5036,7 +5090,9 @@ void RegexTest::PreAllocatedUTextCAPI () { /* Normal case, with match */ uregex_setText(re, text1, -1, &status); + REGEX_CHECK_STATUS; utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + REGEX_CHECK_STATUS; result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &bufferText); @@ -5052,7 +5108,7 @@ void RegexTest::PreAllocatedUTextCAPI () { /* Unicode escapes */ uregex_setText(re, text1, -1, &status); - regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); + regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); REGEX_CHECK_STATUS; @@ -5111,6 +5167,276 @@ void RegexTest::PreAllocatedUTextCAPI () { utext_close(&patternText); } + +//-------------------------------------------------------------- +// +// NamedCapture Check basic named capture group functionality +// +//-------------------------------------------------------------- +void RegexTest::NamedCapture() { + UErrorCode status = U_ZERO_ERROR; + RegexPattern *pat = RegexPattern::compile(UnicodeString( + "abc()()(?xyz)(de)(?hmm)(?oh)f\\k"), 0, status); + REGEX_CHECK_STATUS; + int32_t group = pat->groupNumberFromName("five", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(5 == group); + group = pat->groupNumberFromName("three", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(3 == group); + + status = U_ZERO_ERROR; + group = pat->groupNumberFromName(UnicodeString("six"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(6 == group); + + status = U_ZERO_ERROR; + group = pat->groupNumberFromName(UnicodeString("nosuch"), status); + U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + + // After copying a pattern, named capture should still work in the copy. + RegexPattern *copiedPat = new RegexPattern(*pat); + REGEX_ASSERT(*copiedPat == *pat); + delete pat; pat = NULL; // Delete original, copy should have no references back to it. + + group = copiedPat->groupNumberFromName("five", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(5 == group); + group = copiedPat->groupNumberFromName("three", -1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(3 == group); + delete copiedPat; + + // ReplaceAll with named capture group. + status = U_ZERO_ERROR; + UnicodeString text("Substitution of <> for <>"); + RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?.+?)>>"), text, 0, status); + REGEX_CHECK_STATUS; + // m.pattern().dumpPattern(); + UnicodeString replacedText = m->replaceAll("'${mid}'", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); + delete m; + + // ReplaceAll, allowed capture group numbers. + text = UnicodeString("abcmxyz"); + m = new RegexMatcher(UnicodeString("..(?m)(.)(.)"), text, 0, status); + REGEX_CHECK_STATUS; + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$3>"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$4>"), status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, + REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits + REGEX_CHECK_STATUS; // that push group num out of range. + REGEX_ASSERT(UnicodeString("az") == replacedText); // This is group 1. + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("<${one"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + delete m; + + // Repeat the above replaceAll() tests using the plain C API, which + // has a separate implementation internally. + // TODO: factor out the test data. + + status = U_ZERO_ERROR; + URegularExpression *re = uregex_openC("..(?m)(.)(.)", 0, NULL, &status); + REGEX_CHECK_STATUS; + text = UnicodeString("abcmxyz"); + uregex_setText(re, text.getBuffer(), text.length(), &status); + REGEX_CHECK_STATUS; + + UChar resultBuf[100]; + int32_t resultLength; + UnicodeString repl; + + status = U_ZERO_ERROR; + repl = UnicodeString("<$0>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$1>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${one}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$2>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$3>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$4>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$04>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$000016>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<$3$2$1${one}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("az") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("$3$2$1${one}"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${noSuchName}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${invalid-name}>"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("<${one"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + status = U_ZERO_ERROR; + repl = UnicodeString("$not a capture group"); + resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); + REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); + + uregex_close(re); +} + +//-------------------------------------------------------------- +// +// NamedCaptureLimits Patterns with huge numbers of named capture groups. +// The point is not so much what the exact limit is, +// but that a largish number doesn't hit bad non-linear performance, +// and that exceeding the limit fails cleanly. +// +//-------------------------------------------------------------- +void RegexTest::NamedCaptureLimits() { + if (quick) { + logln("Skipping test. Runs in exhuastive mode only."); + return; + } + const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. + const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. + char nnbuf[100]; + UnicodeString pattern; + int32_t nn; + + for (nn=1; nn)", nn); + pattern.append(UnicodeString(nnbuf, -1, US_INV)); + } + UErrorCode status = U_ZERO_ERROR; + RegexPattern *pat = RegexPattern::compile(pattern, 0, status); + REGEX_CHECK_STATUS; + for (nn=1; nngroupNumberFromName(nnbuf, -1, status); + REGEX_ASSERT(nn == groupNum); + if (nn != groupNum) { + break; + } + } + delete pat; + + pattern.remove(); + for (nn=1; nn)", nn); + pattern.append(UnicodeString(nnbuf, -1, US_INV)); + } + status = U_ZERO_ERROR; + pat = RegexPattern::compile(pattern, 0, status); + REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); + delete pat; +} + + //-------------------------------------------------------------- // // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. @@ -5258,7 +5584,7 @@ void RegexTest::Bug10459() { // It should set an U_REGEX_INVALID_STATE. UChar buf[100]; - int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status); + int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); REGEX_ASSERT(status == U_REGEX_INVALID_STATE); REGEX_ASSERT(len == 0); @@ -5267,5 +5593,262 @@ void RegexTest::Bug10459() { utext_close(utext_txt); } -#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ +void RegexTest::TestCaseInsensitiveStarters() { + // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't + // become stale because of new Unicode characters. + // If it is stale, rerun the generation tool + // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing + // and replace the embedded data in i18n/regexcmp.cpp + for (UChar32 cp=0; cp<=0x10ffff; cp++) { + if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { + continue; + } + UnicodeSet s(cp, cp); + s.closeOver(USET_CASE_INSENSITIVE); + UnicodeSetIterator setIter(s); + while (setIter.next()) { + if (!setIter.isString()) { + continue; + } + const UnicodeString &str = setIter.getString(); + UChar32 firstChar = str.char32At(0); + UnicodeSet starters; + RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); + if (!starters.contains(cp)) { + errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); + return; + } + } + } +} + + +void RegexTest::TestBug11049() { + // Original bug report: pattern with match start consisting of one of several individual characters, + // and the text being matched ending with a supplementary character. find() would read past the + // end of the input text when searching for potential match starting points. + + // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will + // detect the bad read. + + TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); + TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); + + // Test again with a pattern starting with a single character, + // which takes a different code path than starting with an OR expression, + // but with similar logic. + TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); + TestCase11049("C", "string matches at end C", TRUE, __LINE__); +} + +// Run a single test case from TestBug11049(). Internal function. +void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { + UErrorCode status = U_ZERO_ERROR; + UnicodeString patternString = UnicodeString(pattern).unescape(); + LocalPointer compiledPat(RegexPattern::compile(patternString, 0, status)); + + UnicodeString dataString = UnicodeString(data).unescape(); + UChar *exactBuffer = new UChar[dataString.length()]; + dataString.extract(exactBuffer, dataString.length(), status); + UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); + + LocalPointer matcher(compiledPat->matcher(status)); + REGEX_CHECK_STATUS; + matcher->reset(ut); + UBool result = matcher->find(); + if (result != expectMatch) { + errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", + __FILE__, lineNumber, expectMatch, result, pattern, data); + } + + // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see + // off-by-one on find() with match at the last code point. + // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 + // because string.unescape() will only shrink it. + char * utf8Buffer = new char[uprv_strlen(data)+1]; + u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); + REGEX_CHECK_STATUS; + ut = utext_openUTF8(ut, utf8Buffer, -1, &status); + REGEX_CHECK_STATUS; + matcher->reset(ut); + result = matcher->find(); + if (result != expectMatch) { + errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", + __FILE__, lineNumber, expectMatch, result, pattern, data); + } + delete [] utf8Buffer; + + utext_close(ut); + delete [] exactBuffer; +} + + +void RegexTest::TestBug11371() { + if (quick) { + logln("Skipping test. Runs in exhuastive mode only."); + return; + } + UErrorCode status = U_ZERO_ERROR; + UnicodeString patternString; + + for (int i=0; i<8000000; i++) { + patternString.append(UnicodeString("()")); + } + LocalPointer compiledPat(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } + + status = U_ZERO_ERROR; + patternString = "("; + for (int i=0; i<20000000; i++) { + patternString.append(UnicodeString("A++")); + } + patternString.append(UnicodeString("){0}B++")); + LocalPointer compiledPat2(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } + + // Pattern with too much string data, such that string indexes overflow operand data field size + // in compiled instruction. + status = U_ZERO_ERROR; + patternString = ""; + while (patternString.length() < 0x00ffffff) { + patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); + } + patternString.append(UnicodeString("X? trailing string")); + LocalPointer compiledPat3(RegexPattern::compile(patternString, 0, status)); + if (status != U_REGEX_PATTERN_TOO_BIG) { + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", + __FILE__, __LINE__, u_errorName(status)); + } +} + +void RegexTest::TestBug11480() { + // C API, get capture group of a group that does not participate in the match. + // (Returns a zero length string, with nul termination, + // indistinguishable from a group with a zero length match.) + + UErrorCode status = U_ZERO_ERROR; + URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); + REGEX_CHECK_STATUS; + UnicodeString text = UNICODE_STRING_SIMPLE("A"); + uregex_setText(re, text.getBuffer(), text.length(), &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); + UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; + int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); + REGEX_ASSERT(length == 0); + REGEX_ASSERT(buf[0] == 13); + REGEX_ASSERT(buf[1] == 0); + REGEX_ASSERT(buf[2] == 13); + uregex_close(re); + + // UText C++ API, length of match is 0 for non-participating matches. + UText ut = UTEXT_INITIALIZER; + utext_openUnicodeString(&ut, &text, &status); + RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); + REGEX_CHECK_STATUS; + matcher.reset(&ut); + REGEX_ASSERT(matcher.lookingAt(0, status)); + + // UText C++ API, Capture group 1 matches "A", position 0, length 1. + int64_t groupLen = -666; + UText group = UTEXT_INITIALIZER; + matcher.group(1, &group, groupLen, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(groupLen == 1); + REGEX_ASSERT(utext_getNativeIndex(&group) == 0); + + // Capture group 2, the (B), does not participate in the match. + matcher.group(2, &group, groupLen, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(groupLen == 0); + REGEX_ASSERT(matcher.start(2, status) == -1); + REGEX_CHECK_STATUS; +} + +void RegexTest::TestBug12884() { + // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts} + UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}"); + UnicodeString text(u"hello"); + UErrorCode status = U_ZERO_ERROR; + RegexMatcher m(pattern, text, 0, status); + REGEX_CHECK_STATUS; + m.setTimeLimit(5, status); + m.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + // Non-greedy loops. They take a different code path during matching. + UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?"); + status = U_ZERO_ERROR; + RegexMatcher ngM(ngPattern, text, 0, status); + REGEX_CHECK_STATUS; + ngM.setTimeLimit(5, status); + ngM.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + // UText, wrapping non-UTF-16 text, also takes a different execution path. + const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada" + "carácter, sin importar la plataforma, sin importar el programa," + "sin importar el idioma."; + status = U_ZERO_ERROR; + LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status)); + REGEX_CHECK_STATUS; + m.reset(ut.getAlias()); + m.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + + status = U_ZERO_ERROR; + ngM.reset(ut.getAlias()); + ngM.find(status); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); +} + +// Bug 13631. A find() of a pattern with a zero length look-behind assertions +// can cause a read past the end of the input text. +// The failure is seen when running this test with Clang's Addresss Sanitizer. + +void RegexTest::TestBug13631() { + const UChar *pats[] = { u"(?