git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/regextst.cpp

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/********************************************************************
	4	* COPYRIGHT:
	5	* Copyright (c) 2002-2016, International Business Machines Corporation and
	6	* others. All Rights Reserved.
	7	********************************************************************/
	8
	9	//
	10	// regextst.cpp
	11	//
	12	// ICU Regular Expressions test, part of intltest.
	13	//
	14
	15	/*
	16	NOTE!!
	17
	18	PLEASE be careful about ASCII assumptions in this test.
	19	This test is one of the worst repeat offenders.
	20	If you have questions, contact someone on the ICU PMC
	21	who has access to an EBCDIC system.
	22
	23	*/
	24
	25	#include "intltest.h"
	26	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	27
	28	#include <stdlib.h>
	29	#include <stdio.h>
	30	#include <string.h>
	31
	32	#include "unicode/localpointer.h"
	33	#include "unicode/regex.h"
	34	#include "unicode/uchar.h"
	35	#include "unicode/ucnv.h"
	36	#include "unicode/uniset.h"
	37	#include "unicode/uregex.h"
	38	#include "unicode/usetiter.h"
	39	#include "unicode/ustring.h"
	40	#include "unicode/utext.h"
	41	#include "unicode/utf16.h"
	42	#include "cstr.h"
	43	#include "regextst.h"
	44	#include "regexcmp.h"
	45	#include "uvector.h"
	46	#include "util.h"
	47	#include "cmemory.h"
	48	#include "cstring.h"
	49	#include "uinvchar.h"
	50
	51	#define SUPPORT_MUTATING_INPUT_STRING 0
	52
	53	//---------------------------------------------------------------------------
	54	//
	55	// Test class boilerplate
	56	//
	57	//---------------------------------------------------------------------------
	58	RegexTest::RegexTest()
	59	{
	60	}
	61
	62
	63	RegexTest::~RegexTest()
	64	{
	65	}
	66
	67
	68
	69	void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	70	{
	71	if (exec) logln("TestSuite RegexTest: ");
	72	TESTCASE_AUTO_BEGIN;
	73	TESTCASE_AUTO(Basic);
	74	TESTCASE_AUTO(API_Match);
	75	TESTCASE_AUTO(API_Replace);
	76	TESTCASE_AUTO(API_Pattern);
	77	#if !UCONFIG_NO_FILE_IO
	78	TESTCASE_AUTO(Extended);
	79	#endif
	80	TESTCASE_AUTO(Errors);
	81	TESTCASE_AUTO(PerlTests);
	82	TESTCASE_AUTO(Callbacks);
	83	TESTCASE_AUTO(FindProgressCallbacks);
	84	TESTCASE_AUTO(Bug6149);
	85	TESTCASE_AUTO(UTextBasic);
	86	TESTCASE_AUTO(API_Match_UTF8);
	87	TESTCASE_AUTO(API_Replace_UTF8);
	88	TESTCASE_AUTO(API_Pattern_UTF8);
	89	TESTCASE_AUTO(PerlTestsUTF8);
	90	TESTCASE_AUTO(PreAllocatedUTextCAPI);
	91	TESTCASE_AUTO(Bug7651);
	92	TESTCASE_AUTO(Bug7740);
	93	TESTCASE_AUTO(Bug8479);
	94	TESTCASE_AUTO(Bug7029);
	95	TESTCASE_AUTO(CheckInvBufSize);
	96	TESTCASE_AUTO(Bug9283);
	97	TESTCASE_AUTO(Bug10459);
	98	TESTCASE_AUTO(TestCaseInsensitiveStarters);
	99	TESTCASE_AUTO(TestBug11049);
	100	TESTCASE_AUTO(TestBug11371);
	101	TESTCASE_AUTO(TestBug11480);
	102	TESTCASE_AUTO(NamedCapture);
	103	TESTCASE_AUTO(NamedCaptureLimits);
	104	TESTCASE_AUTO(TestBug12884);
	105	TESTCASE_AUTO(TestBug13631);
	106	TESTCASE_AUTO(TestBug13632);
	107	TESTCASE_AUTO(TestBug20359);
	108	TESTCASE_AUTO(TestBug20863);
	109	TESTCASE_AUTO_END;
	110	}
	111
	112
	113	/**
	114	* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
	115	* into ASCII.
	116	* @see utext_openUTF8
	117	*/
	118	static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);
	119
	120	//---------------------------------------------------------------------------
	121	//
	122	// Error Checking / Reporting macros used in all of the tests.
	123	//
	124	//---------------------------------------------------------------------------
	125
	126	static void utextToPrintable(char buf, int32_t bufLen, UText text) {
	127	int64_t oldIndex = utext_getNativeIndex(text);
	128	utext_setNativeIndex(text, 0);
	129	char *bufPtr = buf;
	130	UChar32 c = utext_next32From(text, 0);
	131	while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
	132	if (0x000020<=c && c<0x00007e) {
	133	*bufPtr = c;
	134	} else {
	135	#if 0
	136	sprintf(bufPtr,"U+%04X", c);
	137	bufPtr+= strlen(bufPtr)-1;
	138	#else
	139	*bufPtr = '%';
	140	#endif
	141	}
	142	bufPtr++;
	143	c = UTEXT_NEXT32(text);
	144	}
	145	*bufPtr = 0;
	146	#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
	147	char ebuf = (char)malloc(bufLen);
	148	uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);
	149	uprv_strncpy(buf, ebuf, bufLen);
	150	free((void*)ebuf);
	151	#endif
	152	utext_setNativeIndex(text, oldIndex);
	153	}
	154
	155
	156	static char ASSERT_BUF[1024];
	157
	158	const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
	159	if(message.length()==0) {
	160	strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
	161	} else {
	162	UnicodeString buf;
	163	IntlTest::prettify(message,buf);
	164	if(buf.length()==0) {
	165	strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
	166	} else {
	167	buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
	168	if(ASSERT_BUF[0]==0) {
	169	ASSERT_BUF[0]=0;
	170	for(int32_t i=0;i<buf.length();i++) {
	171	UChar ch = buf[i];
	172	sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
	173	}
	174	}
	175	}
	176	}
	177	ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
	178	return ASSERT_BUF;
	179	}
	180
	181	#define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
	182	char buf[200]; \
	183	utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
	184	logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
	185	} UPRV_BLOCK_MACRO_END
	186
	187	#define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
	188	if (U_FAILURE(status)) { \
	189	dataerrln("%s:%d: RegexTest failure. status=%s", \
	190	__FILE__, __LINE__, u_errorName(status)); \
	191	return; \
	192	} \
	193	} UPRV_BLOCK_MACRO_END
	194
	195	#define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
	196	if ((expr)==FALSE) { \
	197	errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
	198	} \
	199	} UPRV_BLOCK_MACRO_END
	200
	201	#define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
	202	UErrorCode status=U_ZERO_ERROR; \
	203	(expr); \
	204	if (status!=errcode) { \
	205	dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
	206	__LINE__, u_errorName(errcode), u_errorName(status)); \
	207	} \
	208	} UPRV_BLOCK_MACRO_END
	209
	210	#define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
	211	if (U_FAILURE(status)) { \
	212	errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
	213	} \
	214	} UPRV_BLOCK_MACRO_END
	215
	216	#define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
	217	if ((expr)==FALSE) { \
	218	errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
	219	return; \
	220	} \
	221	} UPRV_BLOCK_MACRO_END
	222
	223	// expected: const char * , restricted to invariant characters.
	224	// actual: const UnicodeString &
	225	#define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
	226	if (UnicodeString(expected, -1, US_INV) != (actual)) { \
	227	errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
	228	__FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
	229	} \
	230	} UPRV_BLOCK_MACRO_END
	231
	232
	233	static UBool testUTextEqual(UText uta, UText utb) {
	234	UChar32 ca = 0;
	235	UChar32 cb = 0;
	236	utext_setNativeIndex(uta, 0);
	237	utext_setNativeIndex(utb, 0);
	238	do {
	239	ca = utext_next32(uta);
	240	cb = utext_next32(utb);
	241	if (ca != cb) {
	242	break;
	243	}
	244	} while (ca != U_SENTINEL);
	245	return ca == cb;
	246	}
	247
	248
	249	/**
	250	* @param expected expected text in UTF-8 (not platform) codepage
	251	*/
	252	void RegexTest::assertUText(const char expected, UText actual, const char *file, int line) {
	253	UErrorCode status = U_ZERO_ERROR;
	254	UText expectedText = UTEXT_INITIALIZER;
	255	utext_openUTF8(&expectedText, expected, -1, &status);
	256	if(U_FAILURE(status)) {
	257	errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	258	return;
	259	}
	260	if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
	261	errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
	262	return;
	263	}
	264	utext_setNativeIndex(actual, 0);
	265	if (!testUTextEqual(&expectedText, actual)) {
	266	char buf[201 /21/];
	267	char expectedBuf[201];
	268	utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
	269	utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
	270	errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	271	}
	272	utext_close(&expectedText);
	273	}
	274	/**
	275	* @param expected invariant (platform local text) input
	276	*/
	277
	278	void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {
	279	UErrorCode status = U_ZERO_ERROR;
	280	UText expectedText = UTEXT_INITIALIZER;
	281	regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
	282	if(U_FAILURE(status)) {
	283	errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	284	return;
	285	}
	286	utext_setNativeIndex(actual, 0);
	287	if (!testUTextEqual(&expectedText, actual)) {
	288	char buf[201 /21/];
	289	char expectedBuf[201];
	290	utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
	291	utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
	292	errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	293	}
	294	utext_close(&expectedText);
	295	}
	296
	297	/**
	298	* Assumes utf-8 input
	299	*/
	300	#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
	301	/**
	302	* Assumes Invariant input
	303	*/
	304	#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
	305
	306	/**
	307	* This buffer ( inv_buf ) is used to hold the UTF-8 strings
	308	* passed into utext_openUTF8. An error will be given if
	309	* INV_BUFSIZ is too small. It's only used on EBCDIC systems.
	310	*/
	311
	312	#define INV_BUFSIZ 2048 /* increase this if too small */
	313
	314	static int64_t inv_next=0;
	315
	316	#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
	317	static char inv_buf[INV_BUFSIZ];
	318	#endif
	319
	320	static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {
	321	if(length==-1) length=strlen(inv);
	322	#if U_CHARSET_FAMILY==U_ASCII_FAMILY
	323	inv_next+=length;
	324	return utext_openUTF8(ut, inv, length, status);
	325	#else
	326	if(inv_next+length+1>INV_BUFSIZ) {
	327	fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
	328	__FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
	329	*status = U_MEMORY_ALLOCATION_ERROR;
	330	return NULL;
	331	}
	332
	333	unsigned char buf = (unsigned char)inv_buf+inv_next;
	334	uprv_aestrncpy(buf, (const uint8_t*)inv, length);
	335	inv_next+=length;
	336
	337	#if 0
	338	fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
	339	#endif
	340
	341	return utext_openUTF8(ut, (const char*)buf, length, status);
	342	#endif
	343	}
	344
	345
	346	//---------------------------------------------------------------------------
	347	//
	348	// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
	349	// for the LookingAt() and Match() functions.
	350	//
	351	// usage:
	352	// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
	353	//
	354	// The expected results are UBool - TRUE or FALSE.
	355	// The input text is unescaped. The pattern is not.
	356	//
	357	//
	358	//---------------------------------------------------------------------------
	359
	360	#define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
	361	doRegexLMTest(pat, text, looking, match, __LINE__); \
	362	doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
	363	} UPRV_BLOCK_MACRO_END
	364
	365	UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	366	const UnicodeString pattern(pat, -1, US_INV);
	367	const UnicodeString inputText(text, -1, US_INV);
	368	UErrorCode status = U_ZERO_ERROR;
	369	UParseError pe;
	370	RegexPattern *REPattern = NULL;
	371	RegexMatcher *REMatcher = NULL;
	372	UBool retVal = TRUE;
	373
	374	UnicodeString patString(pat, -1, US_INV);
	375	REPattern = RegexPattern::compile(patString, 0, pe, status);
	376	if (U_FAILURE(status)) {
	377	dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
	378	line, u_errorName(status));
	379	return FALSE;
	380	}
	381	if (line==376) { REPattern->dumpPattern();}
	382
	383	UnicodeString inputString(inputText);
	384	UnicodeString unEscapedInput = inputString.unescape();
	385	REMatcher = REPattern->matcher(unEscapedInput, status);
	386	if (U_FAILURE(status)) {
	387	errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
	388	line, u_errorName(status));
	389	return FALSE;
	390	}
	391
	392	UBool actualmatch;
	393	actualmatch = REMatcher->lookingAt(status);
	394	if (U_FAILURE(status)) {
	395	errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
	396	line, u_errorName(status));
	397	retVal = FALSE;
	398	}
	399	if (actualmatch != looking) {
	400	errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
	401	retVal = FALSE;
	402	}
	403
	404	status = U_ZERO_ERROR;
	405	actualmatch = REMatcher->matches(status);
	406	if (U_FAILURE(status)) {
	407	errln("RegexTest failure in matches() at line %d. Status = %s\n",
	408	line, u_errorName(status));
	409	retVal = FALSE;
	410	}
	411	if (actualmatch != match) {
	412	errln("RegexTest: wrong return from matches() at line %d.\n", line);
	413	retVal = FALSE;
	414	}
	415
	416	if (retVal == FALSE) {
	417	REPattern->dumpPattern();
	418	}
	419
	420	delete REPattern;
	421	delete REMatcher;
	422	return retVal;
	423	}
	424
	425
	426	UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	427	UText pattern = UTEXT_INITIALIZER;
	428	int32_t inputUTF8Length;
	429	char *textChars = NULL;
	430	UText inputText = UTEXT_INITIALIZER;
	431	UErrorCode status = U_ZERO_ERROR;
	432	UParseError pe;
	433	RegexPattern *REPattern = NULL;
	434	RegexMatcher *REMatcher = NULL;
	435	UBool retVal = TRUE;
	436
	437	regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
	438	REPattern = RegexPattern::compile(&pattern, 0, pe, status);
	439	if (U_FAILURE(status)) {
	440	dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
	441	line, u_errorName(status));
	442	return FALSE;
	443	}
	444
	445	UnicodeString inputString(text, -1, US_INV);
	446	UnicodeString unEscapedInput = inputString.unescape();
	447	LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
	448	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	449
	450	inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
	451	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
	452	// UTF-8 does not allow unpaired surrogates, so this could actually happen
	453	logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
	454	return TRUE; // not a failure of the Regex engine
	455	}
	456	status = U_ZERO_ERROR; // buffer overflow
	457	textChars = new char[inputUTF8Length+1];
	458	unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
	459	utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
	460
	461	REMatcher = &REPattern->matcher(status)->reset(&inputText);
	462	if (U_FAILURE(status)) {
	463	errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
	464	line, u_errorName(status));
	465	return FALSE;
	466	}
	467
	468	UBool actualmatch;
	469	actualmatch = REMatcher->lookingAt(status);
	470	if (U_FAILURE(status)) {
	471	errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
	472	line, u_errorName(status));
	473	retVal = FALSE;
	474	}
	475	if (actualmatch != looking) {
	476	errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
	477	retVal = FALSE;
	478	}
	479
	480	status = U_ZERO_ERROR;
	481	actualmatch = REMatcher->matches(status);
	482	if (U_FAILURE(status)) {
	483	errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
	484	line, u_errorName(status));
	485	retVal = FALSE;
	486	}
	487	if (actualmatch != match) {
	488	errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
	489	retVal = FALSE;
	490	}
	491
	492	if (retVal == FALSE) {
	493	REPattern->dumpPattern();
	494	}
	495
	496	delete REPattern;
	497	delete REMatcher;
	498	utext_close(&inputText);
	499	utext_close(&pattern);
	500	delete[] textChars;
	501	return retVal;
	502	}
	503
	504
	505
	506	//---------------------------------------------------------------------------
	507	//
	508	// REGEX_ERR Macro + invocation function to simplify writing tests
	509	// regex tests for incorrect patterns
	510	//
	511	// usage:
	512	// REGEX_ERR("pattern", expected error line, column, expected status);
	513	//
	514	//---------------------------------------------------------------------------
	515	#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
	516
	517	void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
	518	UErrorCode expectedStatus, int32_t line) {
	519	UnicodeString pattern(pat);
	520
	521	UErrorCode status = U_ZERO_ERROR;
	522	UParseError pe;
	523	RegexPattern *callerPattern = NULL;
	524
	525	//
	526	// Compile the caller's pattern
	527	//
	528	UnicodeString patString(pat);
	529	callerPattern = RegexPattern::compile(patString, 0, pe, status);
	530	if (status != expectedStatus) {
	531	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	532	} else {
	533	if (status != U_ZERO_ERROR) {
	534	if (pe.line != errLine \|\| pe.offset != errCol) {
	535	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	536	line, errLine, errCol, pe.line, pe.offset);
	537	}
	538	}
	539	}
	540
	541	delete callerPattern;
	542
	543	//
	544	// Compile again, using a UTF-8-based UText
	545	//
	546	UText patternText = UTEXT_INITIALIZER;
	547	regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
	548	callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
	549	if (status != expectedStatus) {
	550	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	551	} else {
	552	if (status != U_ZERO_ERROR) {
	553	if (pe.line != errLine \|\| pe.offset != errCol) {
	554	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	555	line, errLine, errCol, pe.line, pe.offset);
	556	}
	557	}
	558	}
	559
	560	delete callerPattern;
	561	utext_close(&patternText);
	562	}
	563
	564
	565
	566	//---------------------------------------------------------------------------
	567	//
	568	// Basic Check for basic functionality of regex pattern matching.
	569	// Avoid the use of REGEX_FIND test macro, which has
	570	// substantial dependencies on basic Regex functionality.
	571	//
	572	//---------------------------------------------------------------------------
	573	void RegexTest::Basic() {
	574
	575
	576	//
	577	// Debug - slide failing test cases early
	578	//
	579	#if 0
	580	{
	581	// REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
	582	UParseError pe;
	583	UErrorCode status = U_ZERO_ERROR;
	584	RegexPattern *pattern;
	585	pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
	586	pattern->dumpPattern();
	587	RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
	588	UBool result = m->find();
	589	printf("result = %d\n", result);
	590	// REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
	591	// REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");
	592	}
	593	exit(1);
	594	#endif
	595
	596
	597	//
	598	// Pattern with parentheses
	599	//
	600	REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
	601	REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
	602	REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
	603
	604	//
	605	// Patterns with *
	606	//
	607	REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
	608	REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
	609	REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
	610	REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
	611	REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
	612
	613	REGEX_TESTLM("a*", "", TRUE, TRUE);
	614	REGEX_TESTLM("a*", "b", TRUE, FALSE);
	615
	616
	617	//
	618	// Patterns with "."
	619	//
	620	REGEX_TESTLM(".", "abc", TRUE, FALSE);
	621	REGEX_TESTLM("...", "abc", TRUE, TRUE);
	622	REGEX_TESTLM("....", "abc", FALSE, FALSE);
	623	REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
	624	REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
	625	REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
	626	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
	627	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
	628
	629	//
	630	// Patterns with * applied to chars at end of literal string
	631	//
	632	REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
	633	REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
	634
	635	//
	636	// Supplemental chars match as single chars, not a pair of surrogates.
	637	//
	638	REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
	639	REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
	640	REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
	641
	642
	643	//
	644	// UnicodeSets in the pattern
	645	//
	646	REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
	647	REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
	648	REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
	649	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	650	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	651	REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
	652
	653	REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
	654	REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
	655	REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
	656	REGEX_TESTLM("[\\p{Nd}]", "a123456", TRUE, FALSE); // note that matches 0 occurences.
	657	REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
	658
	659	//
	660	// OR operator in patterns
	661	//
	662	REGEX_TESTLM("(a\|b)", "a", TRUE, TRUE);
	663	REGEX_TESTLM("(a\|b)", "b", TRUE, TRUE);
	664	REGEX_TESTLM("(a\|b)", "c", FALSE, FALSE);
	665	REGEX_TESTLM("a\|b", "b", TRUE, TRUE);
	666
	667	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", TRUE, TRUE);
	668	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", TRUE, FALSE);
	669	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", TRUE, TRUE);
	670	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", TRUE, TRUE);
	671	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", TRUE, TRUE);
	672	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", TRUE, FALSE);
	673
	674	//
	675	// +
	676	//
	677	REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
	678	REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
	679	REGEX_TESTLM("b+", "", FALSE, FALSE);
	680	REGEX_TESTLM("(abc\|def)+", "defabc", TRUE, TRUE);
	681	REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
	682	REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
	683
	684	//
	685	// ?
	686	//
	687	REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
	688	REGEX_TESTLM("ab?", "a", TRUE, TRUE);
	689	REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
	690	REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
	691	REGEX_TESTLM("a(b\|c)?d", "abd", TRUE, TRUE);
	692	REGEX_TESTLM("a(b\|c)?d", "acd", TRUE, TRUE);
	693	REGEX_TESTLM("a(b\|c)?d", "ad", TRUE, TRUE);
	694	REGEX_TESTLM("a(b\|c)?d", "abcd", FALSE, FALSE);
	695	REGEX_TESTLM("a(b\|c)?d", "ab", FALSE, FALSE);
	696
	697	//
	698	// Escape sequences that become single literal chars, handled internally
	699	// by ICU's Unescape.
	700	//
	701
	702	// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
	703	REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
	704	REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
	705	REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
	706	REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
	707	REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
	708	REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
	709	REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
	710	REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
	711	REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
	712
	713	REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
	714	REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
	715
	716	// Escape of special chars in patterns
	717	REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", TRUE, TRUE);
	718	}
	719
	720
	721	//---------------------------------------------------------------------------
	722	//
	723	// UTextBasic Check for quirks that are specific to the UText
	724	// implementation.
	725	//
	726	//---------------------------------------------------------------------------
	727	void RegexTest::UTextBasic() {
	728	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	729	UErrorCode status = U_ZERO_ERROR;
	730	UText pattern = UTEXT_INITIALIZER;
	731	utext_openUTF8(&pattern, str_abc, -1, &status);
	732	RegexMatcher matcher(&pattern, 0, status);
	733	REGEX_CHECK_STATUS;
	734
	735	UText input = UTEXT_INITIALIZER;
	736	utext_openUTF8(&input, str_abc, -1, &status);
	737	REGEX_CHECK_STATUS;
	738	matcher.reset(&input);
	739	REGEX_CHECK_STATUS;
	740	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	741
	742	matcher.reset(matcher.inputText());
	743	REGEX_CHECK_STATUS;
	744	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	745
	746	utext_close(&pattern);
	747	utext_close(&input);
	748	}
	749
	750
	751	//---------------------------------------------------------------------------
	752	//
	753	// API_Match Test that the API for class RegexMatcher
	754	// is present and nominally working, but excluding functions
	755	// implementing replace operations.
	756	//
	757	//---------------------------------------------------------------------------
	758	void RegexTest::API_Match() {
	759	UParseError pe;
	760	UErrorCode status=U_ZERO_ERROR;
	761	int32_t flags = 0;
	762
	763	//
	764	// Debug - slide failing test cases early
	765	//
	766	#if 0
	767	{
	768	}
	769	return;
	770	#endif
	771
	772	//
	773	// Simple pattern compilation
	774	//
	775	{
	776	UnicodeString re("abc");
	777	RegexPattern *pat2;
	778	pat2 = RegexPattern::compile(re, flags, pe, status);
	779	REGEX_CHECK_STATUS;
	780
	781	UnicodeString inStr1 = "abcdef this is a test";
	782	UnicodeString instr2 = "not abc";
	783	UnicodeString empty = "";
	784
	785
	786	//
	787	// Matcher creation and reset.
	788	//
	789	RegexMatcher *m1 = pat2->matcher(inStr1, status);
	790	REGEX_CHECK_STATUS;
	791	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	792	REGEX_ASSERT(m1->input() == inStr1);
	793	m1->reset(instr2);
	794	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	795	REGEX_ASSERT(m1->input() == instr2);
	796	m1->reset(inStr1);
	797	REGEX_ASSERT(m1->input() == inStr1);
	798	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	799	m1->reset(empty);
	800	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	801	REGEX_ASSERT(m1->input() == empty);
	802	REGEX_ASSERT(&m1->pattern() == pat2);
	803
	804	//
	805	// reset(pos, status)
	806	//
	807	m1->reset(inStr1);
	808	m1->reset(4, status);
	809	REGEX_CHECK_STATUS;
	810	REGEX_ASSERT(m1->input() == inStr1);
	811	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	812
	813	m1->reset(-1, status);
	814	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	815	status = U_ZERO_ERROR;
	816
	817	m1->reset(0, status);
	818	REGEX_CHECK_STATUS;
	819	status = U_ZERO_ERROR;
	820
	821	int32_t len = m1->input().length();
	822	m1->reset(len-1, status);
	823	REGEX_CHECK_STATUS;
	824	status = U_ZERO_ERROR;
	825
	826	m1->reset(len, status);
	827	REGEX_CHECK_STATUS;
	828	status = U_ZERO_ERROR;
	829
	830	m1->reset(len+1, status);
	831	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	832	status = U_ZERO_ERROR;
	833
	834	//
	835	// match(pos, status)
	836	//
	837	m1->reset(instr2);
	838	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	839	m1->reset();
	840	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	841	m1->reset();
	842	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	843	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	844	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	845	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	846
	847	// Match() at end of string should fail, but should not
	848	// be an error.
	849	status = U_ZERO_ERROR;
	850	len = m1->input().length();
	851	REGEX_ASSERT(m1->matches(len, status) == FALSE);
	852	REGEX_CHECK_STATUS;
	853
	854	// Match beyond end of string should fail with an error.
	855	status = U_ZERO_ERROR;
	856	REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
	857	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	858
	859	// Successful match at end of string.
	860	{
	861	status = U_ZERO_ERROR;
	862	RegexMatcher m("A?", 0, status); // will match zero length string.
	863	REGEX_CHECK_STATUS;
	864	m.reset(inStr1);
	865	len = inStr1.length();
	866	REGEX_ASSERT(m.matches(len, status) == TRUE);
	867	REGEX_CHECK_STATUS;
	868	m.reset(empty);
	869	REGEX_ASSERT(m.matches(0, status) == TRUE);
	870	REGEX_CHECK_STATUS;
	871	}
	872
	873
	874	//
	875	// lookingAt(pos, status)
	876	//
	877	status = U_ZERO_ERROR;
	878	m1->reset(instr2); // "not abc"
	879	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	880	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	881	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	882	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	883	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	884	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	885	status = U_ZERO_ERROR;
	886	len = m1->input().length();
	887	REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
	888	REGEX_CHECK_STATUS;
	889	REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
	890	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	891
	892	delete m1;
	893	delete pat2;
	894	}
	895
	896
	897	//
	898	// Capture Group.
	899	// RegexMatcher::start();
	900	// RegexMatcher::end();
	901	// RegexMatcher::groupCount();
	902	//
	903	{
	904	int32_t flags=0;
	905	UParseError pe;
	906	UErrorCode status=U_ZERO_ERROR;
	907
	908	UnicodeString re("01(23(45)67)(.*)");
	909	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	910	REGEX_CHECK_STATUS;
	911	UnicodeString data = "0123456789";
	912
	913	RegexMatcher *matcher = pat->matcher(data, status);
	914	REGEX_CHECK_STATUS;
	915	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	916	static const int32_t matchStarts[] = {0, 2, 4, 8};
	917	static const int32_t matchEnds[] = {10, 8, 6, 10};
	918	int32_t i;
	919	for (i=0; i<4; i++) {
	920	int32_t actualStart = matcher->start(i, status);
	921	REGEX_CHECK_STATUS;
	922	if (actualStart != matchStarts[i]) {
	923	errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
	924	__LINE__, i, matchStarts[i], actualStart);
	925	}
	926	int32_t actualEnd = matcher->end(i, status);
	927	REGEX_CHECK_STATUS;
	928	if (actualEnd != matchEnds[i]) {
	929	errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
	930	__LINE__, i, matchEnds[i], actualEnd);
	931	}
	932	}
	933
	934	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	935	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	936
	937	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	938	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	939	matcher->reset();
	940	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	941
	942	matcher->lookingAt(status);
	943	REGEX_ASSERT(matcher->group(status) == "0123456789");
	944	REGEX_ASSERT(matcher->group(0, status) == "0123456789");
	945	REGEX_ASSERT(matcher->group(1, status) == "234567" );
	946	REGEX_ASSERT(matcher->group(2, status) == "45" );
	947	REGEX_ASSERT(matcher->group(3, status) == "89" );
	948	REGEX_CHECK_STATUS;
	949	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	950	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	951	matcher->reset();
	952	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	953
	954	delete matcher;
	955	delete pat;
	956
	957	}
	958
	959	//
	960	// find
	961	//
	962	{
	963	int32_t flags=0;
	964	UParseError pe;
	965	UErrorCode status=U_ZERO_ERROR;
	966
	967	UnicodeString re("abc");
	968	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	969	REGEX_CHECK_STATUS;
	970	UnicodeString data = ".abc..abc...abc..";
	971	// 012345678901234567
	972
	973	RegexMatcher *matcher = pat->matcher(data, status);
	974	REGEX_CHECK_STATUS;
	975	REGEX_ASSERT(matcher->find());
	976	REGEX_ASSERT(matcher->start(status) == 1);
	977	REGEX_ASSERT(matcher->find());
	978	REGEX_ASSERT(matcher->start(status) == 6);
	979	REGEX_ASSERT(matcher->find());
	980	REGEX_ASSERT(matcher->start(status) == 12);
	981	REGEX_ASSERT(matcher->find() == FALSE);
	982	REGEX_ASSERT(matcher->find() == FALSE);
	983
	984	matcher->reset();
	985	REGEX_ASSERT(matcher->find());
	986	REGEX_ASSERT(matcher->start(status) == 1);
	987
	988	REGEX_ASSERT(matcher->find(0, status));
	989	REGEX_ASSERT(matcher->start(status) == 1);
	990	REGEX_ASSERT(matcher->find(1, status));
	991	REGEX_ASSERT(matcher->start(status) == 1);
	992	REGEX_ASSERT(matcher->find(2, status));
	993	REGEX_ASSERT(matcher->start(status) == 6);
	994	REGEX_ASSERT(matcher->find(12, status));
	995	REGEX_ASSERT(matcher->start(status) == 12);
	996	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	997	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	998	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	999	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	1000
	1001	status = U_ZERO_ERROR;
	1002	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1003	status = U_ZERO_ERROR;
	1004	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1005
	1006	REGEX_ASSERT(matcher->groupCount() == 0);
	1007
	1008	delete matcher;
	1009	delete pat;
	1010	}
	1011
	1012
	1013	//
	1014	// find, with \G in pattern (true if at the end of a previous match).
	1015	//
	1016	{
	1017	int32_t flags=0;
	1018	UParseError pe;
	1019	UErrorCode status=U_ZERO_ERROR;
	1020
	1021	UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);
	1022	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1023	REGEX_CHECK_STATUS;
	1024	UnicodeString data = ".abcabc.abc..";
	1025	// 012345678901234567
	1026
	1027	RegexMatcher *matcher = pat->matcher(data, status);
	1028	REGEX_CHECK_STATUS;
	1029	REGEX_ASSERT(matcher->find());
	1030	REGEX_ASSERT(matcher->start(status) == 0);
	1031	REGEX_ASSERT(matcher->start(1, status) == -1);
	1032	REGEX_ASSERT(matcher->start(2, status) == 1);
	1033
	1034	REGEX_ASSERT(matcher->find());
	1035	REGEX_ASSERT(matcher->start(status) == 4);
	1036	REGEX_ASSERT(matcher->start(1, status) == 4);
	1037	REGEX_ASSERT(matcher->start(2, status) == -1);
	1038	REGEX_CHECK_STATUS;
	1039
	1040	delete matcher;
	1041	delete pat;
	1042	}
	1043
	1044	//
	1045	// find with zero length matches, match position should bump ahead
	1046	// to prevent loops.
	1047	//
	1048	{
	1049	int32_t i;
	1050	UErrorCode status=U_ZERO_ERROR;
	1051	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	1052	// using an always-true look-ahead.
	1053	REGEX_CHECK_STATUS;
	1054	UnicodeString s(" ");
	1055	m.reset(s);
	1056	for (i=0; ; i++) {
	1057	if (m.find() == FALSE) {
	1058	break;
	1059	}
	1060	REGEX_ASSERT(m.start(status) == i);
	1061	REGEX_ASSERT(m.end(status) == i);
	1062	}
	1063	REGEX_ASSERT(i==5);
	1064
	1065	// Check that the bump goes over surrogate pairs OK
	1066	s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
	1067	s = s.unescape();
	1068	m.reset(s);
	1069	for (i=0; ; i+=2) {
	1070	if (m.find() == FALSE) {
	1071	break;
	1072	}
	1073	REGEX_ASSERT(m.start(status) == i);
	1074	REGEX_ASSERT(m.end(status) == i);
	1075	}
	1076	REGEX_ASSERT(i==10);
	1077	}
	1078	{
	1079	// find() loop breaking test.
	1080	// with pattern of /.?/, should see a series of one char matches, then a single
	1081	// match of zero length at the end of the input string.
	1082	int32_t i;
	1083	UErrorCode status=U_ZERO_ERROR;
	1084	RegexMatcher m(".?", 0, status);
	1085	REGEX_CHECK_STATUS;
	1086	UnicodeString s(" ");
	1087	m.reset(s);
	1088	for (i=0; ; i++) {
	1089	if (m.find() == FALSE) {
	1090	break;
	1091	}
	1092	REGEX_ASSERT(m.start(status) == i);
	1093	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	1094	}
	1095	REGEX_ASSERT(i==5);
	1096	}
	1097
	1098
	1099	//
	1100	// Matchers with no input string behave as if they had an empty input string.
	1101	//
	1102
	1103	{
	1104	UErrorCode status = U_ZERO_ERROR;
	1105	RegexMatcher m(".?", 0, status);
	1106	REGEX_CHECK_STATUS;
	1107	REGEX_ASSERT(m.find());
	1108	REGEX_ASSERT(m.start(status) == 0);
	1109	REGEX_ASSERT(m.input() == "");
	1110	}
	1111	{
	1112	UErrorCode status = U_ZERO_ERROR;
	1113	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1114	RegexMatcher *m = p->matcher(status);
	1115	REGEX_CHECK_STATUS;
	1116
	1117	REGEX_ASSERT(m->find() == FALSE);
	1118	REGEX_ASSERT(m->input() == "");
	1119	delete m;
	1120	delete p;
	1121	}
	1122
	1123	//
	1124	// Regions
	1125	//
	1126	{
	1127	UErrorCode status = U_ZERO_ERROR;
	1128	UnicodeString testString("This is test data");
	1129	RegexMatcher m(".*", testString, 0, status);
	1130	REGEX_CHECK_STATUS;
	1131	REGEX_ASSERT(m.regionStart() == 0);
	1132	REGEX_ASSERT(m.regionEnd() == testString.length());
	1133	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1134	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1135
	1136	m.region(2,4, status);
	1137	REGEX_CHECK_STATUS;
	1138	REGEX_ASSERT(m.matches(status));
	1139	REGEX_ASSERT(m.start(status)==2);
	1140	REGEX_ASSERT(m.end(status)==4);
	1141	REGEX_CHECK_STATUS;
	1142
	1143	m.reset();
	1144	REGEX_ASSERT(m.regionStart() == 0);
	1145	REGEX_ASSERT(m.regionEnd() == testString.length());
	1146
	1147	UnicodeString shorterString("short");
	1148	m.reset(shorterString);
	1149	REGEX_ASSERT(m.regionStart() == 0);
	1150	REGEX_ASSERT(m.regionEnd() == shorterString.length());
	1151
	1152	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1153	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	1154	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1155	REGEX_ASSERT(&m == &m.reset());
	1156	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1157
	1158	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	1159	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1160	REGEX_ASSERT(&m == &m.reset());
	1161	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1162
	1163	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1164	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	1165	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1166	REGEX_ASSERT(&m == &m.reset());
	1167	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1168
	1169	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	1170	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1171	REGEX_ASSERT(&m == &m.reset());
	1172	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1173
	1174	}
	1175
	1176	//
	1177	// hitEnd() and requireEnd()
	1178	//
	1179	{
	1180	UErrorCode status = U_ZERO_ERROR;
	1181	UnicodeString testString("aabb");
	1182	RegexMatcher m1(".*", testString, 0, status);
	1183	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	1184	REGEX_ASSERT(m1.hitEnd() == TRUE);
	1185	REGEX_ASSERT(m1.requireEnd() == FALSE);
	1186	REGEX_CHECK_STATUS;
	1187
	1188	status = U_ZERO_ERROR;
	1189	RegexMatcher m2("a*", testString, 0, status);
	1190	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	1191	REGEX_ASSERT(m2.hitEnd() == FALSE);
	1192	REGEX_ASSERT(m2.requireEnd() == FALSE);
	1193	REGEX_CHECK_STATUS;
	1194
	1195	status = U_ZERO_ERROR;
	1196	RegexMatcher m3(".*$", testString, 0, status);
	1197	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	1198	REGEX_ASSERT(m3.hitEnd() == TRUE);
	1199	REGEX_ASSERT(m3.requireEnd() == TRUE);
	1200	REGEX_CHECK_STATUS;
	1201	}
	1202
	1203
	1204	//
	1205	// Compilation error on reset with UChar *
	1206	// These were a hazard that people were stumbling over with runtime errors.
	1207	// Changed them to compiler errors by adding private methods that more closely
	1208	// matched the incorrect use of the functions.
	1209	//
	1210	#if 0
	1211	{
	1212	UErrorCode status = U_ZERO_ERROR;
	1213	UChar ucharString[20];
	1214	RegexMatcher m(".", 0, status);
	1215	m.reset(ucharString); // should not compile.
	1216
	1217	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1218	RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
	1219
	1220	RegexMatcher m3(".", ucharString, 0, status); // Should not compile
	1221	}
	1222	#endif
	1223
	1224	//
	1225	// Time Outs.
	1226	// Note: These tests will need to be changed when the regexp engine is
	1227	// able to detect and cut short the exponential time behavior on
	1228	// this type of match.
	1229	//
	1230	{
	1231	UErrorCode status = U_ZERO_ERROR;
	1232	// Enough 'a's in the string to cause the match to time out.
	1233	// (Each on additonal 'a' doubles the time)
	1234	UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
	1235	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1236	REGEX_CHECK_STATUS;
	1237	REGEX_ASSERT(matcher.getTimeLimit() == 0);
	1238	matcher.setTimeLimit(100, status);
	1239	REGEX_ASSERT(matcher.getTimeLimit() == 100);
	1240	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1241	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	1242	}
	1243	{
	1244	UErrorCode status = U_ZERO_ERROR;
	1245	// Few enough 'a's to slip in under the time limit.
	1246	UnicodeString testString("aaaaaaaaaaaaaaaaaa");
	1247	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1248	REGEX_CHECK_STATUS;
	1249	matcher.setTimeLimit(100, status);
	1250	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1251	REGEX_CHECK_STATUS;
	1252	}
	1253
	1254	//
	1255	// Stack Limits
	1256	//
	1257	{
	1258	UErrorCode status = U_ZERO_ERROR;
	1259	UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
	1260
	1261	// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
	1262	// of the '+', and makes the stack frames larger.
	1263	RegexMatcher matcher("(A)+A$", testString, 0, status);
	1264
	1265	// With the default stack, this match should fail to run
	1266	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1267	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1268
	1269	// With unlimited stack, it should run
	1270	status = U_ZERO_ERROR;
	1271	matcher.setStackLimit(0, status);
	1272	REGEX_CHECK_STATUS;
	1273	REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
	1274	REGEX_CHECK_STATUS;
	1275	REGEX_ASSERT(matcher.getStackLimit() == 0);
	1276
	1277	// With a limited stack, it the match should fail
	1278	status = U_ZERO_ERROR;
	1279	matcher.setStackLimit(10000, status);
	1280	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1281	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1282	REGEX_ASSERT(matcher.getStackLimit() == 10000);
	1283	}
	1284
	1285	// A pattern that doesn't save state should work with
	1286	// a minimal sized stack
	1287	{
	1288	UErrorCode status = U_ZERO_ERROR;
	1289	UnicodeString testString = "abc";
	1290	RegexMatcher matcher("abc", testString, 0, status);
	1291	REGEX_CHECK_STATUS;
	1292	matcher.setStackLimit(30, status);
	1293	REGEX_CHECK_STATUS;
	1294	REGEX_ASSERT(matcher.matches(status) == TRUE);
	1295	REGEX_CHECK_STATUS;
	1296	REGEX_ASSERT(matcher.getStackLimit() == 30);
	1297
	1298	// Negative stack sizes should fail
	1299	status = U_ZERO_ERROR;
	1300	matcher.setStackLimit(1000, status);
	1301	REGEX_CHECK_STATUS;
	1302	matcher.setStackLimit(-1, status);
	1303	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	1304	REGEX_ASSERT(matcher.getStackLimit() == 1000);
	1305	}
	1306
	1307
	1308	}
	1309
	1310
	1311
	1312
	1313
	1314
	1315	//---------------------------------------------------------------------------
	1316	//
	1317	// API_Replace API test for class RegexMatcher, testing the
	1318	// Replace family of functions.
	1319	//
	1320	//---------------------------------------------------------------------------
	1321	void RegexTest::API_Replace() {
	1322	//
	1323	// Replace
	1324	//
	1325	int32_t flags=0;
	1326	UParseError pe;
	1327	UErrorCode status=U_ZERO_ERROR;
	1328
	1329	UnicodeString re("abc");
	1330	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1331	REGEX_CHECK_STATUS;
	1332	UnicodeString data = ".abc..abc...abc..";
	1333	// 012345678901234567
	1334	RegexMatcher *matcher = pat->matcher(data, status);
	1335
	1336	//
	1337	// Plain vanilla matches.
	1338	//
	1339	UnicodeString dest;
	1340	dest = matcher->replaceFirst("yz", status);
	1341	REGEX_CHECK_STATUS;
	1342	REGEX_ASSERT(dest == ".yz..abc...abc..");
	1343
	1344	dest = matcher->replaceAll("yz", status);
	1345	REGEX_CHECK_STATUS;
	1346	REGEX_ASSERT(dest == ".yz..yz...yz..");
	1347
	1348	//
	1349	// Plain vanilla non-matches.
	1350	//
	1351	UnicodeString d2 = ".abx..abx...abx..";
	1352	matcher->reset(d2);
	1353	dest = matcher->replaceFirst("yz", status);
	1354	REGEX_CHECK_STATUS;
	1355	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1356
	1357	dest = matcher->replaceAll("yz", status);
	1358	REGEX_CHECK_STATUS;
	1359	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1360
	1361	//
	1362	// Empty source string
	1363	//
	1364	UnicodeString d3 = "";
	1365	matcher->reset(d3);
	1366	dest = matcher->replaceFirst("yz", status);
	1367	REGEX_CHECK_STATUS;
	1368	REGEX_ASSERT(dest == "");
	1369
	1370	dest = matcher->replaceAll("yz", status);
	1371	REGEX_CHECK_STATUS;
	1372	REGEX_ASSERT(dest == "");
	1373
	1374	//
	1375	// Empty substitution string
	1376	//
	1377	matcher->reset(data); // ".abc..abc...abc.."
	1378	dest = matcher->replaceFirst("", status);
	1379	REGEX_CHECK_STATUS;
	1380	REGEX_ASSERT(dest == "...abc...abc..");
	1381
	1382	dest = matcher->replaceAll("", status);
	1383	REGEX_CHECK_STATUS;
	1384	REGEX_ASSERT(dest == "........");
	1385
	1386	//
	1387	// match whole string
	1388	//
	1389	UnicodeString d4 = "abc";
	1390	matcher->reset(d4);
	1391	dest = matcher->replaceFirst("xyz", status);
	1392	REGEX_CHECK_STATUS;
	1393	REGEX_ASSERT(dest == "xyz");
	1394
	1395	dest = matcher->replaceAll("xyz", status);
	1396	REGEX_CHECK_STATUS;
	1397	REGEX_ASSERT(dest == "xyz");
	1398
	1399	//
	1400	// Capture Group, simple case
	1401	//
	1402	UnicodeString re2("a(..)");
	1403	RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
	1404	REGEX_CHECK_STATUS;
	1405	UnicodeString d5 = "abcdefg";
	1406	RegexMatcher *matcher2 = pat2->matcher(d5, status);
	1407	REGEX_CHECK_STATUS;
	1408	dest = matcher2->replaceFirst("$1$1", status);
	1409	REGEX_CHECK_STATUS;
	1410	REGEX_ASSERT(dest == "bcbcdefg");
	1411
	1412	dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
	1413	REGEX_CHECK_STATUS;
	1414	REGEX_ASSERT(dest == "The value of $1 is bc.defg");
	1415
	1416	dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
	1417	REGEX_ASSERT(U_FAILURE(status));
	1418	status = U_ZERO_ERROR;
	1419
	1420	UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
	1421	replacement = replacement.unescape();
	1422	dest = matcher2->replaceFirst(replacement, status);
	1423	REGEX_CHECK_STATUS;
	1424	REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
	1425
	1426	REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
	1427
	1428
	1429	//
	1430	// Replacement String with \u hex escapes
	1431	//
	1432	{
	1433	UnicodeString src = "abc 1 abc 2 abc 3";
	1434	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
	1435	matcher->reset(src);
	1436	UnicodeString result = matcher->replaceAll(substitute, status);
	1437	REGEX_CHECK_STATUS;
	1438	REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
	1439	}
	1440	{
	1441	UnicodeString src = "abc !";
	1442	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
	1443	matcher->reset(src);
	1444	UnicodeString result = matcher->replaceAll(substitute, status);
	1445	REGEX_CHECK_STATUS;
	1446	UnicodeString expected = UnicodeString("--");
	1447	expected.append((UChar32)0x10000);
	1448	expected.append("-- !");
	1449	REGEX_ASSERT(result == expected);
	1450	}
	1451	// TODO: need more through testing of capture substitutions.
	1452
	1453	// Bug 4057
	1454	//
	1455	{
	1456	status = U_ZERO_ERROR;
	1457	UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
	1458	RegexMatcher m("ss(.*?)ee", 0, status);
	1459	REGEX_CHECK_STATUS;
	1460	UnicodeString result;
	1461
	1462	// Multiple finds do NOT bump up the previous appendReplacement postion.
	1463	m.reset(s);
	1464	m.find();
	1465	m.find();
	1466	m.appendReplacement(result, "ooh", status);
	1467	REGEX_CHECK_STATUS;
	1468	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1469
	1470	// After a reset into the interior of a string, appendReplacemnt still starts at beginning.
	1471	status = U_ZERO_ERROR;
	1472	result.truncate(0);
	1473	m.reset(10, status);
	1474	m.find();
	1475	m.find();
	1476	m.appendReplacement(result, "ooh", status);
	1477	REGEX_CHECK_STATUS;
	1478	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1479
	1480	// find() at interior of string, appendReplacemnt still starts at beginning.
	1481	status = U_ZERO_ERROR;
	1482	result.truncate(0);
	1483	m.reset();
	1484	m.find(10, status);
	1485	m.find();
	1486	m.appendReplacement(result, "ooh", status);
	1487	REGEX_CHECK_STATUS;
	1488	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1489
	1490	m.appendTail(result);
	1491	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
	1492
	1493	}
	1494
	1495	delete matcher2;
	1496	delete pat2;
	1497	delete matcher;
	1498	delete pat;
	1499	}
	1500
	1501
	1502	//---------------------------------------------------------------------------
	1503	//
	1504	// API_Pattern Test that the API for class RegexPattern is
	1505	// present and nominally working.
	1506	//
	1507	//---------------------------------------------------------------------------
	1508	void RegexTest::API_Pattern() {
	1509	RegexPattern pata; // Test default constructor to not crash.
	1510	RegexPattern patb;
	1511
	1512	REGEX_ASSERT(pata == patb);
	1513	REGEX_ASSERT(pata == pata);
	1514
	1515	UnicodeString re1("abc[a-l][m-z]");
	1516	UnicodeString re2("def");
	1517	UErrorCode status = U_ZERO_ERROR;
	1518	UParseError pe;
	1519
	1520	RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
	1521	RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
	1522	REGEX_CHECK_STATUS;
	1523	REGEX_ASSERT(pat1 == pat1);
	1524	REGEX_ASSERT(*pat1 != pata);
	1525
	1526	// Assign
	1527	patb = *pat1;
	1528	REGEX_ASSERT(patb == *pat1);
	1529
	1530	// Copy Construct
	1531	RegexPattern patc(*pat1);
	1532	REGEX_ASSERT(patc == *pat1);
	1533	REGEX_ASSERT(patb == patc);
	1534	REGEX_ASSERT(pat1 != pat2);
	1535	patb = *pat2;
	1536	REGEX_ASSERT(patb != patc);
	1537	REGEX_ASSERT(patb == *pat2);
	1538
	1539	// Compile with no flags.
	1540	RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
	1541	REGEX_ASSERT(pat1a == pat1);
	1542
	1543	REGEX_ASSERT(pat1a->flags() == 0);
	1544
	1545	// Compile with different flags should be not equal
	1546	RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
	1547	REGEX_CHECK_STATUS;
	1548
	1549	REGEX_ASSERT(pat1b != pat1a);
	1550	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	1551	REGEX_ASSERT(pat1a->flags() == 0);
	1552	delete pat1b;
	1553
	1554	// clone
	1555	RegexPattern *pat1c = pat1->clone();
	1556	REGEX_ASSERT(pat1c == pat1);
	1557	REGEX_ASSERT(pat1c != pat2);
	1558
	1559	delete pat1c;
	1560	delete pat1a;
	1561	delete pat1;
	1562	delete pat2;
	1563
	1564
	1565	//
	1566	// Verify that a matcher created from a cloned pattern works.
	1567	// (Jitterbug 3423)
	1568	//
	1569	{
	1570	UErrorCode status = U_ZERO_ERROR;
	1571	RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
	1572	RegexPattern *pClone = pSource->clone();
	1573	delete pSource;
	1574	RegexMatcher *mFromClone = pClone->matcher(status);
	1575	REGEX_CHECK_STATUS;
	1576	UnicodeString s = "Hello World";
	1577	mFromClone->reset(s);
	1578	REGEX_ASSERT(mFromClone->find() == TRUE);
	1579	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	1580	REGEX_ASSERT(mFromClone->find() == TRUE);
	1581	REGEX_ASSERT(mFromClone->group(status) == "World");
	1582	REGEX_ASSERT(mFromClone->find() == FALSE);
	1583	delete mFromClone;
	1584	delete pClone;
	1585	}
	1586
	1587	//
	1588	// matches convenience API
	1589	//
	1590	REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
	1591	REGEX_CHECK_STATUS;
	1592	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	1593	REGEX_CHECK_STATUS;
	1594	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	1595	REGEX_CHECK_STATUS;
	1596	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	1597	REGEX_CHECK_STATUS;
	1598	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	1599	REGEX_CHECK_STATUS;
	1600	status = U_INDEX_OUTOFBOUNDS_ERROR;
	1601	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	1602	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1603
	1604
	1605	//
	1606	// Split()
	1607	//
	1608	status = U_ZERO_ERROR;
	1609	pat1 = RegexPattern::compile(" +", pe, status);
	1610	REGEX_CHECK_STATUS;
	1611	UnicodeString fields[10];
	1612
	1613	int32_t n;
	1614	n = pat1->split("Now is the time", fields, 10, status);
	1615	REGEX_CHECK_STATUS;
	1616	REGEX_ASSERT(n==4);
	1617	REGEX_ASSERT(fields[0]=="Now");
	1618	REGEX_ASSERT(fields[1]=="is");
	1619	REGEX_ASSERT(fields[2]=="the");
	1620	REGEX_ASSERT(fields[3]=="time");
	1621	REGEX_ASSERT(fields[4]=="");
	1622
	1623	n = pat1->split("Now is the time", fields, 2, status);
	1624	REGEX_CHECK_STATUS;
	1625	REGEX_ASSERT(n==2);
	1626	REGEX_ASSERT(fields[0]=="Now");
	1627	REGEX_ASSERT(fields[1]=="is the time");
	1628	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	1629
	1630	fields[1] = "*";
	1631	status = U_ZERO_ERROR;
	1632	n = pat1->split("Now is the time", fields, 1, status);
	1633	REGEX_CHECK_STATUS;
	1634	REGEX_ASSERT(n==1);
	1635	REGEX_ASSERT(fields[0]=="Now is the time");
	1636	REGEX_ASSERT(fields[1]=="*");
	1637	status = U_ZERO_ERROR;
	1638
	1639	n = pat1->split(" Now is the time ", fields, 10, status);
	1640	REGEX_CHECK_STATUS;
	1641	REGEX_ASSERT(n==6);
	1642	REGEX_ASSERT(fields[0]=="");
	1643	REGEX_ASSERT(fields[1]=="Now");
	1644	REGEX_ASSERT(fields[2]=="is");
	1645	REGEX_ASSERT(fields[3]=="the");
	1646	REGEX_ASSERT(fields[4]=="time");
	1647	REGEX_ASSERT(fields[5]=="");
	1648
	1649	n = pat1->split(" ", fields, 10, status);
	1650	REGEX_CHECK_STATUS;
	1651	REGEX_ASSERT(n==2);
	1652	REGEX_ASSERT(fields[0]=="");
	1653	REGEX_ASSERT(fields[1]=="");
	1654
	1655	fields[0] = "foo";
	1656	n = pat1->split("", fields, 10, status);
	1657	REGEX_CHECK_STATUS;
	1658	REGEX_ASSERT(n==0);
	1659	REGEX_ASSERT(fields[0]=="foo");
	1660
	1661	delete pat1;
	1662
	1663	// split, with a pattern with (capture)
	1664	pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
	1665	REGEX_CHECK_STATUS;
	1666
	1667	status = U_ZERO_ERROR;
	1668	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	1669	REGEX_CHECK_STATUS;
	1670	REGEX_ASSERT(n==7);
	1671	REGEX_ASSERT(fields[0]=="");
	1672	REGEX_ASSERT(fields[1]=="a");
	1673	REGEX_ASSERT(fields[2]=="Now is ");
	1674	REGEX_ASSERT(fields[3]=="b");
	1675	REGEX_ASSERT(fields[4]=="the time");
	1676	REGEX_ASSERT(fields[5]=="c");
	1677	REGEX_ASSERT(fields[6]=="");
	1678	REGEX_ASSERT(status==U_ZERO_ERROR);
	1679
	1680	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	1681	REGEX_CHECK_STATUS;
	1682	REGEX_ASSERT(n==7);
	1683	REGEX_ASSERT(fields[0]==" ");
	1684	REGEX_ASSERT(fields[1]=="a");
	1685	REGEX_ASSERT(fields[2]=="Now is ");
	1686	REGEX_ASSERT(fields[3]=="b");
	1687	REGEX_ASSERT(fields[4]=="the time");
	1688	REGEX_ASSERT(fields[5]=="c");
	1689	REGEX_ASSERT(fields[6]=="");
	1690
	1691	status = U_ZERO_ERROR;
	1692	fields[6] = "foo";
	1693	n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
	1694	REGEX_CHECK_STATUS;
	1695	REGEX_ASSERT(n==6);
	1696	REGEX_ASSERT(fields[0]==" ");
	1697	REGEX_ASSERT(fields[1]=="a");
	1698	REGEX_ASSERT(fields[2]=="Now is ");
	1699	REGEX_ASSERT(fields[3]=="b");
	1700	REGEX_ASSERT(fields[4]=="the time");
	1701	REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
	1702	REGEX_ASSERT(fields[6]=="foo");
	1703
	1704	status = U_ZERO_ERROR;
	1705	fields[5] = "foo";
	1706	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	1707	REGEX_CHECK_STATUS;
	1708	REGEX_ASSERT(n==5);
	1709	REGEX_ASSERT(fields[0]==" ");
	1710	REGEX_ASSERT(fields[1]=="a");
	1711	REGEX_ASSERT(fields[2]=="Now is ");
	1712	REGEX_ASSERT(fields[3]=="b");
	1713	REGEX_ASSERT(fields[4]=="the time<c>");
	1714	REGEX_ASSERT(fields[5]=="foo");
	1715
	1716	status = U_ZERO_ERROR;
	1717	fields[5] = "foo";
	1718	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	1719	REGEX_CHECK_STATUS;
	1720	REGEX_ASSERT(n==5);
	1721	REGEX_ASSERT(fields[0]==" ");
	1722	REGEX_ASSERT(fields[1]=="a");
	1723	REGEX_ASSERT(fields[2]=="Now is ");
	1724	REGEX_ASSERT(fields[3]=="b");
	1725	REGEX_ASSERT(fields[4]=="the time");
	1726	REGEX_ASSERT(fields[5]=="foo");
	1727
	1728	status = U_ZERO_ERROR;
	1729	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	1730	REGEX_CHECK_STATUS;
	1731	REGEX_ASSERT(n==4);
	1732	REGEX_ASSERT(fields[0]==" ");
	1733	REGEX_ASSERT(fields[1]=="a");
	1734	REGEX_ASSERT(fields[2]=="Now is ");
	1735	REGEX_ASSERT(fields[3]=="the time<c>");
	1736	status = U_ZERO_ERROR;
	1737	delete pat1;
	1738
	1739	pat1 = RegexPattern::compile("([-,])", pe, status);
	1740	REGEX_CHECK_STATUS;
	1741	n = pat1->split("1-10,20", fields, 10, status);
	1742	REGEX_CHECK_STATUS;
	1743	REGEX_ASSERT(n==5);
	1744	REGEX_ASSERT(fields[0]=="1");
	1745	REGEX_ASSERT(fields[1]=="-");
	1746	REGEX_ASSERT(fields[2]=="10");
	1747	REGEX_ASSERT(fields[3]==",");
	1748	REGEX_ASSERT(fields[4]=="20");
	1749	delete pat1;
	1750
	1751	// Test split of string with empty trailing fields
	1752	pat1 = RegexPattern::compile(",", pe, status);
	1753	REGEX_CHECK_STATUS;
	1754	n = pat1->split("a,b,c,", fields, 10, status);
	1755	REGEX_CHECK_STATUS;
	1756	REGEX_ASSERT(n==4);
	1757	REGEX_ASSERT(fields[0]=="a");
	1758	REGEX_ASSERT(fields[1]=="b");
	1759	REGEX_ASSERT(fields[2]=="c");
	1760	REGEX_ASSERT(fields[3]=="");
	1761
	1762	n = pat1->split("a,,,", fields, 10, status);
	1763	REGEX_CHECK_STATUS;
	1764	REGEX_ASSERT(n==4);
	1765	REGEX_ASSERT(fields[0]=="a");
	1766	REGEX_ASSERT(fields[1]=="");
	1767	REGEX_ASSERT(fields[2]=="");
	1768	REGEX_ASSERT(fields[3]=="");
	1769	delete pat1;
	1770
	1771	// Split Separator with zero length match.
	1772	pat1 = RegexPattern::compile(":?", pe, status);
	1773	REGEX_CHECK_STATUS;
	1774	n = pat1->split("abc", fields, 10, status);
	1775	REGEX_CHECK_STATUS;
	1776	REGEX_ASSERT(n==5);
	1777	REGEX_ASSERT(fields[0]=="");
	1778	REGEX_ASSERT(fields[1]=="a");
	1779	REGEX_ASSERT(fields[2]=="b");
	1780	REGEX_ASSERT(fields[3]=="c");
	1781	REGEX_ASSERT(fields[4]=="");
	1782
	1783	delete pat1;
	1784
	1785	//
	1786	// RegexPattern::pattern()
	1787	//
	1788	pat1 = new RegexPattern();
	1789	REGEX_ASSERT(pat1->pattern() == "");
	1790	delete pat1;
	1791
	1792	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1793	REGEX_CHECK_STATUS;
	1794	REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
	1795	delete pat1;
	1796
	1797
	1798	//
	1799	// classID functions
	1800	//
	1801	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1802	REGEX_CHECK_STATUS;
	1803	REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
	1804	REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
	1805	UnicodeString Hello("Hello, world.");
	1806	RegexMatcher *m = pat1->matcher(Hello, status);
	1807	REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
	1808	REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
	1809	REGEX_ASSERT(m->getDynamicClassID() != NULL);
	1810	delete m;
	1811	delete pat1;
	1812
	1813	}
	1814
	1815	//---------------------------------------------------------------------------
	1816	//
	1817	// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
	1818	// is present and working, but excluding functions
	1819	// implementing replace operations.
	1820	//
	1821	//---------------------------------------------------------------------------
	1822	void RegexTest::API_Match_UTF8() {
	1823	UParseError pe;
	1824	UErrorCode status=U_ZERO_ERROR;
	1825	int32_t flags = 0;
	1826
	1827	//
	1828	// Debug - slide failing test cases early
	1829	//
	1830	#if 0
	1831	{
	1832	}
	1833	return;
	1834	#endif
	1835
	1836	//
	1837	// Simple pattern compilation
	1838	//
	1839	{
	1840	UText re = UTEXT_INITIALIZER;
	1841	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	1842	REGEX_VERBOSE_TEXT(&re);
	1843	RegexPattern *pat2;
	1844	pat2 = RegexPattern::compile(&re, flags, pe, status);
	1845	REGEX_CHECK_STATUS;
	1846
	1847	UText input1 = UTEXT_INITIALIZER;
	1848	UText input2 = UTEXT_INITIALIZER;
	1849	UText empty = UTEXT_INITIALIZER;
	1850	regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
	1851	REGEX_VERBOSE_TEXT(&input1);
	1852	regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
	1853	REGEX_VERBOSE_TEXT(&input2);
	1854	utext_openUChars(&empty, NULL, 0, &status);
	1855
	1856	int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
	1857	int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
	1858
	1859
	1860	//
	1861	// Matcher creation and reset.
	1862	//
	1863	RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
	1864	REGEX_CHECK_STATUS;
	1865	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1866	const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
	1867	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1868	m1->reset(&input2);
	1869	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1870	const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
	1871	REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
	1872	m1->reset(&input1);
	1873	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1874	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1875	m1->reset(&empty);
	1876	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1877	REGEX_ASSERT(utext_nativeLength(&empty) == 0);
	1878
	1879	//
	1880	// reset(pos, status)
	1881	//
	1882	m1->reset(&input1);
	1883	m1->reset(4, status);
	1884	REGEX_CHECK_STATUS;
	1885	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1886	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1887
	1888	m1->reset(-1, status);
	1889	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1890	status = U_ZERO_ERROR;
	1891
	1892	m1->reset(0, status);
	1893	REGEX_CHECK_STATUS;
	1894	status = U_ZERO_ERROR;
	1895
	1896	m1->reset(input1Len-1, status);
	1897	REGEX_CHECK_STATUS;
	1898	status = U_ZERO_ERROR;
	1899
	1900	m1->reset(input1Len, status);
	1901	REGEX_CHECK_STATUS;
	1902	status = U_ZERO_ERROR;
	1903
	1904	m1->reset(input1Len+1, status);
	1905	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1906	status = U_ZERO_ERROR;
	1907
	1908	//
	1909	// match(pos, status)
	1910	//
	1911	m1->reset(&input2);
	1912	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1913	m1->reset();
	1914	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	1915	m1->reset();
	1916	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	1917	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1918	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	1919	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1920
	1921	// Match() at end of string should fail, but should not
	1922	// be an error.
	1923	status = U_ZERO_ERROR;
	1924	REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
	1925	REGEX_CHECK_STATUS;
	1926
	1927	// Match beyond end of string should fail with an error.
	1928	status = U_ZERO_ERROR;
	1929	REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
	1930	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1931
	1932	// Successful match at end of string.
	1933	{
	1934	status = U_ZERO_ERROR;
	1935	RegexMatcher m("A?", 0, status); // will match zero length string.
	1936	REGEX_CHECK_STATUS;
	1937	m.reset(&input1);
	1938	REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
	1939	REGEX_CHECK_STATUS;
	1940	m.reset(&empty);
	1941	REGEX_ASSERT(m.matches(0, status) == TRUE);
	1942	REGEX_CHECK_STATUS;
	1943	}
	1944
	1945
	1946	//
	1947	// lookingAt(pos, status)
	1948	//
	1949	status = U_ZERO_ERROR;
	1950	m1->reset(&input2); // "not abc"
	1951	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1952	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	1953	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	1954	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1955	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	1956	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1957	status = U_ZERO_ERROR;
	1958	REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
	1959	REGEX_CHECK_STATUS;
	1960	REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
	1961	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1962
	1963	delete m1;
	1964	delete pat2;
	1965
	1966	utext_close(&re);
	1967	utext_close(&input1);
	1968	utext_close(&input2);
	1969	utext_close(&empty);
	1970	}
	1971
	1972
	1973	//
	1974	// Capture Group.
	1975	// RegexMatcher::start();
	1976	// RegexMatcher::end();
	1977	// RegexMatcher::groupCount();
	1978	//
	1979	{
	1980	int32_t flags=0;
	1981	UParseError pe;
	1982	UErrorCode status=U_ZERO_ERROR;
	1983	UText re=UTEXT_INITIALIZER;
	1984	const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.) /
	1985	utext_openUTF8(&re, str_01234567_pat, -1, &status);
	1986
	1987	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	1988	REGEX_CHECK_STATUS;
	1989
	1990	UText input = UTEXT_INITIALIZER;
	1991	const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	1992	utext_openUTF8(&input, str_0123456789, -1, &status);
	1993
	1994	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	1995	REGEX_CHECK_STATUS;
	1996	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	1997	static const int32_t matchStarts[] = {0, 2, 4, 8};
	1998	static const int32_t matchEnds[] = {10, 8, 6, 10};
	1999	int32_t i;
	2000	for (i=0; i<4; i++) {
	2001	int32_t actualStart = matcher->start(i, status);
	2002	REGEX_CHECK_STATUS;
	2003	if (actualStart != matchStarts[i]) {
	2004	errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
	2005	__FILE__, __LINE__, i, matchStarts[i], actualStart);
	2006	}
	2007	int32_t actualEnd = matcher->end(i, status);
	2008	REGEX_CHECK_STATUS;
	2009	if (actualEnd != matchEnds[i]) {
	2010	errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
	2011	__FILE__, __LINE__, i, matchEnds[i], actualEnd);
	2012	}
	2013	}
	2014
	2015	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	2016	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	2017
	2018	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2019	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2020	matcher->reset();
	2021	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	2022
	2023	matcher->lookingAt(status);
	2024
	2025	UnicodeString dest;
	2026	UText destText = UTEXT_INITIALIZER;
	2027	utext_openUnicodeString(&destText, &dest, &status);
	2028	UText *result;
	2029	//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	2030	// Test shallow-clone API
	2031	int64_t group_len;
	2032	result = matcher->group((UText *)NULL, group_len, status);
	2033	REGEX_CHECK_STATUS;
	2034	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2035	utext_close(result);
	2036	result = matcher->group(0, &destText, group_len, status);
	2037	REGEX_CHECK_STATUS;
	2038	REGEX_ASSERT(result == &destText);
	2039	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2040	// destText is now immutable, reopen it
	2041	utext_close(&destText);
	2042	utext_openUnicodeString(&destText, &dest, &status);
	2043
	2044	int64_t length;
	2045	result = matcher->group(0, NULL, length, status);
	2046	REGEX_CHECK_STATUS;
	2047	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2048	utext_close(result);
	2049	result = matcher->group(0, &destText, length, status);
	2050	REGEX_CHECK_STATUS;
	2051	REGEX_ASSERT(result == &destText);
	2052	REGEX_ASSERT(utext_getNativeIndex(result) == 0);
	2053	REGEX_ASSERT(length == 10);
	2054	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2055
	2056	// Capture Group 1 == "234567"
	2057	result = matcher->group(1, NULL, length, status);
	2058	REGEX_CHECK_STATUS;
	2059	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
	2060	REGEX_ASSERT(length == 6);
	2061	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2062	utext_close(result);
	2063
	2064	result = matcher->group(1, &destText, length, status);
	2065	REGEX_CHECK_STATUS;
	2066	REGEX_ASSERT(result == &destText);
	2067	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
	2068	REGEX_ASSERT(length == 6);
	2069	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2070	utext_close(result);
	2071
	2072	// Capture Group 2 == "45"
	2073	result = matcher->group(2, NULL, length, status);
	2074	REGEX_CHECK_STATUS;
	2075	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
	2076	REGEX_ASSERT(length == 2);
	2077	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2078	utext_close(result);
	2079
	2080	result = matcher->group(2, &destText, length, status);
	2081	REGEX_CHECK_STATUS;
	2082	REGEX_ASSERT(result == &destText);
	2083	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
	2084	REGEX_ASSERT(length == 2);
	2085	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2086	utext_close(result);
	2087
	2088	// Capture Group 3 == "89"
	2089	result = matcher->group(3, NULL, length, status);
	2090	REGEX_CHECK_STATUS;
	2091	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
	2092	REGEX_ASSERT(length == 2);
	2093	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2094	utext_close(result);
	2095
	2096	result = matcher->group(3, &destText, length, status);
	2097	REGEX_CHECK_STATUS;
	2098	REGEX_ASSERT(result == &destText);
	2099	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
	2100	REGEX_ASSERT(length == 2);
	2101	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2102	utext_close(result);
	2103
	2104	// Capture Group number out of range.
	2105	status = U_ZERO_ERROR;
	2106	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2107	status = U_ZERO_ERROR;
	2108	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2109	status = U_ZERO_ERROR;
	2110	matcher->reset();
	2111	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	2112
	2113	delete matcher;
	2114	delete pat;
	2115
	2116	utext_close(&destText);
	2117	utext_close(&input);
	2118	utext_close(&re);
	2119	}
	2120
	2121	//
	2122	// find
	2123	//
	2124	{
	2125	int32_t flags=0;
	2126	UParseError pe;
	2127	UErrorCode status=U_ZERO_ERROR;
	2128	UText re=UTEXT_INITIALIZER;
	2129	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2130	utext_openUTF8(&re, str_abc, -1, &status);
	2131
	2132	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2133	REGEX_CHECK_STATUS;
	2134	UText input = UTEXT_INITIALIZER;
	2135	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2136	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2137	// 012345678901234567
	2138
	2139	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2140	REGEX_CHECK_STATUS;
	2141	REGEX_ASSERT(matcher->find());
	2142	REGEX_ASSERT(matcher->start(status) == 1);
	2143	REGEX_ASSERT(matcher->find());
	2144	REGEX_ASSERT(matcher->start(status) == 6);
	2145	REGEX_ASSERT(matcher->find());
	2146	REGEX_ASSERT(matcher->start(status) == 12);
	2147	REGEX_ASSERT(matcher->find() == FALSE);
	2148	REGEX_ASSERT(matcher->find() == FALSE);
	2149
	2150	matcher->reset();
	2151	REGEX_ASSERT(matcher->find());
	2152	REGEX_ASSERT(matcher->start(status) == 1);
	2153
	2154	REGEX_ASSERT(matcher->find(0, status));
	2155	REGEX_ASSERT(matcher->start(status) == 1);
	2156	REGEX_ASSERT(matcher->find(1, status));
	2157	REGEX_ASSERT(matcher->start(status) == 1);
	2158	REGEX_ASSERT(matcher->find(2, status));
	2159	REGEX_ASSERT(matcher->start(status) == 6);
	2160	REGEX_ASSERT(matcher->find(12, status));
	2161	REGEX_ASSERT(matcher->start(status) == 12);
	2162	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	2163	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	2164	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	2165	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	2166
	2167	status = U_ZERO_ERROR;
	2168	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2169	status = U_ZERO_ERROR;
	2170	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2171
	2172	REGEX_ASSERT(matcher->groupCount() == 0);
	2173
	2174	delete matcher;
	2175	delete pat;
	2176
	2177	utext_close(&input);
	2178	utext_close(&re);
	2179	}
	2180
	2181
	2182	//
	2183	// find, with \G in pattern (true if at the end of a previous match).
	2184	//
	2185	{
	2186	int32_t flags=0;
	2187	UParseError pe;
	2188	UErrorCode status=U_ZERO_ERROR;
	2189	UText re=UTEXT_INITIALIZER;
	2190	const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .?(?:(\\Gabc)\|(abc)) /
	2191	utext_openUTF8(&re, str_Gabcabc, -1, &status);
	2192
	2193	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2194
	2195	REGEX_CHECK_STATUS;
	2196	UText input = UTEXT_INITIALIZER;
	2197	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
	2198	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2199	// 012345678901234567
	2200
	2201	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2202	REGEX_CHECK_STATUS;
	2203	REGEX_ASSERT(matcher->find());
	2204	REGEX_ASSERT(matcher->start(status) == 0);
	2205	REGEX_ASSERT(matcher->start(1, status) == -1);
	2206	REGEX_ASSERT(matcher->start(2, status) == 1);
	2207
	2208	REGEX_ASSERT(matcher->find());
	2209	REGEX_ASSERT(matcher->start(status) == 4);
	2210	REGEX_ASSERT(matcher->start(1, status) == 4);
	2211	REGEX_ASSERT(matcher->start(2, status) == -1);
	2212	REGEX_CHECK_STATUS;
	2213
	2214	delete matcher;
	2215	delete pat;
	2216
	2217	utext_close(&input);
	2218	utext_close(&re);
	2219	}
	2220
	2221	//
	2222	// find with zero length matches, match position should bump ahead
	2223	// to prevent loops.
	2224	//
	2225	{
	2226	int32_t i;
	2227	UErrorCode status=U_ZERO_ERROR;
	2228	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	2229	// using an always-true look-ahead.
	2230	REGEX_CHECK_STATUS;
	2231	UText s = UTEXT_INITIALIZER;
	2232	utext_openUTF8(&s, " ", -1, &status);
	2233	m.reset(&s);
	2234	for (i=0; ; i++) {
	2235	if (m.find() == FALSE) {
	2236	break;
	2237	}
	2238	REGEX_ASSERT(m.start(status) == i);
	2239	REGEX_ASSERT(m.end(status) == i);
	2240	}
	2241	REGEX_ASSERT(i==5);
	2242
	2243	// Check that the bump goes over characters outside the BMP OK
	2244	// "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
	2245	unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
	2246	utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
	2247	m.reset(&s);
	2248	for (i=0; ; i+=4) {
	2249	if (m.find() == FALSE) {
	2250	break;
	2251	}
	2252	REGEX_ASSERT(m.start(status) == i);
	2253	REGEX_ASSERT(m.end(status) == i);
	2254	}
	2255	REGEX_ASSERT(i==20);
	2256
	2257	utext_close(&s);
	2258	}
	2259	{
	2260	// find() loop breaking test.
	2261	// with pattern of /.?/, should see a series of one char matches, then a single
	2262	// match of zero length at the end of the input string.
	2263	int32_t i;
	2264	UErrorCode status=U_ZERO_ERROR;
	2265	RegexMatcher m(".?", 0, status);
	2266	REGEX_CHECK_STATUS;
	2267	UText s = UTEXT_INITIALIZER;
	2268	utext_openUTF8(&s, " ", -1, &status);
	2269	m.reset(&s);
	2270	for (i=0; ; i++) {
	2271	if (m.find() == FALSE) {
	2272	break;
	2273	}
	2274	REGEX_ASSERT(m.start(status) == i);
	2275	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	2276	}
	2277	REGEX_ASSERT(i==5);
	2278
	2279	utext_close(&s);
	2280	}
	2281
	2282
	2283	//
	2284	// Matchers with no input string behave as if they had an empty input string.
	2285	//
	2286
	2287	{
	2288	UErrorCode status = U_ZERO_ERROR;
	2289	RegexMatcher m(".?", 0, status);
	2290	REGEX_CHECK_STATUS;
	2291	REGEX_ASSERT(m.find());
	2292	REGEX_ASSERT(m.start(status) == 0);
	2293	REGEX_ASSERT(m.input() == "");
	2294	}
	2295	{
	2296	UErrorCode status = U_ZERO_ERROR;
	2297	RegexPattern *p = RegexPattern::compile(".", 0, status);
	2298	RegexMatcher *m = p->matcher(status);
	2299	REGEX_CHECK_STATUS;
	2300
	2301	REGEX_ASSERT(m->find() == FALSE);
	2302	REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
	2303	delete m;
	2304	delete p;
	2305	}
	2306
	2307	//
	2308	// Regions
	2309	//
	2310	{
	2311	UErrorCode status = U_ZERO_ERROR;
	2312	UText testPattern = UTEXT_INITIALIZER;
	2313	UText testText = UTEXT_INITIALIZER;
	2314	regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
	2315	REGEX_VERBOSE_TEXT(&testPattern);
	2316	regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
	2317	REGEX_VERBOSE_TEXT(&testText);
	2318
	2319	RegexMatcher m(&testPattern, &testText, 0, status);
	2320	REGEX_CHECK_STATUS;
	2321	REGEX_ASSERT(m.regionStart() == 0);
	2322	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2323	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2324	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2325
	2326	m.region(2,4, status);
	2327	REGEX_CHECK_STATUS;
	2328	REGEX_ASSERT(m.matches(status));
	2329	REGEX_ASSERT(m.start(status)==2);
	2330	REGEX_ASSERT(m.end(status)==4);
	2331	REGEX_CHECK_STATUS;
	2332
	2333	m.reset();
	2334	REGEX_ASSERT(m.regionStart() == 0);
	2335	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2336
	2337	regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
	2338	REGEX_VERBOSE_TEXT(&testText);
	2339	m.reset(&testText);
	2340	REGEX_ASSERT(m.regionStart() == 0);
	2341	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
	2342
	2343	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2344	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	2345	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2346	REGEX_ASSERT(&m == &m.reset());
	2347	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2348
	2349	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	2350	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2351	REGEX_ASSERT(&m == &m.reset());
	2352	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2353
	2354	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2355	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	2356	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2357	REGEX_ASSERT(&m == &m.reset());
	2358	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2359
	2360	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	2361	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2362	REGEX_ASSERT(&m == &m.reset());
	2363	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2364
	2365	utext_close(&testText);
	2366	utext_close(&testPattern);
	2367	}
	2368
	2369	//
	2370	// hitEnd() and requireEnd()
	2371	//
	2372	{
	2373	UErrorCode status = U_ZERO_ERROR;
	2374	UText testPattern = UTEXT_INITIALIZER;
	2375	UText testText = UTEXT_INITIALIZER;
	2376	const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2377	const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
	2378	utext_openUTF8(&testPattern, str_, -1, &status);
	2379	utext_openUTF8(&testText, str_aabb, -1, &status);
	2380
	2381	RegexMatcher m1(&testPattern, &testText, 0, status);
	2382	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	2383	REGEX_ASSERT(m1.hitEnd() == TRUE);
	2384	REGEX_ASSERT(m1.requireEnd() == FALSE);
	2385	REGEX_CHECK_STATUS;
	2386
	2387	status = U_ZERO_ERROR;
	2388	const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
	2389	utext_openUTF8(&testPattern, str_a, -1, &status);
	2390	RegexMatcher m2(&testPattern, &testText, 0, status);
	2391	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	2392	REGEX_ASSERT(m2.hitEnd() == FALSE);
	2393	REGEX_ASSERT(m2.requireEnd() == FALSE);
	2394	REGEX_CHECK_STATUS;
	2395
	2396	status = U_ZERO_ERROR;
	2397	const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /
	2398	utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
	2399	RegexMatcher m3(&testPattern, &testText, 0, status);
	2400	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	2401	REGEX_ASSERT(m3.hitEnd() == TRUE);
	2402	REGEX_ASSERT(m3.requireEnd() == TRUE);
	2403	REGEX_CHECK_STATUS;
	2404
	2405	utext_close(&testText);
	2406	utext_close(&testPattern);
	2407	}
	2408	}
	2409
	2410
	2411	//---------------------------------------------------------------------------
	2412	//
	2413	// API_Replace_UTF8 API test for class RegexMatcher, testing the
	2414	// Replace family of functions.
	2415	//
	2416	//---------------------------------------------------------------------------
	2417	void RegexTest::API_Replace_UTF8() {
	2418	//
	2419	// Replace
	2420	//
	2421	int32_t flags=0;
	2422	UParseError pe;
	2423	UErrorCode status=U_ZERO_ERROR;
	2424
	2425	UText re=UTEXT_INITIALIZER;
	2426	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	2427	REGEX_VERBOSE_TEXT(&re);
	2428	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2429	REGEX_CHECK_STATUS;
	2430
	2431	char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2432	// 012345678901234567
	2433	UText dataText = UTEXT_INITIALIZER;
	2434	utext_openUTF8(&dataText, data, -1, &status);
	2435	REGEX_CHECK_STATUS;
	2436	REGEX_VERBOSE_TEXT(&dataText);
	2437	RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
	2438
	2439	//
	2440	// Plain vanilla matches.
	2441	//
	2442	UnicodeString dest;
	2443	UText destText = UTEXT_INITIALIZER;
	2444	utext_openUnicodeString(&destText, &dest, &status);
	2445	UText *result;
	2446
	2447	UText replText = UTEXT_INITIALIZER;
	2448
	2449	const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
	2450	utext_openUTF8(&replText, str_yz, -1, &status);
	2451	REGEX_VERBOSE_TEXT(&replText);
	2452	result = matcher->replaceFirst(&replText, NULL, status);
	2453	REGEX_CHECK_STATUS;
	2454	const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
	2455	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2456	utext_close(result);
	2457	result = matcher->replaceFirst(&replText, &destText, status);
	2458	REGEX_CHECK_STATUS;
	2459	REGEX_ASSERT(result == &destText);
	2460	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2461
	2462	result = matcher->replaceAll(&replText, NULL, status);
	2463	REGEX_CHECK_STATUS;
	2464	const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
	2465	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2466	utext_close(result);
	2467
	2468	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2469	result = matcher->replaceAll(&replText, &destText, status);
	2470	REGEX_CHECK_STATUS;
	2471	REGEX_ASSERT(result == &destText);
	2472	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2473
	2474	//
	2475	// Plain vanilla non-matches.
	2476	//
	2477	const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
	2478	utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
	2479	matcher->reset(&dataText);
	2480
	2481	result = matcher->replaceFirst(&replText, NULL, status);
	2482	REGEX_CHECK_STATUS;
	2483	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2484	utext_close(result);
	2485	result = matcher->replaceFirst(&replText, &destText, status);
	2486	REGEX_CHECK_STATUS;
	2487	REGEX_ASSERT(result == &destText);
	2488	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2489
	2490	result = matcher->replaceAll(&replText, NULL, status);
	2491	REGEX_CHECK_STATUS;
	2492	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2493	utext_close(result);
	2494	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2495	result = matcher->replaceAll(&replText, &destText, status);
	2496	REGEX_CHECK_STATUS;
	2497	REGEX_ASSERT(result == &destText);
	2498	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2499
	2500	//
	2501	// Empty source string
	2502	//
	2503	utext_openUTF8(&dataText, NULL, 0, &status);
	2504	matcher->reset(&dataText);
	2505
	2506	result = matcher->replaceFirst(&replText, NULL, status);
	2507	REGEX_CHECK_STATUS;
	2508	REGEX_ASSERT_UTEXT_UTF8("", result);
	2509	utext_close(result);
	2510	result = matcher->replaceFirst(&replText, &destText, status);
	2511	REGEX_CHECK_STATUS;
	2512	REGEX_ASSERT(result == &destText);
	2513	REGEX_ASSERT_UTEXT_UTF8("", result);
	2514
	2515	result = matcher->replaceAll(&replText, NULL, status);
	2516	REGEX_CHECK_STATUS;
	2517	REGEX_ASSERT_UTEXT_UTF8("", result);
	2518	utext_close(result);
	2519	result = matcher->replaceAll(&replText, &destText, status);
	2520	REGEX_CHECK_STATUS;
	2521	REGEX_ASSERT(result == &destText);
	2522	REGEX_ASSERT_UTEXT_UTF8("", result);
	2523
	2524	//
	2525	// Empty substitution string
	2526	//
	2527	utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
	2528	matcher->reset(&dataText);
	2529
	2530	utext_openUTF8(&replText, NULL, 0, &status);
	2531	result = matcher->replaceFirst(&replText, NULL, status);
	2532	REGEX_CHECK_STATUS;
	2533	const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
	2534	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2535	utext_close(result);
	2536	result = matcher->replaceFirst(&replText, &destText, status);
	2537	REGEX_CHECK_STATUS;
	2538	REGEX_ASSERT(result == &destText);
	2539	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2540
	2541	result = matcher->replaceAll(&replText, NULL, status);
	2542	REGEX_CHECK_STATUS;
	2543	const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
	2544	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2545	utext_close(result);
	2546	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2547	result = matcher->replaceAll(&replText, &destText, status);
	2548	REGEX_CHECK_STATUS;
	2549	REGEX_ASSERT(result == &destText);
	2550	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2551
	2552	//
	2553	// match whole string
	2554	//
	2555	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2556	utext_openUTF8(&dataText, str_abc, -1, &status);
	2557	matcher->reset(&dataText);
	2558
	2559	const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
	2560	utext_openUTF8(&replText, str_xyz, -1, &status);
	2561	result = matcher->replaceFirst(&replText, NULL, status);
	2562	REGEX_CHECK_STATUS;
	2563	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2564	utext_close(result);
	2565	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2566	result = matcher->replaceFirst(&replText, &destText, status);
	2567	REGEX_CHECK_STATUS;
	2568	REGEX_ASSERT(result == &destText);
	2569	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2570
	2571	result = matcher->replaceAll(&replText, NULL, status);
	2572	REGEX_CHECK_STATUS;
	2573	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2574	utext_close(result);
	2575	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2576	result = matcher->replaceAll(&replText, &destText, status);
	2577	REGEX_CHECK_STATUS;
	2578	REGEX_ASSERT(result == &destText);
	2579	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2580
	2581	//
	2582	// Capture Group, simple case
	2583	//
	2584	const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
	2585	utext_openUTF8(&re, str_add, -1, &status);
	2586	RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
	2587	REGEX_CHECK_STATUS;
	2588
	2589	const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
	2590	utext_openUTF8(&dataText, str_abcdefg, -1, &status);
	2591	RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
	2592	REGEX_CHECK_STATUS;
	2593
	2594	const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
	2595	utext_openUTF8(&replText, str_11, -1, &status);
	2596	result = matcher2->replaceFirst(&replText, NULL, status);
	2597	REGEX_CHECK_STATUS;
	2598	const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
	2599	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2600	utext_close(result);
	2601	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2602	result = matcher2->replaceFirst(&replText, &destText, status);
	2603	REGEX_CHECK_STATUS;
	2604	REGEX_ASSERT(result == &destText);
	2605	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2606
	2607	const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
	2608	utext_openUTF8(&replText, str_v, -1, &status);
	2609	REGEX_VERBOSE_TEXT(&replText);
	2610	result = matcher2->replaceFirst(&replText, NULL, status);
	2611	REGEX_CHECK_STATUS;
	2612	const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
	2613	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2614	utext_close(result);
	2615	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2616	result = matcher2->replaceFirst(&replText, &destText, status);
	2617	REGEX_CHECK_STATUS;
	2618	REGEX_ASSERT(result == &destText);
	2619	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2620
	2621	const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
	2622	0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
	2623	0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
	2624	utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
	2625	result = matcher2->replaceFirst(&replText, NULL, status);
	2626	REGEX_CHECK_STATUS;
	2627	const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
	2628	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2629	utext_close(result);
	2630	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2631	result = matcher2->replaceFirst(&replText, &destText, status);
	2632	REGEX_CHECK_STATUS;
	2633	REGEX_ASSERT(result == &destText);
	2634	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2635
	2636	unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
	2637	//unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
	2638	// 012345678901234567890123456
	2639	supplDigitChars[22] = 0xF0;
	2640	supplDigitChars[23] = 0x9D;
	2641	supplDigitChars[24] = 0x9F;
	2642	supplDigitChars[25] = 0x8F;
	2643	utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
	2644
	2645	result = matcher2->replaceFirst(&replText, NULL, status);
	2646	REGEX_CHECK_STATUS;
	2647	const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
	2648	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2649	utext_close(result);
	2650	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2651	result = matcher2->replaceFirst(&replText, &destText, status);
	2652	REGEX_CHECK_STATUS;
	2653	REGEX_ASSERT(result == &destText);
	2654	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2655	const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
	2656	utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
	2657	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2658	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2659	utext_close(result);
	2660	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2661	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2662	REGEX_ASSERT(result == &destText);
	2663	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2664
	2665	//
	2666	// Replacement String with \u hex escapes
	2667	//
	2668	{
	2669	const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
	2670	const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
	2671	utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
	2672	utext_openUTF8(&replText, str_u0043, -1, &status);
	2673	matcher->reset(&dataText);
	2674
	2675	result = matcher->replaceAll(&replText, NULL, status);
	2676	REGEX_CHECK_STATUS;
	2677	const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
	2678	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2679	utext_close(result);
	2680	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2681	result = matcher->replaceAll(&replText, &destText, status);
	2682	REGEX_CHECK_STATUS;
	2683	REGEX_ASSERT(result == &destText);
	2684	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2685	}
	2686	{
	2687	const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
	2688	utext_openUTF8(&dataText, str_abc, -1, &status);
	2689	const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
	2690	utext_openUTF8(&replText, str_U00010000, -1, &status);
	2691	matcher->reset(&dataText);
	2692
	2693	unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
	2694	// 0123456789
	2695	expected[2] = 0xF0;
	2696	expected[3] = 0x90;
	2697	expected[4] = 0x80;
	2698	expected[5] = 0x80;
	2699
	2700	result = matcher->replaceAll(&replText, NULL, status);
	2701	REGEX_CHECK_STATUS;
	2702	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2703	utext_close(result);
	2704	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2705	result = matcher->replaceAll(&replText, &destText, status);
	2706	REGEX_CHECK_STATUS;
	2707	REGEX_ASSERT(result == &destText);
	2708	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2709	}
	2710	// TODO: need more through testing of capture substitutions.
	2711
	2712	// Bug 4057
	2713	//
	2714	{
	2715	status = U_ZERO_ERROR;
	2716	const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /
	2717	const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
	2718	const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
	2719	utext_openUTF8(&re, str_ssee, -1, &status);
	2720	utext_openUTF8(&dataText, str_blah, -1, &status);
	2721	utext_openUTF8(&replText, str_ooh, -1, &status);
	2722
	2723	RegexMatcher m(&re, 0, status);
	2724	REGEX_CHECK_STATUS;
	2725
	2726	UnicodeString result;
	2727	UText resultText = UTEXT_INITIALIZER;
	2728	utext_openUnicodeString(&resultText, &result, &status);
	2729
	2730	// Multiple finds do NOT bump up the previous appendReplacement postion.
	2731	m.reset(&dataText);
	2732	m.find();
	2733	m.find();
	2734	m.appendReplacement(&resultText, &replText, status);
	2735	REGEX_CHECK_STATUS;
	2736	const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2737	REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
	2738
	2739	// After a reset into the interior of a string, appendReplacement still starts at beginning.
	2740	status = U_ZERO_ERROR;
	2741	result.truncate(0);
	2742	utext_openUnicodeString(&resultText, &result, &status);
	2743	m.reset(10, status);
	2744	m.find();
	2745	m.find();
	2746	m.appendReplacement(&resultText, &replText, status);
	2747	REGEX_CHECK_STATUS;
	2748	const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2749	REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
	2750
	2751	// find() at interior of string, appendReplacement still starts at beginning.
	2752	status = U_ZERO_ERROR;
	2753	result.truncate(0);
	2754	utext_openUnicodeString(&resultText, &result, &status);
	2755	m.reset();
	2756	m.find(10, status);
	2757	m.find();
	2758	m.appendReplacement(&resultText, &replText, status);
	2759	REGEX_CHECK_STATUS;
	2760	const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2761	REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
	2762
	2763	m.appendTail(&resultText, status);
	2764	const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
	2765	REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
	2766
	2767	utext_close(&resultText);
	2768	}
	2769
	2770	delete matcher2;
	2771	delete pat2;
	2772	delete matcher;
	2773	delete pat;
	2774
	2775	utext_close(&dataText);
	2776	utext_close(&replText);
	2777	utext_close(&destText);
	2778	utext_close(&re);
	2779	}
	2780
	2781
	2782	//---------------------------------------------------------------------------
	2783	//
	2784	// API_Pattern_UTF8 Test that the API for class RegexPattern is
	2785	// present and nominally working.
	2786	//
	2787	//---------------------------------------------------------------------------
	2788	void RegexTest::API_Pattern_UTF8() {
	2789	RegexPattern pata; // Test default constructor to not crash.
	2790	RegexPattern patb;
	2791
	2792	REGEX_ASSERT(pata == patb);
	2793	REGEX_ASSERT(pata == pata);
	2794
	2795	UText re1 = UTEXT_INITIALIZER;
	2796	UText re2 = UTEXT_INITIALIZER;
	2797	UErrorCode status = U_ZERO_ERROR;
	2798	UParseError pe;
	2799
	2800	const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
	2801	const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
	2802	utext_openUTF8(&re1, str_abcalmz, -1, &status);
	2803	utext_openUTF8(&re2, str_def, -1, &status);
	2804
	2805	RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
	2806	RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
	2807	REGEX_CHECK_STATUS;
	2808	REGEX_ASSERT(pat1 == pat1);
	2809	REGEX_ASSERT(*pat1 != pata);
	2810
	2811	// Assign
	2812	patb = *pat1;
	2813	REGEX_ASSERT(patb == *pat1);
	2814
	2815	// Copy Construct
	2816	RegexPattern patc(*pat1);
	2817	REGEX_ASSERT(patc == *pat1);
	2818	REGEX_ASSERT(patb == patc);
	2819	REGEX_ASSERT(pat1 != pat2);
	2820	patb = *pat2;
	2821	REGEX_ASSERT(patb != patc);
	2822	REGEX_ASSERT(patb == *pat2);
	2823
	2824	// Compile with no flags.
	2825	RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
	2826	REGEX_ASSERT(pat1a == pat1);
	2827
	2828	REGEX_ASSERT(pat1a->flags() == 0);
	2829
	2830	// Compile with different flags should be not equal
	2831	RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
	2832	REGEX_CHECK_STATUS;
	2833
	2834	REGEX_ASSERT(pat1b != pat1a);
	2835	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	2836	REGEX_ASSERT(pat1a->flags() == 0);
	2837	delete pat1b;
	2838
	2839	// clone
	2840	RegexPattern *pat1c = pat1->clone();
	2841	REGEX_ASSERT(pat1c == pat1);
	2842	REGEX_ASSERT(pat1c != pat2);
	2843
	2844	delete pat1c;
	2845	delete pat1a;
	2846	delete pat1;
	2847	delete pat2;
	2848
	2849	utext_close(&re1);
	2850	utext_close(&re2);
	2851
	2852
	2853	//
	2854	// Verify that a matcher created from a cloned pattern works.
	2855	// (Jitterbug 3423)
	2856	//
	2857	{
	2858	UErrorCode status = U_ZERO_ERROR;
	2859	UText pattern = UTEXT_INITIALIZER;
	2860	const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
	2861	utext_openUTF8(&pattern, str_pL, -1, &status);
	2862
	2863	RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
	2864	RegexPattern *pClone = pSource->clone();
	2865	delete pSource;
	2866	RegexMatcher *mFromClone = pClone->matcher(status);
	2867	REGEX_CHECK_STATUS;
	2868
	2869	UText input = UTEXT_INITIALIZER;
	2870	const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
	2871	utext_openUTF8(&input, str_HelloWorld, -1, &status);
	2872	mFromClone->reset(&input);
	2873	REGEX_ASSERT(mFromClone->find() == TRUE);
	2874	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	2875	REGEX_ASSERT(mFromClone->find() == TRUE);
	2876	REGEX_ASSERT(mFromClone->group(status) == "World");
	2877	REGEX_ASSERT(mFromClone->find() == FALSE);
	2878	delete mFromClone;
	2879	delete pClone;
	2880
	2881	utext_close(&input);
	2882	utext_close(&pattern);
	2883	}
	2884
	2885	//
	2886	// matches convenience API
	2887	//
	2888	{
	2889	UErrorCode status = U_ZERO_ERROR;
	2890	UText pattern = UTEXT_INITIALIZER;
	2891	UText input = UTEXT_INITIALIZER;
	2892
	2893	const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
	2894	utext_openUTF8(&input, str_randominput, -1, &status);
	2895
	2896	const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2897	utext_openUTF8(&pattern, str_dotstar, -1, &status);
	2898	REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
	2899	REGEX_CHECK_STATUS;
	2900
	2901	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2902	utext_openUTF8(&pattern, str_abc, -1, &status);
	2903	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	2904	REGEX_CHECK_STATUS;
	2905
	2906	const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /
	2907	utext_openUTF8(&pattern, str_nput, -1, &status);
	2908	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	2909	REGEX_CHECK_STATUS;
	2910
	2911	utext_openUTF8(&pattern, str_randominput, -1, &status);
	2912	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	2913	REGEX_CHECK_STATUS;
	2914
	2915	const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /
	2916	utext_openUTF8(&pattern, str_u, -1, &status);
	2917	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	2918	REGEX_CHECK_STATUS;
	2919
	2920	utext_openUTF8(&input, str_abc, -1, &status);
	2921	utext_openUTF8(&pattern, str_abc, -1, &status);
	2922	status = U_INDEX_OUTOFBOUNDS_ERROR;
	2923	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	2924	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	2925
	2926	utext_close(&input);
	2927	utext_close(&pattern);
	2928	}
	2929
	2930
	2931	//
	2932	// Split()
	2933	//
	2934	status = U_ZERO_ERROR;
	2935	const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
	2936	utext_openUTF8(&re1, str_spaceplus, -1, &status);
	2937	pat1 = RegexPattern::compile(&re1, pe, status);
	2938	REGEX_CHECK_STATUS;
	2939	UnicodeString fields[10];
	2940
	2941	int32_t n;
	2942	n = pat1->split("Now is the time", fields, 10, status);
	2943	REGEX_CHECK_STATUS;
	2944	REGEX_ASSERT(n==4);
	2945	REGEX_ASSERT(fields[0]=="Now");
	2946	REGEX_ASSERT(fields[1]=="is");
	2947	REGEX_ASSERT(fields[2]=="the");
	2948	REGEX_ASSERT(fields[3]=="time");
	2949	REGEX_ASSERT(fields[4]=="");
	2950
	2951	n = pat1->split("Now is the time", fields, 2, status);
	2952	REGEX_CHECK_STATUS;
	2953	REGEX_ASSERT(n==2);
	2954	REGEX_ASSERT(fields[0]=="Now");
	2955	REGEX_ASSERT(fields[1]=="is the time");
	2956	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	2957
	2958	fields[1] = "*";
	2959	status = U_ZERO_ERROR;
	2960	n = pat1->split("Now is the time", fields, 1, status);
	2961	REGEX_CHECK_STATUS;
	2962	REGEX_ASSERT(n==1);
	2963	REGEX_ASSERT(fields[0]=="Now is the time");
	2964	REGEX_ASSERT(fields[1]=="*");
	2965	status = U_ZERO_ERROR;
	2966
	2967	n = pat1->split(" Now is the time ", fields, 10, status);
	2968	REGEX_CHECK_STATUS;
	2969	REGEX_ASSERT(n==6);
	2970	REGEX_ASSERT(fields[0]=="");
	2971	REGEX_ASSERT(fields[1]=="Now");
	2972	REGEX_ASSERT(fields[2]=="is");
	2973	REGEX_ASSERT(fields[3]=="the");
	2974	REGEX_ASSERT(fields[4]=="time");
	2975	REGEX_ASSERT(fields[5]=="");
	2976	REGEX_ASSERT(fields[6]=="");
	2977
	2978	fields[2] = "*";
	2979	n = pat1->split(" ", fields, 10, status);
	2980	REGEX_CHECK_STATUS;
	2981	REGEX_ASSERT(n==2);
	2982	REGEX_ASSERT(fields[0]=="");
	2983	REGEX_ASSERT(fields[1]=="");
	2984	REGEX_ASSERT(fields[2]=="*");
	2985
	2986	fields[0] = "foo";
	2987	n = pat1->split("", fields, 10, status);
	2988	REGEX_CHECK_STATUS;
	2989	REGEX_ASSERT(n==0);
	2990	REGEX_ASSERT(fields[0]=="foo");
	2991
	2992	delete pat1;
	2993
	2994	// split, with a pattern with (capture)
	2995	regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
	2996	pat1 = RegexPattern::compile(&re1, pe, status);
	2997	REGEX_CHECK_STATUS;
	2998
	2999	status = U_ZERO_ERROR;
	3000	fields[6] = fields[7] = "*";
	3001	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	3002	REGEX_CHECK_STATUS;
	3003	REGEX_ASSERT(n==7);
	3004	REGEX_ASSERT(fields[0]=="");
	3005	REGEX_ASSERT(fields[1]=="a");
	3006	REGEX_ASSERT(fields[2]=="Now is ");
	3007	REGEX_ASSERT(fields[3]=="b");
	3008	REGEX_ASSERT(fields[4]=="the time");
	3009	REGEX_ASSERT(fields[5]=="c");
	3010	REGEX_ASSERT(fields[6]=="");
	3011	REGEX_ASSERT(fields[7]=="*");
	3012	REGEX_ASSERT(status==U_ZERO_ERROR);
	3013
	3014	fields[6] = fields[7] = "*";
	3015	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	3016	REGEX_CHECK_STATUS;
	3017	REGEX_ASSERT(n==7);
	3018	REGEX_ASSERT(fields[0]==" ");
	3019	REGEX_ASSERT(fields[1]=="a");
	3020	REGEX_ASSERT(fields[2]=="Now is ");
	3021	REGEX_ASSERT(fields[3]=="b");
	3022	REGEX_ASSERT(fields[4]=="the time");
	3023	REGEX_ASSERT(fields[5]=="c");
	3024	REGEX_ASSERT(fields[6]=="");
	3025	REGEX_ASSERT(fields[7]=="*");
	3026
	3027	status = U_ZERO_ERROR;
	3028	fields[6] = "foo";
	3029	n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
	3030	REGEX_CHECK_STATUS;
	3031	REGEX_ASSERT(n==6);
	3032	REGEX_ASSERT(fields[0]==" ");
	3033	REGEX_ASSERT(fields[1]=="a");
	3034	REGEX_ASSERT(fields[2]=="Now is ");
	3035	REGEX_ASSERT(fields[3]=="b");
	3036	REGEX_ASSERT(fields[4]=="the time");
	3037	REGEX_ASSERT(fields[5]==" ");
	3038	REGEX_ASSERT(fields[6]=="foo");
	3039
	3040	status = U_ZERO_ERROR;
	3041	fields[5] = "foo";
	3042	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	3043	REGEX_CHECK_STATUS;
	3044	REGEX_ASSERT(n==5);
	3045	REGEX_ASSERT(fields[0]==" ");
	3046	REGEX_ASSERT(fields[1]=="a");
	3047	REGEX_ASSERT(fields[2]=="Now is ");
	3048	REGEX_ASSERT(fields[3]=="b");
	3049	REGEX_ASSERT(fields[4]=="the time<c>");
	3050	REGEX_ASSERT(fields[5]=="foo");
	3051
	3052	status = U_ZERO_ERROR;
	3053	fields[5] = "foo";
	3054	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	3055	REGEX_CHECK_STATUS;
	3056	REGEX_ASSERT(n==5);
	3057	REGEX_ASSERT(fields[0]==" ");
	3058	REGEX_ASSERT(fields[1]=="a");
	3059	REGEX_ASSERT(fields[2]=="Now is ");
	3060	REGEX_ASSERT(fields[3]=="b");
	3061	REGEX_ASSERT(fields[4]=="the time");
	3062	REGEX_ASSERT(fields[5]=="foo");
	3063
	3064	status = U_ZERO_ERROR;
	3065	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	3066	REGEX_CHECK_STATUS;
	3067	REGEX_ASSERT(n==4);
	3068	REGEX_ASSERT(fields[0]==" ");
	3069	REGEX_ASSERT(fields[1]=="a");
	3070	REGEX_ASSERT(fields[2]=="Now is ");
	3071	REGEX_ASSERT(fields[3]=="the time<c>");
	3072	status = U_ZERO_ERROR;
	3073	delete pat1;
	3074
	3075	regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
	3076	pat1 = RegexPattern::compile(&re1, pe, status);
	3077	REGEX_CHECK_STATUS;
	3078	n = pat1->split("1-10,20", fields, 10, status);
	3079	REGEX_CHECK_STATUS;
	3080	REGEX_ASSERT(n==5);
	3081	REGEX_ASSERT(fields[0]=="1");
	3082	REGEX_ASSERT(fields[1]=="-");
	3083	REGEX_ASSERT(fields[2]=="10");
	3084	REGEX_ASSERT(fields[3]==",");
	3085	REGEX_ASSERT(fields[4]=="20");
	3086	delete pat1;
	3087
	3088
	3089	//
	3090	// split of a UText based string, with library allocating output UTexts.
	3091	//
	3092	{
	3093	status = U_ZERO_ERROR;
	3094	RegexMatcher matcher(UnicodeString("(:)"), 0, status);
	3095	UnicodeString stringToSplit("first:second:third");
	3096	UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
	3097	REGEX_CHECK_STATUS;
	3098
	3099	UText *splits[10] = {NULL};
	3100	int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
	3101	REGEX_CHECK_STATUS;
	3102	REGEX_ASSERT(numFields == 5);
	3103	REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
	3104	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
	3105	REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
	3106	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
	3107	REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
	3108	REGEX_ASSERT(splits[5] == NULL);
	3109
	3110	for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
	3111	if (splits[i]) {
	3112	utext_close(splits[i]);
	3113	splits[i] = NULL;
	3114	}
	3115	}
	3116	utext_close(textToSplit);
	3117	}
	3118
	3119
	3120	//
	3121	// RegexPattern::pattern() and patternText()
	3122	//
	3123	pat1 = new RegexPattern();
	3124	REGEX_ASSERT(pat1->pattern() == "");
	3125	REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
	3126	delete pat1;
	3127	const char helloWorldInvariant = "(Hello, world)";
	3128	regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
	3129	pat1 = RegexPattern::compile(&re1, pe, status);
	3130	REGEX_CHECK_STATUS;
	3131	REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
	3132	REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
	3133	delete pat1;
	3134
	3135	utext_close(&re1);
	3136	}
	3137
	3138
	3139	//---------------------------------------------------------------------------
	3140	//
	3141	// Extended A more thorough check for features of regex patterns
	3142	// The test cases are in a separate data file,
	3143	// source/tests/testdata/regextst.txt
	3144	// A description of the test data format is included in that file.
	3145	//
	3146	//---------------------------------------------------------------------------
	3147
	3148	const char *
	3149	RegexTest::getPath(char buffer[2048], const char *filename) {
	3150	UErrorCode status=U_ZERO_ERROR;
	3151	const char *testDataDirectory = IntlTest::getSourceTestData(status);
	3152	if (U_FAILURE(status)) {
	3153	errln("ERROR: loadTestData() failed - %s", u_errorName(status));
	3154	return NULL;
	3155	}
	3156
	3157	strcpy(buffer, testDataDirectory);
	3158	strcat(buffer, filename);
	3159	return buffer;
	3160	}
	3161
	3162	void RegexTest::Extended() {
	3163	char tdd[2048];
	3164	const char *srcPath;
	3165	UErrorCode status = U_ZERO_ERROR;
	3166	int32_t lineNum = 0;
	3167
	3168	//
	3169	// Open and read the test data file.
	3170	//
	3171	srcPath=getPath(tdd, "regextst.txt");
	3172	if(srcPath==NULL) {
	3173	return; /* something went wrong, error already output */
	3174	}
	3175
	3176	int32_t len;
	3177	UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
	3178	if (U_FAILURE(status)) {
	3179	return; /* something went wrong, error already output */
	3180	}
	3181
	3182	//
	3183	// Put the test data into a UnicodeString
	3184	//
	3185	UnicodeString testString(FALSE, testData, len);
	3186
	3187	RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\\1"), 0, status);
	3188	RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, status);
	3189	RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMQvabtyYzZ2-9])([:letter:]*)"), 0, status);
	3190
	3191	RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
	3192	UnicodeString testPattern; // The pattern for test from the test file.
	3193	UnicodeString testFlags; // the flags for a test.
	3194	UnicodeString matchString; // The marked up string to be used as input
	3195
	3196	if (U_FAILURE(status)){
	3197	dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
	3198	delete [] testData;
	3199	return;
	3200	}
	3201
	3202	//
	3203	// Loop over the test data file, once per line.
	3204	//
	3205	while (lineMat.find()) {
	3206	lineNum++;
	3207	if (U_FAILURE(status)) {
	3208	errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
	3209	}
	3210
	3211	status = U_ZERO_ERROR;
	3212	UnicodeString testLine = lineMat.group(1, status);
	3213	if (testLine.length() == 0) {
	3214	continue;
	3215	}
	3216
	3217	//
	3218	// Parse the test line. Skip blank and comment only lines.
	3219	// Separate out the three main fields - pattern, flags, target.
	3220	//
	3221
	3222	commentMat.reset(testLine);
	3223	if (commentMat.lookingAt(status)) {
	3224	// This line is a comment, or blank.
	3225	continue;
	3226	}
	3227
	3228	//
	3229	// Pull out the pattern field, remove it from the test file line.
	3230	//
	3231	quotedStuffMat.reset(testLine);
	3232	if (quotedStuffMat.lookingAt(status)) {
	3233	testPattern = quotedStuffMat.group(2, status);
	3234	testLine.remove(0, quotedStuffMat.end(0, status));
	3235	} else {
	3236	errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
	3237	continue;
	3238	}
	3239
	3240
	3241	//
	3242	// Pull out the flags from the test file line.
	3243	//
	3244	flagsMat.reset(testLine);
	3245	flagsMat.lookingAt(status); // Will always match, possibly an empty string.
	3246	testFlags = flagsMat.group(1, status);
	3247	if (flagsMat.group(2, status).length() > 0) {
	3248	errln("Bad Match flag at line %d. Scanning %c\n",
	3249	lineNum, flagsMat.group(2, status).charAt(0));
	3250	continue;
	3251	}
	3252	testLine.remove(0, flagsMat.end(0, status));
	3253
	3254	//
	3255	// Pull out the match string, as a whole.
	3256	// We'll process the <tags> later.
	3257	//
	3258	quotedStuffMat.reset(testLine);
	3259	if (quotedStuffMat.lookingAt(status)) {
	3260	matchString = quotedStuffMat.group(2, status);
	3261	testLine.remove(0, quotedStuffMat.end(0, status));
	3262	} else {
	3263	errln("Bad match string at test file line %d", lineNum);
	3264	continue;
	3265	}
	3266
	3267	//
	3268	// The only thing left from the input line should be an optional trailing comment.
	3269	//
	3270	commentMat.reset(testLine);
	3271	if (commentMat.lookingAt(status) == FALSE) {
	3272	errln("Line %d: unexpected characters at end of test line.", lineNum);
	3273	continue;
	3274	}
	3275
	3276	//
	3277	// Run the test
	3278	//
	3279	regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
	3280	}
	3281
	3282	delete [] testData;
	3283
	3284	}
	3285
	3286
	3287
	3288	//---------------------------------------------------------------------------
	3289	//
	3290	// regex_find(pattern, flags, inputString, lineNumber)
	3291	//
	3292	// Function to run a single test from the Extended (data driven) tests.
	3293	// See file test/testdata/regextst.txt for a description of the
	3294	// pattern and inputString fields, and the allowed flags.
	3295	// lineNumber is the source line in regextst.txt of the test.
	3296	//
	3297	//---------------------------------------------------------------------------
	3298
	3299
	3300	// Set a value into a UVector at position specified by a decimal number in
	3301	// a UnicodeString. This is a utility function needed by the actual test function,
	3302	// which follows.
	3303	static void set(UVector &vec, int32_t val, UnicodeString index) {
	3304	UErrorCode status=U_ZERO_ERROR;
	3305	int32_t idx = 0;
	3306	for (int32_t i=0; i<index.length(); i++) {
	3307	int32_t d=u_charDigitValue(index.charAt(i));
	3308	if (d<0) {return;}
	3309	idx = idx*10 + d;
	3310	}
	3311	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3312	vec.setElementAt(val, idx);
	3313	}
	3314
	3315	static void setInt(UVector &vec, int32_t val, int32_t idx) {
	3316	UErrorCode status=U_ZERO_ERROR;
	3317	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3318	vec.setElementAt(val, idx);
	3319	}
	3320
	3321	static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
	3322	{
	3323	UBool couldFind = TRUE;
	3324	UTEXT_SETNATIVEINDEX(utext, 0);
	3325	int32_t i = 0;
	3326	while (i < unistrOffset) {
	3327	UChar32 c = UTEXT_NEXT32(utext);
	3328	if (c != U_SENTINEL) {
	3329	i += U16_LENGTH(c);
	3330	} else {
	3331	couldFind = FALSE;
	3332	break;
	3333	}
	3334	}
	3335	nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
	3336	return couldFind;
	3337	}
	3338
	3339
	3340	void RegexTest::regex_find(const UnicodeString &pattern,
	3341	const UnicodeString &flags,
	3342	const UnicodeString &inputString,
	3343	const char *srcPath,
	3344	int32_t line) {
	3345	UnicodeString unEscapedInput;
	3346	UnicodeString deTaggedInput;
	3347
	3348	int32_t patternUTF8Length, inputUTF8Length;
	3349	char patternChars = NULL, inputChars = NULL;
	3350	UText patternText = UTEXT_INITIALIZER;
	3351	UText inputText = UTEXT_INITIALIZER;
	3352	UConverter *UTF8Converter = NULL;
	3353
	3354	UErrorCode status = U_ZERO_ERROR;
	3355	UParseError pe;
	3356	RegexPattern *parsePat = NULL;
	3357	RegexMatcher *parseMatcher = NULL;
	3358	RegexPattern callerPattern = NULL, UTF8Pattern = NULL;
	3359	RegexMatcher matcher = NULL, UTF8Matcher = NULL;
	3360	UVector groupStarts(status);
	3361	UVector groupEnds(status);
	3362	UVector groupStartsUTF8(status);
	3363	UVector groupEndsUTF8(status);
	3364	UBool isMatch = FALSE, isUTF8Match = FALSE;
	3365	UBool failed = FALSE;
	3366	int32_t numFinds;
	3367	int32_t i;
	3368	UBool useMatchesFunc = FALSE;
	3369	UBool useLookingAtFunc = FALSE;
	3370	int32_t regionStart = -1;
	3371	int32_t regionEnd = -1;
	3372	int32_t regionStartUTF8 = -1;
	3373	int32_t regionEndUTF8 = -1;
	3374
	3375
	3376	//
	3377	// Compile the caller's pattern
	3378	//
	3379	uint32_t bflags = 0;
	3380	if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
	3381	bflags \|= UREGEX_CASE_INSENSITIVE;
	3382	}
	3383	if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
	3384	bflags \|= UREGEX_COMMENTS;
	3385	}
	3386	if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
	3387	bflags \|= UREGEX_DOTALL;
	3388	}
	3389	if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
	3390	bflags \|= UREGEX_MULTILINE;
	3391	}
	3392
	3393	if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
	3394	bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
	3395	}
	3396	if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
	3397	bflags \|= UREGEX_UNIX_LINES;
	3398	}
	3399	if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
	3400	bflags \|= UREGEX_LITERAL;
	3401	}
	3402
	3403
	3404	callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
	3405	if (status != U_ZERO_ERROR) {
	3406	#if UCONFIG_NO_BREAK_ITERATION==1
	3407	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3408	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3409	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3410	goto cleanupAndReturn;
	3411	}
	3412	#endif
	3413	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3414	// Expected pattern compilation error.
	3415	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3416	logln("Pattern Compile returns \"%s\"", u_errorName(status));
	3417	}
	3418	goto cleanupAndReturn;
	3419	} else {
	3420	// Unexpected pattern compilation error.
	3421	dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
	3422	goto cleanupAndReturn;
	3423	}
	3424	}
	3425
	3426	UTF8Converter = ucnv_open("UTF8", &status);
	3427	ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	3428
	3429	patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
	3430	status = U_ZERO_ERROR; // buffer overflow
	3431	patternChars = new char[patternUTF8Length+1];
	3432	pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
	3433	utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
	3434
	3435	if (status == U_ZERO_ERROR) {
	3436	UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
	3437
	3438	if (status != U_ZERO_ERROR) {
	3439	#if UCONFIG_NO_BREAK_ITERATION==1
	3440	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3441	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3442	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3443	goto cleanupAndReturn;
	3444	}
	3445	#endif
	3446	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3447	// Expected pattern compilation error.
	3448	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3449	logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
	3450	}
	3451	goto cleanupAndReturn;
	3452	} else {
	3453	// Unexpected pattern compilation error.
	3454	errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
	3455	goto cleanupAndReturn;
	3456	}
	3457	}
	3458	}
	3459
	3460	if (UTF8Pattern == NULL) {
	3461	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3462	logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
	3463	status = U_ZERO_ERROR;
	3464	}
	3465
	3466	if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
	3467	callerPattern->dumpPattern();
	3468	}
	3469
	3470	if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
	3471	errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
	3472	goto cleanupAndReturn;
	3473	}
	3474
	3475
	3476	//
	3477	// Number of times find() should be called on the test string, default to 1
	3478	//
	3479	numFinds = 1;
	3480	for (i=2; i<=9; i++) {
	3481	if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
	3482	if (numFinds != 1) {
	3483	errln("Line %d: more than one digit flag. Scanning %d.", line, i);
	3484	goto cleanupAndReturn;
	3485	}
	3486	numFinds = i;
	3487	}
	3488	}
	3489
	3490	// 'M' flag. Use matches() instead of find()
	3491	if (flags.indexOf((UChar)0x4d) >= 0) {
	3492	useMatchesFunc = TRUE;
	3493	}
	3494	if (flags.indexOf((UChar)0x4c) >= 0) {
	3495	useLookingAtFunc = TRUE;
	3496	}
	3497
	3498	//
	3499	// Find the tags in the input data, remove them, and record the group boundary
	3500	// positions.
	3501	//
	3502	parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);
	3503	REGEX_CHECK_STATUS_L(line);
	3504
	3505	unEscapedInput = inputString.unescape();
	3506	parseMatcher = parsePat->matcher(unEscapedInput, status);
	3507	REGEX_CHECK_STATUS_L(line);
	3508	while(parseMatcher->find()) {
	3509	parseMatcher->appendReplacement(deTaggedInput, "", status);
	3510	REGEX_CHECK_STATUS;
	3511	UnicodeString groupNum = parseMatcher->group(2, status);
	3512	if (groupNum == "r") {
	3513	// <r> or </r>, a region specification within the string
	3514	if (parseMatcher->group(1, status) == "/") {
	3515	regionEnd = deTaggedInput.length();
	3516	} else {
	3517	regionStart = deTaggedInput.length();
	3518	}
	3519	} else {
	3520	// <digits> or </digits>, a group match boundary tag.
	3521	if (parseMatcher->group(1, status) == "/") {
	3522	set(groupEnds, deTaggedInput.length(), groupNum);
	3523	} else {
	3524	set(groupStarts, deTaggedInput.length(), groupNum);
	3525	}
	3526	}
	3527	}
	3528	parseMatcher->appendTail(deTaggedInput);
	3529
	3530	if (groupStarts.size() != groupEnds.size()) {
	3531	errln("Error at line %d: mismatched <n> group tags in expected results.", line);
	3532	failed = true;
	3533	goto cleanupAndReturn;
	3534	}
	3535	if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>regionEnd)) {
	3536	errln("mismatched <r> tags");
	3537	failed = TRUE;
	3538	goto cleanupAndReturn;
	3539	}
	3540
	3541	//
	3542	// Configure the matcher according to the flags specified with this test.
	3543	//
	3544	matcher = callerPattern->matcher(deTaggedInput, status);
	3545	REGEX_CHECK_STATUS_L(line);
	3546	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
	3547	matcher->setTrace(TRUE);
	3548	}
	3549
	3550	if (UTF8Pattern != NULL) {
	3551	inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
	3552	status = U_ZERO_ERROR; // buffer overflow
	3553	inputChars = new char[inputUTF8Length+1];
	3554	deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
	3555	utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
	3556
	3557	if (status == U_ZERO_ERROR) {
	3558	UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
	3559	REGEX_CHECK_STATUS_L(line);
	3560	}
	3561
	3562	if (UTF8Matcher == NULL) {
	3563	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3564	logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
	3565	status = U_ZERO_ERROR;
	3566	}
	3567	}
	3568
	3569	//
	3570	// Generate native indices for UTF8 versions of region and capture group info
	3571	//
	3572	if (UTF8Matcher != NULL) {
	3573	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
	3574	UTF8Matcher->setTrace(TRUE);
	3575	}
	3576	if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
	3577	if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
	3578
	3579	// Fill out the native index UVector info.
	3580	// Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
	3581	for (i=0; i<groupStarts.size(); i++) {
	3582	int32_t start = groupStarts.elementAti(i);
	3583	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3584	if (start >= 0) {
	3585	int32_t startUTF8;
	3586	if (!utextOffsetToNative(&inputText, start, startUTF8)) {
	3587	errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
	3588	failed = TRUE;
	3589	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3590	}
	3591	setInt(groupStartsUTF8, startUTF8, i);
	3592	}
	3593
	3594	int32_t end = groupEnds.elementAti(i);
	3595	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3596	if (end >= 0) {
	3597	int32_t endUTF8;
	3598	if (!utextOffsetToNative(&inputText, end, endUTF8)) {
	3599	errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
	3600	failed = TRUE;
	3601	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3602	}
	3603	setInt(groupEndsUTF8, endUTF8, i);
	3604	}
	3605	}
	3606	}
	3607
	3608	if (regionStart>=0) {
	3609	matcher->region(regionStart, regionEnd, status);
	3610	REGEX_CHECK_STATUS_L(line);
	3611	if (UTF8Matcher != NULL) {
	3612	UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
	3613	REGEX_CHECK_STATUS_L(line);
	3614	}
	3615	}
	3616	if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
	3617	matcher->useAnchoringBounds(FALSE);
	3618	if (UTF8Matcher != NULL) {
	3619	UTF8Matcher->useAnchoringBounds(FALSE);
	3620	}
	3621	}
	3622	if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
	3623	matcher->useTransparentBounds(TRUE);
	3624	if (UTF8Matcher != NULL) {
	3625	UTF8Matcher->useTransparentBounds(TRUE);
	3626	}
	3627	}
	3628
	3629
	3630
	3631	//
	3632	// Do a find on the de-tagged input using the caller's pattern
	3633	// TODO: error on count>1 and not find().
	3634	// error on both matches() and lookingAt().
	3635	//
	3636	for (i=0; i<numFinds; i++) {
	3637	if (useMatchesFunc) {
	3638	isMatch = matcher->matches(status);
	3639	if (UTF8Matcher != NULL) {
	3640	isUTF8Match = UTF8Matcher->matches(status);
	3641	}
	3642	} else if (useLookingAtFunc) {
	3643	isMatch = matcher->lookingAt(status);
	3644	if (UTF8Matcher != NULL) {
	3645	isUTF8Match = UTF8Matcher->lookingAt(status);
	3646	}
	3647	} else {
	3648	isMatch = matcher->find();
	3649	if (UTF8Matcher != NULL) {
	3650	isUTF8Match = UTF8Matcher->find();
	3651	}
	3652	}
	3653	}
	3654	matcher->setTrace(FALSE);
	3655	if (UTF8Matcher) {
	3656	UTF8Matcher->setTrace(FALSE);
	3657	}
	3658	if (U_FAILURE(status)) {
	3659	errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
	3660	}
	3661
	3662	//
	3663	// Match up the groups from the find() with the groups from the tags
	3664	//
	3665
	3666	// number of tags should match number of groups from find operation.
	3667	// matcher->groupCount does not include group 0, the entire match, hence the +1.
	3668	// G option in test means that capture group data is not available in the
	3669	// expected results, so the check needs to be suppressed.
	3670	if (isMatch == FALSE && groupStarts.size() != 0) {
	3671	dataerrln("Error at line %d: Match expected, but none found.", line);
	3672	failed = TRUE;
	3673	goto cleanupAndReturn;
	3674	} else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
	3675	errln("Error at line %d: Match expected, but none found. (UTF8)", line);
	3676	failed = TRUE;
	3677	goto cleanupAndReturn;
	3678	}
	3679	if (isMatch && groupStarts.size() == 0) {
	3680	errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
	3681	failed = TRUE;
	3682	}
	3683	if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
	3684	errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
	3685	failed = TRUE;
	3686	}
	3687
	3688	if (flags.indexOf((UChar)0x47 /G/) >= 0) {
	3689	// Only check for match / no match. Don't check capture groups.
	3690	goto cleanupAndReturn;
	3691	}
	3692
	3693	REGEX_CHECK_STATUS_L(line);
	3694	for (i=0; i<=matcher->groupCount(); i++) {
	3695	int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
	3696	int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
	3697	if (matcher->start(i, status) != expectedStart) {
	3698	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
	3699	line, i, expectedStart, matcher->start(i, status));
	3700	failed = TRUE;
	3701	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3702	} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
	3703	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
	3704	line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
	3705	failed = TRUE;
	3706	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3707	}
	3708
	3709	int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
	3710	int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
	3711	if (matcher->end(i, status) != expectedEnd) {
	3712	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
	3713	line, i, expectedEnd, matcher->end(i, status));
	3714	failed = TRUE;
	3715	// Error on end position; keep going; real error is probably yet to come as group
	3716	// end positions work from end of the input data towards the front.
	3717	} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
	3718	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
	3719	line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
	3720	failed = TRUE;
	3721	// Error on end position; keep going; real error is probably yet to come as group
	3722	// end positions work from end of the input data towards the front.
	3723	}
	3724	}
	3725	if ( matcher->groupCount()+1 < groupStarts.size()) {
	3726	errln("Error at line %d: Expected %d capture groups, found %d.",
	3727	line, groupStarts.size()-1, matcher->groupCount());
	3728	failed = TRUE;
	3729	}
	3730	else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
	3731	errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
	3732	line, groupStarts.size()-1, UTF8Matcher->groupCount());
	3733	failed = TRUE;
	3734	}
	3735
	3736	if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3737	matcher->requireEnd() == TRUE) {
	3738	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
	3739	failed = TRUE;
	3740	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3741	UTF8Matcher->requireEnd() == TRUE) {
	3742	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3743	failed = TRUE;
	3744	}
	3745
	3746	if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
	3747	matcher->requireEnd() == FALSE) {
	3748	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
	3749	failed = TRUE;
	3750	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
	3751	UTF8Matcher->requireEnd() == FALSE) {
	3752	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3753	failed = TRUE;
	3754	}
	3755
	3756	if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3757	matcher->hitEnd() == TRUE) {
	3758	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
	3759	failed = TRUE;
	3760	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3761	UTF8Matcher->hitEnd() == TRUE) {
	3762	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3763	failed = TRUE;
	3764	}
	3765
	3766	if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3767	matcher->hitEnd() == FALSE) {
	3768	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
	3769	failed = TRUE;
	3770	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3771	UTF8Matcher->hitEnd() == FALSE) {
	3772	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3773	failed = TRUE;
	3774	}
	3775
	3776
	3777	cleanupAndReturn:
	3778	if (failed) {
	3779	infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
	3780	+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
	3781	// callerPattern->dump();
	3782	}
	3783	delete parseMatcher;
	3784	delete parsePat;
	3785	delete UTF8Matcher;
	3786	delete UTF8Pattern;
	3787	delete matcher;
	3788	delete callerPattern;
	3789
	3790	utext_close(&inputText);
	3791	delete[] inputChars;
	3792	utext_close(&patternText);
	3793	delete[] patternChars;
	3794	ucnv_close(UTF8Converter);
	3795	}
	3796
	3797
	3798
	3799
	3800	//---------------------------------------------------------------------------
	3801	//
	3802	// Errors Check for error handling in patterns.
	3803	//
	3804	//---------------------------------------------------------------------------
	3805	void RegexTest::Errors() {
	3806	// \escape sequences that aren't implemented yet.
	3807	//REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
	3808
	3809	// Missing close parentheses
	3810	REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3811	REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3812	REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
	3813
	3814	// Extra close paren
	3815	REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
	3816	REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
	3817	REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
	3818
	3819	// Look-ahead, Look-behind
	3820	// TODO: add tests for unbounded length look-behinds.
	3821	REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
	3822
	3823	// Attempt to use non-default flags
	3824	{
	3825	UParseError pe;
	3826	UErrorCode status = U_ZERO_ERROR;
	3827	int32_t flags = UREGEX_CANON_EQ \|
	3828	UREGEX_COMMENTS \| UREGEX_DOTALL \|
	3829	UREGEX_MULTILINE;
	3830	RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);
	3831	REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
	3832	delete pat1;
	3833	}
	3834
	3835
	3836	// Quantifiers are allowed only after something that can be quantified.
	3837	REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
	3838	REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
	3839	REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
	3840
	3841	// Mal-formed {min,max} quantifiers
	3842	REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
	3843	REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
	3844	REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
	3845	REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
	3846	REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
	3847	REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
	3848	REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
	3849	REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
	3850	REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
	3851
	3852	// Ticket 5389
	3853	REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
	3854
	3855	// Invalid Back Reference \0
	3856	// For ICU 3.8 and earlier
	3857	// For ICU versions newer than 3.8, \0 introduces an octal escape.
	3858	//
	3859	REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
	3860
	3861	}
	3862
	3863
	3864	//-------------------------------------------------------------------------------
	3865	//
	3866	// Read a text data file, convert it to UChars, and return the data
	3867	// in one big UChar * buffer, which the caller must delete.
	3868	//
	3869	//--------------------------------------------------------------------------------
	3870	UChar RegexTest::ReadAndConvertFile(const char fileName, int32_t &ulen,
	3871	const char *defEncoding, UErrorCode &status) {
	3872	UChar *retPtr = NULL;
	3873	char *fileBuf = NULL;
	3874	UConverter* conv = NULL;
	3875	FILE *f = NULL;
	3876
	3877	ulen = 0;
	3878	if (U_FAILURE(status)) {
	3879	return retPtr;
	3880	}
	3881
	3882	//
	3883	// Open the file.
	3884	//
	3885	f = fopen(fileName, "rb");
	3886	if (f == 0) {
	3887	dataerrln("Error opening test data file %s\n", fileName);
	3888	status = U_FILE_ACCESS_ERROR;
	3889	return NULL;
	3890	}
	3891	//
	3892	// Read it in
	3893	//
	3894	int32_t fileSize;
	3895	int32_t amt_read;
	3896
	3897	fseek( f, 0, SEEK_END);
	3898	fileSize = ftell(f);
	3899	fileBuf = new char[fileSize];
	3900	fseek(f, 0, SEEK_SET);
	3901	amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
	3902	if (amt_read != fileSize \|\| fileSize <= 0) {
	3903	errln("Error reading test data file.");
	3904	goto cleanUpAndReturn;
	3905	}
	3906
	3907	//
	3908	// Look for a Unicode Signature (BOM) on the data just read
	3909	//
	3910	int32_t signatureLength;
	3911	const char * fileBufC;
	3912	const char* encoding;
	3913
	3914	fileBufC = fileBuf;
	3915	encoding = ucnv_detectUnicodeSignature(
	3916	fileBuf, fileSize, &signatureLength, &status);
	3917	if(encoding!=NULL ){
	3918	fileBufC += signatureLength;
	3919	fileSize -= signatureLength;
	3920	} else {
	3921	encoding = defEncoding;
	3922	if (strcmp(encoding, "utf-8") == 0) {
	3923	errln("file %s is missing its BOM", fileName);
	3924	}
	3925	}
	3926
	3927	//
	3928	// Open a converter to take the rule file to UTF-16
	3929	//
	3930	conv = ucnv_open(encoding, &status);
	3931	if (U_FAILURE(status)) {
	3932	goto cleanUpAndReturn;
	3933	}
	3934
	3935	//
	3936	// Convert the rules to UChar.
	3937	// Preflight first to determine required buffer size.
	3938	//
	3939	ulen = ucnv_toUChars(conv,
	3940	NULL, // dest,
	3941	0, // destCapacity,
	3942	fileBufC,
	3943	fileSize,
	3944	&status);
	3945	if (status == U_BUFFER_OVERFLOW_ERROR) {
	3946	// Buffer Overflow is expected from the preflight operation.
	3947	status = U_ZERO_ERROR;
	3948
	3949	retPtr = new UChar[ulen+1];
	3950	ucnv_toUChars(conv,
	3951	retPtr, // dest,
	3952	ulen+1,
	3953	fileBufC,
	3954	fileSize,
	3955	&status);
	3956	}
	3957
	3958	cleanUpAndReturn:
	3959	fclose(f);
	3960	delete[] fileBuf;
	3961	ucnv_close(conv);
	3962	if (U_FAILURE(status)) {
	3963	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	3964	delete []retPtr;
	3965	retPtr = 0;
	3966	ulen = 0;
	3967	}
	3968	return retPtr;
	3969	}
	3970
	3971
	3972	//-------------------------------------------------------------------------------
	3973	//
	3974	// PerlTests - Run Perl's regular expression tests
	3975	// The input file for this test is re_tests, the standard regular
	3976	// expression test data distributed with the Perl source code.
	3977	//
	3978	// Here is Perl's description of the test data file:
	3979	//
	3980	// # The tests are in a separate file 't/op/re_tests'.
	3981	// # Each line in that file is a separate test.
	3982	// # There are five columns, separated by tabs.
	3983	// #
	3984	// # Column 1 contains the pattern, optionally enclosed in C<''>.
	3985	// # Modifiers can be put after the closing C<'>.
	3986	// #
	3987	// # Column 2 contains the string to be matched.
	3988	// #
	3989	// # Column 3 contains the expected result:
	3990	// # y expect a match
	3991	// # n expect no match
	3992	// # c expect an error
	3993	// # B test exposes a known bug in Perl, should be skipped
	3994	// # b test exposes a known bug in Perl, should be skipped if noamp
	3995	// #
	3996	// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
	3997	// #
	3998	// # Column 4 contains a string, usually C<$&>.
	3999	// #
	4000	// # Column 5 contains the expected result of double-quote
	4001	// # interpolating that string after the match, or start of error message.
	4002	// #
	4003	// # Column 6, if present, contains a reason why the test is skipped.
	4004	// # This is printed with "skipped", for harness to pick up.
	4005	// #
	4006	// # \n in the tests are interpolated, as are variables of the form ${\w+}.
	4007	// #
	4008	// # If you want to add a regular expression test that can't be expressed
	4009	// # in this format, don't add it here: put it in op/pat.t instead.
	4010	//
	4011	// For ICU, if field 3 contains an 'i', the test will be skipped.
	4012	// The test exposes is some known incompatibility between ICU and Perl regexps.
	4013	// (The i is in addition to whatever was there before.)
	4014	//
	4015	//-------------------------------------------------------------------------------
	4016	void RegexTest::PerlTests() {
	4017	char tdd[2048];
	4018	const char *srcPath;
	4019	UErrorCode status = U_ZERO_ERROR;
	4020	UParseError pe;
	4021
	4022	//
	4023	// Open and read the test data file.
	4024	//
	4025	srcPath=getPath(tdd, "re_tests.txt");
	4026	if(srcPath==NULL) {
	4027	return; /* something went wrong, error already output */
	4028	}
	4029
	4030	int32_t len;
	4031	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	4032	if (U_FAILURE(status)) {
	4033	return; /* something went wrong, error already output */
	4034	}
	4035
	4036	//
	4037	// Put the test data into a UnicodeString
	4038	//
	4039	UnicodeString testDataString(FALSE, testData, len);
	4040
	4041	//
	4042	// Regex to break the input file into lines, and strip the new lines.
	4043	// One line per match, capture group one is the desired data.
	4044	//
	4045	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	4046	if (U_FAILURE(status)) {
	4047	dataerrln("RegexPattern::compile() error");
	4048	return;
	4049	}
	4050	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	4051
	4052	//
	4053	// Regex to split a test file line into fields.
	4054	// There are six fields, separated by tabs.
	4055	//
	4056	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	4057
	4058	//
	4059	// Regex to identify test patterns with flag settings, and to separate them.
	4060	// Test patterns with flags look like 'pattern'i
	4061	// Test patterns without flags are not quoted: pattern
	4062	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	4063	//
	4064	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	4065	RegexMatcher* flagMat = flagPat->matcher(status);
	4066
	4067	//
	4068	// The Perl tests reference several perl-isms, which are evaluated/substituted
	4069	// in the test data. Not being perl, this must be done explicitly. Here
	4070	// are string constants and REs for these constructs.
	4071	//
	4072	UnicodeString nulnulSrc("${nulnul}");
	4073	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4074	nulnul = nulnul.unescape();
	4075
	4076	UnicodeString ffffSrc("${ffff}");
	4077	UnicodeString ffff("\\uffff", -1, US_INV);
	4078	ffff = ffff.unescape();
	4079
	4080	// regexp for $-[0], $+[2], etc.
	4081	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4082	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4083
	4084	// regexp for $0, $1, $2, etc.
	4085	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4086	RegexMatcher *cgMat = cgPat->matcher(status);
	4087
	4088
	4089	//
	4090	// Main Loop for the Perl Tests, runs once per line from the
	4091	// test data file.
	4092	//
	4093	int32_t lineNum = 0;
	4094	int32_t skippedUnimplementedCount = 0;
	4095	while (lineMat->find()) {
	4096	lineNum++;
	4097
	4098	//
	4099	// Get a line, break it into its fields, do the Perl
	4100	// variable substitutions.
	4101	//
	4102	UnicodeString line = lineMat->group(1, status);
	4103	UnicodeString fields[7];
	4104	fieldPat->split(line, fields, 7, status);
	4105
	4106	flagMat->reset(fields[0]);
	4107	flagMat->matches(status);
	4108	UnicodeString pattern = flagMat->group(2, status);
	4109	pattern.findAndReplace("${bang}", "!");
	4110	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4111	pattern.findAndReplace(ffffSrc, ffff);
	4112
	4113	//
	4114	// Identify patterns that include match flag settings,
	4115	// split off the flags, remove the extra quotes.
	4116	//
	4117	UnicodeString flagStr = flagMat->group(3, status);
	4118	if (U_FAILURE(status)) {
	4119	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4120	return;
	4121	}
	4122	int32_t flags = 0;
	4123	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4124	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4125	const UChar UChar_m = 0x6d;
	4126	const UChar UChar_x = 0x78;
	4127	const UChar UChar_y = 0x79;
	4128	if (flagStr.indexOf(UChar_i) != -1) {
	4129	flags \|= UREGEX_CASE_INSENSITIVE;
	4130	}
	4131	if (flagStr.indexOf(UChar_m) != -1) {
	4132	flags \|= UREGEX_MULTILINE;
	4133	}
	4134	if (flagStr.indexOf(UChar_x) != -1) {
	4135	flags \|= UREGEX_COMMENTS;
	4136	}
	4137
	4138	//
	4139	// Compile the test pattern.
	4140	//
	4141	status = U_ZERO_ERROR;
	4142	RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
	4143	if (status == U_REGEX_UNIMPLEMENTED) {
	4144	//
	4145	// Test of a feature that is planned for ICU, but not yet implemented.
	4146	// skip the test.
	4147	skippedUnimplementedCount++;
	4148	delete testPat;
	4149	status = U_ZERO_ERROR;
	4150	continue;
	4151	}
	4152
	4153	if (U_FAILURE(status)) {
	4154	// Some tests are supposed to generate errors.
	4155	// Only report an error for tests that are supposed to succeed.
	4156	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4157	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4158	{
	4159	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4160	}
	4161	status = U_ZERO_ERROR;
	4162	delete testPat;
	4163	continue;
	4164	}
	4165
	4166	if (fields[2].indexOf(UChar_i) >= 0) {
	4167	// ICU should skip this test.
	4168	delete testPat;
	4169	continue;
	4170	}
	4171
	4172	if (fields[2].indexOf(UChar_c) >= 0) {
	4173	// This pattern should have caused a compilation error, but didn't/
	4174	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4175	delete testPat;
	4176	continue;
	4177	}
	4178
	4179	//
	4180	// replace the Perl variables that appear in some of the
	4181	// match data strings.
	4182	//
	4183	UnicodeString matchString = fields[1];
	4184	matchString.findAndReplace(nulnulSrc, nulnul);
	4185	matchString.findAndReplace(ffffSrc, ffff);
	4186
	4187	// Replace any \n in the match string with an actual new-line char.
	4188	// Don't do full unescape, as this unescapes more than Perl does, which
	4189	// causes other spurious failures in the tests.
	4190	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4191
	4192
	4193
	4194	//
	4195	// Run the test, check for expected match/don't match result.
	4196	//
	4197	RegexMatcher *testMat = testPat->matcher(matchString, status);
	4198	UBool found = testMat->find();
	4199	UBool expected = FALSE;
	4200	if (fields[2].indexOf(UChar_y) >=0) {
	4201	expected = TRUE;
	4202	}
	4203	if (expected != found) {
	4204	errln("line %d: Expected %smatch, got %smatch",
	4205	lineNum, expected?"":"no ", found?"":"no " );
	4206	continue;
	4207	}
	4208
	4209	// Don't try to check expected results if there is no match.
	4210	// (Some have stuff in the expected fields)
	4211	if (!found) {
	4212	delete testMat;
	4213	delete testPat;
	4214	continue;
	4215	}
	4216
	4217	//
	4218	// Interpret the Perl expression from the fourth field of the data file,
	4219	// building up an ICU string from the results of the ICU match.
	4220	// The Perl expression will contain references to the results of
	4221	// a regex match, including the matched string, capture group strings,
	4222	// group starting and ending indicies, etc.
	4223	//
	4224	UnicodeString resultString;
	4225	UnicodeString perlExpr = fields[3];
	4226	#if SUPPORT_MUTATING_INPUT_STRING
	4227	groupsMat->reset(perlExpr);
	4228	cgMat->reset(perlExpr);
	4229	#endif
	4230
	4231	while (perlExpr.length() > 0) {
	4232	#if !SUPPORT_MUTATING_INPUT_STRING
	4233	// Perferred usage. Reset after any modification to input string.
	4234	groupsMat->reset(perlExpr);
	4235	cgMat->reset(perlExpr);
	4236	#endif
	4237
	4238	if (perlExpr.startsWith("$&")) {
	4239	resultString.append(testMat->group(status));
	4240	perlExpr.remove(0, 2);
	4241	}
	4242
	4243	else if (groupsMat->lookingAt(status)) {
	4244	// $-[0] $+[2] etc.
	4245	UnicodeString digitString = groupsMat->group(2, status);
	4246	int32_t t = 0;
	4247	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4248	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4249	int32_t matchPosition;
	4250	if (plusOrMinus.compare("+") == 0) {
	4251	matchPosition = testMat->end(groupNum, status);
	4252	} else {
	4253	matchPosition = testMat->start(groupNum, status);
	4254	}
	4255	if (matchPosition != -1) {
	4256	ICU_Utility::appendNumber(resultString, matchPosition);
	4257	}
	4258	perlExpr.remove(0, groupsMat->end(status));
	4259	}
	4260
	4261	else if (cgMat->lookingAt(status)) {
	4262	// $1, $2, $3, etc.
	4263	UnicodeString digitString = cgMat->group(1, status);
	4264	int32_t t = 0;
	4265	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4266	if (U_SUCCESS(status)) {
	4267	resultString.append(testMat->group(groupNum, status));
	4268	status = U_ZERO_ERROR;
	4269	}
	4270	perlExpr.remove(0, cgMat->end(status));
	4271	}
	4272
	4273	else if (perlExpr.startsWith("@-")) {
	4274	int32_t i;
	4275	for (i=0; i<=testMat->groupCount(); i++) {
	4276	if (i>0) {
	4277	resultString.append(" ");
	4278	}
	4279	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4280	}
	4281	perlExpr.remove(0, 2);
	4282	}
	4283
	4284	else if (perlExpr.startsWith("@+")) {
	4285	int32_t i;
	4286	for (i=0; i<=testMat->groupCount(); i++) {
	4287	if (i>0) {
	4288	resultString.append(" ");
	4289	}
	4290	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4291	}
	4292	perlExpr.remove(0, 2);
	4293	}
	4294
	4295	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4296	// or as an escaped sequence (e.g. \n)
	4297	if (perlExpr.length() > 1) {
	4298	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4299	}
	4300	UChar c = perlExpr.charAt(0);
	4301	switch (c) {
	4302	case 'n': c = '\n'; break;
	4303	// add any other escape sequences that show up in the test expected results.
	4304	}
	4305	resultString.append(c);
	4306	perlExpr.remove(0, 1);
	4307	}
	4308
	4309	else {
	4310	// Any characters from the perl expression that we don't explicitly
	4311	// recognize before here are assumed to be literals and copied
	4312	// as-is to the expected results.
	4313	resultString.append(perlExpr.charAt(0));
	4314	perlExpr.remove(0, 1);
	4315	}
	4316
	4317	if (U_FAILURE(status)) {
	4318	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4319	break;
	4320	}
	4321	}
	4322
	4323	//
	4324	// Expected Results Compare
	4325	//
	4326	UnicodeString expectedS(fields[4]);
	4327	expectedS.findAndReplace(nulnulSrc, nulnul);
	4328	expectedS.findAndReplace(ffffSrc, ffff);
	4329	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4330
	4331
	4332	if (expectedS.compare(resultString) != 0) {
	4333	err("Line %d: Incorrect perl expression results.", lineNum);
	4334	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4335	}
	4336
	4337	delete testMat;
	4338	delete testPat;
	4339	}
	4340
	4341	//
	4342	// All done. Clean up allocated stuff.
	4343	//
	4344	delete cgMat;
	4345	delete cgPat;
	4346
	4347	delete groupsMat;
	4348	delete groupsPat;
	4349
	4350	delete flagMat;
	4351	delete flagPat;
	4352
	4353	delete lineMat;
	4354	delete linePat;
	4355
	4356	delete fieldPat;
	4357	delete [] testData;
	4358
	4359
	4360	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4361
	4362	}
	4363
	4364
	4365	//-------------------------------------------------------------------------------
	4366	//
	4367	// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
	4368	// (instead of using UnicodeStrings) to test the alternate engine.
	4369	// The input file for this test is re_tests, the standard regular
	4370	// expression test data distributed with the Perl source code.
	4371	// See PerlTests() for more information.
	4372	//
	4373	//-------------------------------------------------------------------------------
	4374	void RegexTest::PerlTestsUTF8() {
	4375	char tdd[2048];
	4376	const char *srcPath;
	4377	UErrorCode status = U_ZERO_ERROR;
	4378	UParseError pe;
	4379	LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
	4380	UText patternText = UTEXT_INITIALIZER;
	4381	char *patternChars = NULL;
	4382	int32_t patternLength;
	4383	int32_t patternCapacity = 0;
	4384	UText inputText = UTEXT_INITIALIZER;
	4385	char *inputChars = NULL;
	4386	int32_t inputLength;
	4387	int32_t inputCapacity = 0;
	4388
	4389	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	4390
	4391	//
	4392	// Open and read the test data file.
	4393	//
	4394	srcPath=getPath(tdd, "re_tests.txt");
	4395	if(srcPath==NULL) {
	4396	return; /* something went wrong, error already output */
	4397	}
	4398
	4399	int32_t len;
	4400	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	4401	if (U_FAILURE(status)) {
	4402	return; /* something went wrong, error already output */
	4403	}
	4404
	4405	//
	4406	// Put the test data into a UnicodeString
	4407	//
	4408	UnicodeString testDataString(FALSE, testData, len);
	4409
	4410	//
	4411	// Regex to break the input file into lines, and strip the new lines.
	4412	// One line per match, capture group one is the desired data.
	4413	//
	4414	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	4415	if (U_FAILURE(status)) {
	4416	dataerrln("RegexPattern::compile() error");
	4417	return;
	4418	}
	4419	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	4420
	4421	//
	4422	// Regex to split a test file line into fields.
	4423	// There are six fields, separated by tabs.
	4424	//
	4425	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	4426
	4427	//
	4428	// Regex to identify test patterns with flag settings, and to separate them.
	4429	// Test patterns with flags look like 'pattern'i
	4430	// Test patterns without flags are not quoted: pattern
	4431	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	4432	//
	4433	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	4434	RegexMatcher* flagMat = flagPat->matcher(status);
	4435
	4436	//
	4437	// The Perl tests reference several perl-isms, which are evaluated/substituted
	4438	// in the test data. Not being perl, this must be done explicitly. Here
	4439	// are string constants and REs for these constructs.
	4440	//
	4441	UnicodeString nulnulSrc("${nulnul}");
	4442	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4443	nulnul = nulnul.unescape();
	4444
	4445	UnicodeString ffffSrc("${ffff}");
	4446	UnicodeString ffff("\\uffff", -1, US_INV);
	4447	ffff = ffff.unescape();
	4448
	4449	// regexp for $-[0], $+[2], etc.
	4450	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4451	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4452
	4453	// regexp for $0, $1, $2, etc.
	4454	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4455	RegexMatcher *cgMat = cgPat->matcher(status);
	4456
	4457
	4458	//
	4459	// Main Loop for the Perl Tests, runs once per line from the
	4460	// test data file.
	4461	//
	4462	int32_t lineNum = 0;
	4463	int32_t skippedUnimplementedCount = 0;
	4464	while (lineMat->find()) {
	4465	lineNum++;
	4466
	4467	//
	4468	// Get a line, break it into its fields, do the Perl
	4469	// variable substitutions.
	4470	//
	4471	UnicodeString line = lineMat->group(1, status);
	4472	UnicodeString fields[7];
	4473	fieldPat->split(line, fields, 7, status);
	4474
	4475	flagMat->reset(fields[0]);
	4476	flagMat->matches(status);
	4477	UnicodeString pattern = flagMat->group(2, status);
	4478	pattern.findAndReplace("${bang}", "!");
	4479	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4480	pattern.findAndReplace(ffffSrc, ffff);
	4481
	4482	//
	4483	// Identify patterns that include match flag settings,
	4484	// split off the flags, remove the extra quotes.
	4485	//
	4486	UnicodeString flagStr = flagMat->group(3, status);
	4487	if (U_FAILURE(status)) {
	4488	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4489	return;
	4490	}
	4491	int32_t flags = 0;
	4492	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4493	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4494	const UChar UChar_m = 0x6d;
	4495	const UChar UChar_x = 0x78;
	4496	const UChar UChar_y = 0x79;
	4497	if (flagStr.indexOf(UChar_i) != -1) {
	4498	flags \|= UREGEX_CASE_INSENSITIVE;
	4499	}
	4500	if (flagStr.indexOf(UChar_m) != -1) {
	4501	flags \|= UREGEX_MULTILINE;
	4502	}
	4503	if (flagStr.indexOf(UChar_x) != -1) {
	4504	flags \|= UREGEX_COMMENTS;
	4505	}
	4506
	4507	//
	4508	// Put the pattern in a UTF-8 UText
	4509	//
	4510	status = U_ZERO_ERROR;
	4511	patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4512	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4513	status = U_ZERO_ERROR;
	4514	delete[] patternChars;
	4515	patternCapacity = patternLength + 1;
	4516	patternChars = new char[patternCapacity];
	4517	pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4518	}
	4519	utext_openUTF8(&patternText, patternChars, patternLength, &status);
	4520
	4521	//
	4522	// Compile the test pattern.
	4523	//
	4524	RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
	4525	if (status == U_REGEX_UNIMPLEMENTED) {
	4526	//
	4527	// Test of a feature that is planned for ICU, but not yet implemented.
	4528	// skip the test.
	4529	skippedUnimplementedCount++;
	4530	delete testPat;
	4531	status = U_ZERO_ERROR;
	4532	continue;
	4533	}
	4534
	4535	if (U_FAILURE(status)) {
	4536	// Some tests are supposed to generate errors.
	4537	// Only report an error for tests that are supposed to succeed.
	4538	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4539	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4540	{
	4541	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4542	}
	4543	status = U_ZERO_ERROR;
	4544	delete testPat;
	4545	continue;
	4546	}
	4547
	4548	if (fields[2].indexOf(UChar_i) >= 0) {
	4549	// ICU should skip this test.
	4550	delete testPat;
	4551	continue;
	4552	}
	4553
	4554	if (fields[2].indexOf(UChar_c) >= 0) {
	4555	// This pattern should have caused a compilation error, but didn't/
	4556	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4557	delete testPat;
	4558	continue;
	4559	}
	4560
	4561
	4562	//
	4563	// replace the Perl variables that appear in some of the
	4564	// match data strings.
	4565	//
	4566	UnicodeString matchString = fields[1];
	4567	matchString.findAndReplace(nulnulSrc, nulnul);
	4568	matchString.findAndReplace(ffffSrc, ffff);
	4569
	4570	// Replace any \n in the match string with an actual new-line char.
	4571	// Don't do full unescape, as this unescapes more than Perl does, which
	4572	// causes other spurious failures in the tests.
	4573	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4574
	4575	//
	4576	// Put the input in a UTF-8 UText
	4577	//
	4578	status = U_ZERO_ERROR;
	4579	inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4580	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4581	status = U_ZERO_ERROR;
	4582	delete[] inputChars;
	4583	inputCapacity = inputLength + 1;
	4584	inputChars = new char[inputCapacity];
	4585	matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4586	}
	4587	utext_openUTF8(&inputText, inputChars, inputLength, &status);
	4588
	4589	//
	4590	// Run the test, check for expected match/don't match result.
	4591	//
	4592	RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
	4593	UBool found = testMat->find();
	4594	UBool expected = FALSE;
	4595	if (fields[2].indexOf(UChar_y) >=0) {
	4596	expected = TRUE;
	4597	}
	4598	if (expected != found) {
	4599	errln("line %d: Expected %smatch, got %smatch",
	4600	lineNum, expected?"":"no ", found?"":"no " );
	4601	continue;
	4602	}
	4603
	4604	// Don't try to check expected results if there is no match.
	4605	// (Some have stuff in the expected fields)
	4606	if (!found) {
	4607	delete testMat;
	4608	delete testPat;
	4609	continue;
	4610	}
	4611
	4612	//
	4613	// Interpret the Perl expression from the fourth field of the data file,
	4614	// building up an ICU string from the results of the ICU match.
	4615	// The Perl expression will contain references to the results of
	4616	// a regex match, including the matched string, capture group strings,
	4617	// group starting and ending indicies, etc.
	4618	//
	4619	UnicodeString resultString;
	4620	UnicodeString perlExpr = fields[3];
	4621
	4622	while (perlExpr.length() > 0) {
	4623	groupsMat->reset(perlExpr);
	4624	cgMat->reset(perlExpr);
	4625
	4626	if (perlExpr.startsWith("$&")) {
	4627	resultString.append(testMat->group(status));
	4628	perlExpr.remove(0, 2);
	4629	}
	4630
	4631	else if (groupsMat->lookingAt(status)) {
	4632	// $-[0] $+[2] etc.
	4633	UnicodeString digitString = groupsMat->group(2, status);
	4634	int32_t t = 0;
	4635	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4636	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4637	int32_t matchPosition;
	4638	if (plusOrMinus.compare("+") == 0) {
	4639	matchPosition = testMat->end(groupNum, status);
	4640	} else {
	4641	matchPosition = testMat->start(groupNum, status);
	4642	}
	4643	if (matchPosition != -1) {
	4644	ICU_Utility::appendNumber(resultString, matchPosition);
	4645	}
	4646	perlExpr.remove(0, groupsMat->end(status));
	4647	}
	4648
	4649	else if (cgMat->lookingAt(status)) {
	4650	// $1, $2, $3, etc.
	4651	UnicodeString digitString = cgMat->group(1, status);
	4652	int32_t t = 0;
	4653	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4654	if (U_SUCCESS(status)) {
	4655	resultString.append(testMat->group(groupNum, status));
	4656	status = U_ZERO_ERROR;
	4657	}
	4658	perlExpr.remove(0, cgMat->end(status));
	4659	}
	4660
	4661	else if (perlExpr.startsWith("@-")) {
	4662	int32_t i;
	4663	for (i=0; i<=testMat->groupCount(); i++) {
	4664	if (i>0) {
	4665	resultString.append(" ");
	4666	}
	4667	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4668	}
	4669	perlExpr.remove(0, 2);
	4670	}
	4671
	4672	else if (perlExpr.startsWith("@+")) {
	4673	int32_t i;
	4674	for (i=0; i<=testMat->groupCount(); i++) {
	4675	if (i>0) {
	4676	resultString.append(" ");
	4677	}
	4678	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4679	}
	4680	perlExpr.remove(0, 2);
	4681	}
	4682
	4683	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4684	// or as an escaped sequence (e.g. \n)
	4685	if (perlExpr.length() > 1) {
	4686	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4687	}
	4688	UChar c = perlExpr.charAt(0);
	4689	switch (c) {
	4690	case 'n': c = '\n'; break;
	4691	// add any other escape sequences that show up in the test expected results.
	4692	}
	4693	resultString.append(c);
	4694	perlExpr.remove(0, 1);
	4695	}
	4696
	4697	else {
	4698	// Any characters from the perl expression that we don't explicitly
	4699	// recognize before here are assumed to be literals and copied
	4700	// as-is to the expected results.
	4701	resultString.append(perlExpr.charAt(0));
	4702	perlExpr.remove(0, 1);
	4703	}
	4704
	4705	if (U_FAILURE(status)) {
	4706	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4707	break;
	4708	}
	4709	}
	4710
	4711	//
	4712	// Expected Results Compare
	4713	//
	4714	UnicodeString expectedS(fields[4]);
	4715	expectedS.findAndReplace(nulnulSrc, nulnul);
	4716	expectedS.findAndReplace(ffffSrc, ffff);
	4717	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4718
	4719
	4720	if (expectedS.compare(resultString) != 0) {
	4721	err("Line %d: Incorrect perl expression results.", lineNum);
	4722	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4723	}
	4724
	4725	delete testMat;
	4726	delete testPat;
	4727	}
	4728
	4729	//
	4730	// All done. Clean up allocated stuff.
	4731	//
	4732	delete cgMat;
	4733	delete cgPat;
	4734
	4735	delete groupsMat;
	4736	delete groupsPat;
	4737
	4738	delete flagMat;
	4739	delete flagPat;
	4740
	4741	delete lineMat;
	4742	delete linePat;
	4743
	4744	delete fieldPat;
	4745	delete [] testData;
	4746
	4747	utext_close(&patternText);
	4748	utext_close(&inputText);
	4749
	4750	delete [] patternChars;
	4751	delete [] inputChars;
	4752
	4753
	4754	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4755
	4756	}
	4757
	4758
	4759	//--------------------------------------------------------------
	4760	//
	4761	// Bug6149 Verify limits to heap expansion for backtrack stack.
	4762	// Use this pattern,
	4763	// "(a?){1,8000000}"
	4764	// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
	4765	// This test is likely to be fragile, as further optimizations stop
	4766	// more cases of pointless looping in the match engine.
	4767	//
	4768	//---------------------------------------------------------------
	4769	void RegexTest::Bug6149() {
	4770	UnicodeString pattern("(a?){1,8000000}");
	4771	UnicodeString s("xyz");
	4772	uint32_t flags = 0;
	4773	UErrorCode status = U_ZERO_ERROR;
	4774
	4775	RegexMatcher matcher(pattern, s, flags, status);
	4776	UBool result = false;
	4777	REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
	4778	REGEX_ASSERT(result == FALSE);
	4779	}
	4780
	4781
	4782	//
	4783	// Callbacks() Test the callback function.
	4784	// When set, callbacks occur periodically during matching operations,
	4785	// giving the application code the ability to abort the operation
	4786	// before it's normal completion.
	4787	//
	4788
	4789	struct callBackContext {
	4790	RegexTest *test;
	4791	int32_t maxCalls;
	4792	int32_t numCalls;
	4793	int32_t lastSteps;
	4794	void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
	4795	};
	4796
	4797	U_CDECL_BEGIN
	4798	static UBool U_CALLCONV
	4799	testCallBackFn(const void *context, int32_t steps) {
	4800	callBackContext info = (callBackContext )context;
	4801	if (info->lastSteps+1 != steps) {
	4802	info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
	4803	}
	4804	info->lastSteps = steps;
	4805	info->numCalls++;
	4806	return (info->numCalls < info->maxCalls);
	4807	}
	4808	U_CDECL_END
	4809
	4810	void RegexTest::Callbacks() {
	4811	{
	4812	// Getter returns NULLs if no callback has been set
	4813
	4814	// The variables that the getter will fill in.
	4815	// Init to non-null values so that the action of the getter can be seen.
	4816	const void *returnedContext = &returnedContext;
	4817	URegexMatchCallback *returnedFn = &testCallBackFn;
	4818
	4819	UErrorCode status = U_ZERO_ERROR;
	4820	RegexMatcher matcher("x", 0, status);
	4821	REGEX_CHECK_STATUS;
	4822	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4823	REGEX_CHECK_STATUS;
	4824	REGEX_ASSERT(returnedFn == NULL);
	4825	REGEX_ASSERT(returnedContext == NULL);
	4826	}
	4827
	4828	{
	4829	// Set and Get work
	4830	callBackContext cbInfo = {this, 0, 0, 0};
	4831	const void *returnedContext;
	4832	URegexMatchCallback *returnedFn;
	4833	UErrorCode status = U_ZERO_ERROR;
	4834	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
	4835	REGEX_CHECK_STATUS;
	4836	matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
	4837	REGEX_CHECK_STATUS;
	4838	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4839	REGEX_CHECK_STATUS;
	4840	REGEX_ASSERT(returnedFn == testCallBackFn);
	4841	REGEX_ASSERT(returnedContext == &cbInfo);
	4842
	4843	// A short-running match shouldn't invoke the callback
	4844	status = U_ZERO_ERROR;
	4845	cbInfo.reset(1);
	4846	UnicodeString s = "xxx";
	4847	matcher.reset(s);
	4848	REGEX_ASSERT(matcher.matches(status));
	4849	REGEX_CHECK_STATUS;
	4850	REGEX_ASSERT(cbInfo.numCalls == 0);
	4851
	4852	// A medium-length match that runs long enough to invoke the
	4853	// callback, but not so long that the callback aborts it.
	4854	status = U_ZERO_ERROR;
	4855	cbInfo.reset(4);
	4856	s = "aaaaaaaaaaaaaaaaaaab";
	4857	matcher.reset(s);
	4858	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4859	REGEX_CHECK_STATUS;
	4860	REGEX_ASSERT(cbInfo.numCalls > 0);
	4861
	4862	// A longer running match that the callback function will abort.
	4863	status = U_ZERO_ERROR;
	4864	cbInfo.reset(4);
	4865	s = "aaaaaaaaaaaaaaaaaaaaaaab";
	4866	matcher.reset(s);
	4867	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4868	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4869	REGEX_ASSERT(cbInfo.numCalls == 4);
	4870
	4871	// A longer running find that the callback function will abort.
	4872	status = U_ZERO_ERROR;
	4873	cbInfo.reset(4);
	4874	s = "aaaaaaaaaaaaaaaaaaaaaaab";
	4875	matcher.reset(s);
	4876	REGEX_ASSERT(matcher.find(status)==FALSE);
	4877	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4878	REGEX_ASSERT(cbInfo.numCalls == 4);
	4879	}
	4880
	4881
	4882	}
	4883
	4884
	4885	//
	4886	// FindProgressCallbacks() Test the find "progress" callback function.
	4887	// When set, the find progress callback will be invoked during a find operations
	4888	// after each return from a match attempt, giving the application the opportunity
	4889	// to terminate a long-running find operation before it's normal completion.
	4890	//
	4891
	4892	struct progressCallBackContext {
	4893	RegexTest *test;
	4894	int64_t lastIndex;
	4895	int32_t maxCalls;
	4896	int32_t numCalls;
	4897	void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
	4898	};
	4899
	4900	// call-back function for find().
	4901	// Return TRUE to continue the find().
	4902	// Return FALSE to stop the find().
	4903	U_CDECL_BEGIN
	4904	static UBool U_CALLCONV
	4905	testProgressCallBackFn(const void *context, int64_t matchIndex) {
	4906	progressCallBackContext info = (progressCallBackContext )context;
	4907	info->numCalls++;
	4908	info->lastIndex = matchIndex;
	4909	// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
	4910	return (info->numCalls < info->maxCalls);
	4911	}
	4912	U_CDECL_END
	4913
	4914	void RegexTest::FindProgressCallbacks() {
	4915	{
	4916	// Getter returns NULLs if no callback has been set
	4917
	4918	// The variables that the getter will fill in.
	4919	// Init to non-null values so that the action of the getter can be seen.
	4920	const void *returnedContext = &returnedContext;
	4921	URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
	4922
	4923	UErrorCode status = U_ZERO_ERROR;
	4924	RegexMatcher matcher("x", 0, status);
	4925	REGEX_CHECK_STATUS;
	4926	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4927	REGEX_CHECK_STATUS;
	4928	REGEX_ASSERT(returnedFn == NULL);
	4929	REGEX_ASSERT(returnedContext == NULL);
	4930	}
	4931
	4932	{
	4933	// Set and Get work
	4934	progressCallBackContext cbInfo = {this, 0, 0, 0};
	4935	const void *returnedContext;
	4936	URegexFindProgressCallback *returnedFn;
	4937	UErrorCode status = U_ZERO_ERROR;
	4938	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
	4939	REGEX_CHECK_STATUS;
	4940	matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
	4941	REGEX_CHECK_STATUS;
	4942	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4943	REGEX_CHECK_STATUS;
	4944	REGEX_ASSERT(returnedFn == testProgressCallBackFn);
	4945	REGEX_ASSERT(returnedContext == &cbInfo);
	4946
	4947	// A find that matches on the initial position does NOT invoke the callback.
	4948	status = U_ZERO_ERROR;
	4949	cbInfo.reset(100);
	4950	UnicodeString s = "aaxxx";
	4951	matcher.reset(s);
	4952	#if 0
	4953	matcher.setTrace(TRUE);
	4954	#endif
	4955	REGEX_ASSERT(matcher.find(0, status));
	4956	REGEX_CHECK_STATUS;
	4957	REGEX_ASSERT(cbInfo.numCalls == 0);
	4958
	4959	// A medium running find() that causes matcher.find() to invoke our callback for each index,
	4960	// but not so many times that we interrupt the operation.
	4961	status = U_ZERO_ERROR;
	4962	s = "aaaaaaaaaaaaaaaaaaab";
	4963	cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
	4964	matcher.reset(s);
	4965	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4966	REGEX_CHECK_STATUS;
	4967	REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
	4968
	4969	// A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
	4970	status = U_ZERO_ERROR;
	4971	UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
	4972	cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
	4973	matcher.reset(s1);
	4974	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4975	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4976	REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
	4977
	4978	// Now a match that will succeed, but after an interruption
	4979	status = U_ZERO_ERROR;
	4980	UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
	4981	cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
	4982	matcher.reset(s2);
	4983	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4984	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4985	// Now retry the match from where left off
	4986	cbInfo.maxCalls = 100; // No callback limit
	4987	status = U_ZERO_ERROR;
	4988	REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
	4989	REGEX_CHECK_STATUS;
	4990	}
	4991
	4992
	4993	}
	4994
	4995
	4996	//---------------------------------------------------------------------------
	4997	//
	4998	// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
	4999	// UTexts. The pure-C implementation of UText
	5000	// has no mutable backing stores, but we can
	5001	// use UnicodeString here to test the functionality.
	5002	//
	5003	//---------------------------------------------------------------------------
	5004	void RegexTest::PreAllocatedUTextCAPI () {
	5005	UErrorCode status = U_ZERO_ERROR;
	5006	URegularExpression *re;
	5007	UText patternText = UTEXT_INITIALIZER;
	5008	UnicodeString buffer;
	5009	UText bufferText = UTEXT_INITIALIZER;
	5010
	5011	utext_openUnicodeString(&bufferText, &buffer, &status);
	5012
	5013	/*
	5014	* getText() and getUText()
	5015	*/
	5016	{
	5017	UText text1 = UTEXT_INITIALIZER;
	5018	UText text2 = UTEXT_INITIALIZER;
	5019	UChar text2Chars[20];
	5020	UText *resultText;
	5021
	5022	status = U_ZERO_ERROR;
	5023	regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
	5024	regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
	5025	u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
	5026	utext_openUChars(&text2, text2Chars, -1, &status);
	5027
	5028	regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
	5029	re = uregex_openUText(&patternText, 0, NULL, &status);
	5030
	5031	/* First set a UText */
	5032	uregex_setUText(re, &text1, &status);
	5033	resultText = uregex_getUText(re, &bufferText, &status);
	5034	REGEX_CHECK_STATUS;
	5035	REGEX_ASSERT(resultText == &bufferText);
	5036	utext_setNativeIndex(resultText, 0);
	5037	utext_setNativeIndex(&text1, 0);
	5038	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	5039
	5040	resultText = uregex_getUText(re, &bufferText, &status);
	5041	REGEX_CHECK_STATUS;
	5042	REGEX_ASSERT(resultText == &bufferText);
	5043	utext_setNativeIndex(resultText, 0);
	5044	utext_setNativeIndex(&text1, 0);
	5045	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	5046
	5047	/* Then set a UChar * */
	5048	uregex_setText(re, text2Chars, 7, &status);
	5049	resultText = uregex_getUText(re, &bufferText, &status);
	5050	REGEX_CHECK_STATUS;
	5051	REGEX_ASSERT(resultText == &bufferText);
	5052	utext_setNativeIndex(resultText, 0);
	5053	utext_setNativeIndex(&text2, 0);
	5054	REGEX_ASSERT(testUTextEqual(resultText, &text2));
	5055
	5056	uregex_close(re);
	5057	utext_close(&text1);
	5058	utext_close(&text2);
	5059	}
	5060
	5061	/*
	5062	* group()
	5063	*/
	5064	{
	5065	UChar text1[80];
	5066	UText *actual;
	5067	UBool result;
	5068	int64_t length = 0;
	5069
	5070	u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
	5071	// 012345678901234567890123456789012345678901234567
	5072	// 0 1 2 3 4
	5073
	5074	status = U_ZERO_ERROR;
	5075	re = uregex_openC("abc(.*?)def", 0, NULL, &status);
	5076	REGEX_CHECK_STATUS;
	5077
	5078	uregex_setText(re, text1, -1, &status);
	5079	result = uregex_find(re, 0, &status);
	5080	REGEX_ASSERT(result==TRUE);
	5081
	5082	/* Capture Group 0, the full match. Should succeed. "abc interior def" */
	5083	status = U_ZERO_ERROR;
	5084	actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
	5085	REGEX_CHECK_STATUS;
	5086	REGEX_ASSERT(actual == &bufferText);
	5087	REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
	5088	REGEX_ASSERT(length == 16);
	5089	REGEX_ASSERT(utext_nativeLength(actual) == 47);
	5090
	5091	/* Capture group #1. Should succeed, matching " interior ". */
	5092	status = U_ZERO_ERROR;
	5093	actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
	5094	REGEX_CHECK_STATUS;
	5095	REGEX_ASSERT(actual == &bufferText);
	5096	REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
	5097	REGEX_ASSERT(length == 10);
	5098	REGEX_ASSERT(utext_nativeLength(actual) == 47);
	5099
	5100	/* Capture group out of range. Error. */
	5101	status = U_ZERO_ERROR;
	5102	actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
	5103	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5104	REGEX_ASSERT(actual == &bufferText);
	5105	uregex_close(re);
	5106
	5107	}
	5108
	5109	/*
	5110	* replaceFirst()
	5111	*/
	5112	{
	5113	UChar text1[80];
	5114	UChar text2[80];
	5115	UText replText = UTEXT_INITIALIZER;
	5116	UText *result;
	5117	status = U_ZERO_ERROR;
	5118	utext_openUnicodeString(&bufferText, &buffer, &status);
	5119
	5120	status = U_ZERO_ERROR;
	5121	u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
	5122	u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
	5123	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5124
	5125	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5126	REGEX_CHECK_STATUS;
	5127
	5128	/* Normal case, with match */
	5129	uregex_setText(re, text1, -1, &status);
	5130	REGEX_CHECK_STATUS;
	5131	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5132	REGEX_CHECK_STATUS;
	5133	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5134	REGEX_CHECK_STATUS;
	5135	REGEX_ASSERT(result == &bufferText);
	5136	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
	5137
	5138	/* No match. Text should copy to output with no changes. */
	5139	uregex_setText(re, text2, -1, &status);
	5140	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5141	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5142	REGEX_CHECK_STATUS;
	5143	REGEX_ASSERT(result == &bufferText);
	5144	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5145
	5146	/* Unicode escapes */
	5147	uregex_setText(re, text1, -1, &status);
	5148	regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
	5149	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5150	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5151	REGEX_CHECK_STATUS;
	5152	REGEX_ASSERT(result == &bufferText);
	5153	REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
	5154
	5155	uregex_close(re);
	5156	utext_close(&replText);
	5157	}
	5158
	5159
	5160	/*
	5161	* replaceAll()
	5162	*/
	5163	{
	5164	UChar text1[80];
	5165	UChar text2[80];
	5166	UText replText = UTEXT_INITIALIZER;
	5167	UText *result;
	5168
	5169	status = U_ZERO_ERROR;
	5170	u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
	5171	u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
	5172	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5173
	5174	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5175	REGEX_CHECK_STATUS;
	5176
	5177	/* Normal case, with match */
	5178	uregex_setText(re, text1, -1, &status);
	5179	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5180	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5181	REGEX_CHECK_STATUS;
	5182	REGEX_ASSERT(result == &bufferText);
	5183	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
	5184
	5185	/* No match. Text should copy to output with no changes. */
	5186	uregex_setText(re, text2, -1, &status);
	5187	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5188	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5189	REGEX_CHECK_STATUS;
	5190	REGEX_ASSERT(result == &bufferText);
	5191	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5192
	5193	uregex_close(re);
	5194	utext_close(&replText);
	5195	}
	5196
	5197
	5198	/*
	5199	* splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
	5200	* so we don't need to test it here.
	5201	*/
	5202
	5203	utext_close(&bufferText);
	5204	utext_close(&patternText);
	5205	}
	5206
	5207
	5208	//--------------------------------------------------------------
	5209	//
	5210	// NamedCapture Check basic named capture group functionality
	5211	//
	5212	//--------------------------------------------------------------
	5213	void RegexTest::NamedCapture() {
	5214	UErrorCode status = U_ZERO_ERROR;
	5215	RegexPattern *pat = RegexPattern::compile(UnicodeString(
	5216	"abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
	5217	REGEX_CHECK_STATUS;
	5218	int32_t group = pat->groupNumberFromName("five", -1, status);
	5219	REGEX_CHECK_STATUS;
	5220	REGEX_ASSERT(5 == group);
	5221	group = pat->groupNumberFromName("three", -1, status);
	5222	REGEX_CHECK_STATUS;
	5223	REGEX_ASSERT(3 == group);
	5224
	5225	status = U_ZERO_ERROR;
	5226	group = pat->groupNumberFromName(UnicodeString("six"), status);
	5227	REGEX_CHECK_STATUS;
	5228	REGEX_ASSERT(6 == group);
	5229
	5230	status = U_ZERO_ERROR;
	5231	group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
	5232	U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5233
	5234	status = U_ZERO_ERROR;
	5235
	5236	// After copying a pattern, named capture should still work in the copy.
	5237	RegexPattern copiedPat = new RegexPattern(pat);
	5238	REGEX_ASSERT(copiedPat == pat);
	5239	delete pat; pat = NULL; // Delete original, copy should have no references back to it.
	5240
	5241	group = copiedPat->groupNumberFromName("five", -1, status);
	5242	REGEX_CHECK_STATUS;
	5243	REGEX_ASSERT(5 == group);
	5244	group = copiedPat->groupNumberFromName("three", -1, status);
	5245	REGEX_CHECK_STATUS;
	5246	REGEX_ASSERT(3 == group);
	5247	delete copiedPat;
	5248
	5249	// ReplaceAll with named capture group.
	5250	status = U_ZERO_ERROR;
	5251	UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
	5252	RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
	5253	REGEX_CHECK_STATUS;
	5254	// m.pattern().dumpPattern();
	5255	UnicodeString replacedText = m->replaceAll("'${mid}'", status);
	5256	REGEX_CHECK_STATUS;
	5257	REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
	5258	delete m;
	5259
	5260	// ReplaceAll, allowed capture group numbers.
	5261	text = UnicodeString("abcmxyz");
	5262	m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
	5263	REGEX_CHECK_STATUS;
	5264
	5265	status = U_ZERO_ERROR;
	5266	replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
	5267	REGEX_CHECK_STATUS;
	5268	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
	5269
	5270	status = U_ZERO_ERROR;
	5271	replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
	5272	REGEX_CHECK_STATUS;
	5273	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
	5274
	5275	status = U_ZERO_ERROR;
	5276	replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
	5277	REGEX_CHECK_STATUS;
	5278	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
	5279
	5280	status = U_ZERO_ERROR;
	5281	replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
	5282	REGEX_CHECK_STATUS;
	5283	REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
	5284
	5285	status = U_ZERO_ERROR;
	5286	replacedText = m->replaceAll(UnicodeString("<$3>"), status);
	5287	REGEX_CHECK_STATUS;
	5288	REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
	5289
	5290	status = U_ZERO_ERROR;
	5291	replacedText = m->replaceAll(UnicodeString("<$4>"), status);
	5292	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5293
	5294	status = U_ZERO_ERROR;
	5295	replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
	5296	REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
	5297	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
	5298
	5299	status = U_ZERO_ERROR;
	5300	replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
	5301	REGEX_CHECK_STATUS; // that push group num out of range.
	5302	REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
	5303
	5304	status = U_ZERO_ERROR;
	5305	replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
	5306	REGEX_CHECK_STATUS;
	5307	REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
	5308
	5309	status = U_ZERO_ERROR;
	5310	replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
	5311	REGEX_CHECK_STATUS;
	5312	REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
	5313
	5314	status = U_ZERO_ERROR;
	5315	replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
	5316	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5317
	5318	status = U_ZERO_ERROR;
	5319	replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
	5320	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5321
	5322	status = U_ZERO_ERROR;
	5323	replacedText = m->replaceAll(UnicodeString("<${one"), status);
	5324	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5325
	5326	status = U_ZERO_ERROR;
	5327	replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
	5328	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5329
	5330	delete m;
	5331
	5332	// Repeat the above replaceAll() tests using the plain C API, which
	5333	// has a separate implementation internally.
	5334	// TODO: factor out the test data.
	5335
	5336	status = U_ZERO_ERROR;
	5337	URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
	5338	REGEX_CHECK_STATUS;
	5339	text = UnicodeString("abcmxyz");
	5340	uregex_setText(re, text.getBuffer(), text.length(), &status);
	5341	REGEX_CHECK_STATUS;
	5342
	5343	UChar resultBuf[100];
	5344	int32_t resultLength;
	5345	UnicodeString repl;
	5346
	5347	status = U_ZERO_ERROR;
	5348	repl = UnicodeString("<$0>");
	5349	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5350	REGEX_CHECK_STATUS;
	5351	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
	5352
	5353	status = U_ZERO_ERROR;
	5354	repl = UnicodeString("<$1>");
	5355	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5356	REGEX_CHECK_STATUS;
	5357	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
	5358
	5359	status = U_ZERO_ERROR;
	5360	repl = UnicodeString("<${one}>");
	5361	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5362	REGEX_CHECK_STATUS;
	5363	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
	5364
	5365	status = U_ZERO_ERROR;
	5366	repl = UnicodeString("<$2>");
	5367	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5368	REGEX_CHECK_STATUS;
	5369	REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
	5370
	5371	status = U_ZERO_ERROR;
	5372	repl = UnicodeString("<$3>");
	5373	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5374	REGEX_CHECK_STATUS;
	5375	REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
	5376
	5377	status = U_ZERO_ERROR;
	5378	repl = UnicodeString("<$4>");
	5379	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5380	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5381
	5382	status = U_ZERO_ERROR;
	5383	repl = UnicodeString("<$04>");
	5384	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5385	REGEX_CHECK_STATUS;
	5386	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
	5387
	5388	status = U_ZERO_ERROR;
	5389	repl = UnicodeString("<$000016>");
	5390	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5391	REGEX_CHECK_STATUS;
	5392	REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
	5393
	5394	status = U_ZERO_ERROR;
	5395	repl = UnicodeString("<$3$2$1${one}>");
	5396	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5397	REGEX_CHECK_STATUS;
	5398	REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
	5399
	5400	status = U_ZERO_ERROR;
	5401	repl = UnicodeString("$3$2$1${one}");
	5402	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5403	REGEX_CHECK_STATUS;
	5404	REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
	5405
	5406	status = U_ZERO_ERROR;
	5407	repl = UnicodeString("<${noSuchName}>");
	5408	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5409	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5410
	5411	status = U_ZERO_ERROR;
	5412	repl = UnicodeString("<${invalid-name}>");
	5413	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5414	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5415
	5416	status = U_ZERO_ERROR;
	5417	repl = UnicodeString("<${one");
	5418	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5419	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5420
	5421	status = U_ZERO_ERROR;
	5422	repl = UnicodeString("$not a capture group");
	5423	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5424	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5425
	5426	uregex_close(re);
	5427	}
	5428
	5429	//--------------------------------------------------------------
	5430	//
	5431	// NamedCaptureLimits Patterns with huge numbers of named capture groups.
	5432	// The point is not so much what the exact limit is,
	5433	// but that a largish number doesn't hit bad non-linear performance,
	5434	// and that exceeding the limit fails cleanly.
	5435	//
	5436	//--------------------------------------------------------------
	5437	void RegexTest::NamedCaptureLimits() {
	5438	if (quick) {
	5439	logln("Skipping test. Runs in exhuastive mode only.");
	5440	return;
	5441	}
	5442	const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
	5443	const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
	5444	char nnbuf[100];
	5445	UnicodeString pattern;
	5446	int32_t nn;
	5447
	5448	for (nn=1; nn<goodLimit; nn++) {
	5449	sprintf(nnbuf, "(?<nn%d>)", nn);
	5450	pattern.append(UnicodeString(nnbuf, -1, US_INV));
	5451	}
	5452	UErrorCode status = U_ZERO_ERROR;
	5453	RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
	5454	REGEX_CHECK_STATUS;
	5455	for (nn=1; nn<goodLimit; nn++) {
	5456	sprintf(nnbuf, "nn%d", nn);
	5457	int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
	5458	REGEX_ASSERT(nn == groupNum);
	5459	if (nn != groupNum) {
	5460	break;
	5461	}
	5462	}
	5463	delete pat;
	5464
	5465	pattern.remove();
	5466	for (nn=1; nn<failLimit; nn++) {
	5467	sprintf(nnbuf, "(?<nn%d>)", nn);
	5468	pattern.append(UnicodeString(nnbuf, -1, US_INV));
	5469	}
	5470	status = U_ZERO_ERROR;
	5471	pat = RegexPattern::compile(pattern, 0, status);
	5472	REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
	5473	delete pat;
	5474	}
	5475
	5476
	5477	//--------------------------------------------------------------
	5478	//
	5479	// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
	5480	//
	5481	//---------------------------------------------------------------
	5482	void RegexTest::Bug7651() {
	5483	UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|\\$[A-Za-z]+)");
	5484	// The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
	5485	// It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
	5486	UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|\\$[A-Za-z]+)");
	5487	UnicodeString s("#ff @abcd This is test");
	5488	RegexPattern *REPattern = NULL;
	5489	RegexMatcher *REMatcher = NULL;
	5490	UErrorCode status = U_ZERO_ERROR;
	5491	UParseError pe;
	5492
	5493	REPattern = RegexPattern::compile(pattern1, 0, pe, status);
	5494	REGEX_CHECK_STATUS;
	5495	REMatcher = REPattern->matcher(s, status);
	5496	REGEX_CHECK_STATUS;
	5497	REGEX_ASSERT(REMatcher->find());
	5498	REGEX_ASSERT(REMatcher->start(status) == 0);
	5499	delete REPattern;
	5500	delete REMatcher;
	5501	status = U_ZERO_ERROR;
	5502
	5503	REPattern = RegexPattern::compile(pattern2, 0, pe, status);
	5504	REGEX_CHECK_STATUS;
	5505	REMatcher = REPattern->matcher(s, status);
	5506	REGEX_CHECK_STATUS;
	5507	REGEX_ASSERT(REMatcher->find());
	5508	REGEX_ASSERT(REMatcher->start(status) == 0);
	5509	delete REPattern;
	5510	delete REMatcher;
	5511	status = U_ZERO_ERROR;
	5512	}
	5513
	5514	void RegexTest::Bug7740() {
	5515	UErrorCode status = U_ZERO_ERROR;
	5516	UnicodeString pattern = "(a)";
	5517	UnicodeString text = "abcdef";
	5518	RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
	5519	REGEX_CHECK_STATUS;
	5520	REGEX_ASSERT(m->lookingAt(status));
	5521	REGEX_CHECK_STATUS;
	5522	status = U_ILLEGAL_ARGUMENT_ERROR;
	5523	UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
	5524	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5525	REGEX_ASSERT(s == "");
	5526	delete m;
	5527	}
	5528
	5529	// Bug 8479: was crashing whith a Bogus UnicodeString as input.
	5530
	5531	void RegexTest::Bug8479() {
	5532	UErrorCode status = U_ZERO_ERROR;
	5533
	5534	RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL\|UREGEX_CASE_INSENSITIVE, status);
	5535	REGEX_CHECK_STATUS;
	5536	if (U_SUCCESS(status))
	5537	{
	5538	UnicodeString str;
	5539	str.setToBogus();
	5540	pMatcher->reset(str);
	5541	status = U_ZERO_ERROR;
	5542	pMatcher->matches(status);
	5543	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5544	delete pMatcher;
	5545	}
	5546	}
	5547
	5548
	5549	// Bug 7029
	5550	void RegexTest::Bug7029() {
	5551	UErrorCode status = U_ZERO_ERROR;
	5552
	5553	RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
	5554	UnicodeString text = "abc.def";
	5555	UnicodeString splits[10];
	5556	REGEX_CHECK_STATUS;
	5557	int32_t numFields = pMatcher->split(text, splits, 10, status);
	5558	REGEX_CHECK_STATUS;
	5559	REGEX_ASSERT(numFields == 8);
	5560	delete pMatcher;
	5561	}
	5562
	5563	// Bug 9283
	5564	// This test is checking for the existance of any supplemental characters that case-fold
	5565	// to a bmp character.
	5566	//
	5567	// At the time of this writing there are none. If any should appear in a subsequent release
	5568	// of Unicode, the code in regular expressions compilation that determines the longest
	5569	// posssible match for a literal string will need to be enhanced.
	5570	//
	5571	// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
	5572	// for details on what to do in case of a failure of this test.
	5573	//
	5574	void RegexTest::Bug9283() {
	5575	#if !UCONFIG_NO_NORMALIZATION
	5576	UErrorCode status = U_ZERO_ERROR;
	5577	UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
	5578	REGEX_CHECK_STATUS;
	5579	int32_t index;
	5580	UChar32 c;
	5581	for (index=0; ; index++) {
	5582	c = supplementalsWithCaseFolding.charAt(index);
	5583	if (c == -1) {
	5584	break;
	5585	}
	5586	UnicodeString cf = UnicodeString(c).foldCase();
	5587	REGEX_ASSERT(cf.length() >= 2);
	5588	}
	5589	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	5590	}
	5591
	5592
	5593	void RegexTest::CheckInvBufSize() {
	5594	if(inv_next>=INV_BUFSIZ) {
	5595	errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
	5596	__FILE__, INV_BUFSIZ, inv_next);
	5597	} else {
	5598	logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
	5599	}
	5600	}
	5601
	5602
	5603	void RegexTest::Bug10459() {
	5604	UErrorCode status = U_ZERO_ERROR;
	5605	UnicodeString patternString("(txt)");
	5606	UnicodeString txtString("txt");
	5607
	5608	UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
	5609	REGEX_CHECK_STATUS;
	5610	UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
	5611	REGEX_CHECK_STATUS;
	5612
	5613	URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
	5614	REGEX_CHECK_STATUS;
	5615
	5616	uregex_setUText(icu_re, utext_txt, &status);
	5617	REGEX_CHECK_STATUS;
	5618
	5619	// The bug was that calling uregex_group() before doing a matching operation
	5620	// was causing a segfault. Only for Regular Expressions created from UText.
	5621	// It should set an U_REGEX_INVALID_STATE.
	5622
	5623	UChar buf[100];
	5624	int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
	5625	REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
	5626	REGEX_ASSERT(len == 0);
	5627
	5628	uregex_close(icu_re);
	5629	utext_close(utext_pat);
	5630	utext_close(utext_txt);
	5631	}
	5632
	5633	void RegexTest::TestCaseInsensitiveStarters() {
	5634	// Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
	5635	// become stale because of new Unicode characters.
	5636	// If it is stale, rerun the generation tool
	5637	// svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
	5638	// and replace the embedded data in i18n/regexcmp.cpp
	5639
	5640	for (UChar32 cp=0; cp<=0x10ffff; cp++) {
	5641	if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
	5642	continue;
	5643	}
	5644	UnicodeSet s(cp, cp);
	5645	s.closeOver(USET_CASE_INSENSITIVE);
	5646	UnicodeSetIterator setIter(s);
	5647	while (setIter.next()) {
	5648	if (!setIter.isString()) {
	5649	continue;
	5650	}
	5651	const UnicodeString &str = setIter.getString();
	5652	UChar32 firstChar = str.char32At(0);
	5653	UnicodeSet starters;
	5654	RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
	5655	if (!starters.contains(cp)) {
	5656	errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
	5657	return;
	5658	}
	5659	}
	5660	}
	5661	}
	5662
	5663
	5664	void RegexTest::TestBug11049() {
	5665	// Original bug report: pattern with match start consisting of one of several individual characters,
	5666	// and the text being matched ending with a supplementary character. find() would read past the
	5667	// end of the input text when searching for potential match starting points.
	5668
	5669	// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
	5670	// detect the bad read.
	5671
	5672	TestCase11049("A\|B\|C", "a string \\ud800\\udc00", FALSE, __LINE__);
	5673	TestCase11049("A\|B\|C", "string matches at end C", TRUE, __LINE__);
	5674
	5675	// Test again with a pattern starting with a single character,
	5676	// which takes a different code path than starting with an OR expression,
	5677	// but with similar logic.
	5678	TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
	5679	TestCase11049("C", "string matches at end C", TRUE, __LINE__);
	5680	}
	5681
	5682	// Run a single test case from TestBug11049(). Internal function.
	5683	void RegexTest::TestCase11049(const char pattern, const char data, UBool expectMatch, int32_t lineNumber) {
	5684	UErrorCode status = U_ZERO_ERROR;
	5685	UnicodeString patternString = UnicodeString(pattern).unescape();
	5686	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
	5687
	5688	UnicodeString dataString = UnicodeString(data).unescape();
	5689	UChar *exactBuffer = new UChar[dataString.length()];
	5690	dataString.extract(exactBuffer, dataString.length(), status);
	5691	UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
	5692
	5693	LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
	5694	REGEX_CHECK_STATUS;
	5695	matcher->reset(ut);
	5696	UBool result = matcher->find();
	5697	if (result != expectMatch) {
	5698	errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
	5699	__FILE__, lineNumber, expectMatch, result, pattern, data);
	5700	}
	5701
	5702	// Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
	5703	// off-by-one on find() with match at the last code point.
	5704	// Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
	5705	// because string.unescape() will only shrink it.
	5706	char * utf8Buffer = new char[uprv_strlen(data)+1];
	5707	u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
	5708	REGEX_CHECK_STATUS;
	5709	ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
	5710	REGEX_CHECK_STATUS;
	5711	matcher->reset(ut);
	5712	result = matcher->find();
	5713	if (result != expectMatch) {
	5714	errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
	5715	__FILE__, lineNumber, expectMatch, result, pattern, data);
	5716	}
	5717	delete [] utf8Buffer;
	5718
	5719	utext_close(ut);
	5720	delete [] exactBuffer;
	5721	}
	5722
	5723
	5724	void RegexTest::TestBug11371() {
	5725	if (quick) {
	5726	logln("Skipping test. Runs in exhuastive mode only.");
	5727	return;
	5728	}
	5729	UErrorCode status = U_ZERO_ERROR;
	5730	UnicodeString patternString;
	5731
	5732	for (int i=0; i<8000000; i++) {
	5733	patternString.append(UnicodeString("()"));
	5734	}
	5735	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
	5736	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5737	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5738	__FILE__, __LINE__, u_errorName(status));
	5739	}
	5740
	5741	status = U_ZERO_ERROR;
	5742	patternString = "(";
	5743	for (int i=0; i<20000000; i++) {
	5744	patternString.append(UnicodeString("A++"));
	5745	}
	5746	patternString.append(UnicodeString("){0}B++"));
	5747	LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
	5748	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5749	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5750	__FILE__, __LINE__, u_errorName(status));
	5751	}
	5752
	5753	// Pattern with too much string data, such that string indexes overflow operand data field size
	5754	// in compiled instruction.
	5755	status = U_ZERO_ERROR;
	5756	patternString = "";
	5757	while (patternString.length() < 0x00ffffff) {
	5758	patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
	5759	}
	5760	patternString.append(UnicodeString("X? trailing string"));
	5761	LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
	5762	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5763	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5764	__FILE__, __LINE__, u_errorName(status));
	5765	}
	5766	}
	5767
	5768	void RegexTest::TestBug11480() {
	5769	// C API, get capture group of a group that does not participate in the match.
	5770	// (Returns a zero length string, with nul termination,
	5771	// indistinguishable from a group with a zero length match.)
	5772
	5773	UErrorCode status = U_ZERO_ERROR;
	5774	URegularExpression *re = uregex_openC("(A)\|(B)", 0, NULL, &status);
	5775	REGEX_CHECK_STATUS;
	5776	UnicodeString text = UNICODE_STRING_SIMPLE("A");
	5777	uregex_setText(re, text.getBuffer(), text.length(), &status);
	5778	REGEX_CHECK_STATUS;
	5779	REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
	5780	UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
	5781	int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
	5782	REGEX_ASSERT(length == 0);
	5783	REGEX_ASSERT(buf[0] == 13);
	5784	REGEX_ASSERT(buf[1] == 0);
	5785	REGEX_ASSERT(buf[2] == 13);
	5786	uregex_close(re);
	5787
	5788	// UText C++ API, length of match is 0 for non-participating matches.
	5789	UText ut = UTEXT_INITIALIZER;
	5790	utext_openUnicodeString(&ut, &text, &status);
	5791	RegexMatcher matcher(UnicodeString("(A)\|(B)"), 0, status);
	5792	REGEX_CHECK_STATUS;
	5793	matcher.reset(&ut);
	5794	REGEX_ASSERT(matcher.lookingAt(0, status));
	5795
	5796	// UText C++ API, Capture group 1 matches "A", position 0, length 1.
	5797	int64_t groupLen = -666;
	5798	UText group = UTEXT_INITIALIZER;
	5799	matcher.group(1, &group, groupLen, status);
	5800	REGEX_CHECK_STATUS;
	5801	REGEX_ASSERT(groupLen == 1);
	5802	REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
	5803
	5804	// Capture group 2, the (B), does not participate in the match.
	5805	matcher.group(2, &group, groupLen, status);
	5806	REGEX_CHECK_STATUS;
	5807	REGEX_ASSERT(groupLen == 0);
	5808	REGEX_ASSERT(matcher.start(2, status) == -1);
	5809	REGEX_CHECK_STATUS;
	5810	}
	5811
	5812	void RegexTest::TestBug12884() {
	5813	// setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
	5814	UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
	5815	UnicodeString text(u"hello");
	5816	UErrorCode status = U_ZERO_ERROR;
	5817	RegexMatcher m(pattern, text, 0, status);
	5818	REGEX_CHECK_STATUS;
	5819	m.setTimeLimit(5, status);
	5820	m.find(status);
	5821	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	5822
	5823	// Non-greedy loops. They take a different code path during matching.
	5824	UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
	5825	status = U_ZERO_ERROR;
	5826	RegexMatcher ngM(ngPattern, text, 0, status);
	5827	REGEX_CHECK_STATUS;
	5828	ngM.setTimeLimit(5, status);
	5829	ngM.find(status);
	5830	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	5831
	5832	// UText, wrapping non-UTF-16 text, also takes a different execution path.
	5833	const char text8 = reinterpret_cast<const char>(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
	5834	"carácter, sin importar la plataforma, sin importar el programa,"
	5835	"sin importar el idioma.");
	5836	status = U_ZERO_ERROR;
	5837	LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
	5838	REGEX_CHECK_STATUS;
	5839	m.reset(ut.getAlias());
	5840	m.find(status);
	5841	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	5842
	5843	status = U_ZERO_ERROR;
	5844	ngM.reset(ut.getAlias());
	5845	ngM.find(status);
	5846	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	5847	}
	5848
	5849	// Bug 13631. A find() of a pattern with a zero length look-behind assertions
	5850	// can cause a read past the end of the input text.
	5851	// The failure is seen when running this test with Clang's Addresss Sanitizer.
	5852
	5853	void RegexTest::TestBug13631() {
	5854	const UChar *pats[] = { u"(?<!^)",
	5855	u"(?<=^)",
	5856	nullptr
	5857	};
	5858	for (const UChar *pat=pats; pat; ++pat) {
	5859	UErrorCode status = U_ZERO_ERROR;
	5860	UnicodeString upat(*pat);
	5861	RegexMatcher matcher(upat, 0, status);
	5862	const UChar s =u'a';
	5863	UText *ut = utext_openUChars(nullptr, &s, 1, &status);
	5864	REGEX_CHECK_STATUS;
	5865	matcher.reset(ut);
	5866	while (matcher.find()) {
	5867	}
	5868	utext_close(ut);
	5869	}
	5870	}
	5871
	5872	// Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
	5873	// where a following group specification would be expected.
	5874	// Failure shows when running the test under Clang's Address Sanitizer.
	5875
	5876	void RegexTest::TestBug13632() {
	5877	UErrorCode status = U_ZERO_ERROR;
	5878	URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
	5879	const char16_t *sourceString = u"Hello, world.";
	5880	uregex_setText(re, sourceString, u_strlen(sourceString), &status);
	5881
	5882	const int32_t destCap = 20;
	5883	char16_t dest[destCap] = {};
	5884	const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
	5885	uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
	5886
	5887	assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
	5888	uregex_close(re);
	5889	}
	5890
	5891	void RegexTest::TestBug20359() {
	5892	// The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
	5893	// pairs. (Enter and exit pattern literal quote mode). Logic was correct.
	5894	// Changed implementation to loop instead of recursing.
	5895
	5896	UnicodeString pattern;
	5897	for (int i=0; i<50000; ++i) {
	5898	pattern += u"\\Q\\E";
	5899	}
	5900	pattern += u"x";
	5901
	5902	UErrorCode status = U_ZERO_ERROR;
	5903	LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
	5904	0, nullptr, &status));
	5905	assertSuccess(WHERE, status);
	5906
	5907	// We have passed the point where the bug crashed. The following is a small sanity
	5908	// check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
	5909
	5910	uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
	5911	assertSuccess(WHERE, status);
	5912	assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
	5913	assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
	5914	assertSuccess(WHERE, status);
	5915	}
	5916
	5917
	5918	void RegexTest::TestBug20863() {
	5919	// Test that patterns with a large number of named capture groups work correctly.
	5920	//
	5921	// The ticket was not for a bug per se, but to reduce memory usage by using lazy
	5922	// construction of the map from capture names to numbers, and decreasing the
	5923	// default size of the map.
	5924
	5925	constexpr int GROUP_COUNT = 2000;
	5926	std::vector<UnicodeString> groupNames;
	5927	for (int32_t i=0; i<GROUP_COUNT; ++i) {
	5928	UnicodeString name;
	5929	name.append(u"name");
	5930	name.append(Int64ToUnicodeString(i));
	5931	groupNames.push_back(name);
	5932	}
	5933
	5934	UnicodeString patternString;
	5935	for (UnicodeString name: groupNames) {
	5936	patternString.append(u"(?<");
	5937	patternString.append(name);
	5938	patternString.append(u">.)");
	5939	}
	5940
	5941	UErrorCode status = U_ZERO_ERROR;
	5942	UParseError pe;
	5943	LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
	5944	if (!assertSuccess(WHERE, status)) {
	5945	return;
	5946	}
	5947
	5948	for (int32_t i=0; i<GROUP_COUNT; ++i) {
	5949	int32_t group = pattern->groupNumberFromName(groupNames[i], status);
	5950	if (!assertSuccess(WHERE, status)) {
	5951	return;
	5952	}
	5953	assertEquals(WHERE, i+1, group);
	5954	// Note: group 0 is the overall match; group 1 is the first separate capture group.
	5955	}
	5956
	5957	// Verify that assignment of patterns with various combinations of named capture work.
	5958	// Lazy creation of the internal named capture map changed the implementation logic here.
	5959	{
	5960	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
	5961	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
	5962	assertSuccess(WHERE, status);
	5963	assertFalse(WHERE, pat1 == pat2);
	5964	pat1 = pat2;
	5965	assertTrue(WHERE, pat1 == pat2);
	5966	assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
	5967	assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
	5968	assertSuccess(WHERE, status);
	5969	}
	5970
	5971	{
	5972	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
	5973	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
	5974	assertSuccess(WHERE, status);
	5975	assertFalse(WHERE, pat1 == pat2);
	5976	pat2 = pat1;
	5977	assertTrue(WHERE, pat1 == pat2);
	5978	assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
	5979	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
	5980	status = U_ZERO_ERROR;
	5981	assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
	5982	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
	5983	status = U_ZERO_ERROR;
	5984	}
	5985
	5986	{
	5987	LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
	5988	LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
	5989	assertSuccess(WHERE, status);
	5990	assertFalse(WHERE, pat1 == pat2);
	5991	pat2 = pat1;
	5992	assertTrue(WHERE, pat1 == pat2);
	5993	assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
	5994	assertSuccess(WHERE, status);
	5995	assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
	5996	assertSuccess(WHERE, status);
	5997	assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
	5998	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
	5999	status = U_ZERO_ERROR;
	6000	assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
	6001	assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
	6002	status = U_ZERO_ERROR;
	6003	}
	6004
	6005	}
	6006
	6007
	6008	#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */