git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/regextst.cpp

... / ...

Commit	Line	Data
	1	/********************************************************************
	2	* COPYRIGHT:
	3	* Copyright (c) 2002-2014, International Business Machines Corporation and
	4	* others. All Rights Reserved.
	5	********************************************************************/
	6
	7	//
	8	// regextst.cpp
	9	//
	10	// ICU Regular Expressions test, part of intltest.
	11	//
	12
	13	/*
	14	NOTE!!
	15
	16	PLEASE be careful about ASCII assumptions in this test.
	17	This test is one of the worst repeat offenders.
	18	If you have questions, contact someone on the ICU PMC
	19	who has access to an EBCDIC system.
	20
	21	*/
	22
	23	#include "intltest.h"
	24	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	25
	26	#include "unicode/regex.h"
	27	#include "unicode/uchar.h"
	28	#include "unicode/ucnv.h"
	29	#include "unicode/uniset.h"
	30	#include "unicode/uregex.h"
	31	#include "unicode/ustring.h"
	32	#include "regextst.h"
	33	#include "uvector.h"
	34	#include "util.h"
	35	#include <stdlib.h>
	36	#include <string.h>
	37	#include <stdio.h>
	38	#include "cstring.h"
	39	#include "uinvchar.h"
	40
	41	#define SUPPORT_MUTATING_INPUT_STRING 0
	42
	43	//---------------------------------------------------------------------------
	44	//
	45	// Test class boilerplate
	46	//
	47	//---------------------------------------------------------------------------
	48	RegexTest::RegexTest()
	49	{
	50	}
	51
	52
	53	RegexTest::~RegexTest()
	54	{
	55	}
	56
	57
	58
	59	void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	60	{
	61	if (exec) logln("TestSuite RegexTest: ");
	62	switch (index) {
	63
	64	case 0: name = "Basic";
	65	if (exec) Basic();
	66	break;
	67	case 1: name = "API_Match";
	68	if (exec) API_Match();
	69	break;
	70	case 2: name = "API_Replace";
	71	if (exec) API_Replace();
	72	break;
	73	case 3: name = "API_Pattern";
	74	if (exec) API_Pattern();
	75	break;
	76	case 4:
	77	#if !UCONFIG_NO_FILE_IO
	78	name = "Extended";
	79	if (exec) Extended();
	80	#else
	81	name = "skip";
	82	#endif
	83	break;
	84	case 5: name = "Errors";
	85	if (exec) Errors();
	86	break;
	87	case 6: name = "PerlTests";
	88	if (exec) PerlTests();
	89	break;
	90	case 7: name = "Callbacks";
	91	if (exec) Callbacks();
	92	break;
	93	case 8: name = "FindProgressCallbacks";
	94	if (exec) FindProgressCallbacks();
	95	break;
	96	case 9: name = "Bug 6149";
	97	if (exec) Bug6149();
	98	break;
	99	case 10: name = "UTextBasic";
	100	if (exec) UTextBasic();
	101	break;
	102	case 11: name = "API_Match_UTF8";
	103	if (exec) API_Match_UTF8();
	104	break;
	105	case 12: name = "API_Replace_UTF8";
	106	if (exec) API_Replace_UTF8();
	107	break;
	108	case 13: name = "API_Pattern_UTF8";
	109	if (exec) API_Pattern_UTF8();
	110	break;
	111	case 14: name = "PerlTestsUTF8";
	112	if (exec) PerlTestsUTF8();
	113	break;
	114	case 15: name = "PreAllocatedUTextCAPI";
	115	if (exec) PreAllocatedUTextCAPI();
	116	break;
	117	case 16: name = "Bug 7651";
	118	if (exec) Bug7651();
	119	break;
	120	case 17: name = "Bug 7740";
	121	if (exec) Bug7740();
	122	break;
	123	case 18: name = "Bug 8479";
	124	if (exec) Bug8479();
	125	break;
	126	case 19: name = "Bug 7029";
	127	if (exec) Bug7029();
	128	break;
	129	case 20: name = "CheckInvBufSize";
	130	if (exec) CheckInvBufSize();
	131	break;
	132	case 21: name = "Bug 9283";
	133	if (exec) Bug9283();
	134	break;
	135	case 22: name = "Bug10459";
	136	if (exec) Bug10459();
	137	break;
	138
	139	default: name = "";
	140	break; //needed to end loop
	141	}
	142	}
	143
	144
	145
	146	/**
	147	* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
	148	* into ASCII.
	149	* @see utext_openUTF8
	150	*/
	151	static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);
	152
	153	//---------------------------------------------------------------------------
	154	//
	155	// Error Checking / Reporting macros used in all of the tests.
	156	//
	157	//---------------------------------------------------------------------------
	158
	159	static void utextToPrintable(char buf, int32_t bufLen, UText text) {
	160	int64_t oldIndex = utext_getNativeIndex(text);
	161	utext_setNativeIndex(text, 0);
	162	char *bufPtr = buf;
	163	UChar32 c = utext_next32From(text, 0);
	164	while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
	165	if (0x000020<=c && c<0x00007e) {
	166	*bufPtr = c;
	167	} else {
	168	#if 0
	169	sprintf(bufPtr,"U+%04X", c);
	170	bufPtr+= strlen(bufPtr)-1;
	171	#else
	172	*bufPtr = '%';
	173	#endif
	174	}
	175	bufPtr++;
	176	c = UTEXT_NEXT32(text);
	177	}
	178	*bufPtr = 0;
	179	#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
	180	char ebuf = (char)malloc(bufLen);
	181	uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);
	182	uprv_strncpy(buf, ebuf, bufLen);
	183	free((void*)ebuf);
	184	#endif
	185	utext_setNativeIndex(text, oldIndex);
	186	}
	187
	188
	189	static char ASSERT_BUF[1024];
	190
	191	const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
	192	if(message.length()==0) {
	193	strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
	194	} else {
	195	UnicodeString buf;
	196	IntlTest::prettify(message,buf);
	197	if(buf.length()==0) {
	198	strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
	199	} else {
	200	buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
	201	if(ASSERT_BUF[0]==0) {
	202	ASSERT_BUF[0]=0;
	203	for(int32_t i=0;i<buf.length();i++) {
	204	UChar ch = buf[i];
	205	sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
	206	}
	207	}
	208	}
	209	}
	210	ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
	211	return ASSERT_BUF;
	212	}
	213
	214	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	215
	216	#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
	217
	218	#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
	219	__FILE__, __LINE__, u_errorName(status)); return;}}
	220
	221	#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
	222
	223	#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
	224	if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
	225	__LINE__, u_errorName(errcode), u_errorName(status));};}
	226
	227	#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
	228	"RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
	229
	230	#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
	231	errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
	232
	233	#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
	234
	235
	236	static UBool testUTextEqual(UText uta, UText utb) {
	237	UChar32 ca = 0;
	238	UChar32 cb = 0;
	239	utext_setNativeIndex(uta, 0);
	240	utext_setNativeIndex(utb, 0);
	241	do {
	242	ca = utext_next32(uta);
	243	cb = utext_next32(utb);
	244	if (ca != cb) {
	245	break;
	246	}
	247	} while (ca != U_SENTINEL);
	248	return ca == cb;
	249	}
	250
	251
	252	/**
	253	* @param expected expected text in UTF-8 (not platform) codepage
	254	*/
	255	void RegexTest::assertUText(const char expected, UText actual, const char *file, int line) {
	256	UErrorCode status = U_ZERO_ERROR;
	257	UText expectedText = UTEXT_INITIALIZER;
	258	utext_openUTF8(&expectedText, expected, -1, &status);
	259	if(U_FAILURE(status)) {
	260	errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	261	return;
	262	}
	263	if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
	264	errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
	265	return;
	266	}
	267	utext_setNativeIndex(actual, 0);
	268	if (!testUTextEqual(&expectedText, actual)) {
	269	char buf[201 /21/];
	270	char expectedBuf[201];
	271	utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
	272	utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
	273	errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	274	}
	275	utext_close(&expectedText);
	276	}
	277	/**
	278	* @param expected invariant (platform local text) input
	279	*/
	280
	281	void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {
	282	UErrorCode status = U_ZERO_ERROR;
	283	UText expectedText = UTEXT_INITIALIZER;
	284	regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
	285	if(U_FAILURE(status)) {
	286	errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	287	return;
	288	}
	289	utext_setNativeIndex(actual, 0);
	290	if (!testUTextEqual(&expectedText, actual)) {
	291	char buf[201 /21/];
	292	char expectedBuf[201];
	293	utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
	294	utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
	295	errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	296	}
	297	utext_close(&expectedText);
	298	}
	299
	300	/**
	301	* Assumes utf-8 input
	302	*/
	303	#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
	304	/**
	305	* Assumes Invariant input
	306	*/
	307	#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
	308
	309	/**
	310	* This buffer ( inv_buf ) is used to hold the UTF-8 strings
	311	* passed into utext_openUTF8. An error will be given if
	312	* INV_BUFSIZ is too small. It's only used on EBCDIC systems.
	313	*/
	314
	315	#define INV_BUFSIZ 2048 /* increase this if too small */
	316
	317	static int64_t inv_next=0;
	318
	319	#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
	320	static char inv_buf[INV_BUFSIZ];
	321	#endif
	322
	323	static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {
	324	if(length==-1) length=strlen(inv);
	325	#if U_CHARSET_FAMILY==U_ASCII_FAMILY
	326	inv_next+=length;
	327	return utext_openUTF8(ut, inv, length, status);
	328	#else
	329	if(inv_next+length+1>INV_BUFSIZ) {
	330	fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
	331	__FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
	332	*status = U_MEMORY_ALLOCATION_ERROR;
	333	return NULL;
	334	}
	335
	336	unsigned char buf = (unsigned char)inv_buf+inv_next;
	337	uprv_aestrncpy(buf, (const uint8_t*)inv, length);
	338	inv_next+=length;
	339
	340	#if 0
	341	fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
	342	#endif
	343
	344	return utext_openUTF8(ut, (const char*)buf, length, status);
	345	#endif
	346	}
	347
	348
	349	//---------------------------------------------------------------------------
	350	//
	351	// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
	352	// for the LookingAt() and Match() functions.
	353	//
	354	// usage:
	355	// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
	356	//
	357	// The expected results are UBool - TRUE or FALSE.
	358	// The input text is unescaped. The pattern is not.
	359	//
	360	//
	361	//---------------------------------------------------------------------------
	362
	363	#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
	364
	365	UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	366	const UnicodeString pattern(pat, -1, US_INV);
	367	const UnicodeString inputText(text, -1, US_INV);
	368	UErrorCode status = U_ZERO_ERROR;
	369	UParseError pe;
	370	RegexPattern *REPattern = NULL;
	371	RegexMatcher *REMatcher = NULL;
	372	UBool retVal = TRUE;
	373
	374	UnicodeString patString(pat, -1, US_INV);
	375	REPattern = RegexPattern::compile(patString, 0, pe, status);
	376	if (U_FAILURE(status)) {
	377	dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
	378	line, u_errorName(status));
	379	return FALSE;
	380	}
	381	if (line==376) { REPattern->dumpPattern();}
	382
	383	UnicodeString inputString(inputText);
	384	UnicodeString unEscapedInput = inputString.unescape();
	385	REMatcher = REPattern->matcher(unEscapedInput, status);
	386	if (U_FAILURE(status)) {
	387	errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
	388	line, u_errorName(status));
	389	return FALSE;
	390	}
	391
	392	UBool actualmatch;
	393	actualmatch = REMatcher->lookingAt(status);
	394	if (U_FAILURE(status)) {
	395	errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
	396	line, u_errorName(status));
	397	retVal = FALSE;
	398	}
	399	if (actualmatch != looking) {
	400	errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
	401	retVal = FALSE;
	402	}
	403
	404	status = U_ZERO_ERROR;
	405	actualmatch = REMatcher->matches(status);
	406	if (U_FAILURE(status)) {
	407	errln("RegexTest failure in matches() at line %d. Status = %s\n",
	408	line, u_errorName(status));
	409	retVal = FALSE;
	410	}
	411	if (actualmatch != match) {
	412	errln("RegexTest: wrong return from matches() at line %d.\n", line);
	413	retVal = FALSE;
	414	}
	415
	416	if (retVal == FALSE) {
	417	REPattern->dumpPattern();
	418	}
	419
	420	delete REPattern;
	421	delete REMatcher;
	422	return retVal;
	423	}
	424
	425
	426	UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	427	UText pattern = UTEXT_INITIALIZER;
	428	int32_t inputUTF8Length;
	429	char *textChars = NULL;
	430	UText inputText = UTEXT_INITIALIZER;
	431	UErrorCode status = U_ZERO_ERROR;
	432	UParseError pe;
	433	RegexPattern *REPattern = NULL;
	434	RegexMatcher *REMatcher = NULL;
	435	UBool retVal = TRUE;
	436
	437	regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
	438	REPattern = RegexPattern::compile(&pattern, 0, pe, status);
	439	if (U_FAILURE(status)) {
	440	dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
	441	line, u_errorName(status));
	442	return FALSE;
	443	}
	444
	445	UnicodeString inputString(text, -1, US_INV);
	446	UnicodeString unEscapedInput = inputString.unescape();
	447	LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
	448	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	449
	450	inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
	451	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
	452	// UTF-8 does not allow unpaired surrogates, so this could actually happen
	453	logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
	454	return TRUE; // not a failure of the Regex engine
	455	}
	456	status = U_ZERO_ERROR; // buffer overflow
	457	textChars = new char[inputUTF8Length+1];
	458	unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
	459	utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
	460
	461	REMatcher = &REPattern->matcher(status)->reset(&inputText);
	462	if (U_FAILURE(status)) {
	463	errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
	464	line, u_errorName(status));
	465	return FALSE;
	466	}
	467
	468	UBool actualmatch;
	469	actualmatch = REMatcher->lookingAt(status);
	470	if (U_FAILURE(status)) {
	471	errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
	472	line, u_errorName(status));
	473	retVal = FALSE;
	474	}
	475	if (actualmatch != looking) {
	476	errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
	477	retVal = FALSE;
	478	}
	479
	480	status = U_ZERO_ERROR;
	481	actualmatch = REMatcher->matches(status);
	482	if (U_FAILURE(status)) {
	483	errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
	484	line, u_errorName(status));
	485	retVal = FALSE;
	486	}
	487	if (actualmatch != match) {
	488	errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
	489	retVal = FALSE;
	490	}
	491
	492	if (retVal == FALSE) {
	493	REPattern->dumpPattern();
	494	}
	495
	496	delete REPattern;
	497	delete REMatcher;
	498	utext_close(&inputText);
	499	utext_close(&pattern);
	500	delete[] textChars;
	501	return retVal;
	502	}
	503
	504
	505
	506	//---------------------------------------------------------------------------
	507	//
	508	// REGEX_ERR Macro + invocation function to simplify writing tests
	509	// regex tests for incorrect patterns
	510	//
	511	// usage:
	512	// REGEX_ERR("pattern", expected error line, column, expected status);
	513	//
	514	//---------------------------------------------------------------------------
	515	#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
	516
	517	void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
	518	UErrorCode expectedStatus, int32_t line) {
	519	UnicodeString pattern(pat);
	520
	521	UErrorCode status = U_ZERO_ERROR;
	522	UParseError pe;
	523	RegexPattern *callerPattern = NULL;
	524
	525	//
	526	// Compile the caller's pattern
	527	//
	528	UnicodeString patString(pat);
	529	callerPattern = RegexPattern::compile(patString, 0, pe, status);
	530	if (status != expectedStatus) {
	531	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	532	} else {
	533	if (status != U_ZERO_ERROR) {
	534	if (pe.line != errLine \|\| pe.offset != errCol) {
	535	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	536	line, errLine, errCol, pe.line, pe.offset);
	537	}
	538	}
	539	}
	540
	541	delete callerPattern;
	542
	543	//
	544	// Compile again, using a UTF-8-based UText
	545	//
	546	UText patternText = UTEXT_INITIALIZER;
	547	regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
	548	callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
	549	if (status != expectedStatus) {
	550	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	551	} else {
	552	if (status != U_ZERO_ERROR) {
	553	if (pe.line != errLine \|\| pe.offset != errCol) {
	554	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	555	line, errLine, errCol, pe.line, pe.offset);
	556	}
	557	}
	558	}
	559
	560	delete callerPattern;
	561	utext_close(&patternText);
	562	}
	563
	564
	565
	566	//---------------------------------------------------------------------------
	567	//
	568	// Basic Check for basic functionality of regex pattern matching.
	569	// Avoid the use of REGEX_FIND test macro, which has
	570	// substantial dependencies on basic Regex functionality.
	571	//
	572	//---------------------------------------------------------------------------
	573	void RegexTest::Basic() {
	574
	575
	576	//
	577	// Debug - slide failing test cases early
	578	//
	579	#if 0
	580	{
	581	// REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
	582	UParseError pe;
	583	UErrorCode status = U_ZERO_ERROR;
	584	RegexPattern *pattern;
	585	pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
	586	pattern->dumpPattern();
	587	RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
	588	UBool result = m->find();
	589	printf("result = %d\n", result);
	590	// REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
	591	// REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");
	592	}
	593	exit(1);
	594	#endif
	595
	596
	597	//
	598	// Pattern with parentheses
	599	//
	600	REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
	601	REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
	602	REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
	603
	604	//
	605	// Patterns with *
	606	//
	607	REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
	608	REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
	609	REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
	610	REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
	611	REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
	612
	613	REGEX_TESTLM("a*", "", TRUE, TRUE);
	614	REGEX_TESTLM("a*", "b", TRUE, FALSE);
	615
	616
	617	//
	618	// Patterns with "."
	619	//
	620	REGEX_TESTLM(".", "abc", TRUE, FALSE);
	621	REGEX_TESTLM("...", "abc", TRUE, TRUE);
	622	REGEX_TESTLM("....", "abc", FALSE, FALSE);
	623	REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
	624	REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
	625	REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
	626	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
	627	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
	628
	629	//
	630	// Patterns with * applied to chars at end of literal string
	631	//
	632	REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
	633	REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
	634
	635	//
	636	// Supplemental chars match as single chars, not a pair of surrogates.
	637	//
	638	REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
	639	REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
	640	REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
	641
	642
	643	//
	644	// UnicodeSets in the pattern
	645	//
	646	REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
	647	REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
	648	REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
	649	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	650	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	651	REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
	652
	653	REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
	654	REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
	655	REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
	656	REGEX_TESTLM("[\\p{Nd}]", "a123456", TRUE, FALSE); // note that matches 0 occurences.
	657	REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
	658
	659	//
	660	// OR operator in patterns
	661	//
	662	REGEX_TESTLM("(a\|b)", "a", TRUE, TRUE);
	663	REGEX_TESTLM("(a\|b)", "b", TRUE, TRUE);
	664	REGEX_TESTLM("(a\|b)", "c", FALSE, FALSE);
	665	REGEX_TESTLM("a\|b", "b", TRUE, TRUE);
	666
	667	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", TRUE, TRUE);
	668	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", TRUE, FALSE);
	669	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", TRUE, TRUE);
	670	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", TRUE, TRUE);
	671	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", TRUE, TRUE);
	672	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", TRUE, FALSE);
	673
	674	//
	675	// +
	676	//
	677	REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
	678	REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
	679	REGEX_TESTLM("b+", "", FALSE, FALSE);
	680	REGEX_TESTLM("(abc\|def)+", "defabc", TRUE, TRUE);
	681	REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
	682	REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
	683
	684	//
	685	// ?
	686	//
	687	REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
	688	REGEX_TESTLM("ab?", "a", TRUE, TRUE);
	689	REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
	690	REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
	691	REGEX_TESTLM("a(b\|c)?d", "abd", TRUE, TRUE);
	692	REGEX_TESTLM("a(b\|c)?d", "acd", TRUE, TRUE);
	693	REGEX_TESTLM("a(b\|c)?d", "ad", TRUE, TRUE);
	694	REGEX_TESTLM("a(b\|c)?d", "abcd", FALSE, FALSE);
	695	REGEX_TESTLM("a(b\|c)?d", "ab", FALSE, FALSE);
	696
	697	//
	698	// Escape sequences that become single literal chars, handled internally
	699	// by ICU's Unescape.
	700	//
	701
	702	// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
	703	REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
	704	REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
	705	REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
	706	REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
	707	REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
	708	REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
	709	REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
	710	REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
	711	REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
	712
	713	REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
	714	REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
	715
	716	// Escape of special chars in patterns
	717	REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", TRUE, TRUE);
	718	}
	719
	720
	721	//---------------------------------------------------------------------------
	722	//
	723	// UTextBasic Check for quirks that are specific to the UText
	724	// implementation.
	725	//
	726	//---------------------------------------------------------------------------
	727	void RegexTest::UTextBasic() {
	728	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	729	UErrorCode status = U_ZERO_ERROR;
	730	UText pattern = UTEXT_INITIALIZER;
	731	utext_openUTF8(&pattern, str_abc, -1, &status);
	732	RegexMatcher matcher(&pattern, 0, status);
	733	REGEX_CHECK_STATUS;
	734
	735	UText input = UTEXT_INITIALIZER;
	736	utext_openUTF8(&input, str_abc, -1, &status);
	737	REGEX_CHECK_STATUS;
	738	matcher.reset(&input);
	739	REGEX_CHECK_STATUS;
	740	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	741
	742	matcher.reset(matcher.inputText());
	743	REGEX_CHECK_STATUS;
	744	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	745
	746	utext_close(&pattern);
	747	utext_close(&input);
	748	}
	749
	750
	751	//---------------------------------------------------------------------------
	752	//
	753	// API_Match Test that the API for class RegexMatcher
	754	// is present and nominally working, but excluding functions
	755	// implementing replace operations.
	756	//
	757	//---------------------------------------------------------------------------
	758	void RegexTest::API_Match() {
	759	UParseError pe;
	760	UErrorCode status=U_ZERO_ERROR;
	761	int32_t flags = 0;
	762
	763	//
	764	// Debug - slide failing test cases early
	765	//
	766	#if 0
	767	{
	768	}
	769	return;
	770	#endif
	771
	772	//
	773	// Simple pattern compilation
	774	//
	775	{
	776	UnicodeString re("abc");
	777	RegexPattern *pat2;
	778	pat2 = RegexPattern::compile(re, flags, pe, status);
	779	REGEX_CHECK_STATUS;
	780
	781	UnicodeString inStr1 = "abcdef this is a test";
	782	UnicodeString instr2 = "not abc";
	783	UnicodeString empty = "";
	784
	785
	786	//
	787	// Matcher creation and reset.
	788	//
	789	RegexMatcher *m1 = pat2->matcher(inStr1, status);
	790	REGEX_CHECK_STATUS;
	791	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	792	REGEX_ASSERT(m1->input() == inStr1);
	793	m1->reset(instr2);
	794	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	795	REGEX_ASSERT(m1->input() == instr2);
	796	m1->reset(inStr1);
	797	REGEX_ASSERT(m1->input() == inStr1);
	798	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	799	m1->reset(empty);
	800	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	801	REGEX_ASSERT(m1->input() == empty);
	802	REGEX_ASSERT(&m1->pattern() == pat2);
	803
	804	//
	805	// reset(pos, status)
	806	//
	807	m1->reset(inStr1);
	808	m1->reset(4, status);
	809	REGEX_CHECK_STATUS;
	810	REGEX_ASSERT(m1->input() == inStr1);
	811	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	812
	813	m1->reset(-1, status);
	814	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	815	status = U_ZERO_ERROR;
	816
	817	m1->reset(0, status);
	818	REGEX_CHECK_STATUS;
	819	status = U_ZERO_ERROR;
	820
	821	int32_t len = m1->input().length();
	822	m1->reset(len-1, status);
	823	REGEX_CHECK_STATUS;
	824	status = U_ZERO_ERROR;
	825
	826	m1->reset(len, status);
	827	REGEX_CHECK_STATUS;
	828	status = U_ZERO_ERROR;
	829
	830	m1->reset(len+1, status);
	831	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	832	status = U_ZERO_ERROR;
	833
	834	//
	835	// match(pos, status)
	836	//
	837	m1->reset(instr2);
	838	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	839	m1->reset();
	840	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	841	m1->reset();
	842	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	843	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	844	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	845	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	846
	847	// Match() at end of string should fail, but should not
	848	// be an error.
	849	status = U_ZERO_ERROR;
	850	len = m1->input().length();
	851	REGEX_ASSERT(m1->matches(len, status) == FALSE);
	852	REGEX_CHECK_STATUS;
	853
	854	// Match beyond end of string should fail with an error.
	855	status = U_ZERO_ERROR;
	856	REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
	857	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	858
	859	// Successful match at end of string.
	860	{
	861	status = U_ZERO_ERROR;
	862	RegexMatcher m("A?", 0, status); // will match zero length string.
	863	REGEX_CHECK_STATUS;
	864	m.reset(inStr1);
	865	len = inStr1.length();
	866	REGEX_ASSERT(m.matches(len, status) == TRUE);
	867	REGEX_CHECK_STATUS;
	868	m.reset(empty);
	869	REGEX_ASSERT(m.matches(0, status) == TRUE);
	870	REGEX_CHECK_STATUS;
	871	}
	872
	873
	874	//
	875	// lookingAt(pos, status)
	876	//
	877	status = U_ZERO_ERROR;
	878	m1->reset(instr2); // "not abc"
	879	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	880	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	881	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	882	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	883	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	884	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	885	status = U_ZERO_ERROR;
	886	len = m1->input().length();
	887	REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
	888	REGEX_CHECK_STATUS;
	889	REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
	890	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	891
	892	delete m1;
	893	delete pat2;
	894	}
	895
	896
	897	//
	898	// Capture Group.
	899	// RegexMatcher::start();
	900	// RegexMatcher::end();
	901	// RegexMatcher::groupCount();
	902	//
	903	{
	904	int32_t flags=0;
	905	UParseError pe;
	906	UErrorCode status=U_ZERO_ERROR;
	907
	908	UnicodeString re("01(23(45)67)(.*)");
	909	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	910	REGEX_CHECK_STATUS;
	911	UnicodeString data = "0123456789";
	912
	913	RegexMatcher *matcher = pat->matcher(data, status);
	914	REGEX_CHECK_STATUS;
	915	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	916	static const int32_t matchStarts[] = {0, 2, 4, 8};
	917	static const int32_t matchEnds[] = {10, 8, 6, 10};
	918	int32_t i;
	919	for (i=0; i<4; i++) {
	920	int32_t actualStart = matcher->start(i, status);
	921	REGEX_CHECK_STATUS;
	922	if (actualStart != matchStarts[i]) {
	923	errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
	924	__LINE__, i, matchStarts[i], actualStart);
	925	}
	926	int32_t actualEnd = matcher->end(i, status);
	927	REGEX_CHECK_STATUS;
	928	if (actualEnd != matchEnds[i]) {
	929	errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
	930	__LINE__, i, matchEnds[i], actualEnd);
	931	}
	932	}
	933
	934	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	935	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	936
	937	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	938	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	939	matcher->reset();
	940	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	941
	942	matcher->lookingAt(status);
	943	REGEX_ASSERT(matcher->group(status) == "0123456789");
	944	REGEX_ASSERT(matcher->group(0, status) == "0123456789");
	945	REGEX_ASSERT(matcher->group(1, status) == "234567" );
	946	REGEX_ASSERT(matcher->group(2, status) == "45" );
	947	REGEX_ASSERT(matcher->group(3, status) == "89" );
	948	REGEX_CHECK_STATUS;
	949	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	950	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	951	matcher->reset();
	952	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	953
	954	delete matcher;
	955	delete pat;
	956
	957	}
	958
	959	//
	960	// find
	961	//
	962	{
	963	int32_t flags=0;
	964	UParseError pe;
	965	UErrorCode status=U_ZERO_ERROR;
	966
	967	UnicodeString re("abc");
	968	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	969	REGEX_CHECK_STATUS;
	970	UnicodeString data = ".abc..abc...abc..";
	971	// 012345678901234567
	972
	973	RegexMatcher *matcher = pat->matcher(data, status);
	974	REGEX_CHECK_STATUS;
	975	REGEX_ASSERT(matcher->find());
	976	REGEX_ASSERT(matcher->start(status) == 1);
	977	REGEX_ASSERT(matcher->find());
	978	REGEX_ASSERT(matcher->start(status) == 6);
	979	REGEX_ASSERT(matcher->find());
	980	REGEX_ASSERT(matcher->start(status) == 12);
	981	REGEX_ASSERT(matcher->find() == FALSE);
	982	REGEX_ASSERT(matcher->find() == FALSE);
	983
	984	matcher->reset();
	985	REGEX_ASSERT(matcher->find());
	986	REGEX_ASSERT(matcher->start(status) == 1);
	987
	988	REGEX_ASSERT(matcher->find(0, status));
	989	REGEX_ASSERT(matcher->start(status) == 1);
	990	REGEX_ASSERT(matcher->find(1, status));
	991	REGEX_ASSERT(matcher->start(status) == 1);
	992	REGEX_ASSERT(matcher->find(2, status));
	993	REGEX_ASSERT(matcher->start(status) == 6);
	994	REGEX_ASSERT(matcher->find(12, status));
	995	REGEX_ASSERT(matcher->start(status) == 12);
	996	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	997	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	998	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	999	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	1000
	1001	status = U_ZERO_ERROR;
	1002	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1003	status = U_ZERO_ERROR;
	1004	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1005
	1006	REGEX_ASSERT(matcher->groupCount() == 0);
	1007
	1008	delete matcher;
	1009	delete pat;
	1010	}
	1011
	1012
	1013	//
	1014	// find, with \G in pattern (true if at the end of a previous match).
	1015	//
	1016	{
	1017	int32_t flags=0;
	1018	UParseError pe;
	1019	UErrorCode status=U_ZERO_ERROR;
	1020
	1021	UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);
	1022	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1023	REGEX_CHECK_STATUS;
	1024	UnicodeString data = ".abcabc.abc..";
	1025	// 012345678901234567
	1026
	1027	RegexMatcher *matcher = pat->matcher(data, status);
	1028	REGEX_CHECK_STATUS;
	1029	REGEX_ASSERT(matcher->find());
	1030	REGEX_ASSERT(matcher->start(status) == 0);
	1031	REGEX_ASSERT(matcher->start(1, status) == -1);
	1032	REGEX_ASSERT(matcher->start(2, status) == 1);
	1033
	1034	REGEX_ASSERT(matcher->find());
	1035	REGEX_ASSERT(matcher->start(status) == 4);
	1036	REGEX_ASSERT(matcher->start(1, status) == 4);
	1037	REGEX_ASSERT(matcher->start(2, status) == -1);
	1038	REGEX_CHECK_STATUS;
	1039
	1040	delete matcher;
	1041	delete pat;
	1042	}
	1043
	1044	//
	1045	// find with zero length matches, match position should bump ahead
	1046	// to prevent loops.
	1047	//
	1048	{
	1049	int32_t i;
	1050	UErrorCode status=U_ZERO_ERROR;
	1051	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	1052	// using an always-true look-ahead.
	1053	REGEX_CHECK_STATUS;
	1054	UnicodeString s(" ");
	1055	m.reset(s);
	1056	for (i=0; ; i++) {
	1057	if (m.find() == FALSE) {
	1058	break;
	1059	}
	1060	REGEX_ASSERT(m.start(status) == i);
	1061	REGEX_ASSERT(m.end(status) == i);
	1062	}
	1063	REGEX_ASSERT(i==5);
	1064
	1065	// Check that the bump goes over surrogate pairs OK
	1066	s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
	1067	s = s.unescape();
	1068	m.reset(s);
	1069	for (i=0; ; i+=2) {
	1070	if (m.find() == FALSE) {
	1071	break;
	1072	}
	1073	REGEX_ASSERT(m.start(status) == i);
	1074	REGEX_ASSERT(m.end(status) == i);
	1075	}
	1076	REGEX_ASSERT(i==10);
	1077	}
	1078	{
	1079	// find() loop breaking test.
	1080	// with pattern of /.?/, should see a series of one char matches, then a single
	1081	// match of zero length at the end of the input string.
	1082	int32_t i;
	1083	UErrorCode status=U_ZERO_ERROR;
	1084	RegexMatcher m(".?", 0, status);
	1085	REGEX_CHECK_STATUS;
	1086	UnicodeString s(" ");
	1087	m.reset(s);
	1088	for (i=0; ; i++) {
	1089	if (m.find() == FALSE) {
	1090	break;
	1091	}
	1092	REGEX_ASSERT(m.start(status) == i);
	1093	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	1094	}
	1095	REGEX_ASSERT(i==5);
	1096	}
	1097
	1098
	1099	//
	1100	// Matchers with no input string behave as if they had an empty input string.
	1101	//
	1102
	1103	{
	1104	UErrorCode status = U_ZERO_ERROR;
	1105	RegexMatcher m(".?", 0, status);
	1106	REGEX_CHECK_STATUS;
	1107	REGEX_ASSERT(m.find());
	1108	REGEX_ASSERT(m.start(status) == 0);
	1109	REGEX_ASSERT(m.input() == "");
	1110	}
	1111	{
	1112	UErrorCode status = U_ZERO_ERROR;
	1113	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1114	RegexMatcher *m = p->matcher(status);
	1115	REGEX_CHECK_STATUS;
	1116
	1117	REGEX_ASSERT(m->find() == FALSE);
	1118	REGEX_ASSERT(m->input() == "");
	1119	delete m;
	1120	delete p;
	1121	}
	1122
	1123	//
	1124	// Regions
	1125	//
	1126	{
	1127	UErrorCode status = U_ZERO_ERROR;
	1128	UnicodeString testString("This is test data");
	1129	RegexMatcher m(".*", testString, 0, status);
	1130	REGEX_CHECK_STATUS;
	1131	REGEX_ASSERT(m.regionStart() == 0);
	1132	REGEX_ASSERT(m.regionEnd() == testString.length());
	1133	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1134	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1135
	1136	m.region(2,4, status);
	1137	REGEX_CHECK_STATUS;
	1138	REGEX_ASSERT(m.matches(status));
	1139	REGEX_ASSERT(m.start(status)==2);
	1140	REGEX_ASSERT(m.end(status)==4);
	1141	REGEX_CHECK_STATUS;
	1142
	1143	m.reset();
	1144	REGEX_ASSERT(m.regionStart() == 0);
	1145	REGEX_ASSERT(m.regionEnd() == testString.length());
	1146
	1147	UnicodeString shorterString("short");
	1148	m.reset(shorterString);
	1149	REGEX_ASSERT(m.regionStart() == 0);
	1150	REGEX_ASSERT(m.regionEnd() == shorterString.length());
	1151
	1152	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1153	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	1154	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1155	REGEX_ASSERT(&m == &m.reset());
	1156	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1157
	1158	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	1159	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1160	REGEX_ASSERT(&m == &m.reset());
	1161	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1162
	1163	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1164	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	1165	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1166	REGEX_ASSERT(&m == &m.reset());
	1167	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1168
	1169	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	1170	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1171	REGEX_ASSERT(&m == &m.reset());
	1172	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1173
	1174	}
	1175
	1176	//
	1177	// hitEnd() and requireEnd()
	1178	//
	1179	{
	1180	UErrorCode status = U_ZERO_ERROR;
	1181	UnicodeString testString("aabb");
	1182	RegexMatcher m1(".*", testString, 0, status);
	1183	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	1184	REGEX_ASSERT(m1.hitEnd() == TRUE);
	1185	REGEX_ASSERT(m1.requireEnd() == FALSE);
	1186	REGEX_CHECK_STATUS;
	1187
	1188	status = U_ZERO_ERROR;
	1189	RegexMatcher m2("a*", testString, 0, status);
	1190	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	1191	REGEX_ASSERT(m2.hitEnd() == FALSE);
	1192	REGEX_ASSERT(m2.requireEnd() == FALSE);
	1193	REGEX_CHECK_STATUS;
	1194
	1195	status = U_ZERO_ERROR;
	1196	RegexMatcher m3(".*$", testString, 0, status);
	1197	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	1198	REGEX_ASSERT(m3.hitEnd() == TRUE);
	1199	REGEX_ASSERT(m3.requireEnd() == TRUE);
	1200	REGEX_CHECK_STATUS;
	1201	}
	1202
	1203
	1204	//
	1205	// Compilation error on reset with UChar *
	1206	// These were a hazard that people were stumbling over with runtime errors.
	1207	// Changed them to compiler errors by adding private methods that more closely
	1208	// matched the incorrect use of the functions.
	1209	//
	1210	#if 0
	1211	{
	1212	UErrorCode status = U_ZERO_ERROR;
	1213	UChar ucharString[20];
	1214	RegexMatcher m(".", 0, status);
	1215	m.reset(ucharString); // should not compile.
	1216
	1217	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1218	RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
	1219
	1220	RegexMatcher m3(".", ucharString, 0, status); // Should not compile
	1221	}
	1222	#endif
	1223
	1224	//
	1225	// Time Outs.
	1226	// Note: These tests will need to be changed when the regexp engine is
	1227	// able to detect and cut short the exponential time behavior on
	1228	// this type of match.
	1229	//
	1230	{
	1231	UErrorCode status = U_ZERO_ERROR;
	1232	// Enough 'a's in the string to cause the match to time out.
	1233	// (Each on additonal 'a' doubles the time)
	1234	UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
	1235	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1236	REGEX_CHECK_STATUS;
	1237	REGEX_ASSERT(matcher.getTimeLimit() == 0);
	1238	matcher.setTimeLimit(100, status);
	1239	REGEX_ASSERT(matcher.getTimeLimit() == 100);
	1240	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1241	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	1242	}
	1243	{
	1244	UErrorCode status = U_ZERO_ERROR;
	1245	// Few enough 'a's to slip in under the time limit.
	1246	UnicodeString testString("aaaaaaaaaaaaaaaaaa");
	1247	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1248	REGEX_CHECK_STATUS;
	1249	matcher.setTimeLimit(100, status);
	1250	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1251	REGEX_CHECK_STATUS;
	1252	}
	1253
	1254	//
	1255	// Stack Limits
	1256	//
	1257	{
	1258	UErrorCode status = U_ZERO_ERROR;
	1259	UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
	1260
	1261	// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
	1262	// of the '+', and makes the stack frames larger.
	1263	RegexMatcher matcher("(A)+A$", testString, 0, status);
	1264
	1265	// With the default stack, this match should fail to run
	1266	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1267	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1268
	1269	// With unlimited stack, it should run
	1270	status = U_ZERO_ERROR;
	1271	matcher.setStackLimit(0, status);
	1272	REGEX_CHECK_STATUS;
	1273	REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
	1274	REGEX_CHECK_STATUS;
	1275	REGEX_ASSERT(matcher.getStackLimit() == 0);
	1276
	1277	// With a limited stack, it the match should fail
	1278	status = U_ZERO_ERROR;
	1279	matcher.setStackLimit(10000, status);
	1280	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1281	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1282	REGEX_ASSERT(matcher.getStackLimit() == 10000);
	1283	}
	1284
	1285	// A pattern that doesn't save state should work with
	1286	// a minimal sized stack
	1287	{
	1288	UErrorCode status = U_ZERO_ERROR;
	1289	UnicodeString testString = "abc";
	1290	RegexMatcher matcher("abc", testString, 0, status);
	1291	REGEX_CHECK_STATUS;
	1292	matcher.setStackLimit(30, status);
	1293	REGEX_CHECK_STATUS;
	1294	REGEX_ASSERT(matcher.matches(status) == TRUE);
	1295	REGEX_CHECK_STATUS;
	1296	REGEX_ASSERT(matcher.getStackLimit() == 30);
	1297
	1298	// Negative stack sizes should fail
	1299	status = U_ZERO_ERROR;
	1300	matcher.setStackLimit(1000, status);
	1301	REGEX_CHECK_STATUS;
	1302	matcher.setStackLimit(-1, status);
	1303	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	1304	REGEX_ASSERT(matcher.getStackLimit() == 1000);
	1305	}
	1306
	1307
	1308	}
	1309
	1310
	1311
	1312
	1313
	1314
	1315	//---------------------------------------------------------------------------
	1316	//
	1317	// API_Replace API test for class RegexMatcher, testing the
	1318	// Replace family of functions.
	1319	//
	1320	//---------------------------------------------------------------------------
	1321	void RegexTest::API_Replace() {
	1322	//
	1323	// Replace
	1324	//
	1325	int32_t flags=0;
	1326	UParseError pe;
	1327	UErrorCode status=U_ZERO_ERROR;
	1328
	1329	UnicodeString re("abc");
	1330	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1331	REGEX_CHECK_STATUS;
	1332	UnicodeString data = ".abc..abc...abc..";
	1333	// 012345678901234567
	1334	RegexMatcher *matcher = pat->matcher(data, status);
	1335
	1336	//
	1337	// Plain vanilla matches.
	1338	//
	1339	UnicodeString dest;
	1340	dest = matcher->replaceFirst("yz", status);
	1341	REGEX_CHECK_STATUS;
	1342	REGEX_ASSERT(dest == ".yz..abc...abc..");
	1343
	1344	dest = matcher->replaceAll("yz", status);
	1345	REGEX_CHECK_STATUS;
	1346	REGEX_ASSERT(dest == ".yz..yz...yz..");
	1347
	1348	//
	1349	// Plain vanilla non-matches.
	1350	//
	1351	UnicodeString d2 = ".abx..abx...abx..";
	1352	matcher->reset(d2);
	1353	dest = matcher->replaceFirst("yz", status);
	1354	REGEX_CHECK_STATUS;
	1355	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1356
	1357	dest = matcher->replaceAll("yz", status);
	1358	REGEX_CHECK_STATUS;
	1359	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1360
	1361	//
	1362	// Empty source string
	1363	//
	1364	UnicodeString d3 = "";
	1365	matcher->reset(d3);
	1366	dest = matcher->replaceFirst("yz", status);
	1367	REGEX_CHECK_STATUS;
	1368	REGEX_ASSERT(dest == "");
	1369
	1370	dest = matcher->replaceAll("yz", status);
	1371	REGEX_CHECK_STATUS;
	1372	REGEX_ASSERT(dest == "");
	1373
	1374	//
	1375	// Empty substitution string
	1376	//
	1377	matcher->reset(data); // ".abc..abc...abc.."
	1378	dest = matcher->replaceFirst("", status);
	1379	REGEX_CHECK_STATUS;
	1380	REGEX_ASSERT(dest == "...abc...abc..");
	1381
	1382	dest = matcher->replaceAll("", status);
	1383	REGEX_CHECK_STATUS;
	1384	REGEX_ASSERT(dest == "........");
	1385
	1386	//
	1387	// match whole string
	1388	//
	1389	UnicodeString d4 = "abc";
	1390	matcher->reset(d4);
	1391	dest = matcher->replaceFirst("xyz", status);
	1392	REGEX_CHECK_STATUS;
	1393	REGEX_ASSERT(dest == "xyz");
	1394
	1395	dest = matcher->replaceAll("xyz", status);
	1396	REGEX_CHECK_STATUS;
	1397	REGEX_ASSERT(dest == "xyz");
	1398
	1399	//
	1400	// Capture Group, simple case
	1401	//
	1402	UnicodeString re2("a(..)");
	1403	RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
	1404	REGEX_CHECK_STATUS;
	1405	UnicodeString d5 = "abcdefg";
	1406	RegexMatcher *matcher2 = pat2->matcher(d5, status);
	1407	REGEX_CHECK_STATUS;
	1408	dest = matcher2->replaceFirst("$1$1", status);
	1409	REGEX_CHECK_STATUS;
	1410	REGEX_ASSERT(dest == "bcbcdefg");
	1411
	1412	dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
	1413	REGEX_CHECK_STATUS;
	1414	REGEX_ASSERT(dest == "The value of $1 is bc.defg");
	1415
	1416	dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
	1417	REGEX_CHECK_STATUS;
	1418	REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
	1419
	1420	UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
	1421	replacement = replacement.unescape();
	1422	dest = matcher2->replaceFirst(replacement, status);
	1423	REGEX_CHECK_STATUS;
	1424	REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
	1425
	1426	REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
	1427
	1428
	1429	//
	1430	// Replacement String with \u hex escapes
	1431	//
	1432	{
	1433	UnicodeString src = "abc 1 abc 2 abc 3";
	1434	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
	1435	matcher->reset(src);
	1436	UnicodeString result = matcher->replaceAll(substitute, status);
	1437	REGEX_CHECK_STATUS;
	1438	REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
	1439	}
	1440	{
	1441	UnicodeString src = "abc !";
	1442	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
	1443	matcher->reset(src);
	1444	UnicodeString result = matcher->replaceAll(substitute, status);
	1445	REGEX_CHECK_STATUS;
	1446	UnicodeString expected = UnicodeString("--");
	1447	expected.append((UChar32)0x10000);
	1448	expected.append("-- !");
	1449	REGEX_ASSERT(result == expected);
	1450	}
	1451	// TODO: need more through testing of capture substitutions.
	1452
	1453	// Bug 4057
	1454	//
	1455	{
	1456	status = U_ZERO_ERROR;
	1457	UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
	1458	RegexMatcher m("ss(.*?)ee", 0, status);
	1459	REGEX_CHECK_STATUS;
	1460	UnicodeString result;
	1461
	1462	// Multiple finds do NOT bump up the previous appendReplacement postion.
	1463	m.reset(s);
	1464	m.find();
	1465	m.find();
	1466	m.appendReplacement(result, "ooh", status);
	1467	REGEX_CHECK_STATUS;
	1468	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1469
	1470	// After a reset into the interior of a string, appendReplacemnt still starts at beginning.
	1471	status = U_ZERO_ERROR;
	1472	result.truncate(0);
	1473	m.reset(10, status);
	1474	m.find();
	1475	m.find();
	1476	m.appendReplacement(result, "ooh", status);
	1477	REGEX_CHECK_STATUS;
	1478	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1479
	1480	// find() at interior of string, appendReplacemnt still starts at beginning.
	1481	status = U_ZERO_ERROR;
	1482	result.truncate(0);
	1483	m.reset();
	1484	m.find(10, status);
	1485	m.find();
	1486	m.appendReplacement(result, "ooh", status);
	1487	REGEX_CHECK_STATUS;
	1488	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1489
	1490	m.appendTail(result);
	1491	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
	1492
	1493	}
	1494
	1495	delete matcher2;
	1496	delete pat2;
	1497	delete matcher;
	1498	delete pat;
	1499	}
	1500
	1501
	1502	//---------------------------------------------------------------------------
	1503	//
	1504	// API_Pattern Test that the API for class RegexPattern is
	1505	// present and nominally working.
	1506	//
	1507	//---------------------------------------------------------------------------
	1508	void RegexTest::API_Pattern() {
	1509	RegexPattern pata; // Test default constructor to not crash.
	1510	RegexPattern patb;
	1511
	1512	REGEX_ASSERT(pata == patb);
	1513	REGEX_ASSERT(pata == pata);
	1514
	1515	UnicodeString re1("abc[a-l][m-z]");
	1516	UnicodeString re2("def");
	1517	UErrorCode status = U_ZERO_ERROR;
	1518	UParseError pe;
	1519
	1520	RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
	1521	RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
	1522	REGEX_CHECK_STATUS;
	1523	REGEX_ASSERT(pat1 == pat1);
	1524	REGEX_ASSERT(*pat1 != pata);
	1525
	1526	// Assign
	1527	patb = *pat1;
	1528	REGEX_ASSERT(patb == *pat1);
	1529
	1530	// Copy Construct
	1531	RegexPattern patc(*pat1);
	1532	REGEX_ASSERT(patc == *pat1);
	1533	REGEX_ASSERT(patb == patc);
	1534	REGEX_ASSERT(pat1 != pat2);
	1535	patb = *pat2;
	1536	REGEX_ASSERT(patb != patc);
	1537	REGEX_ASSERT(patb == *pat2);
	1538
	1539	// Compile with no flags.
	1540	RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
	1541	REGEX_ASSERT(pat1a == pat1);
	1542
	1543	REGEX_ASSERT(pat1a->flags() == 0);
	1544
	1545	// Compile with different flags should be not equal
	1546	RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
	1547	REGEX_CHECK_STATUS;
	1548
	1549	REGEX_ASSERT(pat1b != pat1a);
	1550	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	1551	REGEX_ASSERT(pat1a->flags() == 0);
	1552	delete pat1b;
	1553
	1554	// clone
	1555	RegexPattern *pat1c = pat1->clone();
	1556	REGEX_ASSERT(pat1c == pat1);
	1557	REGEX_ASSERT(pat1c != pat2);
	1558
	1559	delete pat1c;
	1560	delete pat1a;
	1561	delete pat1;
	1562	delete pat2;
	1563
	1564
	1565	//
	1566	// Verify that a matcher created from a cloned pattern works.
	1567	// (Jitterbug 3423)
	1568	//
	1569	{
	1570	UErrorCode status = U_ZERO_ERROR;
	1571	RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
	1572	RegexPattern *pClone = pSource->clone();
	1573	delete pSource;
	1574	RegexMatcher *mFromClone = pClone->matcher(status);
	1575	REGEX_CHECK_STATUS;
	1576	UnicodeString s = "Hello World";
	1577	mFromClone->reset(s);
	1578	REGEX_ASSERT(mFromClone->find() == TRUE);
	1579	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	1580	REGEX_ASSERT(mFromClone->find() == TRUE);
	1581	REGEX_ASSERT(mFromClone->group(status) == "World");
	1582	REGEX_ASSERT(mFromClone->find() == FALSE);
	1583	delete mFromClone;
	1584	delete pClone;
	1585	}
	1586
	1587	//
	1588	// matches convenience API
	1589	//
	1590	REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
	1591	REGEX_CHECK_STATUS;
	1592	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	1593	REGEX_CHECK_STATUS;
	1594	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	1595	REGEX_CHECK_STATUS;
	1596	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	1597	REGEX_CHECK_STATUS;
	1598	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	1599	REGEX_CHECK_STATUS;
	1600	status = U_INDEX_OUTOFBOUNDS_ERROR;
	1601	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	1602	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1603
	1604
	1605	//
	1606	// Split()
	1607	//
	1608	status = U_ZERO_ERROR;
	1609	pat1 = RegexPattern::compile(" +", pe, status);
	1610	REGEX_CHECK_STATUS;
	1611	UnicodeString fields[10];
	1612
	1613	int32_t n;
	1614	n = pat1->split("Now is the time", fields, 10, status);
	1615	REGEX_CHECK_STATUS;
	1616	REGEX_ASSERT(n==4);
	1617	REGEX_ASSERT(fields[0]=="Now");
	1618	REGEX_ASSERT(fields[1]=="is");
	1619	REGEX_ASSERT(fields[2]=="the");
	1620	REGEX_ASSERT(fields[3]=="time");
	1621	REGEX_ASSERT(fields[4]=="");
	1622
	1623	n = pat1->split("Now is the time", fields, 2, status);
	1624	REGEX_CHECK_STATUS;
	1625	REGEX_ASSERT(n==2);
	1626	REGEX_ASSERT(fields[0]=="Now");
	1627	REGEX_ASSERT(fields[1]=="is the time");
	1628	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	1629
	1630	fields[1] = "*";
	1631	status = U_ZERO_ERROR;
	1632	n = pat1->split("Now is the time", fields, 1, status);
	1633	REGEX_CHECK_STATUS;
	1634	REGEX_ASSERT(n==1);
	1635	REGEX_ASSERT(fields[0]=="Now is the time");
	1636	REGEX_ASSERT(fields[1]=="*");
	1637	status = U_ZERO_ERROR;
	1638
	1639	n = pat1->split(" Now is the time ", fields, 10, status);
	1640	REGEX_CHECK_STATUS;
	1641	REGEX_ASSERT(n==6);
	1642	REGEX_ASSERT(fields[0]=="");
	1643	REGEX_ASSERT(fields[1]=="Now");
	1644	REGEX_ASSERT(fields[2]=="is");
	1645	REGEX_ASSERT(fields[3]=="the");
	1646	REGEX_ASSERT(fields[4]=="time");
	1647	REGEX_ASSERT(fields[5]=="");
	1648
	1649	n = pat1->split(" ", fields, 10, status);
	1650	REGEX_CHECK_STATUS;
	1651	REGEX_ASSERT(n==2);
	1652	REGEX_ASSERT(fields[0]=="");
	1653	REGEX_ASSERT(fields[1]=="");
	1654
	1655	fields[0] = "foo";
	1656	n = pat1->split("", fields, 10, status);
	1657	REGEX_CHECK_STATUS;
	1658	REGEX_ASSERT(n==0);
	1659	REGEX_ASSERT(fields[0]=="foo");
	1660
	1661	delete pat1;
	1662
	1663	// split, with a pattern with (capture)
	1664	pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
	1665	REGEX_CHECK_STATUS;
	1666
	1667	status = U_ZERO_ERROR;
	1668	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	1669	REGEX_CHECK_STATUS;
	1670	REGEX_ASSERT(n==7);
	1671	REGEX_ASSERT(fields[0]=="");
	1672	REGEX_ASSERT(fields[1]=="a");
	1673	REGEX_ASSERT(fields[2]=="Now is ");
	1674	REGEX_ASSERT(fields[3]=="b");
	1675	REGEX_ASSERT(fields[4]=="the time");
	1676	REGEX_ASSERT(fields[5]=="c");
	1677	REGEX_ASSERT(fields[6]=="");
	1678	REGEX_ASSERT(status==U_ZERO_ERROR);
	1679
	1680	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	1681	REGEX_CHECK_STATUS;
	1682	REGEX_ASSERT(n==7);
	1683	REGEX_ASSERT(fields[0]==" ");
	1684	REGEX_ASSERT(fields[1]=="a");
	1685	REGEX_ASSERT(fields[2]=="Now is ");
	1686	REGEX_ASSERT(fields[3]=="b");
	1687	REGEX_ASSERT(fields[4]=="the time");
	1688	REGEX_ASSERT(fields[5]=="c");
	1689	REGEX_ASSERT(fields[6]=="");
	1690
	1691	status = U_ZERO_ERROR;
	1692	fields[6] = "foo";
	1693	n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
	1694	REGEX_CHECK_STATUS;
	1695	REGEX_ASSERT(n==6);
	1696	REGEX_ASSERT(fields[0]==" ");
	1697	REGEX_ASSERT(fields[1]=="a");
	1698	REGEX_ASSERT(fields[2]=="Now is ");
	1699	REGEX_ASSERT(fields[3]=="b");
	1700	REGEX_ASSERT(fields[4]=="the time");
	1701	REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
	1702	REGEX_ASSERT(fields[6]=="foo");
	1703
	1704	status = U_ZERO_ERROR;
	1705	fields[5] = "foo";
	1706	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	1707	REGEX_CHECK_STATUS;
	1708	REGEX_ASSERT(n==5);
	1709	REGEX_ASSERT(fields[0]==" ");
	1710	REGEX_ASSERT(fields[1]=="a");
	1711	REGEX_ASSERT(fields[2]=="Now is ");
	1712	REGEX_ASSERT(fields[3]=="b");
	1713	REGEX_ASSERT(fields[4]=="the time<c>");
	1714	REGEX_ASSERT(fields[5]=="foo");
	1715
	1716	status = U_ZERO_ERROR;
	1717	fields[5] = "foo";
	1718	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	1719	REGEX_CHECK_STATUS;
	1720	REGEX_ASSERT(n==5);
	1721	REGEX_ASSERT(fields[0]==" ");
	1722	REGEX_ASSERT(fields[1]=="a");
	1723	REGEX_ASSERT(fields[2]=="Now is ");
	1724	REGEX_ASSERT(fields[3]=="b");
	1725	REGEX_ASSERT(fields[4]=="the time");
	1726	REGEX_ASSERT(fields[5]=="foo");
	1727
	1728	status = U_ZERO_ERROR;
	1729	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	1730	REGEX_CHECK_STATUS;
	1731	REGEX_ASSERT(n==4);
	1732	REGEX_ASSERT(fields[0]==" ");
	1733	REGEX_ASSERT(fields[1]=="a");
	1734	REGEX_ASSERT(fields[2]=="Now is ");
	1735	REGEX_ASSERT(fields[3]=="the time<c>");
	1736	status = U_ZERO_ERROR;
	1737	delete pat1;
	1738
	1739	pat1 = RegexPattern::compile("([-,])", pe, status);
	1740	REGEX_CHECK_STATUS;
	1741	n = pat1->split("1-10,20", fields, 10, status);
	1742	REGEX_CHECK_STATUS;
	1743	REGEX_ASSERT(n==5);
	1744	REGEX_ASSERT(fields[0]=="1");
	1745	REGEX_ASSERT(fields[1]=="-");
	1746	REGEX_ASSERT(fields[2]=="10");
	1747	REGEX_ASSERT(fields[3]==",");
	1748	REGEX_ASSERT(fields[4]=="20");
	1749	delete pat1;
	1750
	1751	// Test split of string with empty trailing fields
	1752	pat1 = RegexPattern::compile(",", pe, status);
	1753	REGEX_CHECK_STATUS;
	1754	n = pat1->split("a,b,c,", fields, 10, status);
	1755	REGEX_CHECK_STATUS;
	1756	REGEX_ASSERT(n==4);
	1757	REGEX_ASSERT(fields[0]=="a");
	1758	REGEX_ASSERT(fields[1]=="b");
	1759	REGEX_ASSERT(fields[2]=="c");
	1760	REGEX_ASSERT(fields[3]=="");
	1761
	1762	n = pat1->split("a,,,", fields, 10, status);
	1763	REGEX_CHECK_STATUS;
	1764	REGEX_ASSERT(n==4);
	1765	REGEX_ASSERT(fields[0]=="a");
	1766	REGEX_ASSERT(fields[1]=="");
	1767	REGEX_ASSERT(fields[2]=="");
	1768	REGEX_ASSERT(fields[3]=="");
	1769	delete pat1;
	1770
	1771	// Split Separator with zero length match.
	1772	pat1 = RegexPattern::compile(":?", pe, status);
	1773	REGEX_CHECK_STATUS;
	1774	n = pat1->split("abc", fields, 10, status);
	1775	REGEX_CHECK_STATUS;
	1776	REGEX_ASSERT(n==5);
	1777	REGEX_ASSERT(fields[0]=="");
	1778	REGEX_ASSERT(fields[1]=="a");
	1779	REGEX_ASSERT(fields[2]=="b");
	1780	REGEX_ASSERT(fields[3]=="c");
	1781	REGEX_ASSERT(fields[4]=="");
	1782
	1783	delete pat1;
	1784
	1785	//
	1786	// RegexPattern::pattern()
	1787	//
	1788	pat1 = new RegexPattern();
	1789	REGEX_ASSERT(pat1->pattern() == "");
	1790	delete pat1;
	1791
	1792	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1793	REGEX_CHECK_STATUS;
	1794	REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
	1795	delete pat1;
	1796
	1797
	1798	//
	1799	// classID functions
	1800	//
	1801	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1802	REGEX_CHECK_STATUS;
	1803	REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
	1804	REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
	1805	UnicodeString Hello("Hello, world.");
	1806	RegexMatcher *m = pat1->matcher(Hello, status);
	1807	REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
	1808	REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
	1809	REGEX_ASSERT(m->getDynamicClassID() != NULL);
	1810	delete m;
	1811	delete pat1;
	1812
	1813	}
	1814
	1815	//---------------------------------------------------------------------------
	1816	//
	1817	// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
	1818	// is present and working, but excluding functions
	1819	// implementing replace operations.
	1820	//
	1821	//---------------------------------------------------------------------------
	1822	void RegexTest::API_Match_UTF8() {
	1823	UParseError pe;
	1824	UErrorCode status=U_ZERO_ERROR;
	1825	int32_t flags = 0;
	1826
	1827	//
	1828	// Debug - slide failing test cases early
	1829	//
	1830	#if 0
	1831	{
	1832	}
	1833	return;
	1834	#endif
	1835
	1836	//
	1837	// Simple pattern compilation
	1838	//
	1839	{
	1840	UText re = UTEXT_INITIALIZER;
	1841	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	1842	REGEX_VERBOSE_TEXT(&re);
	1843	RegexPattern *pat2;
	1844	pat2 = RegexPattern::compile(&re, flags, pe, status);
	1845	REGEX_CHECK_STATUS;
	1846
	1847	UText input1 = UTEXT_INITIALIZER;
	1848	UText input2 = UTEXT_INITIALIZER;
	1849	UText empty = UTEXT_INITIALIZER;
	1850	regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
	1851	REGEX_VERBOSE_TEXT(&input1);
	1852	regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
	1853	REGEX_VERBOSE_TEXT(&input2);
	1854	utext_openUChars(&empty, NULL, 0, &status);
	1855
	1856	int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
	1857	int32_t input2Len = strlen("not abc");
	1858
	1859
	1860	//
	1861	// Matcher creation and reset.
	1862	//
	1863	RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
	1864	REGEX_CHECK_STATUS;
	1865	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1866	const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
	1867	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1868	m1->reset(&input2);
	1869	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1870	const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
	1871	REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
	1872	m1->reset(&input1);
	1873	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1874	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1875	m1->reset(&empty);
	1876	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1877	REGEX_ASSERT(utext_nativeLength(&empty) == 0);
	1878
	1879	//
	1880	// reset(pos, status)
	1881	//
	1882	m1->reset(&input1);
	1883	m1->reset(4, status);
	1884	REGEX_CHECK_STATUS;
	1885	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1886	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1887
	1888	m1->reset(-1, status);
	1889	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1890	status = U_ZERO_ERROR;
	1891
	1892	m1->reset(0, status);
	1893	REGEX_CHECK_STATUS;
	1894	status = U_ZERO_ERROR;
	1895
	1896	m1->reset(input1Len-1, status);
	1897	REGEX_CHECK_STATUS;
	1898	status = U_ZERO_ERROR;
	1899
	1900	m1->reset(input1Len, status);
	1901	REGEX_CHECK_STATUS;
	1902	status = U_ZERO_ERROR;
	1903
	1904	m1->reset(input1Len+1, status);
	1905	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1906	status = U_ZERO_ERROR;
	1907
	1908	//
	1909	// match(pos, status)
	1910	//
	1911	m1->reset(&input2);
	1912	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1913	m1->reset();
	1914	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	1915	m1->reset();
	1916	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	1917	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1918	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	1919	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1920
	1921	// Match() at end of string should fail, but should not
	1922	// be an error.
	1923	status = U_ZERO_ERROR;
	1924	REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
	1925	REGEX_CHECK_STATUS;
	1926
	1927	// Match beyond end of string should fail with an error.
	1928	status = U_ZERO_ERROR;
	1929	REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
	1930	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1931
	1932	// Successful match at end of string.
	1933	{
	1934	status = U_ZERO_ERROR;
	1935	RegexMatcher m("A?", 0, status); // will match zero length string.
	1936	REGEX_CHECK_STATUS;
	1937	m.reset(&input1);
	1938	REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
	1939	REGEX_CHECK_STATUS;
	1940	m.reset(&empty);
	1941	REGEX_ASSERT(m.matches(0, status) == TRUE);
	1942	REGEX_CHECK_STATUS;
	1943	}
	1944
	1945
	1946	//
	1947	// lookingAt(pos, status)
	1948	//
	1949	status = U_ZERO_ERROR;
	1950	m1->reset(&input2); // "not abc"
	1951	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1952	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	1953	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	1954	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1955	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	1956	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1957	status = U_ZERO_ERROR;
	1958	REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
	1959	REGEX_CHECK_STATUS;
	1960	REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
	1961	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1962
	1963	delete m1;
	1964	delete pat2;
	1965
	1966	utext_close(&re);
	1967	utext_close(&input1);
	1968	utext_close(&input2);
	1969	utext_close(&empty);
	1970	}
	1971
	1972
	1973	//
	1974	// Capture Group.
	1975	// RegexMatcher::start();
	1976	// RegexMatcher::end();
	1977	// RegexMatcher::groupCount();
	1978	//
	1979	{
	1980	int32_t flags=0;
	1981	UParseError pe;
	1982	UErrorCode status=U_ZERO_ERROR;
	1983	UText re=UTEXT_INITIALIZER;
	1984	const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.) /
	1985	utext_openUTF8(&re, str_01234567_pat, -1, &status);
	1986
	1987	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	1988	REGEX_CHECK_STATUS;
	1989
	1990	UText input = UTEXT_INITIALIZER;
	1991	const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	1992	utext_openUTF8(&input, str_0123456789, -1, &status);
	1993
	1994	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	1995	REGEX_CHECK_STATUS;
	1996	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	1997	static const int32_t matchStarts[] = {0, 2, 4, 8};
	1998	static const int32_t matchEnds[] = {10, 8, 6, 10};
	1999	int32_t i;
	2000	for (i=0; i<4; i++) {
	2001	int32_t actualStart = matcher->start(i, status);
	2002	REGEX_CHECK_STATUS;
	2003	if (actualStart != matchStarts[i]) {
	2004	errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
	2005	__FILE__, __LINE__, i, matchStarts[i], actualStart);
	2006	}
	2007	int32_t actualEnd = matcher->end(i, status);
	2008	REGEX_CHECK_STATUS;
	2009	if (actualEnd != matchEnds[i]) {
	2010	errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
	2011	__FILE__, __LINE__, i, matchEnds[i], actualEnd);
	2012	}
	2013	}
	2014
	2015	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	2016	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	2017
	2018	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2019	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2020	matcher->reset();
	2021	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	2022
	2023	matcher->lookingAt(status);
	2024
	2025	UnicodeString dest;
	2026	UText destText = UTEXT_INITIALIZER;
	2027	utext_openUnicodeString(&destText, &dest, &status);
	2028	UText *result;
	2029	//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	2030	// Test shallow-clone API
	2031	int64_t group_len;
	2032	result = matcher->group((UText *)NULL, group_len, status);
	2033	REGEX_CHECK_STATUS;
	2034	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2035	utext_close(result);
	2036	result = matcher->group(0, &destText, group_len, status);
	2037	REGEX_CHECK_STATUS;
	2038	REGEX_ASSERT(result == &destText);
	2039	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2040	// destText is now immutable, reopen it
	2041	utext_close(&destText);
	2042	utext_openUnicodeString(&destText, &dest, &status);
	2043
	2044	result = matcher->group(0, NULL, status);
	2045	REGEX_CHECK_STATUS;
	2046	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2047	utext_close(result);
	2048	result = matcher->group(0, &destText, status);
	2049	REGEX_CHECK_STATUS;
	2050	REGEX_ASSERT(result == &destText);
	2051	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2052
	2053	result = matcher->group(1, NULL, status);
	2054	REGEX_CHECK_STATUS;
	2055	const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
	2056	REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
	2057	utext_close(result);
	2058	result = matcher->group(1, &destText, status);
	2059	REGEX_CHECK_STATUS;
	2060	REGEX_ASSERT(result == &destText);
	2061	REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
	2062
	2063	result = matcher->group(2, NULL, status);
	2064	REGEX_CHECK_STATUS;
	2065	const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
	2066	REGEX_ASSERT_UTEXT_UTF8(str_45, result);
	2067	utext_close(result);
	2068	result = matcher->group(2, &destText, status);
	2069	REGEX_CHECK_STATUS;
	2070	REGEX_ASSERT(result == &destText);
	2071	REGEX_ASSERT_UTEXT_UTF8(str_45, result);
	2072
	2073	result = matcher->group(3, NULL, status);
	2074	REGEX_CHECK_STATUS;
	2075	const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
	2076	REGEX_ASSERT_UTEXT_UTF8(str_89, result);
	2077	utext_close(result);
	2078	result = matcher->group(3, &destText, status);
	2079	REGEX_CHECK_STATUS;
	2080	REGEX_ASSERT(result == &destText);
	2081	REGEX_ASSERT_UTEXT_UTF8(str_89, result);
	2082
	2083	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2084	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2085	matcher->reset();
	2086	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	2087
	2088	delete matcher;
	2089	delete pat;
	2090
	2091	utext_close(&destText);
	2092	utext_close(&input);
	2093	utext_close(&re);
	2094	}
	2095
	2096	//
	2097	// find
	2098	//
	2099	{
	2100	int32_t flags=0;
	2101	UParseError pe;
	2102	UErrorCode status=U_ZERO_ERROR;
	2103	UText re=UTEXT_INITIALIZER;
	2104	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2105	utext_openUTF8(&re, str_abc, -1, &status);
	2106
	2107	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2108	REGEX_CHECK_STATUS;
	2109	UText input = UTEXT_INITIALIZER;
	2110	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2111	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2112	// 012345678901234567
	2113
	2114	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2115	REGEX_CHECK_STATUS;
	2116	REGEX_ASSERT(matcher->find());
	2117	REGEX_ASSERT(matcher->start(status) == 1);
	2118	REGEX_ASSERT(matcher->find());
	2119	REGEX_ASSERT(matcher->start(status) == 6);
	2120	REGEX_ASSERT(matcher->find());
	2121	REGEX_ASSERT(matcher->start(status) == 12);
	2122	REGEX_ASSERT(matcher->find() == FALSE);
	2123	REGEX_ASSERT(matcher->find() == FALSE);
	2124
	2125	matcher->reset();
	2126	REGEX_ASSERT(matcher->find());
	2127	REGEX_ASSERT(matcher->start(status) == 1);
	2128
	2129	REGEX_ASSERT(matcher->find(0, status));
	2130	REGEX_ASSERT(matcher->start(status) == 1);
	2131	REGEX_ASSERT(matcher->find(1, status));
	2132	REGEX_ASSERT(matcher->start(status) == 1);
	2133	REGEX_ASSERT(matcher->find(2, status));
	2134	REGEX_ASSERT(matcher->start(status) == 6);
	2135	REGEX_ASSERT(matcher->find(12, status));
	2136	REGEX_ASSERT(matcher->start(status) == 12);
	2137	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	2138	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	2139	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	2140	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	2141
	2142	status = U_ZERO_ERROR;
	2143	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2144	status = U_ZERO_ERROR;
	2145	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2146
	2147	REGEX_ASSERT(matcher->groupCount() == 0);
	2148
	2149	delete matcher;
	2150	delete pat;
	2151
	2152	utext_close(&input);
	2153	utext_close(&re);
	2154	}
	2155
	2156
	2157	//
	2158	// find, with \G in pattern (true if at the end of a previous match).
	2159	//
	2160	{
	2161	int32_t flags=0;
	2162	UParseError pe;
	2163	UErrorCode status=U_ZERO_ERROR;
	2164	UText re=UTEXT_INITIALIZER;
	2165	const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .?(?:(\\Gabc)\|(abc)) /
	2166	utext_openUTF8(&re, str_Gabcabc, -1, &status);
	2167
	2168	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2169
	2170	REGEX_CHECK_STATUS;
	2171	UText input = UTEXT_INITIALIZER;
	2172	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
	2173	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2174	// 012345678901234567
	2175
	2176	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2177	REGEX_CHECK_STATUS;
	2178	REGEX_ASSERT(matcher->find());
	2179	REGEX_ASSERT(matcher->start(status) == 0);
	2180	REGEX_ASSERT(matcher->start(1, status) == -1);
	2181	REGEX_ASSERT(matcher->start(2, status) == 1);
	2182
	2183	REGEX_ASSERT(matcher->find());
	2184	REGEX_ASSERT(matcher->start(status) == 4);
	2185	REGEX_ASSERT(matcher->start(1, status) == 4);
	2186	REGEX_ASSERT(matcher->start(2, status) == -1);
	2187	REGEX_CHECK_STATUS;
	2188
	2189	delete matcher;
	2190	delete pat;
	2191
	2192	utext_close(&input);
	2193	utext_close(&re);
	2194	}
	2195
	2196	//
	2197	// find with zero length matches, match position should bump ahead
	2198	// to prevent loops.
	2199	//
	2200	{
	2201	int32_t i;
	2202	UErrorCode status=U_ZERO_ERROR;
	2203	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	2204	// using an always-true look-ahead.
	2205	REGEX_CHECK_STATUS;
	2206	UText s = UTEXT_INITIALIZER;
	2207	utext_openUTF8(&s, " ", -1, &status);
	2208	m.reset(&s);
	2209	for (i=0; ; i++) {
	2210	if (m.find() == FALSE) {
	2211	break;
	2212	}
	2213	REGEX_ASSERT(m.start(status) == i);
	2214	REGEX_ASSERT(m.end(status) == i);
	2215	}
	2216	REGEX_ASSERT(i==5);
	2217
	2218	// Check that the bump goes over characters outside the BMP OK
	2219	// "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
	2220	unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
	2221	utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
	2222	m.reset(&s);
	2223	for (i=0; ; i+=4) {
	2224	if (m.find() == FALSE) {
	2225	break;
	2226	}
	2227	REGEX_ASSERT(m.start(status) == i);
	2228	REGEX_ASSERT(m.end(status) == i);
	2229	}
	2230	REGEX_ASSERT(i==20);
	2231
	2232	utext_close(&s);
	2233	}
	2234	{
	2235	// find() loop breaking test.
	2236	// with pattern of /.?/, should see a series of one char matches, then a single
	2237	// match of zero length at the end of the input string.
	2238	int32_t i;
	2239	UErrorCode status=U_ZERO_ERROR;
	2240	RegexMatcher m(".?", 0, status);
	2241	REGEX_CHECK_STATUS;
	2242	UText s = UTEXT_INITIALIZER;
	2243	utext_openUTF8(&s, " ", -1, &status);
	2244	m.reset(&s);
	2245	for (i=0; ; i++) {
	2246	if (m.find() == FALSE) {
	2247	break;
	2248	}
	2249	REGEX_ASSERT(m.start(status) == i);
	2250	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	2251	}
	2252	REGEX_ASSERT(i==5);
	2253
	2254	utext_close(&s);
	2255	}
	2256
	2257
	2258	//
	2259	// Matchers with no input string behave as if they had an empty input string.
	2260	//
	2261
	2262	{
	2263	UErrorCode status = U_ZERO_ERROR;
	2264	RegexMatcher m(".?", 0, status);
	2265	REGEX_CHECK_STATUS;
	2266	REGEX_ASSERT(m.find());
	2267	REGEX_ASSERT(m.start(status) == 0);
	2268	REGEX_ASSERT(m.input() == "");
	2269	}
	2270	{
	2271	UErrorCode status = U_ZERO_ERROR;
	2272	RegexPattern *p = RegexPattern::compile(".", 0, status);
	2273	RegexMatcher *m = p->matcher(status);
	2274	REGEX_CHECK_STATUS;
	2275
	2276	REGEX_ASSERT(m->find() == FALSE);
	2277	REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
	2278	delete m;
	2279	delete p;
	2280	}
	2281
	2282	//
	2283	// Regions
	2284	//
	2285	{
	2286	UErrorCode status = U_ZERO_ERROR;
	2287	UText testPattern = UTEXT_INITIALIZER;
	2288	UText testText = UTEXT_INITIALIZER;
	2289	regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
	2290	REGEX_VERBOSE_TEXT(&testPattern);
	2291	regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
	2292	REGEX_VERBOSE_TEXT(&testText);
	2293
	2294	RegexMatcher m(&testPattern, &testText, 0, status);
	2295	REGEX_CHECK_STATUS;
	2296	REGEX_ASSERT(m.regionStart() == 0);
	2297	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2298	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2299	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2300
	2301	m.region(2,4, status);
	2302	REGEX_CHECK_STATUS;
	2303	REGEX_ASSERT(m.matches(status));
	2304	REGEX_ASSERT(m.start(status)==2);
	2305	REGEX_ASSERT(m.end(status)==4);
	2306	REGEX_CHECK_STATUS;
	2307
	2308	m.reset();
	2309	REGEX_ASSERT(m.regionStart() == 0);
	2310	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2311
	2312	regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
	2313	REGEX_VERBOSE_TEXT(&testText);
	2314	m.reset(&testText);
	2315	REGEX_ASSERT(m.regionStart() == 0);
	2316	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
	2317
	2318	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2319	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	2320	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2321	REGEX_ASSERT(&m == &m.reset());
	2322	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2323
	2324	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	2325	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2326	REGEX_ASSERT(&m == &m.reset());
	2327	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2328
	2329	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2330	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	2331	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2332	REGEX_ASSERT(&m == &m.reset());
	2333	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2334
	2335	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	2336	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2337	REGEX_ASSERT(&m == &m.reset());
	2338	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2339
	2340	utext_close(&testText);
	2341	utext_close(&testPattern);
	2342	}
	2343
	2344	//
	2345	// hitEnd() and requireEnd()
	2346	//
	2347	{
	2348	UErrorCode status = U_ZERO_ERROR;
	2349	UText testPattern = UTEXT_INITIALIZER;
	2350	UText testText = UTEXT_INITIALIZER;
	2351	const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2352	const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
	2353	utext_openUTF8(&testPattern, str_, -1, &status);
	2354	utext_openUTF8(&testText, str_aabb, -1, &status);
	2355
	2356	RegexMatcher m1(&testPattern, &testText, 0, status);
	2357	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	2358	REGEX_ASSERT(m1.hitEnd() == TRUE);
	2359	REGEX_ASSERT(m1.requireEnd() == FALSE);
	2360	REGEX_CHECK_STATUS;
	2361
	2362	status = U_ZERO_ERROR;
	2363	const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
	2364	utext_openUTF8(&testPattern, str_a, -1, &status);
	2365	RegexMatcher m2(&testPattern, &testText, 0, status);
	2366	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	2367	REGEX_ASSERT(m2.hitEnd() == FALSE);
	2368	REGEX_ASSERT(m2.requireEnd() == FALSE);
	2369	REGEX_CHECK_STATUS;
	2370
	2371	status = U_ZERO_ERROR;
	2372	const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /
	2373	utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
	2374	RegexMatcher m3(&testPattern, &testText, 0, status);
	2375	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	2376	REGEX_ASSERT(m3.hitEnd() == TRUE);
	2377	REGEX_ASSERT(m3.requireEnd() == TRUE);
	2378	REGEX_CHECK_STATUS;
	2379
	2380	utext_close(&testText);
	2381	utext_close(&testPattern);
	2382	}
	2383	}
	2384
	2385
	2386	//---------------------------------------------------------------------------
	2387	//
	2388	// API_Replace_UTF8 API test for class RegexMatcher, testing the
	2389	// Replace family of functions.
	2390	//
	2391	//---------------------------------------------------------------------------
	2392	void RegexTest::API_Replace_UTF8() {
	2393	//
	2394	// Replace
	2395	//
	2396	int32_t flags=0;
	2397	UParseError pe;
	2398	UErrorCode status=U_ZERO_ERROR;
	2399
	2400	UText re=UTEXT_INITIALIZER;
	2401	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	2402	REGEX_VERBOSE_TEXT(&re);
	2403	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2404	REGEX_CHECK_STATUS;
	2405
	2406	char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2407	// 012345678901234567
	2408	UText dataText = UTEXT_INITIALIZER;
	2409	utext_openUTF8(&dataText, data, -1, &status);
	2410	REGEX_CHECK_STATUS;
	2411	REGEX_VERBOSE_TEXT(&dataText);
	2412	RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
	2413
	2414	//
	2415	// Plain vanilla matches.
	2416	//
	2417	UnicodeString dest;
	2418	UText destText = UTEXT_INITIALIZER;
	2419	utext_openUnicodeString(&destText, &dest, &status);
	2420	UText *result;
	2421
	2422	UText replText = UTEXT_INITIALIZER;
	2423
	2424	const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
	2425	utext_openUTF8(&replText, str_yz, -1, &status);
	2426	REGEX_VERBOSE_TEXT(&replText);
	2427	result = matcher->replaceFirst(&replText, NULL, status);
	2428	REGEX_CHECK_STATUS;
	2429	const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
	2430	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2431	utext_close(result);
	2432	result = matcher->replaceFirst(&replText, &destText, status);
	2433	REGEX_CHECK_STATUS;
	2434	REGEX_ASSERT(result == &destText);
	2435	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2436
	2437	result = matcher->replaceAll(&replText, NULL, status);
	2438	REGEX_CHECK_STATUS;
	2439	const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
	2440	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2441	utext_close(result);
	2442
	2443	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2444	result = matcher->replaceAll(&replText, &destText, status);
	2445	REGEX_CHECK_STATUS;
	2446	REGEX_ASSERT(result == &destText);
	2447	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2448
	2449	//
	2450	// Plain vanilla non-matches.
	2451	//
	2452	const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
	2453	utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
	2454	matcher->reset(&dataText);
	2455
	2456	result = matcher->replaceFirst(&replText, NULL, status);
	2457	REGEX_CHECK_STATUS;
	2458	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2459	utext_close(result);
	2460	result = matcher->replaceFirst(&replText, &destText, status);
	2461	REGEX_CHECK_STATUS;
	2462	REGEX_ASSERT(result == &destText);
	2463	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2464
	2465	result = matcher->replaceAll(&replText, NULL, status);
	2466	REGEX_CHECK_STATUS;
	2467	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2468	utext_close(result);
	2469	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2470	result = matcher->replaceAll(&replText, &destText, status);
	2471	REGEX_CHECK_STATUS;
	2472	REGEX_ASSERT(result == &destText);
	2473	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2474
	2475	//
	2476	// Empty source string
	2477	//
	2478	utext_openUTF8(&dataText, NULL, 0, &status);
	2479	matcher->reset(&dataText);
	2480
	2481	result = matcher->replaceFirst(&replText, NULL, status);
	2482	REGEX_CHECK_STATUS;
	2483	REGEX_ASSERT_UTEXT_UTF8("", result);
	2484	utext_close(result);
	2485	result = matcher->replaceFirst(&replText, &destText, status);
	2486	REGEX_CHECK_STATUS;
	2487	REGEX_ASSERT(result == &destText);
	2488	REGEX_ASSERT_UTEXT_UTF8("", result);
	2489
	2490	result = matcher->replaceAll(&replText, NULL, status);
	2491	REGEX_CHECK_STATUS;
	2492	REGEX_ASSERT_UTEXT_UTF8("", result);
	2493	utext_close(result);
	2494	result = matcher->replaceAll(&replText, &destText, status);
	2495	REGEX_CHECK_STATUS;
	2496	REGEX_ASSERT(result == &destText);
	2497	REGEX_ASSERT_UTEXT_UTF8("", result);
	2498
	2499	//
	2500	// Empty substitution string
	2501	//
	2502	utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
	2503	matcher->reset(&dataText);
	2504
	2505	utext_openUTF8(&replText, NULL, 0, &status);
	2506	result = matcher->replaceFirst(&replText, NULL, status);
	2507	REGEX_CHECK_STATUS;
	2508	const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
	2509	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2510	utext_close(result);
	2511	result = matcher->replaceFirst(&replText, &destText, status);
	2512	REGEX_CHECK_STATUS;
	2513	REGEX_ASSERT(result == &destText);
	2514	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2515
	2516	result = matcher->replaceAll(&replText, NULL, status);
	2517	REGEX_CHECK_STATUS;
	2518	const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
	2519	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2520	utext_close(result);
	2521	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2522	result = matcher->replaceAll(&replText, &destText, status);
	2523	REGEX_CHECK_STATUS;
	2524	REGEX_ASSERT(result == &destText);
	2525	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2526
	2527	//
	2528	// match whole string
	2529	//
	2530	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2531	utext_openUTF8(&dataText, str_abc, -1, &status);
	2532	matcher->reset(&dataText);
	2533
	2534	const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
	2535	utext_openUTF8(&replText, str_xyz, -1, &status);
	2536	result = matcher->replaceFirst(&replText, NULL, status);
	2537	REGEX_CHECK_STATUS;
	2538	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2539	utext_close(result);
	2540	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2541	result = matcher->replaceFirst(&replText, &destText, status);
	2542	REGEX_CHECK_STATUS;
	2543	REGEX_ASSERT(result == &destText);
	2544	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2545
	2546	result = matcher->replaceAll(&replText, NULL, status);
	2547	REGEX_CHECK_STATUS;
	2548	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2549	utext_close(result);
	2550	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2551	result = matcher->replaceAll(&replText, &destText, status);
	2552	REGEX_CHECK_STATUS;
	2553	REGEX_ASSERT(result == &destText);
	2554	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2555
	2556	//
	2557	// Capture Group, simple case
	2558	//
	2559	const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
	2560	utext_openUTF8(&re, str_add, -1, &status);
	2561	RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
	2562	REGEX_CHECK_STATUS;
	2563
	2564	const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
	2565	utext_openUTF8(&dataText, str_abcdefg, -1, &status);
	2566	RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
	2567	REGEX_CHECK_STATUS;
	2568
	2569	const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
	2570	utext_openUTF8(&replText, str_11, -1, &status);
	2571	result = matcher2->replaceFirst(&replText, NULL, status);
	2572	REGEX_CHECK_STATUS;
	2573	const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
	2574	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2575	utext_close(result);
	2576	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2577	result = matcher2->replaceFirst(&replText, &destText, status);
	2578	REGEX_CHECK_STATUS;
	2579	REGEX_ASSERT(result == &destText);
	2580	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2581
	2582	const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
	2583	utext_openUTF8(&replText, str_v, -1, &status);
	2584	REGEX_VERBOSE_TEXT(&replText);
	2585	result = matcher2->replaceFirst(&replText, NULL, status);
	2586	REGEX_CHECK_STATUS;
	2587	const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
	2588	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2589	utext_close(result);
	2590	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2591	result = matcher2->replaceFirst(&replText, &destText, status);
	2592	REGEX_CHECK_STATUS;
	2593	REGEX_ASSERT(result == &destText);
	2594	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2595
	2596	const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
	2597	utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
	2598	result = matcher2->replaceFirst(&replText, NULL, status);
	2599	REGEX_CHECK_STATUS;
	2600	const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
	2601	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2602	utext_close(result);
	2603	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2604	result = matcher2->replaceFirst(&replText, &destText, status);
	2605	REGEX_CHECK_STATUS;
	2606	REGEX_ASSERT(result == &destText);
	2607	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2608
	2609	unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
	2610	//unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
	2611	// 012345678901234567890123456
	2612	supplDigitChars[22] = 0xF0;
	2613	supplDigitChars[23] = 0x9D;
	2614	supplDigitChars[24] = 0x9F;
	2615	supplDigitChars[25] = 0x8F;
	2616	utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
	2617
	2618	result = matcher2->replaceFirst(&replText, NULL, status);
	2619	REGEX_CHECK_STATUS;
	2620	const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
	2621	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2622	utext_close(result);
	2623	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2624	result = matcher2->replaceFirst(&replText, &destText, status);
	2625	REGEX_CHECK_STATUS;
	2626	REGEX_ASSERT(result == &destText);
	2627	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2628	const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
	2629	utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
	2630	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2631	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2632	utext_close(result);
	2633	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2634	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2635	REGEX_ASSERT(result == &destText);
	2636	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2637
	2638	//
	2639	// Replacement String with \u hex escapes
	2640	//
	2641	{
	2642	const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
	2643	const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
	2644	utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
	2645	utext_openUTF8(&replText, str_u0043, -1, &status);
	2646	matcher->reset(&dataText);
	2647
	2648	result = matcher->replaceAll(&replText, NULL, status);
	2649	REGEX_CHECK_STATUS;
	2650	const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
	2651	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2652	utext_close(result);
	2653	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2654	result = matcher->replaceAll(&replText, &destText, status);
	2655	REGEX_CHECK_STATUS;
	2656	REGEX_ASSERT(result == &destText);
	2657	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2658	}
	2659	{
	2660	const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
	2661	utext_openUTF8(&dataText, str_abc, -1, &status);
	2662	const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
	2663	utext_openUTF8(&replText, str_U00010000, -1, &status);
	2664	matcher->reset(&dataText);
	2665
	2666	unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
	2667	// 0123456789
	2668	expected[2] = 0xF0;
	2669	expected[3] = 0x90;
	2670	expected[4] = 0x80;
	2671	expected[5] = 0x80;
	2672
	2673	result = matcher->replaceAll(&replText, NULL, status);
	2674	REGEX_CHECK_STATUS;
	2675	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2676	utext_close(result);
	2677	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2678	result = matcher->replaceAll(&replText, &destText, status);
	2679	REGEX_CHECK_STATUS;
	2680	REGEX_ASSERT(result == &destText);
	2681	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2682	}
	2683	// TODO: need more through testing of capture substitutions.
	2684
	2685	// Bug 4057
	2686	//
	2687	{
	2688	status = U_ZERO_ERROR;
	2689	const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /
	2690	const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
	2691	const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
	2692	utext_openUTF8(&re, str_ssee, -1, &status);
	2693	utext_openUTF8(&dataText, str_blah, -1, &status);
	2694	utext_openUTF8(&replText, str_ooh, -1, &status);
	2695
	2696	RegexMatcher m(&re, 0, status);
	2697	REGEX_CHECK_STATUS;
	2698
	2699	UnicodeString result;
	2700	UText resultText = UTEXT_INITIALIZER;
	2701	utext_openUnicodeString(&resultText, &result, &status);
	2702
	2703	// Multiple finds do NOT bump up the previous appendReplacement postion.
	2704	m.reset(&dataText);
	2705	m.find();
	2706	m.find();
	2707	m.appendReplacement(&resultText, &replText, status);
	2708	REGEX_CHECK_STATUS;
	2709	const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2710	REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
	2711
	2712	// After a reset into the interior of a string, appendReplacement still starts at beginning.
	2713	status = U_ZERO_ERROR;
	2714	result.truncate(0);
	2715	utext_openUnicodeString(&resultText, &result, &status);
	2716	m.reset(10, status);
	2717	m.find();
	2718	m.find();
	2719	m.appendReplacement(&resultText, &replText, status);
	2720	REGEX_CHECK_STATUS;
	2721	const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2722	REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
	2723
	2724	// find() at interior of string, appendReplacement still starts at beginning.
	2725	status = U_ZERO_ERROR;
	2726	result.truncate(0);
	2727	utext_openUnicodeString(&resultText, &result, &status);
	2728	m.reset();
	2729	m.find(10, status);
	2730	m.find();
	2731	m.appendReplacement(&resultText, &replText, status);
	2732	REGEX_CHECK_STATUS;
	2733	const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2734	REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
	2735
	2736	m.appendTail(&resultText, status);
	2737	const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
	2738	REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
	2739
	2740	utext_close(&resultText);
	2741	}
	2742
	2743	delete matcher2;
	2744	delete pat2;
	2745	delete matcher;
	2746	delete pat;
	2747
	2748	utext_close(&dataText);
	2749	utext_close(&replText);
	2750	utext_close(&destText);
	2751	utext_close(&re);
	2752	}
	2753
	2754
	2755	//---------------------------------------------------------------------------
	2756	//
	2757	// API_Pattern_UTF8 Test that the API for class RegexPattern is
	2758	// present and nominally working.
	2759	//
	2760	//---------------------------------------------------------------------------
	2761	void RegexTest::API_Pattern_UTF8() {
	2762	RegexPattern pata; // Test default constructor to not crash.
	2763	RegexPattern patb;
	2764
	2765	REGEX_ASSERT(pata == patb);
	2766	REGEX_ASSERT(pata == pata);
	2767
	2768	UText re1 = UTEXT_INITIALIZER;
	2769	UText re2 = UTEXT_INITIALIZER;
	2770	UErrorCode status = U_ZERO_ERROR;
	2771	UParseError pe;
	2772
	2773	const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
	2774	const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
	2775	utext_openUTF8(&re1, str_abcalmz, -1, &status);
	2776	utext_openUTF8(&re2, str_def, -1, &status);
	2777
	2778	RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
	2779	RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
	2780	REGEX_CHECK_STATUS;
	2781	REGEX_ASSERT(pat1 == pat1);
	2782	REGEX_ASSERT(*pat1 != pata);
	2783
	2784	// Assign
	2785	patb = *pat1;
	2786	REGEX_ASSERT(patb == *pat1);
	2787
	2788	// Copy Construct
	2789	RegexPattern patc(*pat1);
	2790	REGEX_ASSERT(patc == *pat1);
	2791	REGEX_ASSERT(patb == patc);
	2792	REGEX_ASSERT(pat1 != pat2);
	2793	patb = *pat2;
	2794	REGEX_ASSERT(patb != patc);
	2795	REGEX_ASSERT(patb == *pat2);
	2796
	2797	// Compile with no flags.
	2798	RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
	2799	REGEX_ASSERT(pat1a == pat1);
	2800
	2801	REGEX_ASSERT(pat1a->flags() == 0);
	2802
	2803	// Compile with different flags should be not equal
	2804	RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
	2805	REGEX_CHECK_STATUS;
	2806
	2807	REGEX_ASSERT(pat1b != pat1a);
	2808	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	2809	REGEX_ASSERT(pat1a->flags() == 0);
	2810	delete pat1b;
	2811
	2812	// clone
	2813	RegexPattern *pat1c = pat1->clone();
	2814	REGEX_ASSERT(pat1c == pat1);
	2815	REGEX_ASSERT(pat1c != pat2);
	2816
	2817	delete pat1c;
	2818	delete pat1a;
	2819	delete pat1;
	2820	delete pat2;
	2821
	2822	utext_close(&re1);
	2823	utext_close(&re2);
	2824
	2825
	2826	//
	2827	// Verify that a matcher created from a cloned pattern works.
	2828	// (Jitterbug 3423)
	2829	//
	2830	{
	2831	UErrorCode status = U_ZERO_ERROR;
	2832	UText pattern = UTEXT_INITIALIZER;
	2833	const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
	2834	utext_openUTF8(&pattern, str_pL, -1, &status);
	2835
	2836	RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
	2837	RegexPattern *pClone = pSource->clone();
	2838	delete pSource;
	2839	RegexMatcher *mFromClone = pClone->matcher(status);
	2840	REGEX_CHECK_STATUS;
	2841
	2842	UText input = UTEXT_INITIALIZER;
	2843	const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
	2844	utext_openUTF8(&input, str_HelloWorld, -1, &status);
	2845	mFromClone->reset(&input);
	2846	REGEX_ASSERT(mFromClone->find() == TRUE);
	2847	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	2848	REGEX_ASSERT(mFromClone->find() == TRUE);
	2849	REGEX_ASSERT(mFromClone->group(status) == "World");
	2850	REGEX_ASSERT(mFromClone->find() == FALSE);
	2851	delete mFromClone;
	2852	delete pClone;
	2853
	2854	utext_close(&input);
	2855	utext_close(&pattern);
	2856	}
	2857
	2858	//
	2859	// matches convenience API
	2860	//
	2861	{
	2862	UErrorCode status = U_ZERO_ERROR;
	2863	UText pattern = UTEXT_INITIALIZER;
	2864	UText input = UTEXT_INITIALIZER;
	2865
	2866	const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
	2867	utext_openUTF8(&input, str_randominput, -1, &status);
	2868
	2869	const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2870	utext_openUTF8(&pattern, str_dotstar, -1, &status);
	2871	REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
	2872	REGEX_CHECK_STATUS;
	2873
	2874	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2875	utext_openUTF8(&pattern, str_abc, -1, &status);
	2876	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	2877	REGEX_CHECK_STATUS;
	2878
	2879	const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /
	2880	utext_openUTF8(&pattern, str_nput, -1, &status);
	2881	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	2882	REGEX_CHECK_STATUS;
	2883
	2884	utext_openUTF8(&pattern, str_randominput, -1, &status);
	2885	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	2886	REGEX_CHECK_STATUS;
	2887
	2888	const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /
	2889	utext_openUTF8(&pattern, str_u, -1, &status);
	2890	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	2891	REGEX_CHECK_STATUS;
	2892
	2893	utext_openUTF8(&input, str_abc, -1, &status);
	2894	utext_openUTF8(&pattern, str_abc, -1, &status);
	2895	status = U_INDEX_OUTOFBOUNDS_ERROR;
	2896	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	2897	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	2898
	2899	utext_close(&input);
	2900	utext_close(&pattern);
	2901	}
	2902
	2903
	2904	//
	2905	// Split()
	2906	//
	2907	status = U_ZERO_ERROR;
	2908	const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
	2909	utext_openUTF8(&re1, str_spaceplus, -1, &status);
	2910	pat1 = RegexPattern::compile(&re1, pe, status);
	2911	REGEX_CHECK_STATUS;
	2912	UnicodeString fields[10];
	2913
	2914	int32_t n;
	2915	n = pat1->split("Now is the time", fields, 10, status);
	2916	REGEX_CHECK_STATUS;
	2917	REGEX_ASSERT(n==4);
	2918	REGEX_ASSERT(fields[0]=="Now");
	2919	REGEX_ASSERT(fields[1]=="is");
	2920	REGEX_ASSERT(fields[2]=="the");
	2921	REGEX_ASSERT(fields[3]=="time");
	2922	REGEX_ASSERT(fields[4]=="");
	2923
	2924	n = pat1->split("Now is the time", fields, 2, status);
	2925	REGEX_CHECK_STATUS;
	2926	REGEX_ASSERT(n==2);
	2927	REGEX_ASSERT(fields[0]=="Now");
	2928	REGEX_ASSERT(fields[1]=="is the time");
	2929	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	2930
	2931	fields[1] = "*";
	2932	status = U_ZERO_ERROR;
	2933	n = pat1->split("Now is the time", fields, 1, status);
	2934	REGEX_CHECK_STATUS;
	2935	REGEX_ASSERT(n==1);
	2936	REGEX_ASSERT(fields[0]=="Now is the time");
	2937	REGEX_ASSERT(fields[1]=="*");
	2938	status = U_ZERO_ERROR;
	2939
	2940	n = pat1->split(" Now is the time ", fields, 10, status);
	2941	REGEX_CHECK_STATUS;
	2942	REGEX_ASSERT(n==6);
	2943	REGEX_ASSERT(fields[0]=="");
	2944	REGEX_ASSERT(fields[1]=="Now");
	2945	REGEX_ASSERT(fields[2]=="is");
	2946	REGEX_ASSERT(fields[3]=="the");
	2947	REGEX_ASSERT(fields[4]=="time");
	2948	REGEX_ASSERT(fields[5]=="");
	2949	REGEX_ASSERT(fields[6]=="");
	2950
	2951	fields[2] = "*";
	2952	n = pat1->split(" ", fields, 10, status);
	2953	REGEX_CHECK_STATUS;
	2954	REGEX_ASSERT(n==2);
	2955	REGEX_ASSERT(fields[0]=="");
	2956	REGEX_ASSERT(fields[1]=="");
	2957	REGEX_ASSERT(fields[2]=="*");
	2958
	2959	fields[0] = "foo";
	2960	n = pat1->split("", fields, 10, status);
	2961	REGEX_CHECK_STATUS;
	2962	REGEX_ASSERT(n==0);
	2963	REGEX_ASSERT(fields[0]=="foo");
	2964
	2965	delete pat1;
	2966
	2967	// split, with a pattern with (capture)
	2968	regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
	2969	pat1 = RegexPattern::compile(&re1, pe, status);
	2970	REGEX_CHECK_STATUS;
	2971
	2972	status = U_ZERO_ERROR;
	2973	fields[6] = fields[7] = "*";
	2974	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	2975	REGEX_CHECK_STATUS;
	2976	REGEX_ASSERT(n==7);
	2977	REGEX_ASSERT(fields[0]=="");
	2978	REGEX_ASSERT(fields[1]=="a");
	2979	REGEX_ASSERT(fields[2]=="Now is ");
	2980	REGEX_ASSERT(fields[3]=="b");
	2981	REGEX_ASSERT(fields[4]=="the time");
	2982	REGEX_ASSERT(fields[5]=="c");
	2983	REGEX_ASSERT(fields[6]=="");
	2984	REGEX_ASSERT(fields[7]=="*");
	2985	REGEX_ASSERT(status==U_ZERO_ERROR);
	2986
	2987	fields[6] = fields[7] = "*";
	2988	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	2989	REGEX_CHECK_STATUS;
	2990	REGEX_ASSERT(n==7);
	2991	REGEX_ASSERT(fields[0]==" ");
	2992	REGEX_ASSERT(fields[1]=="a");
	2993	REGEX_ASSERT(fields[2]=="Now is ");
	2994	REGEX_ASSERT(fields[3]=="b");
	2995	REGEX_ASSERT(fields[4]=="the time");
	2996	REGEX_ASSERT(fields[5]=="c");
	2997	REGEX_ASSERT(fields[6]=="");
	2998	REGEX_ASSERT(fields[7]=="*");
	2999
	3000	status = U_ZERO_ERROR;
	3001	fields[6] = "foo";
	3002	n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
	3003	REGEX_CHECK_STATUS;
	3004	REGEX_ASSERT(n==6);
	3005	REGEX_ASSERT(fields[0]==" ");
	3006	REGEX_ASSERT(fields[1]=="a");
	3007	REGEX_ASSERT(fields[2]=="Now is ");
	3008	REGEX_ASSERT(fields[3]=="b");
	3009	REGEX_ASSERT(fields[4]=="the time");
	3010	REGEX_ASSERT(fields[5]==" ");
	3011	REGEX_ASSERT(fields[6]=="foo");
	3012
	3013	status = U_ZERO_ERROR;
	3014	fields[5] = "foo";
	3015	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	3016	REGEX_CHECK_STATUS;
	3017	REGEX_ASSERT(n==5);
	3018	REGEX_ASSERT(fields[0]==" ");
	3019	REGEX_ASSERT(fields[1]=="a");
	3020	REGEX_ASSERT(fields[2]=="Now is ");
	3021	REGEX_ASSERT(fields[3]=="b");
	3022	REGEX_ASSERT(fields[4]=="the time<c>");
	3023	REGEX_ASSERT(fields[5]=="foo");
	3024
	3025	status = U_ZERO_ERROR;
	3026	fields[5] = "foo";
	3027	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	3028	REGEX_CHECK_STATUS;
	3029	REGEX_ASSERT(n==5);
	3030	REGEX_ASSERT(fields[0]==" ");
	3031	REGEX_ASSERT(fields[1]=="a");
	3032	REGEX_ASSERT(fields[2]=="Now is ");
	3033	REGEX_ASSERT(fields[3]=="b");
	3034	REGEX_ASSERT(fields[4]=="the time");
	3035	REGEX_ASSERT(fields[5]=="foo");
	3036
	3037	status = U_ZERO_ERROR;
	3038	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	3039	REGEX_CHECK_STATUS;
	3040	REGEX_ASSERT(n==4);
	3041	REGEX_ASSERT(fields[0]==" ");
	3042	REGEX_ASSERT(fields[1]=="a");
	3043	REGEX_ASSERT(fields[2]=="Now is ");
	3044	REGEX_ASSERT(fields[3]=="the time<c>");
	3045	status = U_ZERO_ERROR;
	3046	delete pat1;
	3047
	3048	regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
	3049	pat1 = RegexPattern::compile(&re1, pe, status);
	3050	REGEX_CHECK_STATUS;
	3051	n = pat1->split("1-10,20", fields, 10, status);
	3052	REGEX_CHECK_STATUS;
	3053	REGEX_ASSERT(n==5);
	3054	REGEX_ASSERT(fields[0]=="1");
	3055	REGEX_ASSERT(fields[1]=="-");
	3056	REGEX_ASSERT(fields[2]=="10");
	3057	REGEX_ASSERT(fields[3]==",");
	3058	REGEX_ASSERT(fields[4]=="20");
	3059	delete pat1;
	3060
	3061
	3062	//
	3063	// RegexPattern::pattern() and patternText()
	3064	//
	3065	pat1 = new RegexPattern();
	3066	REGEX_ASSERT(pat1->pattern() == "");
	3067	REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
	3068	delete pat1;
	3069	const char helloWorldInvariant = "(Hello, world)";
	3070	regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
	3071	pat1 = RegexPattern::compile(&re1, pe, status);
	3072	REGEX_CHECK_STATUS;
	3073	REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
	3074	REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
	3075	delete pat1;
	3076
	3077	utext_close(&re1);
	3078	}
	3079
	3080
	3081	//---------------------------------------------------------------------------
	3082	//
	3083	// Extended A more thorough check for features of regex patterns
	3084	// The test cases are in a separate data file,
	3085	// source/tests/testdata/regextst.txt
	3086	// A description of the test data format is included in that file.
	3087	//
	3088	//---------------------------------------------------------------------------
	3089
	3090	const char *
	3091	RegexTest::getPath(char buffer[2048], const char *filename) {
	3092	UErrorCode status=U_ZERO_ERROR;
	3093	const char *testDataDirectory = IntlTest::getSourceTestData(status);
	3094	if (U_FAILURE(status)) {
	3095	errln("ERROR: loadTestData() failed - %s", u_errorName(status));
	3096	return NULL;
	3097	}
	3098
	3099	strcpy(buffer, testDataDirectory);
	3100	strcat(buffer, filename);
	3101	return buffer;
	3102	}
	3103
	3104	void RegexTest::Extended() {
	3105	char tdd[2048];
	3106	const char *srcPath;
	3107	UErrorCode status = U_ZERO_ERROR;
	3108	int32_t lineNum = 0;
	3109
	3110	//
	3111	// Open and read the test data file.
	3112	//
	3113	srcPath=getPath(tdd, "regextst.txt");
	3114	if(srcPath==NULL) {
	3115	return; /* something went wrong, error already output */
	3116	}
	3117
	3118	int32_t len;
	3119	UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
	3120	if (U_FAILURE(status)) {
	3121	return; /* something went wrong, error already output */
	3122	}
	3123
	3124	//
	3125	// Put the test data into a UnicodeString
	3126	//
	3127	UnicodeString testString(FALSE, testData, len);
	3128
	3129	RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\\1"), 0, status);
	3130	RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, status);
	3131	RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMQvabtyYzZ2-9])([:letter:]*)"), 0, status);
	3132
	3133	RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
	3134	UnicodeString testPattern; // The pattern for test from the test file.
	3135	UnicodeString testFlags; // the flags for a test.
	3136	UnicodeString matchString; // The marked up string to be used as input
	3137
	3138	if (U_FAILURE(status)){
	3139	dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
	3140	delete [] testData;
	3141	return;
	3142	}
	3143
	3144	//
	3145	// Loop over the test data file, once per line.
	3146	//
	3147	while (lineMat.find()) {
	3148	lineNum++;
	3149	if (U_FAILURE(status)) {
	3150	errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
	3151	}
	3152
	3153	status = U_ZERO_ERROR;
	3154	UnicodeString testLine = lineMat.group(1, status);
	3155	if (testLine.length() == 0) {
	3156	continue;
	3157	}
	3158
	3159	//
	3160	// Parse the test line. Skip blank and comment only lines.
	3161	// Separate out the three main fields - pattern, flags, target.
	3162	//
	3163
	3164	commentMat.reset(testLine);
	3165	if (commentMat.lookingAt(status)) {
	3166	// This line is a comment, or blank.
	3167	continue;
	3168	}
	3169
	3170	//
	3171	// Pull out the pattern field, remove it from the test file line.
	3172	//
	3173	quotedStuffMat.reset(testLine);
	3174	if (quotedStuffMat.lookingAt(status)) {
	3175	testPattern = quotedStuffMat.group(2, status);
	3176	testLine.remove(0, quotedStuffMat.end(0, status));
	3177	} else {
	3178	errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
	3179	continue;
	3180	}
	3181
	3182
	3183	//
	3184	// Pull out the flags from the test file line.
	3185	//
	3186	flagsMat.reset(testLine);
	3187	flagsMat.lookingAt(status); // Will always match, possibly an empty string.
	3188	testFlags = flagsMat.group(1, status);
	3189	if (flagsMat.group(2, status).length() > 0) {
	3190	errln("Bad Match flag at line %d. Scanning %c\n",
	3191	lineNum, flagsMat.group(2, status).charAt(0));
	3192	continue;
	3193	}
	3194	testLine.remove(0, flagsMat.end(0, status));
	3195
	3196	//
	3197	// Pull out the match string, as a whole.
	3198	// We'll process the <tags> later.
	3199	//
	3200	quotedStuffMat.reset(testLine);
	3201	if (quotedStuffMat.lookingAt(status)) {
	3202	matchString = quotedStuffMat.group(2, status);
	3203	testLine.remove(0, quotedStuffMat.end(0, status));
	3204	} else {
	3205	errln("Bad match string at test file line %d", lineNum);
	3206	continue;
	3207	}
	3208
	3209	//
	3210	// The only thing left from the input line should be an optional trailing comment.
	3211	//
	3212	commentMat.reset(testLine);
	3213	if (commentMat.lookingAt(status) == FALSE) {
	3214	errln("Line %d: unexpected characters at end of test line.", lineNum);
	3215	continue;
	3216	}
	3217
	3218	//
	3219	// Run the test
	3220	//
	3221	regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
	3222	}
	3223
	3224	delete [] testData;
	3225
	3226	}
	3227
	3228
	3229
	3230	//---------------------------------------------------------------------------
	3231	//
	3232	// regex_find(pattern, flags, inputString, lineNumber)
	3233	//
	3234	// Function to run a single test from the Extended (data driven) tests.
	3235	// See file test/testdata/regextst.txt for a description of the
	3236	// pattern and inputString fields, and the allowed flags.
	3237	// lineNumber is the source line in regextst.txt of the test.
	3238	//
	3239	//---------------------------------------------------------------------------
	3240
	3241
	3242	// Set a value into a UVector at position specified by a decimal number in
	3243	// a UnicodeString. This is a utility function needed by the actual test function,
	3244	// which follows.
	3245	static void set(UVector &vec, int32_t val, UnicodeString index) {
	3246	UErrorCode status=U_ZERO_ERROR;
	3247	int32_t idx = 0;
	3248	for (int32_t i=0; i<index.length(); i++) {
	3249	int32_t d=u_charDigitValue(index.charAt(i));
	3250	if (d<0) {return;}
	3251	idx = idx*10 + d;
	3252	}
	3253	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3254	vec.setElementAt(val, idx);
	3255	}
	3256
	3257	static void setInt(UVector &vec, int32_t val, int32_t idx) {
	3258	UErrorCode status=U_ZERO_ERROR;
	3259	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3260	vec.setElementAt(val, idx);
	3261	}
	3262
	3263	static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
	3264	{
	3265	UBool couldFind = TRUE;
	3266	UTEXT_SETNATIVEINDEX(utext, 0);
	3267	int32_t i = 0;
	3268	while (i < unistrOffset) {
	3269	UChar32 c = UTEXT_NEXT32(utext);
	3270	if (c != U_SENTINEL) {
	3271	i += U16_LENGTH(c);
	3272	} else {
	3273	couldFind = FALSE;
	3274	break;
	3275	}
	3276	}
	3277	nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
	3278	return couldFind;
	3279	}
	3280
	3281
	3282	void RegexTest::regex_find(const UnicodeString &pattern,
	3283	const UnicodeString &flags,
	3284	const UnicodeString &inputString,
	3285	const char *srcPath,
	3286	int32_t line) {
	3287	UnicodeString unEscapedInput;
	3288	UnicodeString deTaggedInput;
	3289
	3290	int32_t patternUTF8Length, inputUTF8Length;
	3291	char patternChars = NULL, inputChars = NULL;
	3292	UText patternText = UTEXT_INITIALIZER;
	3293	UText inputText = UTEXT_INITIALIZER;
	3294	UConverter *UTF8Converter = NULL;
	3295
	3296	UErrorCode status = U_ZERO_ERROR;
	3297	UParseError pe;
	3298	RegexPattern *parsePat = NULL;
	3299	RegexMatcher *parseMatcher = NULL;
	3300	RegexPattern callerPattern = NULL, UTF8Pattern = NULL;
	3301	RegexMatcher matcher = NULL, UTF8Matcher = NULL;
	3302	UVector groupStarts(status);
	3303	UVector groupEnds(status);
	3304	UVector groupStartsUTF8(status);
	3305	UVector groupEndsUTF8(status);
	3306	UBool isMatch = FALSE, isUTF8Match = FALSE;
	3307	UBool failed = FALSE;
	3308	int32_t numFinds;
	3309	int32_t i;
	3310	UBool useMatchesFunc = FALSE;
	3311	UBool useLookingAtFunc = FALSE;
	3312	int32_t regionStart = -1;
	3313	int32_t regionEnd = -1;
	3314	int32_t regionStartUTF8 = -1;
	3315	int32_t regionEndUTF8 = -1;
	3316
	3317
	3318	//
	3319	// Compile the caller's pattern
	3320	//
	3321	uint32_t bflags = 0;
	3322	if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
	3323	bflags \|= UREGEX_CASE_INSENSITIVE;
	3324	}
	3325	if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
	3326	bflags \|= UREGEX_COMMENTS;
	3327	}
	3328	if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
	3329	bflags \|= UREGEX_DOTALL;
	3330	}
	3331	if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
	3332	bflags \|= UREGEX_MULTILINE;
	3333	}
	3334
	3335	if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
	3336	bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
	3337	}
	3338	if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
	3339	bflags \|= UREGEX_UNIX_LINES;
	3340	}
	3341	if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
	3342	bflags \|= UREGEX_LITERAL;
	3343	}
	3344
	3345
	3346	callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
	3347	if (status != U_ZERO_ERROR) {
	3348	#if UCONFIG_NO_BREAK_ITERATION==1
	3349	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3350	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3351	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3352	goto cleanupAndReturn;
	3353	}
	3354	#endif
	3355	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3356	// Expected pattern compilation error.
	3357	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3358	logln("Pattern Compile returns \"%s\"", u_errorName(status));
	3359	}
	3360	goto cleanupAndReturn;
	3361	} else {
	3362	// Unexpected pattern compilation error.
	3363	dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
	3364	goto cleanupAndReturn;
	3365	}
	3366	}
	3367
	3368	UTF8Converter = ucnv_open("UTF8", &status);
	3369	ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	3370
	3371	patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
	3372	status = U_ZERO_ERROR; // buffer overflow
	3373	patternChars = new char[patternUTF8Length+1];
	3374	pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
	3375	utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
	3376
	3377	if (status == U_ZERO_ERROR) {
	3378	UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
	3379
	3380	if (status != U_ZERO_ERROR) {
	3381	#if UCONFIG_NO_BREAK_ITERATION==1
	3382	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3383	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3384	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3385	goto cleanupAndReturn;
	3386	}
	3387	#endif
	3388	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3389	// Expected pattern compilation error.
	3390	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3391	logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
	3392	}
	3393	goto cleanupAndReturn;
	3394	} else {
	3395	// Unexpected pattern compilation error.
	3396	errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
	3397	goto cleanupAndReturn;
	3398	}
	3399	}
	3400	}
	3401
	3402	if (UTF8Pattern == NULL) {
	3403	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3404	logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
	3405	status = U_ZERO_ERROR;
	3406	}
	3407
	3408	if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
	3409	callerPattern->dumpPattern();
	3410	}
	3411
	3412	if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
	3413	errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
	3414	goto cleanupAndReturn;
	3415	}
	3416
	3417
	3418	//
	3419	// Number of times find() should be called on the test string, default to 1
	3420	//
	3421	numFinds = 1;
	3422	for (i=2; i<=9; i++) {
	3423	if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
	3424	if (numFinds != 1) {
	3425	errln("Line %d: more than one digit flag. Scanning %d.", line, i);
	3426	goto cleanupAndReturn;
	3427	}
	3428	numFinds = i;
	3429	}
	3430	}
	3431
	3432	// 'M' flag. Use matches() instead of find()
	3433	if (flags.indexOf((UChar)0x4d) >= 0) {
	3434	useMatchesFunc = TRUE;
	3435	}
	3436	if (flags.indexOf((UChar)0x4c) >= 0) {
	3437	useLookingAtFunc = TRUE;
	3438	}
	3439
	3440	//
	3441	// Find the tags in the input data, remove them, and record the group boundary
	3442	// positions.
	3443	//
	3444	parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);
	3445	REGEX_CHECK_STATUS_L(line);
	3446
	3447	unEscapedInput = inputString.unescape();
	3448	parseMatcher = parsePat->matcher(unEscapedInput, status);
	3449	REGEX_CHECK_STATUS_L(line);
	3450	while(parseMatcher->find()) {
	3451	parseMatcher->appendReplacement(deTaggedInput, "", status);
	3452	REGEX_CHECK_STATUS;
	3453	UnicodeString groupNum = parseMatcher->group(2, status);
	3454	if (groupNum == "r") {
	3455	// <r> or </r>, a region specification within the string
	3456	if (parseMatcher->group(1, status) == "/") {
	3457	regionEnd = deTaggedInput.length();
	3458	} else {
	3459	regionStart = deTaggedInput.length();
	3460	}
	3461	} else {
	3462	// <digits> or </digits>, a group match boundary tag.
	3463	if (parseMatcher->group(1, status) == "/") {
	3464	set(groupEnds, deTaggedInput.length(), groupNum);
	3465	} else {
	3466	set(groupStarts, deTaggedInput.length(), groupNum);
	3467	}
	3468	}
	3469	}
	3470	parseMatcher->appendTail(deTaggedInput);
	3471	REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
	3472	if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>regionEnd)) {
	3473	errln("mismatched <r> tags");
	3474	failed = TRUE;
	3475	goto cleanupAndReturn;
	3476	}
	3477
	3478	//
	3479	// Configure the matcher according to the flags specified with this test.
	3480	//
	3481	matcher = callerPattern->matcher(deTaggedInput, status);
	3482	REGEX_CHECK_STATUS_L(line);
	3483	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
	3484	matcher->setTrace(TRUE);
	3485	}
	3486
	3487	if (UTF8Pattern != NULL) {
	3488	inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
	3489	status = U_ZERO_ERROR; // buffer overflow
	3490	inputChars = new char[inputUTF8Length+1];
	3491	deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
	3492	utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
	3493
	3494	if (status == U_ZERO_ERROR) {
	3495	UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
	3496	REGEX_CHECK_STATUS_L(line);
	3497	}
	3498
	3499	if (UTF8Matcher == NULL) {
	3500	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3501	logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
	3502	status = U_ZERO_ERROR;
	3503	}
	3504	}
	3505
	3506	//
	3507	// Generate native indices for UTF8 versions of region and capture group info
	3508	//
	3509	if (UTF8Matcher != NULL) {
	3510	if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
	3511	if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
	3512
	3513	// Fill out the native index UVector info.
	3514	// Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
	3515	for (i=0; i<groupStarts.size(); i++) {
	3516	int32_t start = groupStarts.elementAti(i);
	3517	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3518	if (start >= 0) {
	3519	int32_t startUTF8;
	3520	if (!utextOffsetToNative(&inputText, start, startUTF8)) {
	3521	errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
	3522	failed = TRUE;
	3523	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3524	}
	3525	setInt(groupStartsUTF8, startUTF8, i);
	3526	}
	3527
	3528	int32_t end = groupEnds.elementAti(i);
	3529	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3530	if (end >= 0) {
	3531	int32_t endUTF8;
	3532	if (!utextOffsetToNative(&inputText, end, endUTF8)) {
	3533	errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
	3534	failed = TRUE;
	3535	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3536	}
	3537	setInt(groupEndsUTF8, endUTF8, i);
	3538	}
	3539	}
	3540	}
	3541
	3542	if (regionStart>=0) {
	3543	matcher->region(regionStart, regionEnd, status);
	3544	REGEX_CHECK_STATUS_L(line);
	3545	if (UTF8Matcher != NULL) {
	3546	UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
	3547	REGEX_CHECK_STATUS_L(line);
	3548	}
	3549	}
	3550	if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
	3551	matcher->useAnchoringBounds(FALSE);
	3552	if (UTF8Matcher != NULL) {
	3553	UTF8Matcher->useAnchoringBounds(FALSE);
	3554	}
	3555	}
	3556	if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
	3557	matcher->useTransparentBounds(TRUE);
	3558	if (UTF8Matcher != NULL) {
	3559	UTF8Matcher->useTransparentBounds(TRUE);
	3560	}
	3561	}
	3562
	3563
	3564
	3565	//
	3566	// Do a find on the de-tagged input using the caller's pattern
	3567	// TODO: error on count>1 and not find().
	3568	// error on both matches() and lookingAt().
	3569	//
	3570	for (i=0; i<numFinds; i++) {
	3571	if (useMatchesFunc) {
	3572	isMatch = matcher->matches(status);
	3573	if (UTF8Matcher != NULL) {
	3574	isUTF8Match = UTF8Matcher->matches(status);
	3575	}
	3576	} else if (useLookingAtFunc) {
	3577	isMatch = matcher->lookingAt(status);
	3578	if (UTF8Matcher != NULL) {
	3579	isUTF8Match = UTF8Matcher->lookingAt(status);
	3580	}
	3581	} else {
	3582	isMatch = matcher->find();
	3583	if (UTF8Matcher != NULL) {
	3584	isUTF8Match = UTF8Matcher->find();
	3585	}
	3586	}
	3587	}
	3588	matcher->setTrace(FALSE);
	3589	if (U_FAILURE(status)) {
	3590	errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
	3591	}
	3592
	3593	//
	3594	// Match up the groups from the find() with the groups from the tags
	3595	//
	3596
	3597	// number of tags should match number of groups from find operation.
	3598	// matcher->groupCount does not include group 0, the entire match, hence the +1.
	3599	// G option in test means that capture group data is not available in the
	3600	// expected results, so the check needs to be suppressed.
	3601	if (isMatch == FALSE && groupStarts.size() != 0) {
	3602	dataerrln("Error at line %d: Match expected, but none found.", line);
	3603	failed = TRUE;
	3604	goto cleanupAndReturn;
	3605	} else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
	3606	errln("Error at line %d: Match expected, but none found. (UTF8)", line);
	3607	failed = TRUE;
	3608	goto cleanupAndReturn;
	3609	}
	3610
	3611	if (flags.indexOf((UChar)0x47 /G/) >= 0) {
	3612	// Only check for match / no match. Don't check capture groups.
	3613	if (isMatch && groupStarts.size() == 0) {
	3614	errln("Error at line %d: No match expected, but one found.", line);
	3615	failed = TRUE;
	3616	} else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
	3617	errln("Error at line %d: No match expected, but one found. (UTF8)", line);
	3618	failed = TRUE;
	3619	}
	3620	goto cleanupAndReturn;
	3621	}
	3622
	3623	REGEX_CHECK_STATUS_L(line);
	3624	for (i=0; i<=matcher->groupCount(); i++) {
	3625	int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
	3626	int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
	3627	if (matcher->start(i, status) != expectedStart) {
	3628	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
	3629	line, i, expectedStart, matcher->start(i, status));
	3630	failed = TRUE;
	3631	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3632	} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
	3633	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
	3634	line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
	3635	failed = TRUE;
	3636	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3637	}
	3638
	3639	int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
	3640	int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
	3641	if (matcher->end(i, status) != expectedEnd) {
	3642	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
	3643	line, i, expectedEnd, matcher->end(i, status));
	3644	failed = TRUE;
	3645	// Error on end position; keep going; real error is probably yet to come as group
	3646	// end positions work from end of the input data towards the front.
	3647	} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
	3648	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
	3649	line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
	3650	failed = TRUE;
	3651	// Error on end position; keep going; real error is probably yet to come as group
	3652	// end positions work from end of the input data towards the front.
	3653	}
	3654	}
	3655	if ( matcher->groupCount()+1 < groupStarts.size()) {
	3656	errln("Error at line %d: Expected %d capture groups, found %d.",
	3657	line, groupStarts.size()-1, matcher->groupCount());
	3658	failed = TRUE;
	3659	}
	3660	else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
	3661	errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
	3662	line, groupStarts.size()-1, UTF8Matcher->groupCount());
	3663	failed = TRUE;
	3664	}
	3665
	3666	if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3667	matcher->requireEnd() == TRUE) {
	3668	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
	3669	failed = TRUE;
	3670	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3671	UTF8Matcher->requireEnd() == TRUE) {
	3672	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3673	failed = TRUE;
	3674	}
	3675
	3676	if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
	3677	matcher->requireEnd() == FALSE) {
	3678	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
	3679	failed = TRUE;
	3680	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
	3681	UTF8Matcher->requireEnd() == FALSE) {
	3682	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3683	failed = TRUE;
	3684	}
	3685
	3686	if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3687	matcher->hitEnd() == TRUE) {
	3688	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
	3689	failed = TRUE;
	3690	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3691	UTF8Matcher->hitEnd() == TRUE) {
	3692	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3693	failed = TRUE;
	3694	}
	3695
	3696	if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3697	matcher->hitEnd() == FALSE) {
	3698	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
	3699	failed = TRUE;
	3700	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3701	UTF8Matcher->hitEnd() == FALSE) {
	3702	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3703	failed = TRUE;
	3704	}
	3705
	3706
	3707	cleanupAndReturn:
	3708	if (failed) {
	3709	infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
	3710	+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
	3711	// callerPattern->dump();
	3712	}
	3713	delete parseMatcher;
	3714	delete parsePat;
	3715	delete UTF8Matcher;
	3716	delete UTF8Pattern;
	3717	delete matcher;
	3718	delete callerPattern;
	3719
	3720	utext_close(&inputText);
	3721	delete[] inputChars;
	3722	utext_close(&patternText);
	3723	delete[] patternChars;
	3724	ucnv_close(UTF8Converter);
	3725	}
	3726
	3727
	3728
	3729
	3730	//---------------------------------------------------------------------------
	3731	//
	3732	// Errors Check for error handling in patterns.
	3733	//
	3734	//---------------------------------------------------------------------------
	3735	void RegexTest::Errors() {
	3736	// \escape sequences that aren't implemented yet.
	3737	//REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
	3738
	3739	// Missing close parentheses
	3740	REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3741	REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3742	REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
	3743
	3744	// Extra close paren
	3745	REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
	3746	REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
	3747	REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
	3748
	3749	// Look-ahead, Look-behind
	3750	// TODO: add tests for unbounded length look-behinds.
	3751	REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
	3752
	3753	// Attempt to use non-default flags
	3754	{
	3755	UParseError pe;
	3756	UErrorCode status = U_ZERO_ERROR;
	3757	int32_t flags = UREGEX_CANON_EQ \|
	3758	UREGEX_COMMENTS \| UREGEX_DOTALL \|
	3759	UREGEX_MULTILINE;
	3760	RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);
	3761	REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
	3762	delete pat1;
	3763	}
	3764
	3765
	3766	// Quantifiers are allowed only after something that can be quantified.
	3767	REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
	3768	REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
	3769	REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
	3770
	3771	// Mal-formed {min,max} quantifiers
	3772	REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
	3773	REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
	3774	REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
	3775	REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
	3776	REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
	3777	REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
	3778	REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
	3779	REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
	3780	REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
	3781
	3782	// Ticket 5389
	3783	REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
	3784
	3785	// Invalid Back Reference \0
	3786	// For ICU 3.8 and earlier
	3787	// For ICU versions newer than 3.8, \0 introduces an octal escape.
	3788	//
	3789	REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
	3790
	3791	}
	3792
	3793
	3794	//-------------------------------------------------------------------------------
	3795	//
	3796	// Read a text data file, convert it to UChars, and return the data
	3797	// in one big UChar * buffer, which the caller must delete.
	3798	//
	3799	//--------------------------------------------------------------------------------
	3800	UChar RegexTest::ReadAndConvertFile(const char fileName, int32_t &ulen,
	3801	const char *defEncoding, UErrorCode &status) {
	3802	UChar *retPtr = NULL;
	3803	char *fileBuf = NULL;
	3804	UConverter* conv = NULL;
	3805	FILE *f = NULL;
	3806
	3807	ulen = 0;
	3808	if (U_FAILURE(status)) {
	3809	return retPtr;
	3810	}
	3811
	3812	//
	3813	// Open the file.
	3814	//
	3815	f = fopen(fileName, "rb");
	3816	if (f == 0) {
	3817	dataerrln("Error opening test data file %s\n", fileName);
	3818	status = U_FILE_ACCESS_ERROR;
	3819	return NULL;
	3820	}
	3821	//
	3822	// Read it in
	3823	//
	3824	int32_t fileSize;
	3825	int32_t amt_read;
	3826
	3827	fseek( f, 0, SEEK_END);
	3828	fileSize = ftell(f);
	3829	fileBuf = new char[fileSize];
	3830	fseek(f, 0, SEEK_SET);
	3831	amt_read = fread(fileBuf, 1, fileSize, f);
	3832	if (amt_read != fileSize \|\| fileSize <= 0) {
	3833	errln("Error reading test data file.");
	3834	goto cleanUpAndReturn;
	3835	}
	3836
	3837	//
	3838	// Look for a Unicode Signature (BOM) on the data just read
	3839	//
	3840	int32_t signatureLength;
	3841	const char * fileBufC;
	3842	const char* encoding;
	3843
	3844	fileBufC = fileBuf;
	3845	encoding = ucnv_detectUnicodeSignature(
	3846	fileBuf, fileSize, &signatureLength, &status);
	3847	if(encoding!=NULL ){
	3848	fileBufC += signatureLength;
	3849	fileSize -= signatureLength;
	3850	} else {
	3851	encoding = defEncoding;
	3852	if (strcmp(encoding, "utf-8") == 0) {
	3853	errln("file %s is missing its BOM", fileName);
	3854	}
	3855	}
	3856
	3857	//
	3858	// Open a converter to take the rule file to UTF-16
	3859	//
	3860	conv = ucnv_open(encoding, &status);
	3861	if (U_FAILURE(status)) {
	3862	goto cleanUpAndReturn;
	3863	}
	3864
	3865	//
	3866	// Convert the rules to UChar.
	3867	// Preflight first to determine required buffer size.
	3868	//
	3869	ulen = ucnv_toUChars(conv,
	3870	NULL, // dest,
	3871	0, // destCapacity,
	3872	fileBufC,
	3873	fileSize,
	3874	&status);
	3875	if (status == U_BUFFER_OVERFLOW_ERROR) {
	3876	// Buffer Overflow is expected from the preflight operation.
	3877	status = U_ZERO_ERROR;
	3878
	3879	retPtr = new UChar[ulen+1];
	3880	ucnv_toUChars(conv,
	3881	retPtr, // dest,
	3882	ulen+1,
	3883	fileBufC,
	3884	fileSize,
	3885	&status);
	3886	}
	3887
	3888	cleanUpAndReturn:
	3889	fclose(f);
	3890	delete[] fileBuf;
	3891	ucnv_close(conv);
	3892	if (U_FAILURE(status)) {
	3893	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	3894	delete []retPtr;
	3895	retPtr = 0;
	3896	ulen = 0;
	3897	};
	3898	return retPtr;
	3899	}
	3900
	3901
	3902	//-------------------------------------------------------------------------------
	3903	//
	3904	// PerlTests - Run Perl's regular expression tests
	3905	// The input file for this test is re_tests, the standard regular
	3906	// expression test data distributed with the Perl source code.
	3907	//
	3908	// Here is Perl's description of the test data file:
	3909	//
	3910	// # The tests are in a separate file 't/op/re_tests'.
	3911	// # Each line in that file is a separate test.
	3912	// # There are five columns, separated by tabs.
	3913	// #
	3914	// # Column 1 contains the pattern, optionally enclosed in C<''>.
	3915	// # Modifiers can be put after the closing C<'>.
	3916	// #
	3917	// # Column 2 contains the string to be matched.
	3918	// #
	3919	// # Column 3 contains the expected result:
	3920	// # y expect a match
	3921	// # n expect no match
	3922	// # c expect an error
	3923	// # B test exposes a known bug in Perl, should be skipped
	3924	// # b test exposes a known bug in Perl, should be skipped if noamp
	3925	// #
	3926	// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
	3927	// #
	3928	// # Column 4 contains a string, usually C<$&>.
	3929	// #
	3930	// # Column 5 contains the expected result of double-quote
	3931	// # interpolating that string after the match, or start of error message.
	3932	// #
	3933	// # Column 6, if present, contains a reason why the test is skipped.
	3934	// # This is printed with "skipped", for harness to pick up.
	3935	// #
	3936	// # \n in the tests are interpolated, as are variables of the form ${\w+}.
	3937	// #
	3938	// # If you want to add a regular expression test that can't be expressed
	3939	// # in this format, don't add it here: put it in op/pat.t instead.
	3940	//
	3941	// For ICU, if field 3 contains an 'i', the test will be skipped.
	3942	// The test exposes is some known incompatibility between ICU and Perl regexps.
	3943	// (The i is in addition to whatever was there before.)
	3944	//
	3945	//-------------------------------------------------------------------------------
	3946	void RegexTest::PerlTests() {
	3947	char tdd[2048];
	3948	const char *srcPath;
	3949	UErrorCode status = U_ZERO_ERROR;
	3950	UParseError pe;
	3951
	3952	//
	3953	// Open and read the test data file.
	3954	//
	3955	srcPath=getPath(tdd, "re_tests.txt");
	3956	if(srcPath==NULL) {
	3957	return; /* something went wrong, error already output */
	3958	}
	3959
	3960	int32_t len;
	3961	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	3962	if (U_FAILURE(status)) {
	3963	return; /* something went wrong, error already output */
	3964	}
	3965
	3966	//
	3967	// Put the test data into a UnicodeString
	3968	//
	3969	UnicodeString testDataString(FALSE, testData, len);
	3970
	3971	//
	3972	// Regex to break the input file into lines, and strip the new lines.
	3973	// One line per match, capture group one is the desired data.
	3974	//
	3975	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	3976	if (U_FAILURE(status)) {
	3977	dataerrln("RegexPattern::compile() error");
	3978	return;
	3979	}
	3980	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	3981
	3982	//
	3983	// Regex to split a test file line into fields.
	3984	// There are six fields, separated by tabs.
	3985	//
	3986	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	3987
	3988	//
	3989	// Regex to identify test patterns with flag settings, and to separate them.
	3990	// Test patterns with flags look like 'pattern'i
	3991	// Test patterns without flags are not quoted: pattern
	3992	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	3993	//
	3994	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	3995	RegexMatcher* flagMat = flagPat->matcher(status);
	3996
	3997	//
	3998	// The Perl tests reference several perl-isms, which are evaluated/substituted
	3999	// in the test data. Not being perl, this must be done explicitly. Here
	4000	// are string constants and REs for these constructs.
	4001	//
	4002	UnicodeString nulnulSrc("${nulnul}");
	4003	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4004	nulnul = nulnul.unescape();
	4005
	4006	UnicodeString ffffSrc("${ffff}");
	4007	UnicodeString ffff("\\uffff", -1, US_INV);
	4008	ffff = ffff.unescape();
	4009
	4010	// regexp for $-[0], $+[2], etc.
	4011	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4012	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4013
	4014	// regexp for $0, $1, $2, etc.
	4015	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4016	RegexMatcher *cgMat = cgPat->matcher(status);
	4017
	4018
	4019	//
	4020	// Main Loop for the Perl Tests, runs once per line from the
	4021	// test data file.
	4022	//
	4023	int32_t lineNum = 0;
	4024	int32_t skippedUnimplementedCount = 0;
	4025	while (lineMat->find()) {
	4026	lineNum++;
	4027
	4028	//
	4029	// Get a line, break it into its fields, do the Perl
	4030	// variable substitutions.
	4031	//
	4032	UnicodeString line = lineMat->group(1, status);
	4033	UnicodeString fields[7];
	4034	fieldPat->split(line, fields, 7, status);
	4035
	4036	flagMat->reset(fields[0]);
	4037	flagMat->matches(status);
	4038	UnicodeString pattern = flagMat->group(2, status);
	4039	pattern.findAndReplace("${bang}", "!");
	4040	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4041	pattern.findAndReplace(ffffSrc, ffff);
	4042
	4043	//
	4044	// Identify patterns that include match flag settings,
	4045	// split off the flags, remove the extra quotes.
	4046	//
	4047	UnicodeString flagStr = flagMat->group(3, status);
	4048	if (U_FAILURE(status)) {
	4049	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4050	return;
	4051	}
	4052	int32_t flags = 0;
	4053	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4054	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4055	const UChar UChar_m = 0x6d;
	4056	const UChar UChar_x = 0x78;
	4057	const UChar UChar_y = 0x79;
	4058	if (flagStr.indexOf(UChar_i) != -1) {
	4059	flags \|= UREGEX_CASE_INSENSITIVE;
	4060	}
	4061	if (flagStr.indexOf(UChar_m) != -1) {
	4062	flags \|= UREGEX_MULTILINE;
	4063	}
	4064	if (flagStr.indexOf(UChar_x) != -1) {
	4065	flags \|= UREGEX_COMMENTS;
	4066	}
	4067
	4068	//
	4069	// Compile the test pattern.
	4070	//
	4071	status = U_ZERO_ERROR;
	4072	RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
	4073	if (status == U_REGEX_UNIMPLEMENTED) {
	4074	//
	4075	// Test of a feature that is planned for ICU, but not yet implemented.
	4076	// skip the test.
	4077	skippedUnimplementedCount++;
	4078	delete testPat;
	4079	status = U_ZERO_ERROR;
	4080	continue;
	4081	}
	4082
	4083	if (U_FAILURE(status)) {
	4084	// Some tests are supposed to generate errors.
	4085	// Only report an error for tests that are supposed to succeed.
	4086	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4087	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4088	{
	4089	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4090	}
	4091	status = U_ZERO_ERROR;
	4092	delete testPat;
	4093	continue;
	4094	}
	4095
	4096	if (fields[2].indexOf(UChar_i) >= 0) {
	4097	// ICU should skip this test.
	4098	delete testPat;
	4099	continue;
	4100	}
	4101
	4102	if (fields[2].indexOf(UChar_c) >= 0) {
	4103	// This pattern should have caused a compilation error, but didn't/
	4104	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4105	delete testPat;
	4106	continue;
	4107	}
	4108
	4109	//
	4110	// replace the Perl variables that appear in some of the
	4111	// match data strings.
	4112	//
	4113	UnicodeString matchString = fields[1];
	4114	matchString.findAndReplace(nulnulSrc, nulnul);
	4115	matchString.findAndReplace(ffffSrc, ffff);
	4116
	4117	// Replace any \n in the match string with an actual new-line char.
	4118	// Don't do full unescape, as this unescapes more than Perl does, which
	4119	// causes other spurious failures in the tests.
	4120	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4121
	4122
	4123
	4124	//
	4125	// Run the test, check for expected match/don't match result.
	4126	//
	4127	RegexMatcher *testMat = testPat->matcher(matchString, status);
	4128	UBool found = testMat->find();
	4129	UBool expected = FALSE;
	4130	if (fields[2].indexOf(UChar_y) >=0) {
	4131	expected = TRUE;
	4132	}
	4133	if (expected != found) {
	4134	errln("line %d: Expected %smatch, got %smatch",
	4135	lineNum, expected?"":"no ", found?"":"no " );
	4136	continue;
	4137	}
	4138
	4139	// Don't try to check expected results if there is no match.
	4140	// (Some have stuff in the expected fields)
	4141	if (!found) {
	4142	delete testMat;
	4143	delete testPat;
	4144	continue;
	4145	}
	4146
	4147	//
	4148	// Interpret the Perl expression from the fourth field of the data file,
	4149	// building up an ICU string from the results of the ICU match.
	4150	// The Perl expression will contain references to the results of
	4151	// a regex match, including the matched string, capture group strings,
	4152	// group starting and ending indicies, etc.
	4153	//
	4154	UnicodeString resultString;
	4155	UnicodeString perlExpr = fields[3];
	4156	#if SUPPORT_MUTATING_INPUT_STRING
	4157	groupsMat->reset(perlExpr);
	4158	cgMat->reset(perlExpr);
	4159	#endif
	4160
	4161	while (perlExpr.length() > 0) {
	4162	#if !SUPPORT_MUTATING_INPUT_STRING
	4163	// Perferred usage. Reset after any modification to input string.
	4164	groupsMat->reset(perlExpr);
	4165	cgMat->reset(perlExpr);
	4166	#endif
	4167
	4168	if (perlExpr.startsWith("$&")) {
	4169	resultString.append(testMat->group(status));
	4170	perlExpr.remove(0, 2);
	4171	}
	4172
	4173	else if (groupsMat->lookingAt(status)) {
	4174	// $-[0] $+[2] etc.
	4175	UnicodeString digitString = groupsMat->group(2, status);
	4176	int32_t t = 0;
	4177	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4178	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4179	int32_t matchPosition;
	4180	if (plusOrMinus.compare("+") == 0) {
	4181	matchPosition = testMat->end(groupNum, status);
	4182	} else {
	4183	matchPosition = testMat->start(groupNum, status);
	4184	}
	4185	if (matchPosition != -1) {
	4186	ICU_Utility::appendNumber(resultString, matchPosition);
	4187	}
	4188	perlExpr.remove(0, groupsMat->end(status));
	4189	}
	4190
	4191	else if (cgMat->lookingAt(status)) {
	4192	// $1, $2, $3, etc.
	4193	UnicodeString digitString = cgMat->group(1, status);
	4194	int32_t t = 0;
	4195	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4196	if (U_SUCCESS(status)) {
	4197	resultString.append(testMat->group(groupNum, status));
	4198	status = U_ZERO_ERROR;
	4199	}
	4200	perlExpr.remove(0, cgMat->end(status));
	4201	}
	4202
	4203	else if (perlExpr.startsWith("@-")) {
	4204	int32_t i;
	4205	for (i=0; i<=testMat->groupCount(); i++) {
	4206	if (i>0) {
	4207	resultString.append(" ");
	4208	}
	4209	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4210	}
	4211	perlExpr.remove(0, 2);
	4212	}
	4213
	4214	else if (perlExpr.startsWith("@+")) {
	4215	int32_t i;
	4216	for (i=0; i<=testMat->groupCount(); i++) {
	4217	if (i>0) {
	4218	resultString.append(" ");
	4219	}
	4220	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4221	}
	4222	perlExpr.remove(0, 2);
	4223	}
	4224
	4225	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4226	// or as an escaped sequence (e.g. \n)
	4227	if (perlExpr.length() > 1) {
	4228	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4229	}
	4230	UChar c = perlExpr.charAt(0);
	4231	switch (c) {
	4232	case 'n': c = '\n'; break;
	4233	// add any other escape sequences that show up in the test expected results.
	4234	}
	4235	resultString.append(c);
	4236	perlExpr.remove(0, 1);
	4237	}
	4238
	4239	else {
	4240	// Any characters from the perl expression that we don't explicitly
	4241	// recognize before here are assumed to be literals and copied
	4242	// as-is to the expected results.
	4243	resultString.append(perlExpr.charAt(0));
	4244	perlExpr.remove(0, 1);
	4245	}
	4246
	4247	if (U_FAILURE(status)) {
	4248	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4249	break;
	4250	}
	4251	}
	4252
	4253	//
	4254	// Expected Results Compare
	4255	//
	4256	UnicodeString expectedS(fields[4]);
	4257	expectedS.findAndReplace(nulnulSrc, nulnul);
	4258	expectedS.findAndReplace(ffffSrc, ffff);
	4259	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4260
	4261
	4262	if (expectedS.compare(resultString) != 0) {
	4263	err("Line %d: Incorrect perl expression results.", lineNum);
	4264	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4265	}
	4266
	4267	delete testMat;
	4268	delete testPat;
	4269	}
	4270
	4271	//
	4272	// All done. Clean up allocated stuff.
	4273	//
	4274	delete cgMat;
	4275	delete cgPat;
	4276
	4277	delete groupsMat;
	4278	delete groupsPat;
	4279
	4280	delete flagMat;
	4281	delete flagPat;
	4282
	4283	delete lineMat;
	4284	delete linePat;
	4285
	4286	delete fieldPat;
	4287	delete [] testData;
	4288
	4289
	4290	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4291
	4292	}
	4293
	4294
	4295	//-------------------------------------------------------------------------------
	4296	//
	4297	// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
	4298	// (instead of using UnicodeStrings) to test the alternate engine.
	4299	// The input file for this test is re_tests, the standard regular
	4300	// expression test data distributed with the Perl source code.
	4301	// See PerlTests() for more information.
	4302	//
	4303	//-------------------------------------------------------------------------------
	4304	void RegexTest::PerlTestsUTF8() {
	4305	char tdd[2048];
	4306	const char *srcPath;
	4307	UErrorCode status = U_ZERO_ERROR;
	4308	UParseError pe;
	4309	LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
	4310	UText patternText = UTEXT_INITIALIZER;
	4311	char *patternChars = NULL;
	4312	int32_t patternLength;
	4313	int32_t patternCapacity = 0;
	4314	UText inputText = UTEXT_INITIALIZER;
	4315	char *inputChars = NULL;
	4316	int32_t inputLength;
	4317	int32_t inputCapacity = 0;
	4318
	4319	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	4320
	4321	//
	4322	// Open and read the test data file.
	4323	//
	4324	srcPath=getPath(tdd, "re_tests.txt");
	4325	if(srcPath==NULL) {
	4326	return; /* something went wrong, error already output */
	4327	}
	4328
	4329	int32_t len;
	4330	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	4331	if (U_FAILURE(status)) {
	4332	return; /* something went wrong, error already output */
	4333	}
	4334
	4335	//
	4336	// Put the test data into a UnicodeString
	4337	//
	4338	UnicodeString testDataString(FALSE, testData, len);
	4339
	4340	//
	4341	// Regex to break the input file into lines, and strip the new lines.
	4342	// One line per match, capture group one is the desired data.
	4343	//
	4344	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	4345	if (U_FAILURE(status)) {
	4346	dataerrln("RegexPattern::compile() error");
	4347	return;
	4348	}
	4349	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	4350
	4351	//
	4352	// Regex to split a test file line into fields.
	4353	// There are six fields, separated by tabs.
	4354	//
	4355	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	4356
	4357	//
	4358	// Regex to identify test patterns with flag settings, and to separate them.
	4359	// Test patterns with flags look like 'pattern'i
	4360	// Test patterns without flags are not quoted: pattern
	4361	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	4362	//
	4363	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	4364	RegexMatcher* flagMat = flagPat->matcher(status);
	4365
	4366	//
	4367	// The Perl tests reference several perl-isms, which are evaluated/substituted
	4368	// in the test data. Not being perl, this must be done explicitly. Here
	4369	// are string constants and REs for these constructs.
	4370	//
	4371	UnicodeString nulnulSrc("${nulnul}");
	4372	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4373	nulnul = nulnul.unescape();
	4374
	4375	UnicodeString ffffSrc("${ffff}");
	4376	UnicodeString ffff("\\uffff", -1, US_INV);
	4377	ffff = ffff.unescape();
	4378
	4379	// regexp for $-[0], $+[2], etc.
	4380	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4381	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4382
	4383	// regexp for $0, $1, $2, etc.
	4384	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4385	RegexMatcher *cgMat = cgPat->matcher(status);
	4386
	4387
	4388	//
	4389	// Main Loop for the Perl Tests, runs once per line from the
	4390	// test data file.
	4391	//
	4392	int32_t lineNum = 0;
	4393	int32_t skippedUnimplementedCount = 0;
	4394	while (lineMat->find()) {
	4395	lineNum++;
	4396
	4397	//
	4398	// Get a line, break it into its fields, do the Perl
	4399	// variable substitutions.
	4400	//
	4401	UnicodeString line = lineMat->group(1, status);
	4402	UnicodeString fields[7];
	4403	fieldPat->split(line, fields, 7, status);
	4404
	4405	flagMat->reset(fields[0]);
	4406	flagMat->matches(status);
	4407	UnicodeString pattern = flagMat->group(2, status);
	4408	pattern.findAndReplace("${bang}", "!");
	4409	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4410	pattern.findAndReplace(ffffSrc, ffff);
	4411
	4412	//
	4413	// Identify patterns that include match flag settings,
	4414	// split off the flags, remove the extra quotes.
	4415	//
	4416	UnicodeString flagStr = flagMat->group(3, status);
	4417	if (U_FAILURE(status)) {
	4418	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4419	return;
	4420	}
	4421	int32_t flags = 0;
	4422	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4423	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4424	const UChar UChar_m = 0x6d;
	4425	const UChar UChar_x = 0x78;
	4426	const UChar UChar_y = 0x79;
	4427	if (flagStr.indexOf(UChar_i) != -1) {
	4428	flags \|= UREGEX_CASE_INSENSITIVE;
	4429	}
	4430	if (flagStr.indexOf(UChar_m) != -1) {
	4431	flags \|= UREGEX_MULTILINE;
	4432	}
	4433	if (flagStr.indexOf(UChar_x) != -1) {
	4434	flags \|= UREGEX_COMMENTS;
	4435	}
	4436
	4437	//
	4438	// Put the pattern in a UTF-8 UText
	4439	//
	4440	status = U_ZERO_ERROR;
	4441	patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4442	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4443	status = U_ZERO_ERROR;
	4444	delete[] patternChars;
	4445	patternCapacity = patternLength + 1;
	4446	patternChars = new char[patternCapacity];
	4447	pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4448	}
	4449	utext_openUTF8(&patternText, patternChars, patternLength, &status);
	4450
	4451	//
	4452	// Compile the test pattern.
	4453	//
	4454	RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
	4455	if (status == U_REGEX_UNIMPLEMENTED) {
	4456	//
	4457	// Test of a feature that is planned for ICU, but not yet implemented.
	4458	// skip the test.
	4459	skippedUnimplementedCount++;
	4460	delete testPat;
	4461	status = U_ZERO_ERROR;
	4462	continue;
	4463	}
	4464
	4465	if (U_FAILURE(status)) {
	4466	// Some tests are supposed to generate errors.
	4467	// Only report an error for tests that are supposed to succeed.
	4468	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4469	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4470	{
	4471	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4472	}
	4473	status = U_ZERO_ERROR;
	4474	delete testPat;
	4475	continue;
	4476	}
	4477
	4478	if (fields[2].indexOf(UChar_i) >= 0) {
	4479	// ICU should skip this test.
	4480	delete testPat;
	4481	continue;
	4482	}
	4483
	4484	if (fields[2].indexOf(UChar_c) >= 0) {
	4485	// This pattern should have caused a compilation error, but didn't/
	4486	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4487	delete testPat;
	4488	continue;
	4489	}
	4490
	4491
	4492	//
	4493	// replace the Perl variables that appear in some of the
	4494	// match data strings.
	4495	//
	4496	UnicodeString matchString = fields[1];
	4497	matchString.findAndReplace(nulnulSrc, nulnul);
	4498	matchString.findAndReplace(ffffSrc, ffff);
	4499
	4500	// Replace any \n in the match string with an actual new-line char.
	4501	// Don't do full unescape, as this unescapes more than Perl does, which
	4502	// causes other spurious failures in the tests.
	4503	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4504
	4505	//
	4506	// Put the input in a UTF-8 UText
	4507	//
	4508	status = U_ZERO_ERROR;
	4509	inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4510	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4511	status = U_ZERO_ERROR;
	4512	delete[] inputChars;
	4513	inputCapacity = inputLength + 1;
	4514	inputChars = new char[inputCapacity];
	4515	matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4516	}
	4517	utext_openUTF8(&inputText, inputChars, inputLength, &status);
	4518
	4519	//
	4520	// Run the test, check for expected match/don't match result.
	4521	//
	4522	RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
	4523	UBool found = testMat->find();
	4524	UBool expected = FALSE;
	4525	if (fields[2].indexOf(UChar_y) >=0) {
	4526	expected = TRUE;
	4527	}
	4528	if (expected != found) {
	4529	errln("line %d: Expected %smatch, got %smatch",
	4530	lineNum, expected?"":"no ", found?"":"no " );
	4531	continue;
	4532	}
	4533
	4534	// Don't try to check expected results if there is no match.
	4535	// (Some have stuff in the expected fields)
	4536	if (!found) {
	4537	delete testMat;
	4538	delete testPat;
	4539	continue;
	4540	}
	4541
	4542	//
	4543	// Interpret the Perl expression from the fourth field of the data file,
	4544	// building up an ICU string from the results of the ICU match.
	4545	// The Perl expression will contain references to the results of
	4546	// a regex match, including the matched string, capture group strings,
	4547	// group starting and ending indicies, etc.
	4548	//
	4549	UnicodeString resultString;
	4550	UnicodeString perlExpr = fields[3];
	4551
	4552	while (perlExpr.length() > 0) {
	4553	groupsMat->reset(perlExpr);
	4554	cgMat->reset(perlExpr);
	4555
	4556	if (perlExpr.startsWith("$&")) {
	4557	resultString.append(testMat->group(status));
	4558	perlExpr.remove(0, 2);
	4559	}
	4560
	4561	else if (groupsMat->lookingAt(status)) {
	4562	// $-[0] $+[2] etc.
	4563	UnicodeString digitString = groupsMat->group(2, status);
	4564	int32_t t = 0;
	4565	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4566	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4567	int32_t matchPosition;
	4568	if (plusOrMinus.compare("+") == 0) {
	4569	matchPosition = testMat->end(groupNum, status);
	4570	} else {
	4571	matchPosition = testMat->start(groupNum, status);
	4572	}
	4573	if (matchPosition != -1) {
	4574	ICU_Utility::appendNumber(resultString, matchPosition);
	4575	}
	4576	perlExpr.remove(0, groupsMat->end(status));
	4577	}
	4578
	4579	else if (cgMat->lookingAt(status)) {
	4580	// $1, $2, $3, etc.
	4581	UnicodeString digitString = cgMat->group(1, status);
	4582	int32_t t = 0;
	4583	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4584	if (U_SUCCESS(status)) {
	4585	resultString.append(testMat->group(groupNum, status));
	4586	status = U_ZERO_ERROR;
	4587	}
	4588	perlExpr.remove(0, cgMat->end(status));
	4589	}
	4590
	4591	else if (perlExpr.startsWith("@-")) {
	4592	int32_t i;
	4593	for (i=0; i<=testMat->groupCount(); i++) {
	4594	if (i>0) {
	4595	resultString.append(" ");
	4596	}
	4597	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4598	}
	4599	perlExpr.remove(0, 2);
	4600	}
	4601
	4602	else if (perlExpr.startsWith("@+")) {
	4603	int32_t i;
	4604	for (i=0; i<=testMat->groupCount(); i++) {
	4605	if (i>0) {
	4606	resultString.append(" ");
	4607	}
	4608	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4609	}
	4610	perlExpr.remove(0, 2);
	4611	}
	4612
	4613	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4614	// or as an escaped sequence (e.g. \n)
	4615	if (perlExpr.length() > 1) {
	4616	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4617	}
	4618	UChar c = perlExpr.charAt(0);
	4619	switch (c) {
	4620	case 'n': c = '\n'; break;
	4621	// add any other escape sequences that show up in the test expected results.
	4622	}
	4623	resultString.append(c);
	4624	perlExpr.remove(0, 1);
	4625	}
	4626
	4627	else {
	4628	// Any characters from the perl expression that we don't explicitly
	4629	// recognize before here are assumed to be literals and copied
	4630	// as-is to the expected results.
	4631	resultString.append(perlExpr.charAt(0));
	4632	perlExpr.remove(0, 1);
	4633	}
	4634
	4635	if (U_FAILURE(status)) {
	4636	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4637	break;
	4638	}
	4639	}
	4640
	4641	//
	4642	// Expected Results Compare
	4643	//
	4644	UnicodeString expectedS(fields[4]);
	4645	expectedS.findAndReplace(nulnulSrc, nulnul);
	4646	expectedS.findAndReplace(ffffSrc, ffff);
	4647	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4648
	4649
	4650	if (expectedS.compare(resultString) != 0) {
	4651	err("Line %d: Incorrect perl expression results.", lineNum);
	4652	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4653	}
	4654
	4655	delete testMat;
	4656	delete testPat;
	4657	}
	4658
	4659	//
	4660	// All done. Clean up allocated stuff.
	4661	//
	4662	delete cgMat;
	4663	delete cgPat;
	4664
	4665	delete groupsMat;
	4666	delete groupsPat;
	4667
	4668	delete flagMat;
	4669	delete flagPat;
	4670
	4671	delete lineMat;
	4672	delete linePat;
	4673
	4674	delete fieldPat;
	4675	delete [] testData;
	4676
	4677	utext_close(&patternText);
	4678	utext_close(&inputText);
	4679
	4680	delete [] patternChars;
	4681	delete [] inputChars;
	4682
	4683
	4684	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4685
	4686	}
	4687
	4688
	4689	//--------------------------------------------------------------
	4690	//
	4691	// Bug6149 Verify limits to heap expansion for backtrack stack.
	4692	// Use this pattern,
	4693	// "(a?){1,8000000}"
	4694	// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
	4695	// This test is likely to be fragile, as further optimizations stop
	4696	// more cases of pointless looping in the match engine.
	4697	//
	4698	//---------------------------------------------------------------
	4699	void RegexTest::Bug6149() {
	4700	UnicodeString pattern("(a?){1,8000000}");
	4701	UnicodeString s("xyz");
	4702	uint32_t flags = 0;
	4703	UErrorCode status = U_ZERO_ERROR;
	4704
	4705	RegexMatcher matcher(pattern, s, flags, status);
	4706	UBool result = false;
	4707	REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
	4708	REGEX_ASSERT(result == FALSE);
	4709	}
	4710
	4711
	4712	//
	4713	// Callbacks() Test the callback function.
	4714	// When set, callbacks occur periodically during matching operations,
	4715	// giving the application code the ability to abort the operation
	4716	// before it's normal completion.
	4717	//
	4718
	4719	struct callBackContext {
	4720	RegexTest *test;
	4721	int32_t maxCalls;
	4722	int32_t numCalls;
	4723	int32_t lastSteps;
	4724	void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
	4725	};
	4726
	4727	U_CDECL_BEGIN
	4728	static UBool U_CALLCONV
	4729	testCallBackFn(const void *context, int32_t steps) {
	4730	callBackContext info = (callBackContext )context;
	4731	if (info->lastSteps+1 != steps) {
	4732	info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
	4733	}
	4734	info->lastSteps = steps;
	4735	info->numCalls++;
	4736	return (info->numCalls < info->maxCalls);
	4737	}
	4738	U_CDECL_END
	4739
	4740	void RegexTest::Callbacks() {
	4741	{
	4742	// Getter returns NULLs if no callback has been set
	4743
	4744	// The variables that the getter will fill in.
	4745	// Init to non-null values so that the action of the getter can be seen.
	4746	const void *returnedContext = &returnedContext;
	4747	URegexMatchCallback *returnedFn = &testCallBackFn;
	4748
	4749	UErrorCode status = U_ZERO_ERROR;
	4750	RegexMatcher matcher("x", 0, status);
	4751	REGEX_CHECK_STATUS;
	4752	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4753	REGEX_CHECK_STATUS;
	4754	REGEX_ASSERT(returnedFn == NULL);
	4755	REGEX_ASSERT(returnedContext == NULL);
	4756	}
	4757
	4758	{
	4759	// Set and Get work
	4760	callBackContext cbInfo = {this, 0, 0, 0};
	4761	const void *returnedContext;
	4762	URegexMatchCallback *returnedFn;
	4763	UErrorCode status = U_ZERO_ERROR;
	4764	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
	4765	REGEX_CHECK_STATUS;
	4766	matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
	4767	REGEX_CHECK_STATUS;
	4768	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4769	REGEX_CHECK_STATUS;
	4770	REGEX_ASSERT(returnedFn == testCallBackFn);
	4771	REGEX_ASSERT(returnedContext == &cbInfo);
	4772
	4773	// A short-running match shouldn't invoke the callback
	4774	status = U_ZERO_ERROR;
	4775	cbInfo.reset(1);
	4776	UnicodeString s = "xxx";
	4777	matcher.reset(s);
	4778	REGEX_ASSERT(matcher.matches(status));
	4779	REGEX_CHECK_STATUS;
	4780	REGEX_ASSERT(cbInfo.numCalls == 0);
	4781
	4782	// A medium-length match that runs long enough to invoke the
	4783	// callback, but not so long that the callback aborts it.
	4784	status = U_ZERO_ERROR;
	4785	cbInfo.reset(4);
	4786	s = "aaaaaaaaaaaaaaaaaaab";
	4787	matcher.reset(s);
	4788	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4789	REGEX_CHECK_STATUS;
	4790	REGEX_ASSERT(cbInfo.numCalls > 0);
	4791
	4792	// A longer running match that the callback function will abort.
	4793	status = U_ZERO_ERROR;
	4794	cbInfo.reset(4);
	4795	s = "aaaaaaaaaaaaaaaaaaaaaaab";
	4796	matcher.reset(s);
	4797	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4798	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4799	REGEX_ASSERT(cbInfo.numCalls == 4);
	4800	}
	4801
	4802
	4803	}
	4804
	4805
	4806	//
	4807	// FindProgressCallbacks() Test the find "progress" callback function.
	4808	// When set, the find progress callback will be invoked during a find operations
	4809	// after each return from a match attempt, giving the application the opportunity
	4810	// to terminate a long-running find operation before it's normal completion.
	4811	//
	4812
	4813	struct progressCallBackContext {
	4814	RegexTest *test;
	4815	int64_t lastIndex;
	4816	int32_t maxCalls;
	4817	int32_t numCalls;
	4818	void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
	4819	};
	4820
	4821	U_CDECL_BEGIN
	4822	static UBool U_CALLCONV
	4823	testProgressCallBackFn(const void *context, int64_t matchIndex) {
	4824	progressCallBackContext info = (progressCallBackContext )context;
	4825	info->numCalls++;
	4826	info->lastIndex = matchIndex;
	4827	// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
	4828	return (info->numCalls < info->maxCalls);
	4829	}
	4830	U_CDECL_END
	4831
	4832	void RegexTest::FindProgressCallbacks() {
	4833	{
	4834	// Getter returns NULLs if no callback has been set
	4835
	4836	// The variables that the getter will fill in.
	4837	// Init to non-null values so that the action of the getter can be seen.
	4838	const void *returnedContext = &returnedContext;
	4839	URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
	4840
	4841	UErrorCode status = U_ZERO_ERROR;
	4842	RegexMatcher matcher("x", 0, status);
	4843	REGEX_CHECK_STATUS;
	4844	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4845	REGEX_CHECK_STATUS;
	4846	REGEX_ASSERT(returnedFn == NULL);
	4847	REGEX_ASSERT(returnedContext == NULL);
	4848	}
	4849
	4850	{
	4851	// Set and Get work
	4852	progressCallBackContext cbInfo = {this, 0, 0, 0};
	4853	const void *returnedContext;
	4854	URegexFindProgressCallback *returnedFn;
	4855	UErrorCode status = U_ZERO_ERROR;
	4856	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
	4857	REGEX_CHECK_STATUS;
	4858	matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
	4859	REGEX_CHECK_STATUS;
	4860	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4861	REGEX_CHECK_STATUS;
	4862	REGEX_ASSERT(returnedFn == testProgressCallBackFn);
	4863	REGEX_ASSERT(returnedContext == &cbInfo);
	4864
	4865	// A short-running match should NOT invoke the callback.
	4866	status = U_ZERO_ERROR;
	4867	cbInfo.reset(100);
	4868	UnicodeString s = "abxxx";
	4869	matcher.reset(s);
	4870	#if 0
	4871	matcher.setTrace(TRUE);
	4872	#endif
	4873	REGEX_ASSERT(matcher.find(0, status));
	4874	REGEX_CHECK_STATUS;
	4875	REGEX_ASSERT(cbInfo.numCalls == 0);
	4876
	4877	// A medium running match that causes matcher.find() to invoke our callback for each index.
	4878	status = U_ZERO_ERROR;
	4879	s = "aaaaaaaaaaaaaaaaaaab";
	4880	cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
	4881	matcher.reset(s);
	4882	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4883	REGEX_CHECK_STATUS;
	4884	REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
	4885
	4886	// A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
	4887	status = U_ZERO_ERROR;
	4888	UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
	4889	cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
	4890	matcher.reset(s1);
	4891	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4892	REGEX_CHECK_STATUS;
	4893	REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
	4894
	4895	#if 0
	4896	// Now a match that will succeed, but after an interruption
	4897	status = U_ZERO_ERROR;
	4898	UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
	4899	cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
	4900	matcher.reset(s2);
	4901	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4902	REGEX_CHECK_STATUS;
	4903	// Now retry the match from where left off
	4904	cbInfo.maxCalls = 100; // No callback limit
	4905	REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
	4906	REGEX_CHECK_STATUS;
	4907	#endif
	4908	}
	4909
	4910
	4911	}
	4912
	4913
	4914	//---------------------------------------------------------------------------
	4915	//
	4916	// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
	4917	// UTexts. The pure-C implementation of UText
	4918	// has no mutable backing stores, but we can
	4919	// use UnicodeString here to test the functionality.
	4920	//
	4921	//---------------------------------------------------------------------------
	4922	void RegexTest::PreAllocatedUTextCAPI () {
	4923	UErrorCode status = U_ZERO_ERROR;
	4924	URegularExpression *re;
	4925	UText patternText = UTEXT_INITIALIZER;
	4926	UnicodeString buffer;
	4927	UText bufferText = UTEXT_INITIALIZER;
	4928
	4929	utext_openUnicodeString(&bufferText, &buffer, &status);
	4930
	4931	/*
	4932	* getText() and getUText()
	4933	*/
	4934	{
	4935	UText text1 = UTEXT_INITIALIZER;
	4936	UText text2 = UTEXT_INITIALIZER;
	4937	UChar text2Chars[20];
	4938	UText *resultText;
	4939
	4940	status = U_ZERO_ERROR;
	4941	regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
	4942	regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
	4943	u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
	4944	utext_openUChars(&text2, text2Chars, -1, &status);
	4945
	4946	regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
	4947	re = uregex_openUText(&patternText, 0, NULL, &status);
	4948
	4949	/* First set a UText */
	4950	uregex_setUText(re, &text1, &status);
	4951	resultText = uregex_getUText(re, &bufferText, &status);
	4952	REGEX_CHECK_STATUS;
	4953	REGEX_ASSERT(resultText == &bufferText);
	4954	utext_setNativeIndex(resultText, 0);
	4955	utext_setNativeIndex(&text1, 0);
	4956	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	4957
	4958	resultText = uregex_getUText(re, &bufferText, &status);
	4959	REGEX_CHECK_STATUS;
	4960	REGEX_ASSERT(resultText == &bufferText);
	4961	utext_setNativeIndex(resultText, 0);
	4962	utext_setNativeIndex(&text1, 0);
	4963	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	4964
	4965	/* Then set a UChar * */
	4966	uregex_setText(re, text2Chars, 7, &status);
	4967	resultText = uregex_getUText(re, &bufferText, &status);
	4968	REGEX_CHECK_STATUS;
	4969	REGEX_ASSERT(resultText == &bufferText);
	4970	utext_setNativeIndex(resultText, 0);
	4971	utext_setNativeIndex(&text2, 0);
	4972	REGEX_ASSERT(testUTextEqual(resultText, &text2));
	4973
	4974	uregex_close(re);
	4975	utext_close(&text1);
	4976	utext_close(&text2);
	4977	}
	4978
	4979	/*
	4980	* group()
	4981	*/
	4982	{
	4983	UChar text1[80];
	4984	UText *actual;
	4985	UBool result;
	4986	u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
	4987
	4988	status = U_ZERO_ERROR;
	4989	re = uregex_openC("abc(.*?)def", 0, NULL, &status);
	4990	REGEX_CHECK_STATUS;
	4991
	4992	uregex_setText(re, text1, -1, &status);
	4993	result = uregex_find(re, 0, &status);
	4994	REGEX_ASSERT(result==TRUE);
	4995
	4996	/* Capture Group 0, the full match. Should succeed. */
	4997	status = U_ZERO_ERROR;
	4998	actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
	4999	REGEX_CHECK_STATUS;
	5000	REGEX_ASSERT(actual == &bufferText);
	5001	REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
	5002
	5003	/* Capture group #1. Should succeed. */
	5004	status = U_ZERO_ERROR;
	5005	actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
	5006	REGEX_CHECK_STATUS;
	5007	REGEX_ASSERT(actual == &bufferText);
	5008	REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
	5009
	5010	/* Capture group out of range. Error. */
	5011	status = U_ZERO_ERROR;
	5012	actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
	5013	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5014	REGEX_ASSERT(actual == &bufferText);
	5015
	5016	uregex_close(re);
	5017
	5018	}
	5019
	5020	/*
	5021	* replaceFirst()
	5022	*/
	5023	{
	5024	UChar text1[80];
	5025	UChar text2[80];
	5026	UText replText = UTEXT_INITIALIZER;
	5027	UText *result;
	5028
	5029	status = U_ZERO_ERROR;
	5030	u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
	5031	u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
	5032	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5033
	5034	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5035	REGEX_CHECK_STATUS;
	5036
	5037	/* Normal case, with match */
	5038	uregex_setText(re, text1, -1, &status);
	5039	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5040	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5041	REGEX_CHECK_STATUS;
	5042	REGEX_ASSERT(result == &bufferText);
	5043	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
	5044
	5045	/* No match. Text should copy to output with no changes. */
	5046	uregex_setText(re, text2, -1, &status);
	5047	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5048	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5049	REGEX_CHECK_STATUS;
	5050	REGEX_ASSERT(result == &bufferText);
	5051	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5052
	5053	/* Unicode escapes */
	5054	uregex_setText(re, text1, -1, &status);
	5055	regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
	5056	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5057	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5058	REGEX_CHECK_STATUS;
	5059	REGEX_ASSERT(result == &bufferText);
	5060	REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
	5061
	5062	uregex_close(re);
	5063	utext_close(&replText);
	5064	}
	5065
	5066
	5067	/*
	5068	* replaceAll()
	5069	*/
	5070	{
	5071	UChar text1[80];
	5072	UChar text2[80];
	5073	UText replText = UTEXT_INITIALIZER;
	5074	UText *result;
	5075
	5076	status = U_ZERO_ERROR;
	5077	u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
	5078	u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
	5079	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5080
	5081	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5082	REGEX_CHECK_STATUS;
	5083
	5084	/* Normal case, with match */
	5085	uregex_setText(re, text1, -1, &status);
	5086	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5087	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5088	REGEX_CHECK_STATUS;
	5089	REGEX_ASSERT(result == &bufferText);
	5090	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
	5091
	5092	/* No match. Text should copy to output with no changes. */
	5093	uregex_setText(re, text2, -1, &status);
	5094	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5095	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5096	REGEX_CHECK_STATUS;
	5097	REGEX_ASSERT(result == &bufferText);
	5098	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5099
	5100	uregex_close(re);
	5101	utext_close(&replText);
	5102	}
	5103
	5104
	5105	/*
	5106	* splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
	5107	* so we don't need to test it here.
	5108	*/
	5109
	5110	utext_close(&bufferText);
	5111	utext_close(&patternText);
	5112	}
	5113
	5114	//--------------------------------------------------------------
	5115	//
	5116	// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
	5117	//
	5118	//---------------------------------------------------------------
	5119	void RegexTest::Bug7651() {
	5120	UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|\\$[A-Za-z]+)");
	5121	// The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
	5122	// It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
	5123	UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|\\$[A-Za-z]+)");
	5124	UnicodeString s("#ff @abcd This is test");
	5125	RegexPattern *REPattern = NULL;
	5126	RegexMatcher *REMatcher = NULL;
	5127	UErrorCode status = U_ZERO_ERROR;
	5128	UParseError pe;
	5129
	5130	REPattern = RegexPattern::compile(pattern1, 0, pe, status);
	5131	REGEX_CHECK_STATUS;
	5132	REMatcher = REPattern->matcher(s, status);
	5133	REGEX_CHECK_STATUS;
	5134	REGEX_ASSERT(REMatcher->find());
	5135	REGEX_ASSERT(REMatcher->start(status) == 0);
	5136	delete REPattern;
	5137	delete REMatcher;
	5138	status = U_ZERO_ERROR;
	5139
	5140	REPattern = RegexPattern::compile(pattern2, 0, pe, status);
	5141	REGEX_CHECK_STATUS;
	5142	REMatcher = REPattern->matcher(s, status);
	5143	REGEX_CHECK_STATUS;
	5144	REGEX_ASSERT(REMatcher->find());
	5145	REGEX_ASSERT(REMatcher->start(status) == 0);
	5146	delete REPattern;
	5147	delete REMatcher;
	5148	status = U_ZERO_ERROR;
	5149	}
	5150
	5151	void RegexTest::Bug7740() {
	5152	UErrorCode status = U_ZERO_ERROR;
	5153	UnicodeString pattern = "(a)";
	5154	UnicodeString text = "abcdef";
	5155	RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
	5156	REGEX_CHECK_STATUS;
	5157	REGEX_ASSERT(m->lookingAt(status));
	5158	REGEX_CHECK_STATUS;
	5159	status = U_ILLEGAL_ARGUMENT_ERROR;
	5160	UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
	5161	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5162	REGEX_ASSERT(s == "");
	5163	delete m;
	5164	}
	5165
	5166	// Bug 8479: was crashing whith a Bogus UnicodeString as input.
	5167
	5168	void RegexTest::Bug8479() {
	5169	UErrorCode status = U_ZERO_ERROR;
	5170
	5171	RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL\|UREGEX_CASE_INSENSITIVE, status);
	5172	REGEX_CHECK_STATUS;
	5173	if (U_SUCCESS(status))
	5174	{
	5175	UnicodeString str;
	5176	str.setToBogus();
	5177	pMatcher->reset(str);
	5178	status = U_ZERO_ERROR;
	5179	pMatcher->matches(status);
	5180	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5181	delete pMatcher;
	5182	}
	5183	}
	5184
	5185
	5186	// Bug 7029
	5187	void RegexTest::Bug7029() {
	5188	UErrorCode status = U_ZERO_ERROR;
	5189
	5190	RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
	5191	UnicodeString text = "abc.def";
	5192	UnicodeString splits[10];
	5193	REGEX_CHECK_STATUS;
	5194	int32_t numFields = pMatcher->split(text, splits, 10, status);
	5195	REGEX_CHECK_STATUS;
	5196	REGEX_ASSERT(numFields == 8);
	5197	delete pMatcher;
	5198	}
	5199
	5200	// Bug 9283
	5201	// This test is checking for the existance of any supplemental characters that case-fold
	5202	// to a bmp character.
	5203	//
	5204	// At the time of this writing there are none. If any should appear in a subsequent release
	5205	// of Unicode, the code in regular expressions compilation that determines the longest
	5206	// posssible match for a literal string will need to be enhanced.
	5207	//
	5208	// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
	5209	// for details on what to do in case of a failure of this test.
	5210	//
	5211	void RegexTest::Bug9283() {
	5212	#if !UCONFIG_NO_NORMALIZATION
	5213	UErrorCode status = U_ZERO_ERROR;
	5214	UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
	5215	REGEX_CHECK_STATUS;
	5216	int32_t index;
	5217	UChar32 c;
	5218	for (index=0; ; index++) {
	5219	c = supplementalsWithCaseFolding.charAt(index);
	5220	if (c == -1) {
	5221	break;
	5222	}
	5223	UnicodeString cf = UnicodeString(c).foldCase();
	5224	REGEX_ASSERT(cf.length() >= 2);
	5225	}
	5226	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	5227	}
	5228
	5229
	5230	void RegexTest::CheckInvBufSize() {
	5231	if(inv_next>=INV_BUFSIZ) {
	5232	errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
	5233	__FILE__, INV_BUFSIZ, inv_next);
	5234	} else {
	5235	logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
	5236	}
	5237	}
	5238
	5239
	5240	void RegexTest::Bug10459() {
	5241	UErrorCode status = U_ZERO_ERROR;
	5242	UnicodeString patternString("(txt)");
	5243	UnicodeString txtString("txt");
	5244
	5245	UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
	5246	REGEX_CHECK_STATUS;
	5247	UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
	5248	REGEX_CHECK_STATUS;
	5249
	5250	URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
	5251	REGEX_CHECK_STATUS;
	5252
	5253	uregex_setUText(icu_re, utext_txt, &status);
	5254	REGEX_CHECK_STATUS;
	5255
	5256	// The bug was that calling uregex_group() before doing a matching operation
	5257	// was causing a segfault. Only for Regular Expressions created from UText.
	5258	// It should set an U_REGEX_INVALID_STATE.
	5259
	5260	UChar buf[100];
	5261	int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status);
	5262	REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
	5263	REGEX_ASSERT(len == 0);
	5264
	5265	uregex_close(icu_re);
	5266	utext_close(utext_pat);
	5267	utext_close(utext_txt);
	5268	}
	5269
	5270	#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
	5271