git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/regextst.cpp

... / ...

Commit	Line	Data
	1	/********************************************************************
	2	* COPYRIGHT:
	3	* Copyright (c) 2002-2015, International Business Machines Corporation and
	4	* others. All Rights Reserved.
	5	********************************************************************/
	6
	7	//
	8	// regextst.cpp
	9	//
	10	// ICU Regular Expressions test, part of intltest.
	11	//
	12
	13	/*
	14	NOTE!!
	15
	16	PLEASE be careful about ASCII assumptions in this test.
	17	This test is one of the worst repeat offenders.
	18	If you have questions, contact someone on the ICU PMC
	19	who has access to an EBCDIC system.
	20
	21	*/
	22
	23	#include "intltest.h"
	24	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	25
	26	#include "unicode/localpointer.h"
	27	#include "unicode/regex.h"
	28	#include "unicode/uchar.h"
	29	#include "unicode/ucnv.h"
	30	#include "unicode/uniset.h"
	31	#include "unicode/uregex.h"
	32	#include "unicode/usetiter.h"
	33	#include "unicode/ustring.h"
	34	#include "regextst.h"
	35	#include "regexcmp.h"
	36	#include "uvector.h"
	37	#include "util.h"
	38	#include <stdlib.h>
	39	#include <string.h>
	40	#include <stdio.h>
	41	#include "cmemory.h"
	42	#include "cstring.h"
	43	#include "uinvchar.h"
	44
	45	#define SUPPORT_MUTATING_INPUT_STRING 0
	46
	47	//---------------------------------------------------------------------------
	48	//
	49	// Test class boilerplate
	50	//
	51	//---------------------------------------------------------------------------
	52	RegexTest::RegexTest()
	53	{
	54	}
	55
	56
	57	RegexTest::~RegexTest()
	58	{
	59	}
	60
	61
	62
	63	void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	64	{
	65	if (exec) logln("TestSuite RegexTest: ");
	66	switch (index) {
	67
	68	case 0: name = "Basic";
	69	if (exec) Basic();
	70	break;
	71	case 1: name = "API_Match";
	72	if (exec) API_Match();
	73	break;
	74	case 2: name = "API_Replace";
	75	if (exec) API_Replace();
	76	break;
	77	case 3: name = "API_Pattern";
	78	if (exec) API_Pattern();
	79	break;
	80	case 4:
	81	#if !UCONFIG_NO_FILE_IO
	82	name = "Extended";
	83	if (exec) Extended();
	84	#else
	85	name = "skip";
	86	#endif
	87	break;
	88	case 5: name = "Errors";
	89	if (exec) Errors();
	90	break;
	91	case 6: name = "PerlTests";
	92	if (exec) PerlTests();
	93	break;
	94	case 7: name = "Callbacks";
	95	if (exec) Callbacks();
	96	break;
	97	case 8: name = "FindProgressCallbacks";
	98	if (exec) FindProgressCallbacks();
	99	break;
	100	case 9: name = "Bug 6149";
	101	if (exec) Bug6149();
	102	break;
	103	case 10: name = "UTextBasic";
	104	if (exec) UTextBasic();
	105	break;
	106	case 11: name = "API_Match_UTF8";
	107	if (exec) API_Match_UTF8();
	108	break;
	109	case 12: name = "API_Replace_UTF8";
	110	if (exec) API_Replace_UTF8();
	111	break;
	112	case 13: name = "API_Pattern_UTF8";
	113	if (exec) API_Pattern_UTF8();
	114	break;
	115	case 14: name = "PerlTestsUTF8";
	116	if (exec) PerlTestsUTF8();
	117	break;
	118	case 15: name = "PreAllocatedUTextCAPI";
	119	if (exec) PreAllocatedUTextCAPI();
	120	break;
	121	case 16: name = "Bug 7651";
	122	if (exec) Bug7651();
	123	break;
	124	case 17: name = "Bug 7740";
	125	if (exec) Bug7740();
	126	break;
	127	case 18: name = "Bug 8479";
	128	if (exec) Bug8479();
	129	break;
	130	case 19: name = "Bug 7029";
	131	if (exec) Bug7029();
	132	break;
	133	case 20: name = "CheckInvBufSize";
	134	if (exec) CheckInvBufSize();
	135	break;
	136	case 21: name = "Bug 9283";
	137	if (exec) Bug9283();
	138	break;
	139	case 22: name = "Bug10459";
	140	if (exec) Bug10459();
	141	break;
	142	case 23: name = "TestCaseInsensitiveStarters";
	143	if (exec) TestCaseInsensitiveStarters();
	144	break;
	145	case 24: name = "TestBug11049";
	146	if (exec) TestBug11049();
	147	break;
	148	case 25: name = "TestBug11371";
	149	if (exec) TestBug11371();
	150	break;
	151	case 26: name = "TestBug11480";
	152	if (exec) TestBug11480();
	153	break;
	154	case 27: name = "NamedCapture";
	155	if (exec) NamedCapture();
	156	break;
	157	case 28: name = "NamedCaptureLimits";
	158	if (exec) NamedCaptureLimits();
	159	break;
	160	default: name = "";
	161	break; //needed to end loop
	162	}
	163	}
	164
	165
	166
	167	/**
	168	* Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
	169	* into ASCII.
	170	* @see utext_openUTF8
	171	*/
	172	static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);
	173
	174	//---------------------------------------------------------------------------
	175	//
	176	// Error Checking / Reporting macros used in all of the tests.
	177	//
	178	//---------------------------------------------------------------------------
	179
	180	static void utextToPrintable(char buf, int32_t bufLen, UText text) {
	181	int64_t oldIndex = utext_getNativeIndex(text);
	182	utext_setNativeIndex(text, 0);
	183	char *bufPtr = buf;
	184	UChar32 c = utext_next32From(text, 0);
	185	while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
	186	if (0x000020<=c && c<0x00007e) {
	187	*bufPtr = c;
	188	} else {
	189	#if 0
	190	sprintf(bufPtr,"U+%04X", c);
	191	bufPtr+= strlen(bufPtr)-1;
	192	#else
	193	*bufPtr = '%';
	194	#endif
	195	}
	196	bufPtr++;
	197	c = UTEXT_NEXT32(text);
	198	}
	199	*bufPtr = 0;
	200	#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
	201	char ebuf = (char)malloc(bufLen);
	202	uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);
	203	uprv_strncpy(buf, ebuf, bufLen);
	204	free((void*)ebuf);
	205	#endif
	206	utext_setNativeIndex(text, oldIndex);
	207	}
	208
	209
	210	static char ASSERT_BUF[1024];
	211
	212	const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
	213	if(message.length()==0) {
	214	strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
	215	} else {
	216	UnicodeString buf;
	217	IntlTest::prettify(message,buf);
	218	if(buf.length()==0) {
	219	strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
	220	} else {
	221	buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
	222	if(ASSERT_BUF[0]==0) {
	223	ASSERT_BUF[0]=0;
	224	for(int32_t i=0;i<buf.length();i++) {
	225	UChar ch = buf[i];
	226	sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
	227	}
	228	}
	229	}
	230	}
	231	ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
	232	return ASSERT_BUF;
	233	}
	234
	235	#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
	236
	237	#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
	238	__FILE__, __LINE__, u_errorName(status)); return;}}
	239
	240	#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
	241
	242	#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
	243	if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
	244	__LINE__, u_errorName(errcode), u_errorName(status));};}
	245
	246	#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
	247	"RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
	248
	249	#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
	250	errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
	251
	252	// expected: const char * , restricted to invariant characters.
	253	// actual: const UnicodeString &
	254	#define REGEX_ASSERT_UNISTR(expected, actual) { \
	255	if (UnicodeString(expected, -1, US_INV) != (actual)) { \
	256	errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
	257	__FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
	258
	259
	260	static UBool testUTextEqual(UText uta, UText utb) {
	261	UChar32 ca = 0;
	262	UChar32 cb = 0;
	263	utext_setNativeIndex(uta, 0);
	264	utext_setNativeIndex(utb, 0);
	265	do {
	266	ca = utext_next32(uta);
	267	cb = utext_next32(utb);
	268	if (ca != cb) {
	269	break;
	270	}
	271	} while (ca != U_SENTINEL);
	272	return ca == cb;
	273	}
	274
	275
	276	/**
	277	* @param expected expected text in UTF-8 (not platform) codepage
	278	*/
	279	void RegexTest::assertUText(const char expected, UText actual, const char *file, int line) {
	280	UErrorCode status = U_ZERO_ERROR;
	281	UText expectedText = UTEXT_INITIALIZER;
	282	utext_openUTF8(&expectedText, expected, -1, &status);
	283	if(U_FAILURE(status)) {
	284	errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	285	return;
	286	}
	287	if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
	288	errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
	289	return;
	290	}
	291	utext_setNativeIndex(actual, 0);
	292	if (!testUTextEqual(&expectedText, actual)) {
	293	char buf[201 /21/];
	294	char expectedBuf[201];
	295	utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
	296	utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
	297	errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	298	}
	299	utext_close(&expectedText);
	300	}
	301	/**
	302	* @param expected invariant (platform local text) input
	303	*/
	304
	305	void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {
	306	UErrorCode status = U_ZERO_ERROR;
	307	UText expectedText = UTEXT_INITIALIZER;
	308	regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
	309	if(U_FAILURE(status)) {
	310	errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
	311	return;
	312	}
	313	utext_setNativeIndex(actual, 0);
	314	if (!testUTextEqual(&expectedText, actual)) {
	315	char buf[201 /21/];
	316	char expectedBuf[201];
	317	utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
	318	utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
	319	errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
	320	}
	321	utext_close(&expectedText);
	322	}
	323
	324	/**
	325	* Assumes utf-8 input
	326	*/
	327	#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
	328	/**
	329	* Assumes Invariant input
	330	*/
	331	#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
	332
	333	/**
	334	* This buffer ( inv_buf ) is used to hold the UTF-8 strings
	335	* passed into utext_openUTF8. An error will be given if
	336	* INV_BUFSIZ is too small. It's only used on EBCDIC systems.
	337	*/
	338
	339	#define INV_BUFSIZ 2048 /* increase this if too small */
	340
	341	static int64_t inv_next=0;
	342
	343	#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
	344	static char inv_buf[INV_BUFSIZ];
	345	#endif
	346
	347	static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {
	348	if(length==-1) length=strlen(inv);
	349	#if U_CHARSET_FAMILY==U_ASCII_FAMILY
	350	inv_next+=length;
	351	return utext_openUTF8(ut, inv, length, status);
	352	#else
	353	if(inv_next+length+1>INV_BUFSIZ) {
	354	fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
	355	__FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
	356	*status = U_MEMORY_ALLOCATION_ERROR;
	357	return NULL;
	358	}
	359
	360	unsigned char buf = (unsigned char)inv_buf+inv_next;
	361	uprv_aestrncpy(buf, (const uint8_t*)inv, length);
	362	inv_next+=length;
	363
	364	#if 0
	365	fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
	366	#endif
	367
	368	return utext_openUTF8(ut, (const char*)buf, length, status);
	369	#endif
	370	}
	371
	372
	373	//---------------------------------------------------------------------------
	374	//
	375	// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
	376	// for the LookingAt() and Match() functions.
	377	//
	378	// usage:
	379	// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
	380	//
	381	// The expected results are UBool - TRUE or FALSE.
	382	// The input text is unescaped. The pattern is not.
	383	//
	384	//
	385	//---------------------------------------------------------------------------
	386
	387	#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
	388
	389	UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	390	const UnicodeString pattern(pat, -1, US_INV);
	391	const UnicodeString inputText(text, -1, US_INV);
	392	UErrorCode status = U_ZERO_ERROR;
	393	UParseError pe;
	394	RegexPattern *REPattern = NULL;
	395	RegexMatcher *REMatcher = NULL;
	396	UBool retVal = TRUE;
	397
	398	UnicodeString patString(pat, -1, US_INV);
	399	REPattern = RegexPattern::compile(patString, 0, pe, status);
	400	if (U_FAILURE(status)) {
	401	dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
	402	line, u_errorName(status));
	403	return FALSE;
	404	}
	405	if (line==376) { REPattern->dumpPattern();}
	406
	407	UnicodeString inputString(inputText);
	408	UnicodeString unEscapedInput = inputString.unescape();
	409	REMatcher = REPattern->matcher(unEscapedInput, status);
	410	if (U_FAILURE(status)) {
	411	errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
	412	line, u_errorName(status));
	413	return FALSE;
	414	}
	415
	416	UBool actualmatch;
	417	actualmatch = REMatcher->lookingAt(status);
	418	if (U_FAILURE(status)) {
	419	errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
	420	line, u_errorName(status));
	421	retVal = FALSE;
	422	}
	423	if (actualmatch != looking) {
	424	errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
	425	retVal = FALSE;
	426	}
	427
	428	status = U_ZERO_ERROR;
	429	actualmatch = REMatcher->matches(status);
	430	if (U_FAILURE(status)) {
	431	errln("RegexTest failure in matches() at line %d. Status = %s\n",
	432	line, u_errorName(status));
	433	retVal = FALSE;
	434	}
	435	if (actualmatch != match) {
	436	errln("RegexTest: wrong return from matches() at line %d.\n", line);
	437	retVal = FALSE;
	438	}
	439
	440	if (retVal == FALSE) {
	441	REPattern->dumpPattern();
	442	}
	443
	444	delete REPattern;
	445	delete REMatcher;
	446	return retVal;
	447	}
	448
	449
	450	UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool looking, UBool match, int32_t line) {
	451	UText pattern = UTEXT_INITIALIZER;
	452	int32_t inputUTF8Length;
	453	char *textChars = NULL;
	454	UText inputText = UTEXT_INITIALIZER;
	455	UErrorCode status = U_ZERO_ERROR;
	456	UParseError pe;
	457	RegexPattern *REPattern = NULL;
	458	RegexMatcher *REMatcher = NULL;
	459	UBool retVal = TRUE;
	460
	461	regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
	462	REPattern = RegexPattern::compile(&pattern, 0, pe, status);
	463	if (U_FAILURE(status)) {
	464	dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
	465	line, u_errorName(status));
	466	return FALSE;
	467	}
	468
	469	UnicodeString inputString(text, -1, US_INV);
	470	UnicodeString unEscapedInput = inputString.unescape();
	471	LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
	472	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	473
	474	inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
	475	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
	476	// UTF-8 does not allow unpaired surrogates, so this could actually happen
	477	logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
	478	return TRUE; // not a failure of the Regex engine
	479	}
	480	status = U_ZERO_ERROR; // buffer overflow
	481	textChars = new char[inputUTF8Length+1];
	482	unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
	483	utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
	484
	485	REMatcher = &REPattern->matcher(status)->reset(&inputText);
	486	if (U_FAILURE(status)) {
	487	errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
	488	line, u_errorName(status));
	489	return FALSE;
	490	}
	491
	492	UBool actualmatch;
	493	actualmatch = REMatcher->lookingAt(status);
	494	if (U_FAILURE(status)) {
	495	errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
	496	line, u_errorName(status));
	497	retVal = FALSE;
	498	}
	499	if (actualmatch != looking) {
	500	errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
	501	retVal = FALSE;
	502	}
	503
	504	status = U_ZERO_ERROR;
	505	actualmatch = REMatcher->matches(status);
	506	if (U_FAILURE(status)) {
	507	errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
	508	line, u_errorName(status));
	509	retVal = FALSE;
	510	}
	511	if (actualmatch != match) {
	512	errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
	513	retVal = FALSE;
	514	}
	515
	516	if (retVal == FALSE) {
	517	REPattern->dumpPattern();
	518	}
	519
	520	delete REPattern;
	521	delete REMatcher;
	522	utext_close(&inputText);
	523	utext_close(&pattern);
	524	delete[] textChars;
	525	return retVal;
	526	}
	527
	528
	529
	530	//---------------------------------------------------------------------------
	531	//
	532	// REGEX_ERR Macro + invocation function to simplify writing tests
	533	// regex tests for incorrect patterns
	534	//
	535	// usage:
	536	// REGEX_ERR("pattern", expected error line, column, expected status);
	537	//
	538	//---------------------------------------------------------------------------
	539	#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
	540
	541	void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
	542	UErrorCode expectedStatus, int32_t line) {
	543	UnicodeString pattern(pat);
	544
	545	UErrorCode status = U_ZERO_ERROR;
	546	UParseError pe;
	547	RegexPattern *callerPattern = NULL;
	548
	549	//
	550	// Compile the caller's pattern
	551	//
	552	UnicodeString patString(pat);
	553	callerPattern = RegexPattern::compile(patString, 0, pe, status);
	554	if (status != expectedStatus) {
	555	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	556	} else {
	557	if (status != U_ZERO_ERROR) {
	558	if (pe.line != errLine \|\| pe.offset != errCol) {
	559	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	560	line, errLine, errCol, pe.line, pe.offset);
	561	}
	562	}
	563	}
	564
	565	delete callerPattern;
	566
	567	//
	568	// Compile again, using a UTF-8-based UText
	569	//
	570	UText patternText = UTEXT_INITIALIZER;
	571	regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
	572	callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
	573	if (status != expectedStatus) {
	574	dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
	575	} else {
	576	if (status != U_ZERO_ERROR) {
	577	if (pe.line != errLine \|\| pe.offset != errCol) {
	578	errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
	579	line, errLine, errCol, pe.line, pe.offset);
	580	}
	581	}
	582	}
	583
	584	delete callerPattern;
	585	utext_close(&patternText);
	586	}
	587
	588
	589
	590	//---------------------------------------------------------------------------
	591	//
	592	// Basic Check for basic functionality of regex pattern matching.
	593	// Avoid the use of REGEX_FIND test macro, which has
	594	// substantial dependencies on basic Regex functionality.
	595	//
	596	//---------------------------------------------------------------------------
	597	void RegexTest::Basic() {
	598
	599
	600	//
	601	// Debug - slide failing test cases early
	602	//
	603	#if 0
	604	{
	605	// REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
	606	UParseError pe;
	607	UErrorCode status = U_ZERO_ERROR;
	608	RegexPattern *pattern;
	609	pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
	610	pattern->dumpPattern();
	611	RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
	612	UBool result = m->find();
	613	printf("result = %d\n", result);
	614	// REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
	615	// REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");
	616	}
	617	exit(1);
	618	#endif
	619
	620
	621	//
	622	// Pattern with parentheses
	623	//
	624	REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
	625	REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
	626	REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
	627
	628	//
	629	// Patterns with *
	630	//
	631	REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
	632	REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
	633	REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
	634	REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
	635	REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
	636
	637	REGEX_TESTLM("a*", "", TRUE, TRUE);
	638	REGEX_TESTLM("a*", "b", TRUE, FALSE);
	639
	640
	641	//
	642	// Patterns with "."
	643	//
	644	REGEX_TESTLM(".", "abc", TRUE, FALSE);
	645	REGEX_TESTLM("...", "abc", TRUE, TRUE);
	646	REGEX_TESTLM("....", "abc", FALSE, FALSE);
	647	REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
	648	REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
	649	REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
	650	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
	651	REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
	652
	653	//
	654	// Patterns with * applied to chars at end of literal string
	655	//
	656	REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
	657	REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
	658
	659	//
	660	// Supplemental chars match as single chars, not a pair of surrogates.
	661	//
	662	REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
	663	REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
	664	REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
	665
	666
	667	//
	668	// UnicodeSets in the pattern
	669	//
	670	REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
	671	REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
	672	REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
	673	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	674	REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
	675	REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
	676
	677	REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
	678	REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
	679	REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
	680	REGEX_TESTLM("[\\p{Nd}]", "a123456", TRUE, FALSE); // note that matches 0 occurences.
	681	REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
	682
	683	//
	684	// OR operator in patterns
	685	//
	686	REGEX_TESTLM("(a\|b)", "a", TRUE, TRUE);
	687	REGEX_TESTLM("(a\|b)", "b", TRUE, TRUE);
	688	REGEX_TESTLM("(a\|b)", "c", FALSE, FALSE);
	689	REGEX_TESTLM("a\|b", "b", TRUE, TRUE);
	690
	691	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", TRUE, TRUE);
	692	REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", TRUE, FALSE);
	693	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", TRUE, TRUE);
	694	REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", TRUE, TRUE);
	695	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", TRUE, TRUE);
	696	REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", TRUE, FALSE);
	697
	698	//
	699	// +
	700	//
	701	REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
	702	REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
	703	REGEX_TESTLM("b+", "", FALSE, FALSE);
	704	REGEX_TESTLM("(abc\|def)+", "defabc", TRUE, TRUE);
	705	REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
	706	REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
	707
	708	//
	709	// ?
	710	//
	711	REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
	712	REGEX_TESTLM("ab?", "a", TRUE, TRUE);
	713	REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
	714	REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
	715	REGEX_TESTLM("a(b\|c)?d", "abd", TRUE, TRUE);
	716	REGEX_TESTLM("a(b\|c)?d", "acd", TRUE, TRUE);
	717	REGEX_TESTLM("a(b\|c)?d", "ad", TRUE, TRUE);
	718	REGEX_TESTLM("a(b\|c)?d", "abcd", FALSE, FALSE);
	719	REGEX_TESTLM("a(b\|c)?d", "ab", FALSE, FALSE);
	720
	721	//
	722	// Escape sequences that become single literal chars, handled internally
	723	// by ICU's Unescape.
	724	//
	725
	726	// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
	727	REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
	728	REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
	729	REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
	730	REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
	731	REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
	732	REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
	733	REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
	734	REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
	735	REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
	736
	737	REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
	738	REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
	739
	740	// Escape of special chars in patterns
	741	REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", TRUE, TRUE);
	742	}
	743
	744
	745	//---------------------------------------------------------------------------
	746	//
	747	// UTextBasic Check for quirks that are specific to the UText
	748	// implementation.
	749	//
	750	//---------------------------------------------------------------------------
	751	void RegexTest::UTextBasic() {
	752	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	753	UErrorCode status = U_ZERO_ERROR;
	754	UText pattern = UTEXT_INITIALIZER;
	755	utext_openUTF8(&pattern, str_abc, -1, &status);
	756	RegexMatcher matcher(&pattern, 0, status);
	757	REGEX_CHECK_STATUS;
	758
	759	UText input = UTEXT_INITIALIZER;
	760	utext_openUTF8(&input, str_abc, -1, &status);
	761	REGEX_CHECK_STATUS;
	762	matcher.reset(&input);
	763	REGEX_CHECK_STATUS;
	764	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	765
	766	matcher.reset(matcher.inputText());
	767	REGEX_CHECK_STATUS;
	768	REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
	769
	770	utext_close(&pattern);
	771	utext_close(&input);
	772	}
	773
	774
	775	//---------------------------------------------------------------------------
	776	//
	777	// API_Match Test that the API for class RegexMatcher
	778	// is present and nominally working, but excluding functions
	779	// implementing replace operations.
	780	//
	781	//---------------------------------------------------------------------------
	782	void RegexTest::API_Match() {
	783	UParseError pe;
	784	UErrorCode status=U_ZERO_ERROR;
	785	int32_t flags = 0;
	786
	787	//
	788	// Debug - slide failing test cases early
	789	//
	790	#if 0
	791	{
	792	}
	793	return;
	794	#endif
	795
	796	//
	797	// Simple pattern compilation
	798	//
	799	{
	800	UnicodeString re("abc");
	801	RegexPattern *pat2;
	802	pat2 = RegexPattern::compile(re, flags, pe, status);
	803	REGEX_CHECK_STATUS;
	804
	805	UnicodeString inStr1 = "abcdef this is a test";
	806	UnicodeString instr2 = "not abc";
	807	UnicodeString empty = "";
	808
	809
	810	//
	811	// Matcher creation and reset.
	812	//
	813	RegexMatcher *m1 = pat2->matcher(inStr1, status);
	814	REGEX_CHECK_STATUS;
	815	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	816	REGEX_ASSERT(m1->input() == inStr1);
	817	m1->reset(instr2);
	818	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	819	REGEX_ASSERT(m1->input() == instr2);
	820	m1->reset(inStr1);
	821	REGEX_ASSERT(m1->input() == inStr1);
	822	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	823	m1->reset(empty);
	824	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	825	REGEX_ASSERT(m1->input() == empty);
	826	REGEX_ASSERT(&m1->pattern() == pat2);
	827
	828	//
	829	// reset(pos, status)
	830	//
	831	m1->reset(inStr1);
	832	m1->reset(4, status);
	833	REGEX_CHECK_STATUS;
	834	REGEX_ASSERT(m1->input() == inStr1);
	835	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	836
	837	m1->reset(-1, status);
	838	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	839	status = U_ZERO_ERROR;
	840
	841	m1->reset(0, status);
	842	REGEX_CHECK_STATUS;
	843	status = U_ZERO_ERROR;
	844
	845	int32_t len = m1->input().length();
	846	m1->reset(len-1, status);
	847	REGEX_CHECK_STATUS;
	848	status = U_ZERO_ERROR;
	849
	850	m1->reset(len, status);
	851	REGEX_CHECK_STATUS;
	852	status = U_ZERO_ERROR;
	853
	854	m1->reset(len+1, status);
	855	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	856	status = U_ZERO_ERROR;
	857
	858	//
	859	// match(pos, status)
	860	//
	861	m1->reset(instr2);
	862	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	863	m1->reset();
	864	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	865	m1->reset();
	866	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	867	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	868	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	869	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	870
	871	// Match() at end of string should fail, but should not
	872	// be an error.
	873	status = U_ZERO_ERROR;
	874	len = m1->input().length();
	875	REGEX_ASSERT(m1->matches(len, status) == FALSE);
	876	REGEX_CHECK_STATUS;
	877
	878	// Match beyond end of string should fail with an error.
	879	status = U_ZERO_ERROR;
	880	REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
	881	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	882
	883	// Successful match at end of string.
	884	{
	885	status = U_ZERO_ERROR;
	886	RegexMatcher m("A?", 0, status); // will match zero length string.
	887	REGEX_CHECK_STATUS;
	888	m.reset(inStr1);
	889	len = inStr1.length();
	890	REGEX_ASSERT(m.matches(len, status) == TRUE);
	891	REGEX_CHECK_STATUS;
	892	m.reset(empty);
	893	REGEX_ASSERT(m.matches(0, status) == TRUE);
	894	REGEX_CHECK_STATUS;
	895	}
	896
	897
	898	//
	899	// lookingAt(pos, status)
	900	//
	901	status = U_ZERO_ERROR;
	902	m1->reset(instr2); // "not abc"
	903	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	904	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	905	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	906	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	907	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	908	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	909	status = U_ZERO_ERROR;
	910	len = m1->input().length();
	911	REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
	912	REGEX_CHECK_STATUS;
	913	REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
	914	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	915
	916	delete m1;
	917	delete pat2;
	918	}
	919
	920
	921	//
	922	// Capture Group.
	923	// RegexMatcher::start();
	924	// RegexMatcher::end();
	925	// RegexMatcher::groupCount();
	926	//
	927	{
	928	int32_t flags=0;
	929	UParseError pe;
	930	UErrorCode status=U_ZERO_ERROR;
	931
	932	UnicodeString re("01(23(45)67)(.*)");
	933	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	934	REGEX_CHECK_STATUS;
	935	UnicodeString data = "0123456789";
	936
	937	RegexMatcher *matcher = pat->matcher(data, status);
	938	REGEX_CHECK_STATUS;
	939	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	940	static const int32_t matchStarts[] = {0, 2, 4, 8};
	941	static const int32_t matchEnds[] = {10, 8, 6, 10};
	942	int32_t i;
	943	for (i=0; i<4; i++) {
	944	int32_t actualStart = matcher->start(i, status);
	945	REGEX_CHECK_STATUS;
	946	if (actualStart != matchStarts[i]) {
	947	errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
	948	__LINE__, i, matchStarts[i], actualStart);
	949	}
	950	int32_t actualEnd = matcher->end(i, status);
	951	REGEX_CHECK_STATUS;
	952	if (actualEnd != matchEnds[i]) {
	953	errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
	954	__LINE__, i, matchEnds[i], actualEnd);
	955	}
	956	}
	957
	958	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	959	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	960
	961	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	962	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	963	matcher->reset();
	964	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	965
	966	matcher->lookingAt(status);
	967	REGEX_ASSERT(matcher->group(status) == "0123456789");
	968	REGEX_ASSERT(matcher->group(0, status) == "0123456789");
	969	REGEX_ASSERT(matcher->group(1, status) == "234567" );
	970	REGEX_ASSERT(matcher->group(2, status) == "45" );
	971	REGEX_ASSERT(matcher->group(3, status) == "89" );
	972	REGEX_CHECK_STATUS;
	973	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	974	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	975	matcher->reset();
	976	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	977
	978	delete matcher;
	979	delete pat;
	980
	981	}
	982
	983	//
	984	// find
	985	//
	986	{
	987	int32_t flags=0;
	988	UParseError pe;
	989	UErrorCode status=U_ZERO_ERROR;
	990
	991	UnicodeString re("abc");
	992	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	993	REGEX_CHECK_STATUS;
	994	UnicodeString data = ".abc..abc...abc..";
	995	// 012345678901234567
	996
	997	RegexMatcher *matcher = pat->matcher(data, status);
	998	REGEX_CHECK_STATUS;
	999	REGEX_ASSERT(matcher->find());
	1000	REGEX_ASSERT(matcher->start(status) == 1);
	1001	REGEX_ASSERT(matcher->find());
	1002	REGEX_ASSERT(matcher->start(status) == 6);
	1003	REGEX_ASSERT(matcher->find());
	1004	REGEX_ASSERT(matcher->start(status) == 12);
	1005	REGEX_ASSERT(matcher->find() == FALSE);
	1006	REGEX_ASSERT(matcher->find() == FALSE);
	1007
	1008	matcher->reset();
	1009	REGEX_ASSERT(matcher->find());
	1010	REGEX_ASSERT(matcher->start(status) == 1);
	1011
	1012	REGEX_ASSERT(matcher->find(0, status));
	1013	REGEX_ASSERT(matcher->start(status) == 1);
	1014	REGEX_ASSERT(matcher->find(1, status));
	1015	REGEX_ASSERT(matcher->start(status) == 1);
	1016	REGEX_ASSERT(matcher->find(2, status));
	1017	REGEX_ASSERT(matcher->start(status) == 6);
	1018	REGEX_ASSERT(matcher->find(12, status));
	1019	REGEX_ASSERT(matcher->start(status) == 12);
	1020	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	1021	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	1022	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	1023	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	1024
	1025	status = U_ZERO_ERROR;
	1026	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1027	status = U_ZERO_ERROR;
	1028	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	1029
	1030	REGEX_ASSERT(matcher->groupCount() == 0);
	1031
	1032	delete matcher;
	1033	delete pat;
	1034	}
	1035
	1036
	1037	//
	1038	// find, with \G in pattern (true if at the end of a previous match).
	1039	//
	1040	{
	1041	int32_t flags=0;
	1042	UParseError pe;
	1043	UErrorCode status=U_ZERO_ERROR;
	1044
	1045	UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);
	1046	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1047	REGEX_CHECK_STATUS;
	1048	UnicodeString data = ".abcabc.abc..";
	1049	// 012345678901234567
	1050
	1051	RegexMatcher *matcher = pat->matcher(data, status);
	1052	REGEX_CHECK_STATUS;
	1053	REGEX_ASSERT(matcher->find());
	1054	REGEX_ASSERT(matcher->start(status) == 0);
	1055	REGEX_ASSERT(matcher->start(1, status) == -1);
	1056	REGEX_ASSERT(matcher->start(2, status) == 1);
	1057
	1058	REGEX_ASSERT(matcher->find());
	1059	REGEX_ASSERT(matcher->start(status) == 4);
	1060	REGEX_ASSERT(matcher->start(1, status) == 4);
	1061	REGEX_ASSERT(matcher->start(2, status) == -1);
	1062	REGEX_CHECK_STATUS;
	1063
	1064	delete matcher;
	1065	delete pat;
	1066	}
	1067
	1068	//
	1069	// find with zero length matches, match position should bump ahead
	1070	// to prevent loops.
	1071	//
	1072	{
	1073	int32_t i;
	1074	UErrorCode status=U_ZERO_ERROR;
	1075	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	1076	// using an always-true look-ahead.
	1077	REGEX_CHECK_STATUS;
	1078	UnicodeString s(" ");
	1079	m.reset(s);
	1080	for (i=0; ; i++) {
	1081	if (m.find() == FALSE) {
	1082	break;
	1083	}
	1084	REGEX_ASSERT(m.start(status) == i);
	1085	REGEX_ASSERT(m.end(status) == i);
	1086	}
	1087	REGEX_ASSERT(i==5);
	1088
	1089	// Check that the bump goes over surrogate pairs OK
	1090	s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
	1091	s = s.unescape();
	1092	m.reset(s);
	1093	for (i=0; ; i+=2) {
	1094	if (m.find() == FALSE) {
	1095	break;
	1096	}
	1097	REGEX_ASSERT(m.start(status) == i);
	1098	REGEX_ASSERT(m.end(status) == i);
	1099	}
	1100	REGEX_ASSERT(i==10);
	1101	}
	1102	{
	1103	// find() loop breaking test.
	1104	// with pattern of /.?/, should see a series of one char matches, then a single
	1105	// match of zero length at the end of the input string.
	1106	int32_t i;
	1107	UErrorCode status=U_ZERO_ERROR;
	1108	RegexMatcher m(".?", 0, status);
	1109	REGEX_CHECK_STATUS;
	1110	UnicodeString s(" ");
	1111	m.reset(s);
	1112	for (i=0; ; i++) {
	1113	if (m.find() == FALSE) {
	1114	break;
	1115	}
	1116	REGEX_ASSERT(m.start(status) == i);
	1117	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	1118	}
	1119	REGEX_ASSERT(i==5);
	1120	}
	1121
	1122
	1123	//
	1124	// Matchers with no input string behave as if they had an empty input string.
	1125	//
	1126
	1127	{
	1128	UErrorCode status = U_ZERO_ERROR;
	1129	RegexMatcher m(".?", 0, status);
	1130	REGEX_CHECK_STATUS;
	1131	REGEX_ASSERT(m.find());
	1132	REGEX_ASSERT(m.start(status) == 0);
	1133	REGEX_ASSERT(m.input() == "");
	1134	}
	1135	{
	1136	UErrorCode status = U_ZERO_ERROR;
	1137	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1138	RegexMatcher *m = p->matcher(status);
	1139	REGEX_CHECK_STATUS;
	1140
	1141	REGEX_ASSERT(m->find() == FALSE);
	1142	REGEX_ASSERT(m->input() == "");
	1143	delete m;
	1144	delete p;
	1145	}
	1146
	1147	//
	1148	// Regions
	1149	//
	1150	{
	1151	UErrorCode status = U_ZERO_ERROR;
	1152	UnicodeString testString("This is test data");
	1153	RegexMatcher m(".*", testString, 0, status);
	1154	REGEX_CHECK_STATUS;
	1155	REGEX_ASSERT(m.regionStart() == 0);
	1156	REGEX_ASSERT(m.regionEnd() == testString.length());
	1157	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1158	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1159
	1160	m.region(2,4, status);
	1161	REGEX_CHECK_STATUS;
	1162	REGEX_ASSERT(m.matches(status));
	1163	REGEX_ASSERT(m.start(status)==2);
	1164	REGEX_ASSERT(m.end(status)==4);
	1165	REGEX_CHECK_STATUS;
	1166
	1167	m.reset();
	1168	REGEX_ASSERT(m.regionStart() == 0);
	1169	REGEX_ASSERT(m.regionEnd() == testString.length());
	1170
	1171	UnicodeString shorterString("short");
	1172	m.reset(shorterString);
	1173	REGEX_ASSERT(m.regionStart() == 0);
	1174	REGEX_ASSERT(m.regionEnd() == shorterString.length());
	1175
	1176	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1177	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	1178	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1179	REGEX_ASSERT(&m == &m.reset());
	1180	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	1181
	1182	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	1183	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1184	REGEX_ASSERT(&m == &m.reset());
	1185	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	1186
	1187	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1188	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	1189	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1190	REGEX_ASSERT(&m == &m.reset());
	1191	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	1192
	1193	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	1194	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1195	REGEX_ASSERT(&m == &m.reset());
	1196	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	1197
	1198	}
	1199
	1200	//
	1201	// hitEnd() and requireEnd()
	1202	//
	1203	{
	1204	UErrorCode status = U_ZERO_ERROR;
	1205	UnicodeString testString("aabb");
	1206	RegexMatcher m1(".*", testString, 0, status);
	1207	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	1208	REGEX_ASSERT(m1.hitEnd() == TRUE);
	1209	REGEX_ASSERT(m1.requireEnd() == FALSE);
	1210	REGEX_CHECK_STATUS;
	1211
	1212	status = U_ZERO_ERROR;
	1213	RegexMatcher m2("a*", testString, 0, status);
	1214	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	1215	REGEX_ASSERT(m2.hitEnd() == FALSE);
	1216	REGEX_ASSERT(m2.requireEnd() == FALSE);
	1217	REGEX_CHECK_STATUS;
	1218
	1219	status = U_ZERO_ERROR;
	1220	RegexMatcher m3(".*$", testString, 0, status);
	1221	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	1222	REGEX_ASSERT(m3.hitEnd() == TRUE);
	1223	REGEX_ASSERT(m3.requireEnd() == TRUE);
	1224	REGEX_CHECK_STATUS;
	1225	}
	1226
	1227
	1228	//
	1229	// Compilation error on reset with UChar *
	1230	// These were a hazard that people were stumbling over with runtime errors.
	1231	// Changed them to compiler errors by adding private methods that more closely
	1232	// matched the incorrect use of the functions.
	1233	//
	1234	#if 0
	1235	{
	1236	UErrorCode status = U_ZERO_ERROR;
	1237	UChar ucharString[20];
	1238	RegexMatcher m(".", 0, status);
	1239	m.reset(ucharString); // should not compile.
	1240
	1241	RegexPattern *p = RegexPattern::compile(".", 0, status);
	1242	RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
	1243
	1244	RegexMatcher m3(".", ucharString, 0, status); // Should not compile
	1245	}
	1246	#endif
	1247
	1248	//
	1249	// Time Outs.
	1250	// Note: These tests will need to be changed when the regexp engine is
	1251	// able to detect and cut short the exponential time behavior on
	1252	// this type of match.
	1253	//
	1254	{
	1255	UErrorCode status = U_ZERO_ERROR;
	1256	// Enough 'a's in the string to cause the match to time out.
	1257	// (Each on additonal 'a' doubles the time)
	1258	UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
	1259	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1260	REGEX_CHECK_STATUS;
	1261	REGEX_ASSERT(matcher.getTimeLimit() == 0);
	1262	matcher.setTimeLimit(100, status);
	1263	REGEX_ASSERT(matcher.getTimeLimit() == 100);
	1264	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1265	REGEX_ASSERT(status == U_REGEX_TIME_OUT);
	1266	}
	1267	{
	1268	UErrorCode status = U_ZERO_ERROR;
	1269	// Few enough 'a's to slip in under the time limit.
	1270	UnicodeString testString("aaaaaaaaaaaaaaaaaa");
	1271	RegexMatcher matcher("(a+)+b", testString, 0, status);
	1272	REGEX_CHECK_STATUS;
	1273	matcher.setTimeLimit(100, status);
	1274	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1275	REGEX_CHECK_STATUS;
	1276	}
	1277
	1278	//
	1279	// Stack Limits
	1280	//
	1281	{
	1282	UErrorCode status = U_ZERO_ERROR;
	1283	UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
	1284
	1285	// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
	1286	// of the '+', and makes the stack frames larger.
	1287	RegexMatcher matcher("(A)+A$", testString, 0, status);
	1288
	1289	// With the default stack, this match should fail to run
	1290	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1291	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1292
	1293	// With unlimited stack, it should run
	1294	status = U_ZERO_ERROR;
	1295	matcher.setStackLimit(0, status);
	1296	REGEX_CHECK_STATUS;
	1297	REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
	1298	REGEX_CHECK_STATUS;
	1299	REGEX_ASSERT(matcher.getStackLimit() == 0);
	1300
	1301	// With a limited stack, it the match should fail
	1302	status = U_ZERO_ERROR;
	1303	matcher.setStackLimit(10000, status);
	1304	REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
	1305	REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
	1306	REGEX_ASSERT(matcher.getStackLimit() == 10000);
	1307	}
	1308
	1309	// A pattern that doesn't save state should work with
	1310	// a minimal sized stack
	1311	{
	1312	UErrorCode status = U_ZERO_ERROR;
	1313	UnicodeString testString = "abc";
	1314	RegexMatcher matcher("abc", testString, 0, status);
	1315	REGEX_CHECK_STATUS;
	1316	matcher.setStackLimit(30, status);
	1317	REGEX_CHECK_STATUS;
	1318	REGEX_ASSERT(matcher.matches(status) == TRUE);
	1319	REGEX_CHECK_STATUS;
	1320	REGEX_ASSERT(matcher.getStackLimit() == 30);
	1321
	1322	// Negative stack sizes should fail
	1323	status = U_ZERO_ERROR;
	1324	matcher.setStackLimit(1000, status);
	1325	REGEX_CHECK_STATUS;
	1326	matcher.setStackLimit(-1, status);
	1327	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	1328	REGEX_ASSERT(matcher.getStackLimit() == 1000);
	1329	}
	1330
	1331
	1332	}
	1333
	1334
	1335
	1336
	1337
	1338
	1339	//---------------------------------------------------------------------------
	1340	//
	1341	// API_Replace API test for class RegexMatcher, testing the
	1342	// Replace family of functions.
	1343	//
	1344	//---------------------------------------------------------------------------
	1345	void RegexTest::API_Replace() {
	1346	//
	1347	// Replace
	1348	//
	1349	int32_t flags=0;
	1350	UParseError pe;
	1351	UErrorCode status=U_ZERO_ERROR;
	1352
	1353	UnicodeString re("abc");
	1354	RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
	1355	REGEX_CHECK_STATUS;
	1356	UnicodeString data = ".abc..abc...abc..";
	1357	// 012345678901234567
	1358	RegexMatcher *matcher = pat->matcher(data, status);
	1359
	1360	//
	1361	// Plain vanilla matches.
	1362	//
	1363	UnicodeString dest;
	1364	dest = matcher->replaceFirst("yz", status);
	1365	REGEX_CHECK_STATUS;
	1366	REGEX_ASSERT(dest == ".yz..abc...abc..");
	1367
	1368	dest = matcher->replaceAll("yz", status);
	1369	REGEX_CHECK_STATUS;
	1370	REGEX_ASSERT(dest == ".yz..yz...yz..");
	1371
	1372	//
	1373	// Plain vanilla non-matches.
	1374	//
	1375	UnicodeString d2 = ".abx..abx...abx..";
	1376	matcher->reset(d2);
	1377	dest = matcher->replaceFirst("yz", status);
	1378	REGEX_CHECK_STATUS;
	1379	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1380
	1381	dest = matcher->replaceAll("yz", status);
	1382	REGEX_CHECK_STATUS;
	1383	REGEX_ASSERT(dest == ".abx..abx...abx..");
	1384
	1385	//
	1386	// Empty source string
	1387	//
	1388	UnicodeString d3 = "";
	1389	matcher->reset(d3);
	1390	dest = matcher->replaceFirst("yz", status);
	1391	REGEX_CHECK_STATUS;
	1392	REGEX_ASSERT(dest == "");
	1393
	1394	dest = matcher->replaceAll("yz", status);
	1395	REGEX_CHECK_STATUS;
	1396	REGEX_ASSERT(dest == "");
	1397
	1398	//
	1399	// Empty substitution string
	1400	//
	1401	matcher->reset(data); // ".abc..abc...abc.."
	1402	dest = matcher->replaceFirst("", status);
	1403	REGEX_CHECK_STATUS;
	1404	REGEX_ASSERT(dest == "...abc...abc..");
	1405
	1406	dest = matcher->replaceAll("", status);
	1407	REGEX_CHECK_STATUS;
	1408	REGEX_ASSERT(dest == "........");
	1409
	1410	//
	1411	// match whole string
	1412	//
	1413	UnicodeString d4 = "abc";
	1414	matcher->reset(d4);
	1415	dest = matcher->replaceFirst("xyz", status);
	1416	REGEX_CHECK_STATUS;
	1417	REGEX_ASSERT(dest == "xyz");
	1418
	1419	dest = matcher->replaceAll("xyz", status);
	1420	REGEX_CHECK_STATUS;
	1421	REGEX_ASSERT(dest == "xyz");
	1422
	1423	//
	1424	// Capture Group, simple case
	1425	//
	1426	UnicodeString re2("a(..)");
	1427	RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
	1428	REGEX_CHECK_STATUS;
	1429	UnicodeString d5 = "abcdefg";
	1430	RegexMatcher *matcher2 = pat2->matcher(d5, status);
	1431	REGEX_CHECK_STATUS;
	1432	dest = matcher2->replaceFirst("$1$1", status);
	1433	REGEX_CHECK_STATUS;
	1434	REGEX_ASSERT(dest == "bcbcdefg");
	1435
	1436	dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
	1437	REGEX_CHECK_STATUS;
	1438	REGEX_ASSERT(dest == "The value of $1 is bc.defg");
	1439
	1440	dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
	1441	REGEX_ASSERT(U_FAILURE(status));
	1442	status = U_ZERO_ERROR;
	1443
	1444	UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
	1445	replacement = replacement.unescape();
	1446	dest = matcher2->replaceFirst(replacement, status);
	1447	REGEX_CHECK_STATUS;
	1448	REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
	1449
	1450	REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
	1451
	1452
	1453	//
	1454	// Replacement String with \u hex escapes
	1455	//
	1456	{
	1457	UnicodeString src = "abc 1 abc 2 abc 3";
	1458	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
	1459	matcher->reset(src);
	1460	UnicodeString result = matcher->replaceAll(substitute, status);
	1461	REGEX_CHECK_STATUS;
	1462	REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
	1463	}
	1464	{
	1465	UnicodeString src = "abc !";
	1466	UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
	1467	matcher->reset(src);
	1468	UnicodeString result = matcher->replaceAll(substitute, status);
	1469	REGEX_CHECK_STATUS;
	1470	UnicodeString expected = UnicodeString("--");
	1471	expected.append((UChar32)0x10000);
	1472	expected.append("-- !");
	1473	REGEX_ASSERT(result == expected);
	1474	}
	1475	// TODO: need more through testing of capture substitutions.
	1476
	1477	// Bug 4057
	1478	//
	1479	{
	1480	status = U_ZERO_ERROR;
	1481	UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
	1482	RegexMatcher m("ss(.*?)ee", 0, status);
	1483	REGEX_CHECK_STATUS;
	1484	UnicodeString result;
	1485
	1486	// Multiple finds do NOT bump up the previous appendReplacement postion.
	1487	m.reset(s);
	1488	m.find();
	1489	m.find();
	1490	m.appendReplacement(result, "ooh", status);
	1491	REGEX_CHECK_STATUS;
	1492	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1493
	1494	// After a reset into the interior of a string, appendReplacemnt still starts at beginning.
	1495	status = U_ZERO_ERROR;
	1496	result.truncate(0);
	1497	m.reset(10, status);
	1498	m.find();
	1499	m.find();
	1500	m.appendReplacement(result, "ooh", status);
	1501	REGEX_CHECK_STATUS;
	1502	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1503
	1504	// find() at interior of string, appendReplacemnt still starts at beginning.
	1505	status = U_ZERO_ERROR;
	1506	result.truncate(0);
	1507	m.reset();
	1508	m.find(10, status);
	1509	m.find();
	1510	m.appendReplacement(result, "ooh", status);
	1511	REGEX_CHECK_STATUS;
	1512	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
	1513
	1514	m.appendTail(result);
	1515	REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
	1516
	1517	}
	1518
	1519	delete matcher2;
	1520	delete pat2;
	1521	delete matcher;
	1522	delete pat;
	1523	}
	1524
	1525
	1526	//---------------------------------------------------------------------------
	1527	//
	1528	// API_Pattern Test that the API for class RegexPattern is
	1529	// present and nominally working.
	1530	//
	1531	//---------------------------------------------------------------------------
	1532	void RegexTest::API_Pattern() {
	1533	RegexPattern pata; // Test default constructor to not crash.
	1534	RegexPattern patb;
	1535
	1536	REGEX_ASSERT(pata == patb);
	1537	REGEX_ASSERT(pata == pata);
	1538
	1539	UnicodeString re1("abc[a-l][m-z]");
	1540	UnicodeString re2("def");
	1541	UErrorCode status = U_ZERO_ERROR;
	1542	UParseError pe;
	1543
	1544	RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
	1545	RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
	1546	REGEX_CHECK_STATUS;
	1547	REGEX_ASSERT(pat1 == pat1);
	1548	REGEX_ASSERT(*pat1 != pata);
	1549
	1550	// Assign
	1551	patb = *pat1;
	1552	REGEX_ASSERT(patb == *pat1);
	1553
	1554	// Copy Construct
	1555	RegexPattern patc(*pat1);
	1556	REGEX_ASSERT(patc == *pat1);
	1557	REGEX_ASSERT(patb == patc);
	1558	REGEX_ASSERT(pat1 != pat2);
	1559	patb = *pat2;
	1560	REGEX_ASSERT(patb != patc);
	1561	REGEX_ASSERT(patb == *pat2);
	1562
	1563	// Compile with no flags.
	1564	RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
	1565	REGEX_ASSERT(pat1a == pat1);
	1566
	1567	REGEX_ASSERT(pat1a->flags() == 0);
	1568
	1569	// Compile with different flags should be not equal
	1570	RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
	1571	REGEX_CHECK_STATUS;
	1572
	1573	REGEX_ASSERT(pat1b != pat1a);
	1574	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	1575	REGEX_ASSERT(pat1a->flags() == 0);
	1576	delete pat1b;
	1577
	1578	// clone
	1579	RegexPattern *pat1c = pat1->clone();
	1580	REGEX_ASSERT(pat1c == pat1);
	1581	REGEX_ASSERT(pat1c != pat2);
	1582
	1583	delete pat1c;
	1584	delete pat1a;
	1585	delete pat1;
	1586	delete pat2;
	1587
	1588
	1589	//
	1590	// Verify that a matcher created from a cloned pattern works.
	1591	// (Jitterbug 3423)
	1592	//
	1593	{
	1594	UErrorCode status = U_ZERO_ERROR;
	1595	RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
	1596	RegexPattern *pClone = pSource->clone();
	1597	delete pSource;
	1598	RegexMatcher *mFromClone = pClone->matcher(status);
	1599	REGEX_CHECK_STATUS;
	1600	UnicodeString s = "Hello World";
	1601	mFromClone->reset(s);
	1602	REGEX_ASSERT(mFromClone->find() == TRUE);
	1603	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	1604	REGEX_ASSERT(mFromClone->find() == TRUE);
	1605	REGEX_ASSERT(mFromClone->group(status) == "World");
	1606	REGEX_ASSERT(mFromClone->find() == FALSE);
	1607	delete mFromClone;
	1608	delete pClone;
	1609	}
	1610
	1611	//
	1612	// matches convenience API
	1613	//
	1614	REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
	1615	REGEX_CHECK_STATUS;
	1616	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	1617	REGEX_CHECK_STATUS;
	1618	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	1619	REGEX_CHECK_STATUS;
	1620	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	1621	REGEX_CHECK_STATUS;
	1622	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	1623	REGEX_CHECK_STATUS;
	1624	status = U_INDEX_OUTOFBOUNDS_ERROR;
	1625	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	1626	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1627
	1628
	1629	//
	1630	// Split()
	1631	//
	1632	status = U_ZERO_ERROR;
	1633	pat1 = RegexPattern::compile(" +", pe, status);
	1634	REGEX_CHECK_STATUS;
	1635	UnicodeString fields[10];
	1636
	1637	int32_t n;
	1638	n = pat1->split("Now is the time", fields, 10, status);
	1639	REGEX_CHECK_STATUS;
	1640	REGEX_ASSERT(n==4);
	1641	REGEX_ASSERT(fields[0]=="Now");
	1642	REGEX_ASSERT(fields[1]=="is");
	1643	REGEX_ASSERT(fields[2]=="the");
	1644	REGEX_ASSERT(fields[3]=="time");
	1645	REGEX_ASSERT(fields[4]=="");
	1646
	1647	n = pat1->split("Now is the time", fields, 2, status);
	1648	REGEX_CHECK_STATUS;
	1649	REGEX_ASSERT(n==2);
	1650	REGEX_ASSERT(fields[0]=="Now");
	1651	REGEX_ASSERT(fields[1]=="is the time");
	1652	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	1653
	1654	fields[1] = "*";
	1655	status = U_ZERO_ERROR;
	1656	n = pat1->split("Now is the time", fields, 1, status);
	1657	REGEX_CHECK_STATUS;
	1658	REGEX_ASSERT(n==1);
	1659	REGEX_ASSERT(fields[0]=="Now is the time");
	1660	REGEX_ASSERT(fields[1]=="*");
	1661	status = U_ZERO_ERROR;
	1662
	1663	n = pat1->split(" Now is the time ", fields, 10, status);
	1664	REGEX_CHECK_STATUS;
	1665	REGEX_ASSERT(n==6);
	1666	REGEX_ASSERT(fields[0]=="");
	1667	REGEX_ASSERT(fields[1]=="Now");
	1668	REGEX_ASSERT(fields[2]=="is");
	1669	REGEX_ASSERT(fields[3]=="the");
	1670	REGEX_ASSERT(fields[4]=="time");
	1671	REGEX_ASSERT(fields[5]=="");
	1672
	1673	n = pat1->split(" ", fields, 10, status);
	1674	REGEX_CHECK_STATUS;
	1675	REGEX_ASSERT(n==2);
	1676	REGEX_ASSERT(fields[0]=="");
	1677	REGEX_ASSERT(fields[1]=="");
	1678
	1679	fields[0] = "foo";
	1680	n = pat1->split("", fields, 10, status);
	1681	REGEX_CHECK_STATUS;
	1682	REGEX_ASSERT(n==0);
	1683	REGEX_ASSERT(fields[0]=="foo");
	1684
	1685	delete pat1;
	1686
	1687	// split, with a pattern with (capture)
	1688	pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
	1689	REGEX_CHECK_STATUS;
	1690
	1691	status = U_ZERO_ERROR;
	1692	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	1693	REGEX_CHECK_STATUS;
	1694	REGEX_ASSERT(n==7);
	1695	REGEX_ASSERT(fields[0]=="");
	1696	REGEX_ASSERT(fields[1]=="a");
	1697	REGEX_ASSERT(fields[2]=="Now is ");
	1698	REGEX_ASSERT(fields[3]=="b");
	1699	REGEX_ASSERT(fields[4]=="the time");
	1700	REGEX_ASSERT(fields[5]=="c");
	1701	REGEX_ASSERT(fields[6]=="");
	1702	REGEX_ASSERT(status==U_ZERO_ERROR);
	1703
	1704	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	1705	REGEX_CHECK_STATUS;
	1706	REGEX_ASSERT(n==7);
	1707	REGEX_ASSERT(fields[0]==" ");
	1708	REGEX_ASSERT(fields[1]=="a");
	1709	REGEX_ASSERT(fields[2]=="Now is ");
	1710	REGEX_ASSERT(fields[3]=="b");
	1711	REGEX_ASSERT(fields[4]=="the time");
	1712	REGEX_ASSERT(fields[5]=="c");
	1713	REGEX_ASSERT(fields[6]=="");
	1714
	1715	status = U_ZERO_ERROR;
	1716	fields[6] = "foo";
	1717	n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
	1718	REGEX_CHECK_STATUS;
	1719	REGEX_ASSERT(n==6);
	1720	REGEX_ASSERT(fields[0]==" ");
	1721	REGEX_ASSERT(fields[1]=="a");
	1722	REGEX_ASSERT(fields[2]=="Now is ");
	1723	REGEX_ASSERT(fields[3]=="b");
	1724	REGEX_ASSERT(fields[4]=="the time");
	1725	REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
	1726	REGEX_ASSERT(fields[6]=="foo");
	1727
	1728	status = U_ZERO_ERROR;
	1729	fields[5] = "foo";
	1730	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	1731	REGEX_CHECK_STATUS;
	1732	REGEX_ASSERT(n==5);
	1733	REGEX_ASSERT(fields[0]==" ");
	1734	REGEX_ASSERT(fields[1]=="a");
	1735	REGEX_ASSERT(fields[2]=="Now is ");
	1736	REGEX_ASSERT(fields[3]=="b");
	1737	REGEX_ASSERT(fields[4]=="the time<c>");
	1738	REGEX_ASSERT(fields[5]=="foo");
	1739
	1740	status = U_ZERO_ERROR;
	1741	fields[5] = "foo";
	1742	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	1743	REGEX_CHECK_STATUS;
	1744	REGEX_ASSERT(n==5);
	1745	REGEX_ASSERT(fields[0]==" ");
	1746	REGEX_ASSERT(fields[1]=="a");
	1747	REGEX_ASSERT(fields[2]=="Now is ");
	1748	REGEX_ASSERT(fields[3]=="b");
	1749	REGEX_ASSERT(fields[4]=="the time");
	1750	REGEX_ASSERT(fields[5]=="foo");
	1751
	1752	status = U_ZERO_ERROR;
	1753	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	1754	REGEX_CHECK_STATUS;
	1755	REGEX_ASSERT(n==4);
	1756	REGEX_ASSERT(fields[0]==" ");
	1757	REGEX_ASSERT(fields[1]=="a");
	1758	REGEX_ASSERT(fields[2]=="Now is ");
	1759	REGEX_ASSERT(fields[3]=="the time<c>");
	1760	status = U_ZERO_ERROR;
	1761	delete pat1;
	1762
	1763	pat1 = RegexPattern::compile("([-,])", pe, status);
	1764	REGEX_CHECK_STATUS;
	1765	n = pat1->split("1-10,20", fields, 10, status);
	1766	REGEX_CHECK_STATUS;
	1767	REGEX_ASSERT(n==5);
	1768	REGEX_ASSERT(fields[0]=="1");
	1769	REGEX_ASSERT(fields[1]=="-");
	1770	REGEX_ASSERT(fields[2]=="10");
	1771	REGEX_ASSERT(fields[3]==",");
	1772	REGEX_ASSERT(fields[4]=="20");
	1773	delete pat1;
	1774
	1775	// Test split of string with empty trailing fields
	1776	pat1 = RegexPattern::compile(",", pe, status);
	1777	REGEX_CHECK_STATUS;
	1778	n = pat1->split("a,b,c,", fields, 10, status);
	1779	REGEX_CHECK_STATUS;
	1780	REGEX_ASSERT(n==4);
	1781	REGEX_ASSERT(fields[0]=="a");
	1782	REGEX_ASSERT(fields[1]=="b");
	1783	REGEX_ASSERT(fields[2]=="c");
	1784	REGEX_ASSERT(fields[3]=="");
	1785
	1786	n = pat1->split("a,,,", fields, 10, status);
	1787	REGEX_CHECK_STATUS;
	1788	REGEX_ASSERT(n==4);
	1789	REGEX_ASSERT(fields[0]=="a");
	1790	REGEX_ASSERT(fields[1]=="");
	1791	REGEX_ASSERT(fields[2]=="");
	1792	REGEX_ASSERT(fields[3]=="");
	1793	delete pat1;
	1794
	1795	// Split Separator with zero length match.
	1796	pat1 = RegexPattern::compile(":?", pe, status);
	1797	REGEX_CHECK_STATUS;
	1798	n = pat1->split("abc", fields, 10, status);
	1799	REGEX_CHECK_STATUS;
	1800	REGEX_ASSERT(n==5);
	1801	REGEX_ASSERT(fields[0]=="");
	1802	REGEX_ASSERT(fields[1]=="a");
	1803	REGEX_ASSERT(fields[2]=="b");
	1804	REGEX_ASSERT(fields[3]=="c");
	1805	REGEX_ASSERT(fields[4]=="");
	1806
	1807	delete pat1;
	1808
	1809	//
	1810	// RegexPattern::pattern()
	1811	//
	1812	pat1 = new RegexPattern();
	1813	REGEX_ASSERT(pat1->pattern() == "");
	1814	delete pat1;
	1815
	1816	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1817	REGEX_CHECK_STATUS;
	1818	REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
	1819	delete pat1;
	1820
	1821
	1822	//
	1823	// classID functions
	1824	//
	1825	pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
	1826	REGEX_CHECK_STATUS;
	1827	REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
	1828	REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
	1829	UnicodeString Hello("Hello, world.");
	1830	RegexMatcher *m = pat1->matcher(Hello, status);
	1831	REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
	1832	REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
	1833	REGEX_ASSERT(m->getDynamicClassID() != NULL);
	1834	delete m;
	1835	delete pat1;
	1836
	1837	}
	1838
	1839	//---------------------------------------------------------------------------
	1840	//
	1841	// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
	1842	// is present and working, but excluding functions
	1843	// implementing replace operations.
	1844	//
	1845	//---------------------------------------------------------------------------
	1846	void RegexTest::API_Match_UTF8() {
	1847	UParseError pe;
	1848	UErrorCode status=U_ZERO_ERROR;
	1849	int32_t flags = 0;
	1850
	1851	//
	1852	// Debug - slide failing test cases early
	1853	//
	1854	#if 0
	1855	{
	1856	}
	1857	return;
	1858	#endif
	1859
	1860	//
	1861	// Simple pattern compilation
	1862	//
	1863	{
	1864	UText re = UTEXT_INITIALIZER;
	1865	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	1866	REGEX_VERBOSE_TEXT(&re);
	1867	RegexPattern *pat2;
	1868	pat2 = RegexPattern::compile(&re, flags, pe, status);
	1869	REGEX_CHECK_STATUS;
	1870
	1871	UText input1 = UTEXT_INITIALIZER;
	1872	UText input2 = UTEXT_INITIALIZER;
	1873	UText empty = UTEXT_INITIALIZER;
	1874	regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
	1875	REGEX_VERBOSE_TEXT(&input1);
	1876	regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
	1877	REGEX_VERBOSE_TEXT(&input2);
	1878	utext_openUChars(&empty, NULL, 0, &status);
	1879
	1880	int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
	1881	int32_t input2Len = strlen("not abc");
	1882
	1883
	1884	//
	1885	// Matcher creation and reset.
	1886	//
	1887	RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
	1888	REGEX_CHECK_STATUS;
	1889	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1890	const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
	1891	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1892	m1->reset(&input2);
	1893	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1894	const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
	1895	REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
	1896	m1->reset(&input1);
	1897	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1898	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1899	m1->reset(&empty);
	1900	REGEX_ASSERT(m1->lookingAt(status) == FALSE);
	1901	REGEX_ASSERT(utext_nativeLength(&empty) == 0);
	1902
	1903	//
	1904	// reset(pos, status)
	1905	//
	1906	m1->reset(&input1);
	1907	m1->reset(4, status);
	1908	REGEX_CHECK_STATUS;
	1909	REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
	1910	REGEX_ASSERT(m1->lookingAt(status) == TRUE);
	1911
	1912	m1->reset(-1, status);
	1913	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1914	status = U_ZERO_ERROR;
	1915
	1916	m1->reset(0, status);
	1917	REGEX_CHECK_STATUS;
	1918	status = U_ZERO_ERROR;
	1919
	1920	m1->reset(input1Len-1, status);
	1921	REGEX_CHECK_STATUS;
	1922	status = U_ZERO_ERROR;
	1923
	1924	m1->reset(input1Len, status);
	1925	REGEX_CHECK_STATUS;
	1926	status = U_ZERO_ERROR;
	1927
	1928	m1->reset(input1Len+1, status);
	1929	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1930	status = U_ZERO_ERROR;
	1931
	1932	//
	1933	// match(pos, status)
	1934	//
	1935	m1->reset(&input2);
	1936	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1937	m1->reset();
	1938	REGEX_ASSERT(m1->matches(3, status) == FALSE);
	1939	m1->reset();
	1940	REGEX_ASSERT(m1->matches(5, status) == FALSE);
	1941	REGEX_ASSERT(m1->matches(4, status) == TRUE);
	1942	REGEX_ASSERT(m1->matches(-1, status) == FALSE);
	1943	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1944
	1945	// Match() at end of string should fail, but should not
	1946	// be an error.
	1947	status = U_ZERO_ERROR;
	1948	REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
	1949	REGEX_CHECK_STATUS;
	1950
	1951	// Match beyond end of string should fail with an error.
	1952	status = U_ZERO_ERROR;
	1953	REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
	1954	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1955
	1956	// Successful match at end of string.
	1957	{
	1958	status = U_ZERO_ERROR;
	1959	RegexMatcher m("A?", 0, status); // will match zero length string.
	1960	REGEX_CHECK_STATUS;
	1961	m.reset(&input1);
	1962	REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
	1963	REGEX_CHECK_STATUS;
	1964	m.reset(&empty);
	1965	REGEX_ASSERT(m.matches(0, status) == TRUE);
	1966	REGEX_CHECK_STATUS;
	1967	}
	1968
	1969
	1970	//
	1971	// lookingAt(pos, status)
	1972	//
	1973	status = U_ZERO_ERROR;
	1974	m1->reset(&input2); // "not abc"
	1975	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1976	REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
	1977	REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
	1978	REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
	1979	REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
	1980	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1981	status = U_ZERO_ERROR;
	1982	REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
	1983	REGEX_CHECK_STATUS;
	1984	REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
	1985	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	1986
	1987	delete m1;
	1988	delete pat2;
	1989
	1990	utext_close(&re);
	1991	utext_close(&input1);
	1992	utext_close(&input2);
	1993	utext_close(&empty);
	1994	}
	1995
	1996
	1997	//
	1998	// Capture Group.
	1999	// RegexMatcher::start();
	2000	// RegexMatcher::end();
	2001	// RegexMatcher::groupCount();
	2002	//
	2003	{
	2004	int32_t flags=0;
	2005	UParseError pe;
	2006	UErrorCode status=U_ZERO_ERROR;
	2007	UText re=UTEXT_INITIALIZER;
	2008	const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.) /
	2009	utext_openUTF8(&re, str_01234567_pat, -1, &status);
	2010
	2011	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2012	REGEX_CHECK_STATUS;
	2013
	2014	UText input = UTEXT_INITIALIZER;
	2015	const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	2016	utext_openUTF8(&input, str_0123456789, -1, &status);
	2017
	2018	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2019	REGEX_CHECK_STATUS;
	2020	REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
	2021	static const int32_t matchStarts[] = {0, 2, 4, 8};
	2022	static const int32_t matchEnds[] = {10, 8, 6, 10};
	2023	int32_t i;
	2024	for (i=0; i<4; i++) {
	2025	int32_t actualStart = matcher->start(i, status);
	2026	REGEX_CHECK_STATUS;
	2027	if (actualStart != matchStarts[i]) {
	2028	errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
	2029	__FILE__, __LINE__, i, matchStarts[i], actualStart);
	2030	}
	2031	int32_t actualEnd = matcher->end(i, status);
	2032	REGEX_CHECK_STATUS;
	2033	if (actualEnd != matchEnds[i]) {
	2034	errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
	2035	__FILE__, __LINE__, i, matchEnds[i], actualEnd);
	2036	}
	2037	}
	2038
	2039	REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
	2040	REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
	2041
	2042	REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2043	REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2044	matcher->reset();
	2045	REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
	2046
	2047	matcher->lookingAt(status);
	2048
	2049	UnicodeString dest;
	2050	UText destText = UTEXT_INITIALIZER;
	2051	utext_openUnicodeString(&destText, &dest, &status);
	2052	UText *result;
	2053	//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
	2054	// Test shallow-clone API
	2055	int64_t group_len;
	2056	result = matcher->group((UText *)NULL, group_len, status);
	2057	REGEX_CHECK_STATUS;
	2058	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2059	utext_close(result);
	2060	result = matcher->group(0, &destText, group_len, status);
	2061	REGEX_CHECK_STATUS;
	2062	REGEX_ASSERT(result == &destText);
	2063	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2064	// destText is now immutable, reopen it
	2065	utext_close(&destText);
	2066	utext_openUnicodeString(&destText, &dest, &status);
	2067
	2068	int64_t length;
	2069	result = matcher->group(0, NULL, length, status);
	2070	REGEX_CHECK_STATUS;
	2071	REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
	2072	utext_close(result);
	2073	result = matcher->group(0, &destText, length, status);
	2074	REGEX_CHECK_STATUS;
	2075	REGEX_ASSERT(result == &destText);
	2076	REGEX_ASSERT(utext_getNativeIndex(result) == 0);
	2077	REGEX_ASSERT(length == 10);
	2078	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2079
	2080	// Capture Group 1 == "234567"
	2081	result = matcher->group(1, NULL, length, status);
	2082	REGEX_CHECK_STATUS;
	2083	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
	2084	REGEX_ASSERT(length == 6);
	2085	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2086	utext_close(result);
	2087
	2088	result = matcher->group(1, &destText, length, status);
	2089	REGEX_CHECK_STATUS;
	2090	REGEX_ASSERT(result == &destText);
	2091	REGEX_ASSERT(utext_getNativeIndex(result) == 2);
	2092	REGEX_ASSERT(length == 6);
	2093	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2094	utext_close(result);
	2095
	2096	// Capture Group 2 == "45"
	2097	result = matcher->group(2, NULL, length, status);
	2098	REGEX_CHECK_STATUS;
	2099	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
	2100	REGEX_ASSERT(length == 2);
	2101	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2102	utext_close(result);
	2103
	2104	result = matcher->group(2, &destText, length, status);
	2105	REGEX_CHECK_STATUS;
	2106	REGEX_ASSERT(result == &destText);
	2107	REGEX_ASSERT(utext_getNativeIndex(result) == 4);
	2108	REGEX_ASSERT(length == 2);
	2109	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2110	utext_close(result);
	2111
	2112	// Capture Group 3 == "89"
	2113	result = matcher->group(3, NULL, length, status);
	2114	REGEX_CHECK_STATUS;
	2115	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
	2116	REGEX_ASSERT(length == 2);
	2117	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2118	utext_close(result);
	2119
	2120	result = matcher->group(3, &destText, length, status);
	2121	REGEX_CHECK_STATUS;
	2122	REGEX_ASSERT(result == &destText);
	2123	REGEX_ASSERT(utext_getNativeIndex(result) == 8);
	2124	REGEX_ASSERT(length == 2);
	2125	REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
	2126	utext_close(result);
	2127
	2128	// Capture Group number out of range.
	2129	status = U_ZERO_ERROR;
	2130	REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2131	status = U_ZERO_ERROR;
	2132	REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2133	status = U_ZERO_ERROR;
	2134	matcher->reset();
	2135	REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
	2136
	2137	delete matcher;
	2138	delete pat;
	2139
	2140	utext_close(&destText);
	2141	utext_close(&input);
	2142	utext_close(&re);
	2143	}
	2144
	2145	//
	2146	// find
	2147	//
	2148	{
	2149	int32_t flags=0;
	2150	UParseError pe;
	2151	UErrorCode status=U_ZERO_ERROR;
	2152	UText re=UTEXT_INITIALIZER;
	2153	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2154	utext_openUTF8(&re, str_abc, -1, &status);
	2155
	2156	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2157	REGEX_CHECK_STATUS;
	2158	UText input = UTEXT_INITIALIZER;
	2159	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2160	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2161	// 012345678901234567
	2162
	2163	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2164	REGEX_CHECK_STATUS;
	2165	REGEX_ASSERT(matcher->find());
	2166	REGEX_ASSERT(matcher->start(status) == 1);
	2167	REGEX_ASSERT(matcher->find());
	2168	REGEX_ASSERT(matcher->start(status) == 6);
	2169	REGEX_ASSERT(matcher->find());
	2170	REGEX_ASSERT(matcher->start(status) == 12);
	2171	REGEX_ASSERT(matcher->find() == FALSE);
	2172	REGEX_ASSERT(matcher->find() == FALSE);
	2173
	2174	matcher->reset();
	2175	REGEX_ASSERT(matcher->find());
	2176	REGEX_ASSERT(matcher->start(status) == 1);
	2177
	2178	REGEX_ASSERT(matcher->find(0, status));
	2179	REGEX_ASSERT(matcher->start(status) == 1);
	2180	REGEX_ASSERT(matcher->find(1, status));
	2181	REGEX_ASSERT(matcher->start(status) == 1);
	2182	REGEX_ASSERT(matcher->find(2, status));
	2183	REGEX_ASSERT(matcher->start(status) == 6);
	2184	REGEX_ASSERT(matcher->find(12, status));
	2185	REGEX_ASSERT(matcher->start(status) == 12);
	2186	REGEX_ASSERT(matcher->find(13, status) == FALSE);
	2187	REGEX_ASSERT(matcher->find(16, status) == FALSE);
	2188	REGEX_ASSERT(matcher->find(17, status) == FALSE);
	2189	REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
	2190
	2191	status = U_ZERO_ERROR;
	2192	REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2193	status = U_ZERO_ERROR;
	2194	REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
	2195
	2196	REGEX_ASSERT(matcher->groupCount() == 0);
	2197
	2198	delete matcher;
	2199	delete pat;
	2200
	2201	utext_close(&input);
	2202	utext_close(&re);
	2203	}
	2204
	2205
	2206	//
	2207	// find, with \G in pattern (true if at the end of a previous match).
	2208	//
	2209	{
	2210	int32_t flags=0;
	2211	UParseError pe;
	2212	UErrorCode status=U_ZERO_ERROR;
	2213	UText re=UTEXT_INITIALIZER;
	2214	const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .?(?:(\\Gabc)\|(abc)) /
	2215	utext_openUTF8(&re, str_Gabcabc, -1, &status);
	2216
	2217	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2218
	2219	REGEX_CHECK_STATUS;
	2220	UText input = UTEXT_INITIALIZER;
	2221	const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
	2222	utext_openUTF8(&input, str_abcabcabc, -1, &status);
	2223	// 012345678901234567
	2224
	2225	RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
	2226	REGEX_CHECK_STATUS;
	2227	REGEX_ASSERT(matcher->find());
	2228	REGEX_ASSERT(matcher->start(status) == 0);
	2229	REGEX_ASSERT(matcher->start(1, status) == -1);
	2230	REGEX_ASSERT(matcher->start(2, status) == 1);
	2231
	2232	REGEX_ASSERT(matcher->find());
	2233	REGEX_ASSERT(matcher->start(status) == 4);
	2234	REGEX_ASSERT(matcher->start(1, status) == 4);
	2235	REGEX_ASSERT(matcher->start(2, status) == -1);
	2236	REGEX_CHECK_STATUS;
	2237
	2238	delete matcher;
	2239	delete pat;
	2240
	2241	utext_close(&input);
	2242	utext_close(&re);
	2243	}
	2244
	2245	//
	2246	// find with zero length matches, match position should bump ahead
	2247	// to prevent loops.
	2248	//
	2249	{
	2250	int32_t i;
	2251	UErrorCode status=U_ZERO_ERROR;
	2252	RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
	2253	// using an always-true look-ahead.
	2254	REGEX_CHECK_STATUS;
	2255	UText s = UTEXT_INITIALIZER;
	2256	utext_openUTF8(&s, " ", -1, &status);
	2257	m.reset(&s);
	2258	for (i=0; ; i++) {
	2259	if (m.find() == FALSE) {
	2260	break;
	2261	}
	2262	REGEX_ASSERT(m.start(status) == i);
	2263	REGEX_ASSERT(m.end(status) == i);
	2264	}
	2265	REGEX_ASSERT(i==5);
	2266
	2267	// Check that the bump goes over characters outside the BMP OK
	2268	// "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
	2269	unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
	2270	utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
	2271	m.reset(&s);
	2272	for (i=0; ; i+=4) {
	2273	if (m.find() == FALSE) {
	2274	break;
	2275	}
	2276	REGEX_ASSERT(m.start(status) == i);
	2277	REGEX_ASSERT(m.end(status) == i);
	2278	}
	2279	REGEX_ASSERT(i==20);
	2280
	2281	utext_close(&s);
	2282	}
	2283	{
	2284	// find() loop breaking test.
	2285	// with pattern of /.?/, should see a series of one char matches, then a single
	2286	// match of zero length at the end of the input string.
	2287	int32_t i;
	2288	UErrorCode status=U_ZERO_ERROR;
	2289	RegexMatcher m(".?", 0, status);
	2290	REGEX_CHECK_STATUS;
	2291	UText s = UTEXT_INITIALIZER;
	2292	utext_openUTF8(&s, " ", -1, &status);
	2293	m.reset(&s);
	2294	for (i=0; ; i++) {
	2295	if (m.find() == FALSE) {
	2296	break;
	2297	}
	2298	REGEX_ASSERT(m.start(status) == i);
	2299	REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
	2300	}
	2301	REGEX_ASSERT(i==5);
	2302
	2303	utext_close(&s);
	2304	}
	2305
	2306
	2307	//
	2308	// Matchers with no input string behave as if they had an empty input string.
	2309	//
	2310
	2311	{
	2312	UErrorCode status = U_ZERO_ERROR;
	2313	RegexMatcher m(".?", 0, status);
	2314	REGEX_CHECK_STATUS;
	2315	REGEX_ASSERT(m.find());
	2316	REGEX_ASSERT(m.start(status) == 0);
	2317	REGEX_ASSERT(m.input() == "");
	2318	}
	2319	{
	2320	UErrorCode status = U_ZERO_ERROR;
	2321	RegexPattern *p = RegexPattern::compile(".", 0, status);
	2322	RegexMatcher *m = p->matcher(status);
	2323	REGEX_CHECK_STATUS;
	2324
	2325	REGEX_ASSERT(m->find() == FALSE);
	2326	REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
	2327	delete m;
	2328	delete p;
	2329	}
	2330
	2331	//
	2332	// Regions
	2333	//
	2334	{
	2335	UErrorCode status = U_ZERO_ERROR;
	2336	UText testPattern = UTEXT_INITIALIZER;
	2337	UText testText = UTEXT_INITIALIZER;
	2338	regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
	2339	REGEX_VERBOSE_TEXT(&testPattern);
	2340	regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
	2341	REGEX_VERBOSE_TEXT(&testText);
	2342
	2343	RegexMatcher m(&testPattern, &testText, 0, status);
	2344	REGEX_CHECK_STATUS;
	2345	REGEX_ASSERT(m.regionStart() == 0);
	2346	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2347	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2348	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2349
	2350	m.region(2,4, status);
	2351	REGEX_CHECK_STATUS;
	2352	REGEX_ASSERT(m.matches(status));
	2353	REGEX_ASSERT(m.start(status)==2);
	2354	REGEX_ASSERT(m.end(status)==4);
	2355	REGEX_CHECK_STATUS;
	2356
	2357	m.reset();
	2358	REGEX_ASSERT(m.regionStart() == 0);
	2359	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
	2360
	2361	regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
	2362	REGEX_VERBOSE_TEXT(&testText);
	2363	m.reset(&testText);
	2364	REGEX_ASSERT(m.regionStart() == 0);
	2365	REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
	2366
	2367	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2368	REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
	2369	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2370	REGEX_ASSERT(&m == &m.reset());
	2371	REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
	2372
	2373	REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
	2374	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2375	REGEX_ASSERT(&m == &m.reset());
	2376	REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
	2377
	2378	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2379	REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
	2380	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2381	REGEX_ASSERT(&m == &m.reset());
	2382	REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
	2383
	2384	REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
	2385	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2386	REGEX_ASSERT(&m == &m.reset());
	2387	REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
	2388
	2389	utext_close(&testText);
	2390	utext_close(&testPattern);
	2391	}
	2392
	2393	//
	2394	// hitEnd() and requireEnd()
	2395	//
	2396	{
	2397	UErrorCode status = U_ZERO_ERROR;
	2398	UText testPattern = UTEXT_INITIALIZER;
	2399	UText testText = UTEXT_INITIALIZER;
	2400	const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2401	const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
	2402	utext_openUTF8(&testPattern, str_, -1, &status);
	2403	utext_openUTF8(&testText, str_aabb, -1, &status);
	2404
	2405	RegexMatcher m1(&testPattern, &testText, 0, status);
	2406	REGEX_ASSERT(m1.lookingAt(status) == TRUE);
	2407	REGEX_ASSERT(m1.hitEnd() == TRUE);
	2408	REGEX_ASSERT(m1.requireEnd() == FALSE);
	2409	REGEX_CHECK_STATUS;
	2410
	2411	status = U_ZERO_ERROR;
	2412	const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
	2413	utext_openUTF8(&testPattern, str_a, -1, &status);
	2414	RegexMatcher m2(&testPattern, &testText, 0, status);
	2415	REGEX_ASSERT(m2.lookingAt(status) == TRUE);
	2416	REGEX_ASSERT(m2.hitEnd() == FALSE);
	2417	REGEX_ASSERT(m2.requireEnd() == FALSE);
	2418	REGEX_CHECK_STATUS;
	2419
	2420	status = U_ZERO_ERROR;
	2421	const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /
	2422	utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
	2423	RegexMatcher m3(&testPattern, &testText, 0, status);
	2424	REGEX_ASSERT(m3.lookingAt(status) == TRUE);
	2425	REGEX_ASSERT(m3.hitEnd() == TRUE);
	2426	REGEX_ASSERT(m3.requireEnd() == TRUE);
	2427	REGEX_CHECK_STATUS;
	2428
	2429	utext_close(&testText);
	2430	utext_close(&testPattern);
	2431	}
	2432	}
	2433
	2434
	2435	//---------------------------------------------------------------------------
	2436	//
	2437	// API_Replace_UTF8 API test for class RegexMatcher, testing the
	2438	// Replace family of functions.
	2439	//
	2440	//---------------------------------------------------------------------------
	2441	void RegexTest::API_Replace_UTF8() {
	2442	//
	2443	// Replace
	2444	//
	2445	int32_t flags=0;
	2446	UParseError pe;
	2447	UErrorCode status=U_ZERO_ERROR;
	2448
	2449	UText re=UTEXT_INITIALIZER;
	2450	regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
	2451	REGEX_VERBOSE_TEXT(&re);
	2452	RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
	2453	REGEX_CHECK_STATUS;
	2454
	2455	char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
	2456	// 012345678901234567
	2457	UText dataText = UTEXT_INITIALIZER;
	2458	utext_openUTF8(&dataText, data, -1, &status);
	2459	REGEX_CHECK_STATUS;
	2460	REGEX_VERBOSE_TEXT(&dataText);
	2461	RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
	2462
	2463	//
	2464	// Plain vanilla matches.
	2465	//
	2466	UnicodeString dest;
	2467	UText destText = UTEXT_INITIALIZER;
	2468	utext_openUnicodeString(&destText, &dest, &status);
	2469	UText *result;
	2470
	2471	UText replText = UTEXT_INITIALIZER;
	2472
	2473	const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
	2474	utext_openUTF8(&replText, str_yz, -1, &status);
	2475	REGEX_VERBOSE_TEXT(&replText);
	2476	result = matcher->replaceFirst(&replText, NULL, status);
	2477	REGEX_CHECK_STATUS;
	2478	const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
	2479	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2480	utext_close(result);
	2481	result = matcher->replaceFirst(&replText, &destText, status);
	2482	REGEX_CHECK_STATUS;
	2483	REGEX_ASSERT(result == &destText);
	2484	REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
	2485
	2486	result = matcher->replaceAll(&replText, NULL, status);
	2487	REGEX_CHECK_STATUS;
	2488	const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
	2489	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2490	utext_close(result);
	2491
	2492	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2493	result = matcher->replaceAll(&replText, &destText, status);
	2494	REGEX_CHECK_STATUS;
	2495	REGEX_ASSERT(result == &destText);
	2496	REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
	2497
	2498	//
	2499	// Plain vanilla non-matches.
	2500	//
	2501	const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
	2502	utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
	2503	matcher->reset(&dataText);
	2504
	2505	result = matcher->replaceFirst(&replText, NULL, status);
	2506	REGEX_CHECK_STATUS;
	2507	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2508	utext_close(result);
	2509	result = matcher->replaceFirst(&replText, &destText, status);
	2510	REGEX_CHECK_STATUS;
	2511	REGEX_ASSERT(result == &destText);
	2512	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2513
	2514	result = matcher->replaceAll(&replText, NULL, status);
	2515	REGEX_CHECK_STATUS;
	2516	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2517	utext_close(result);
	2518	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2519	result = matcher->replaceAll(&replText, &destText, status);
	2520	REGEX_CHECK_STATUS;
	2521	REGEX_ASSERT(result == &destText);
	2522	REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
	2523
	2524	//
	2525	// Empty source string
	2526	//
	2527	utext_openUTF8(&dataText, NULL, 0, &status);
	2528	matcher->reset(&dataText);
	2529
	2530	result = matcher->replaceFirst(&replText, NULL, status);
	2531	REGEX_CHECK_STATUS;
	2532	REGEX_ASSERT_UTEXT_UTF8("", result);
	2533	utext_close(result);
	2534	result = matcher->replaceFirst(&replText, &destText, status);
	2535	REGEX_CHECK_STATUS;
	2536	REGEX_ASSERT(result == &destText);
	2537	REGEX_ASSERT_UTEXT_UTF8("", result);
	2538
	2539	result = matcher->replaceAll(&replText, NULL, status);
	2540	REGEX_CHECK_STATUS;
	2541	REGEX_ASSERT_UTEXT_UTF8("", result);
	2542	utext_close(result);
	2543	result = matcher->replaceAll(&replText, &destText, status);
	2544	REGEX_CHECK_STATUS;
	2545	REGEX_ASSERT(result == &destText);
	2546	REGEX_ASSERT_UTEXT_UTF8("", result);
	2547
	2548	//
	2549	// Empty substitution string
	2550	//
	2551	utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
	2552	matcher->reset(&dataText);
	2553
	2554	utext_openUTF8(&replText, NULL, 0, &status);
	2555	result = matcher->replaceFirst(&replText, NULL, status);
	2556	REGEX_CHECK_STATUS;
	2557	const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
	2558	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2559	utext_close(result);
	2560	result = matcher->replaceFirst(&replText, &destText, status);
	2561	REGEX_CHECK_STATUS;
	2562	REGEX_ASSERT(result == &destText);
	2563	REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
	2564
	2565	result = matcher->replaceAll(&replText, NULL, status);
	2566	REGEX_CHECK_STATUS;
	2567	const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
	2568	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2569	utext_close(result);
	2570	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2571	result = matcher->replaceAll(&replText, &destText, status);
	2572	REGEX_CHECK_STATUS;
	2573	REGEX_ASSERT(result == &destText);
	2574	REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
	2575
	2576	//
	2577	// match whole string
	2578	//
	2579	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2580	utext_openUTF8(&dataText, str_abc, -1, &status);
	2581	matcher->reset(&dataText);
	2582
	2583	const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
	2584	utext_openUTF8(&replText, str_xyz, -1, &status);
	2585	result = matcher->replaceFirst(&replText, NULL, status);
	2586	REGEX_CHECK_STATUS;
	2587	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2588	utext_close(result);
	2589	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2590	result = matcher->replaceFirst(&replText, &destText, status);
	2591	REGEX_CHECK_STATUS;
	2592	REGEX_ASSERT(result == &destText);
	2593	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2594
	2595	result = matcher->replaceAll(&replText, NULL, status);
	2596	REGEX_CHECK_STATUS;
	2597	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2598	utext_close(result);
	2599	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2600	result = matcher->replaceAll(&replText, &destText, status);
	2601	REGEX_CHECK_STATUS;
	2602	REGEX_ASSERT(result == &destText);
	2603	REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
	2604
	2605	//
	2606	// Capture Group, simple case
	2607	//
	2608	const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
	2609	utext_openUTF8(&re, str_add, -1, &status);
	2610	RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
	2611	REGEX_CHECK_STATUS;
	2612
	2613	const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
	2614	utext_openUTF8(&dataText, str_abcdefg, -1, &status);
	2615	RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
	2616	REGEX_CHECK_STATUS;
	2617
	2618	const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
	2619	utext_openUTF8(&replText, str_11, -1, &status);
	2620	result = matcher2->replaceFirst(&replText, NULL, status);
	2621	REGEX_CHECK_STATUS;
	2622	const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
	2623	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2624	utext_close(result);
	2625	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2626	result = matcher2->replaceFirst(&replText, &destText, status);
	2627	REGEX_CHECK_STATUS;
	2628	REGEX_ASSERT(result == &destText);
	2629	REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
	2630
	2631	const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
	2632	utext_openUTF8(&replText, str_v, -1, &status);
	2633	REGEX_VERBOSE_TEXT(&replText);
	2634	result = matcher2->replaceFirst(&replText, NULL, status);
	2635	REGEX_CHECK_STATUS;
	2636	const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
	2637	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2638	utext_close(result);
	2639	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2640	result = matcher2->replaceFirst(&replText, &destText, status);
	2641	REGEX_CHECK_STATUS;
	2642	REGEX_ASSERT(result == &destText);
	2643	REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
	2644
	2645	const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
	2646	0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
	2647	0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
	2648	utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
	2649	result = matcher2->replaceFirst(&replText, NULL, status);
	2650	REGEX_CHECK_STATUS;
	2651	const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
	2652	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2653	utext_close(result);
	2654	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2655	result = matcher2->replaceFirst(&replText, &destText, status);
	2656	REGEX_CHECK_STATUS;
	2657	REGEX_ASSERT(result == &destText);
	2658	REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
	2659
	2660	unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
	2661	//unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
	2662	// 012345678901234567890123456
	2663	supplDigitChars[22] = 0xF0;
	2664	supplDigitChars[23] = 0x9D;
	2665	supplDigitChars[24] = 0x9F;
	2666	supplDigitChars[25] = 0x8F;
	2667	utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
	2668
	2669	result = matcher2->replaceFirst(&replText, NULL, status);
	2670	REGEX_CHECK_STATUS;
	2671	const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
	2672	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2673	utext_close(result);
	2674	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2675	result = matcher2->replaceFirst(&replText, &destText, status);
	2676	REGEX_CHECK_STATUS;
	2677	REGEX_ASSERT(result == &destText);
	2678	REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
	2679	const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
	2680	utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
	2681	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2682	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2683	utext_close(result);
	2684	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2685	REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
	2686	REGEX_ASSERT(result == &destText);
	2687	// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
	2688
	2689	//
	2690	// Replacement String with \u hex escapes
	2691	//
	2692	{
	2693	const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
	2694	const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
	2695	utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
	2696	utext_openUTF8(&replText, str_u0043, -1, &status);
	2697	matcher->reset(&dataText);
	2698
	2699	result = matcher->replaceAll(&replText, NULL, status);
	2700	REGEX_CHECK_STATUS;
	2701	const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
	2702	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2703	utext_close(result);
	2704	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2705	result = matcher->replaceAll(&replText, &destText, status);
	2706	REGEX_CHECK_STATUS;
	2707	REGEX_ASSERT(result == &destText);
	2708	REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
	2709	}
	2710	{
	2711	const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
	2712	utext_openUTF8(&dataText, str_abc, -1, &status);
	2713	const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
	2714	utext_openUTF8(&replText, str_U00010000, -1, &status);
	2715	matcher->reset(&dataText);
	2716
	2717	unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
	2718	// 0123456789
	2719	expected[2] = 0xF0;
	2720	expected[3] = 0x90;
	2721	expected[4] = 0x80;
	2722	expected[5] = 0x80;
	2723
	2724	result = matcher->replaceAll(&replText, NULL, status);
	2725	REGEX_CHECK_STATUS;
	2726	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2727	utext_close(result);
	2728	utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
	2729	result = matcher->replaceAll(&replText, &destText, status);
	2730	REGEX_CHECK_STATUS;
	2731	REGEX_ASSERT(result == &destText);
	2732	REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
	2733	}
	2734	// TODO: need more through testing of capture substitutions.
	2735
	2736	// Bug 4057
	2737	//
	2738	{
	2739	status = U_ZERO_ERROR;
	2740	const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /
	2741	const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
	2742	const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
	2743	utext_openUTF8(&re, str_ssee, -1, &status);
	2744	utext_openUTF8(&dataText, str_blah, -1, &status);
	2745	utext_openUTF8(&replText, str_ooh, -1, &status);
	2746
	2747	RegexMatcher m(&re, 0, status);
	2748	REGEX_CHECK_STATUS;
	2749
	2750	UnicodeString result;
	2751	UText resultText = UTEXT_INITIALIZER;
	2752	utext_openUnicodeString(&resultText, &result, &status);
	2753
	2754	// Multiple finds do NOT bump up the previous appendReplacement postion.
	2755	m.reset(&dataText);
	2756	m.find();
	2757	m.find();
	2758	m.appendReplacement(&resultText, &replText, status);
	2759	REGEX_CHECK_STATUS;
	2760	const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2761	REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
	2762
	2763	// After a reset into the interior of a string, appendReplacement still starts at beginning.
	2764	status = U_ZERO_ERROR;
	2765	result.truncate(0);
	2766	utext_openUnicodeString(&resultText, &result, &status);
	2767	m.reset(10, status);
	2768	m.find();
	2769	m.find();
	2770	m.appendReplacement(&resultText, &replText, status);
	2771	REGEX_CHECK_STATUS;
	2772	const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2773	REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
	2774
	2775	// find() at interior of string, appendReplacement still starts at beginning.
	2776	status = U_ZERO_ERROR;
	2777	result.truncate(0);
	2778	utext_openUnicodeString(&resultText, &result, &status);
	2779	m.reset();
	2780	m.find(10, status);
	2781	m.find();
	2782	m.appendReplacement(&resultText, &replText, status);
	2783	REGEX_CHECK_STATUS;
	2784	const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
	2785	REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
	2786
	2787	m.appendTail(&resultText, status);
	2788	const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
	2789	REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
	2790
	2791	utext_close(&resultText);
	2792	}
	2793
	2794	delete matcher2;
	2795	delete pat2;
	2796	delete matcher;
	2797	delete pat;
	2798
	2799	utext_close(&dataText);
	2800	utext_close(&replText);
	2801	utext_close(&destText);
	2802	utext_close(&re);
	2803	}
	2804
	2805
	2806	//---------------------------------------------------------------------------
	2807	//
	2808	// API_Pattern_UTF8 Test that the API for class RegexPattern is
	2809	// present and nominally working.
	2810	//
	2811	//---------------------------------------------------------------------------
	2812	void RegexTest::API_Pattern_UTF8() {
	2813	RegexPattern pata; // Test default constructor to not crash.
	2814	RegexPattern patb;
	2815
	2816	REGEX_ASSERT(pata == patb);
	2817	REGEX_ASSERT(pata == pata);
	2818
	2819	UText re1 = UTEXT_INITIALIZER;
	2820	UText re2 = UTEXT_INITIALIZER;
	2821	UErrorCode status = U_ZERO_ERROR;
	2822	UParseError pe;
	2823
	2824	const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
	2825	const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
	2826	utext_openUTF8(&re1, str_abcalmz, -1, &status);
	2827	utext_openUTF8(&re2, str_def, -1, &status);
	2828
	2829	RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
	2830	RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
	2831	REGEX_CHECK_STATUS;
	2832	REGEX_ASSERT(pat1 == pat1);
	2833	REGEX_ASSERT(*pat1 != pata);
	2834
	2835	// Assign
	2836	patb = *pat1;
	2837	REGEX_ASSERT(patb == *pat1);
	2838
	2839	// Copy Construct
	2840	RegexPattern patc(*pat1);
	2841	REGEX_ASSERT(patc == *pat1);
	2842	REGEX_ASSERT(patb == patc);
	2843	REGEX_ASSERT(pat1 != pat2);
	2844	patb = *pat2;
	2845	REGEX_ASSERT(patb != patc);
	2846	REGEX_ASSERT(patb == *pat2);
	2847
	2848	// Compile with no flags.
	2849	RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
	2850	REGEX_ASSERT(pat1a == pat1);
	2851
	2852	REGEX_ASSERT(pat1a->flags() == 0);
	2853
	2854	// Compile with different flags should be not equal
	2855	RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
	2856	REGEX_CHECK_STATUS;
	2857
	2858	REGEX_ASSERT(pat1b != pat1a);
	2859	REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
	2860	REGEX_ASSERT(pat1a->flags() == 0);
	2861	delete pat1b;
	2862
	2863	// clone
	2864	RegexPattern *pat1c = pat1->clone();
	2865	REGEX_ASSERT(pat1c == pat1);
	2866	REGEX_ASSERT(pat1c != pat2);
	2867
	2868	delete pat1c;
	2869	delete pat1a;
	2870	delete pat1;
	2871	delete pat2;
	2872
	2873	utext_close(&re1);
	2874	utext_close(&re2);
	2875
	2876
	2877	//
	2878	// Verify that a matcher created from a cloned pattern works.
	2879	// (Jitterbug 3423)
	2880	//
	2881	{
	2882	UErrorCode status = U_ZERO_ERROR;
	2883	UText pattern = UTEXT_INITIALIZER;
	2884	const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
	2885	utext_openUTF8(&pattern, str_pL, -1, &status);
	2886
	2887	RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
	2888	RegexPattern *pClone = pSource->clone();
	2889	delete pSource;
	2890	RegexMatcher *mFromClone = pClone->matcher(status);
	2891	REGEX_CHECK_STATUS;
	2892
	2893	UText input = UTEXT_INITIALIZER;
	2894	const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
	2895	utext_openUTF8(&input, str_HelloWorld, -1, &status);
	2896	mFromClone->reset(&input);
	2897	REGEX_ASSERT(mFromClone->find() == TRUE);
	2898	REGEX_ASSERT(mFromClone->group(status) == "Hello");
	2899	REGEX_ASSERT(mFromClone->find() == TRUE);
	2900	REGEX_ASSERT(mFromClone->group(status) == "World");
	2901	REGEX_ASSERT(mFromClone->find() == FALSE);
	2902	delete mFromClone;
	2903	delete pClone;
	2904
	2905	utext_close(&input);
	2906	utext_close(&pattern);
	2907	}
	2908
	2909	//
	2910	// matches convenience API
	2911	//
	2912	{
	2913	UErrorCode status = U_ZERO_ERROR;
	2914	UText pattern = UTEXT_INITIALIZER;
	2915	UText input = UTEXT_INITIALIZER;
	2916
	2917	const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
	2918	utext_openUTF8(&input, str_randominput, -1, &status);
	2919
	2920	const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
	2921	utext_openUTF8(&pattern, str_dotstar, -1, &status);
	2922	REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
	2923	REGEX_CHECK_STATUS;
	2924
	2925	const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
	2926	utext_openUTF8(&pattern, str_abc, -1, &status);
	2927	REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
	2928	REGEX_CHECK_STATUS;
	2929
	2930	const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /
	2931	utext_openUTF8(&pattern, str_nput, -1, &status);
	2932	REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
	2933	REGEX_CHECK_STATUS;
	2934
	2935	utext_openUTF8(&pattern, str_randominput, -1, &status);
	2936	REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
	2937	REGEX_CHECK_STATUS;
	2938
	2939	const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /
	2940	utext_openUTF8(&pattern, str_u, -1, &status);
	2941	REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
	2942	REGEX_CHECK_STATUS;
	2943
	2944	utext_openUTF8(&input, str_abc, -1, &status);
	2945	utext_openUTF8(&pattern, str_abc, -1, &status);
	2946	status = U_INDEX_OUTOFBOUNDS_ERROR;
	2947	REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
	2948	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	2949
	2950	utext_close(&input);
	2951	utext_close(&pattern);
	2952	}
	2953
	2954
	2955	//
	2956	// Split()
	2957	//
	2958	status = U_ZERO_ERROR;
	2959	const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
	2960	utext_openUTF8(&re1, str_spaceplus, -1, &status);
	2961	pat1 = RegexPattern::compile(&re1, pe, status);
	2962	REGEX_CHECK_STATUS;
	2963	UnicodeString fields[10];
	2964
	2965	int32_t n;
	2966	n = pat1->split("Now is the time", fields, 10, status);
	2967	REGEX_CHECK_STATUS;
	2968	REGEX_ASSERT(n==4);
	2969	REGEX_ASSERT(fields[0]=="Now");
	2970	REGEX_ASSERT(fields[1]=="is");
	2971	REGEX_ASSERT(fields[2]=="the");
	2972	REGEX_ASSERT(fields[3]=="time");
	2973	REGEX_ASSERT(fields[4]=="");
	2974
	2975	n = pat1->split("Now is the time", fields, 2, status);
	2976	REGEX_CHECK_STATUS;
	2977	REGEX_ASSERT(n==2);
	2978	REGEX_ASSERT(fields[0]=="Now");
	2979	REGEX_ASSERT(fields[1]=="is the time");
	2980	REGEX_ASSERT(fields[2]=="the"); // left over from previous test
	2981
	2982	fields[1] = "*";
	2983	status = U_ZERO_ERROR;
	2984	n = pat1->split("Now is the time", fields, 1, status);
	2985	REGEX_CHECK_STATUS;
	2986	REGEX_ASSERT(n==1);
	2987	REGEX_ASSERT(fields[0]=="Now is the time");
	2988	REGEX_ASSERT(fields[1]=="*");
	2989	status = U_ZERO_ERROR;
	2990
	2991	n = pat1->split(" Now is the time ", fields, 10, status);
	2992	REGEX_CHECK_STATUS;
	2993	REGEX_ASSERT(n==6);
	2994	REGEX_ASSERT(fields[0]=="");
	2995	REGEX_ASSERT(fields[1]=="Now");
	2996	REGEX_ASSERT(fields[2]=="is");
	2997	REGEX_ASSERT(fields[3]=="the");
	2998	REGEX_ASSERT(fields[4]=="time");
	2999	REGEX_ASSERT(fields[5]=="");
	3000	REGEX_ASSERT(fields[6]=="");
	3001
	3002	fields[2] = "*";
	3003	n = pat1->split(" ", fields, 10, status);
	3004	REGEX_CHECK_STATUS;
	3005	REGEX_ASSERT(n==2);
	3006	REGEX_ASSERT(fields[0]=="");
	3007	REGEX_ASSERT(fields[1]=="");
	3008	REGEX_ASSERT(fields[2]=="*");
	3009
	3010	fields[0] = "foo";
	3011	n = pat1->split("", fields, 10, status);
	3012	REGEX_CHECK_STATUS;
	3013	REGEX_ASSERT(n==0);
	3014	REGEX_ASSERT(fields[0]=="foo");
	3015
	3016	delete pat1;
	3017
	3018	// split, with a pattern with (capture)
	3019	regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
	3020	pat1 = RegexPattern::compile(&re1, pe, status);
	3021	REGEX_CHECK_STATUS;
	3022
	3023	status = U_ZERO_ERROR;
	3024	fields[6] = fields[7] = "*";
	3025	n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
	3026	REGEX_CHECK_STATUS;
	3027	REGEX_ASSERT(n==7);
	3028	REGEX_ASSERT(fields[0]=="");
	3029	REGEX_ASSERT(fields[1]=="a");
	3030	REGEX_ASSERT(fields[2]=="Now is ");
	3031	REGEX_ASSERT(fields[3]=="b");
	3032	REGEX_ASSERT(fields[4]=="the time");
	3033	REGEX_ASSERT(fields[5]=="c");
	3034	REGEX_ASSERT(fields[6]=="");
	3035	REGEX_ASSERT(fields[7]=="*");
	3036	REGEX_ASSERT(status==U_ZERO_ERROR);
	3037
	3038	fields[6] = fields[7] = "*";
	3039	n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
	3040	REGEX_CHECK_STATUS;
	3041	REGEX_ASSERT(n==7);
	3042	REGEX_ASSERT(fields[0]==" ");
	3043	REGEX_ASSERT(fields[1]=="a");
	3044	REGEX_ASSERT(fields[2]=="Now is ");
	3045	REGEX_ASSERT(fields[3]=="b");
	3046	REGEX_ASSERT(fields[4]=="the time");
	3047	REGEX_ASSERT(fields[5]=="c");
	3048	REGEX_ASSERT(fields[6]=="");
	3049	REGEX_ASSERT(fields[7]=="*");
	3050
	3051	status = U_ZERO_ERROR;
	3052	fields[6] = "foo";
	3053	n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
	3054	REGEX_CHECK_STATUS;
	3055	REGEX_ASSERT(n==6);
	3056	REGEX_ASSERT(fields[0]==" ");
	3057	REGEX_ASSERT(fields[1]=="a");
	3058	REGEX_ASSERT(fields[2]=="Now is ");
	3059	REGEX_ASSERT(fields[3]=="b");
	3060	REGEX_ASSERT(fields[4]=="the time");
	3061	REGEX_ASSERT(fields[5]==" ");
	3062	REGEX_ASSERT(fields[6]=="foo");
	3063
	3064	status = U_ZERO_ERROR;
	3065	fields[5] = "foo";
	3066	n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
	3067	REGEX_CHECK_STATUS;
	3068	REGEX_ASSERT(n==5);
	3069	REGEX_ASSERT(fields[0]==" ");
	3070	REGEX_ASSERT(fields[1]=="a");
	3071	REGEX_ASSERT(fields[2]=="Now is ");
	3072	REGEX_ASSERT(fields[3]=="b");
	3073	REGEX_ASSERT(fields[4]=="the time<c>");
	3074	REGEX_ASSERT(fields[5]=="foo");
	3075
	3076	status = U_ZERO_ERROR;
	3077	fields[5] = "foo";
	3078	n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
	3079	REGEX_CHECK_STATUS;
	3080	REGEX_ASSERT(n==5);
	3081	REGEX_ASSERT(fields[0]==" ");
	3082	REGEX_ASSERT(fields[1]=="a");
	3083	REGEX_ASSERT(fields[2]=="Now is ");
	3084	REGEX_ASSERT(fields[3]=="b");
	3085	REGEX_ASSERT(fields[4]=="the time");
	3086	REGEX_ASSERT(fields[5]=="foo");
	3087
	3088	status = U_ZERO_ERROR;
	3089	n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
	3090	REGEX_CHECK_STATUS;
	3091	REGEX_ASSERT(n==4);
	3092	REGEX_ASSERT(fields[0]==" ");
	3093	REGEX_ASSERT(fields[1]=="a");
	3094	REGEX_ASSERT(fields[2]=="Now is ");
	3095	REGEX_ASSERT(fields[3]=="the time<c>");
	3096	status = U_ZERO_ERROR;
	3097	delete pat1;
	3098
	3099	regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
	3100	pat1 = RegexPattern::compile(&re1, pe, status);
	3101	REGEX_CHECK_STATUS;
	3102	n = pat1->split("1-10,20", fields, 10, status);
	3103	REGEX_CHECK_STATUS;
	3104	REGEX_ASSERT(n==5);
	3105	REGEX_ASSERT(fields[0]=="1");
	3106	REGEX_ASSERT(fields[1]=="-");
	3107	REGEX_ASSERT(fields[2]=="10");
	3108	REGEX_ASSERT(fields[3]==",");
	3109	REGEX_ASSERT(fields[4]=="20");
	3110	delete pat1;
	3111
	3112
	3113	//
	3114	// split of a UText based string, with library allocating output UTexts.
	3115	//
	3116	{
	3117	status = U_ZERO_ERROR;
	3118	RegexMatcher matcher(UnicodeString("(:)"), 0, status);
	3119	UnicodeString stringToSplit("first:second:third");
	3120	UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
	3121	REGEX_CHECK_STATUS;
	3122
	3123	UText *splits[10] = {NULL};
	3124	int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
	3125	REGEX_CHECK_STATUS;
	3126	REGEX_ASSERT(numFields == 5);
	3127	REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
	3128	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
	3129	REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
	3130	REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
	3131	REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
	3132	REGEX_ASSERT(splits[5] == NULL);
	3133
	3134	for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
	3135	if (splits[i]) {
	3136	utext_close(splits[i]);
	3137	splits[i] = NULL;
	3138	}
	3139	}
	3140	utext_close(textToSplit);
	3141	}
	3142
	3143
	3144	//
	3145	// RegexPattern::pattern() and patternText()
	3146	//
	3147	pat1 = new RegexPattern();
	3148	REGEX_ASSERT(pat1->pattern() == "");
	3149	REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
	3150	delete pat1;
	3151	const char helloWorldInvariant = "(Hello, world)";
	3152	regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
	3153	pat1 = RegexPattern::compile(&re1, pe, status);
	3154	REGEX_CHECK_STATUS;
	3155	REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
	3156	REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
	3157	delete pat1;
	3158
	3159	utext_close(&re1);
	3160	}
	3161
	3162
	3163	//---------------------------------------------------------------------------
	3164	//
	3165	// Extended A more thorough check for features of regex patterns
	3166	// The test cases are in a separate data file,
	3167	// source/tests/testdata/regextst.txt
	3168	// A description of the test data format is included in that file.
	3169	//
	3170	//---------------------------------------------------------------------------
	3171
	3172	const char *
	3173	RegexTest::getPath(char buffer[2048], const char *filename) {
	3174	UErrorCode status=U_ZERO_ERROR;
	3175	const char *testDataDirectory = IntlTest::getSourceTestData(status);
	3176	if (U_FAILURE(status)) {
	3177	errln("ERROR: loadTestData() failed - %s", u_errorName(status));
	3178	return NULL;
	3179	}
	3180
	3181	strcpy(buffer, testDataDirectory);
	3182	strcat(buffer, filename);
	3183	return buffer;
	3184	}
	3185
	3186	void RegexTest::Extended() {
	3187	char tdd[2048];
	3188	const char *srcPath;
	3189	UErrorCode status = U_ZERO_ERROR;
	3190	int32_t lineNum = 0;
	3191
	3192	//
	3193	// Open and read the test data file.
	3194	//
	3195	srcPath=getPath(tdd, "regextst.txt");
	3196	if(srcPath==NULL) {
	3197	return; /* something went wrong, error already output */
	3198	}
	3199
	3200	int32_t len;
	3201	UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
	3202	if (U_FAILURE(status)) {
	3203	return; /* something went wrong, error already output */
	3204	}
	3205
	3206	//
	3207	// Put the test data into a UnicodeString
	3208	//
	3209	UnicodeString testString(FALSE, testData, len);
	3210
	3211	RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\\1"), 0, status);
	3212	RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, status);
	3213	RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMQvabtyYzZ2-9])([:letter:]*)"), 0, status);
	3214
	3215	RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
	3216	UnicodeString testPattern; // The pattern for test from the test file.
	3217	UnicodeString testFlags; // the flags for a test.
	3218	UnicodeString matchString; // The marked up string to be used as input
	3219
	3220	if (U_FAILURE(status)){
	3221	dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
	3222	delete [] testData;
	3223	return;
	3224	}
	3225
	3226	//
	3227	// Loop over the test data file, once per line.
	3228	//
	3229	while (lineMat.find()) {
	3230	lineNum++;
	3231	if (U_FAILURE(status)) {
	3232	errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
	3233	}
	3234
	3235	status = U_ZERO_ERROR;
	3236	UnicodeString testLine = lineMat.group(1, status);
	3237	if (testLine.length() == 0) {
	3238	continue;
	3239	}
	3240
	3241	//
	3242	// Parse the test line. Skip blank and comment only lines.
	3243	// Separate out the three main fields - pattern, flags, target.
	3244	//
	3245
	3246	commentMat.reset(testLine);
	3247	if (commentMat.lookingAt(status)) {
	3248	// This line is a comment, or blank.
	3249	continue;
	3250	}
	3251
	3252	//
	3253	// Pull out the pattern field, remove it from the test file line.
	3254	//
	3255	quotedStuffMat.reset(testLine);
	3256	if (quotedStuffMat.lookingAt(status)) {
	3257	testPattern = quotedStuffMat.group(2, status);
	3258	testLine.remove(0, quotedStuffMat.end(0, status));
	3259	} else {
	3260	errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
	3261	continue;
	3262	}
	3263
	3264
	3265	//
	3266	// Pull out the flags from the test file line.
	3267	//
	3268	flagsMat.reset(testLine);
	3269	flagsMat.lookingAt(status); // Will always match, possibly an empty string.
	3270	testFlags = flagsMat.group(1, status);
	3271	if (flagsMat.group(2, status).length() > 0) {
	3272	errln("Bad Match flag at line %d. Scanning %c\n",
	3273	lineNum, flagsMat.group(2, status).charAt(0));
	3274	continue;
	3275	}
	3276	testLine.remove(0, flagsMat.end(0, status));
	3277
	3278	//
	3279	// Pull out the match string, as a whole.
	3280	// We'll process the <tags> later.
	3281	//
	3282	quotedStuffMat.reset(testLine);
	3283	if (quotedStuffMat.lookingAt(status)) {
	3284	matchString = quotedStuffMat.group(2, status);
	3285	testLine.remove(0, quotedStuffMat.end(0, status));
	3286	} else {
	3287	errln("Bad match string at test file line %d", lineNum);
	3288	continue;
	3289	}
	3290
	3291	//
	3292	// The only thing left from the input line should be an optional trailing comment.
	3293	//
	3294	commentMat.reset(testLine);
	3295	if (commentMat.lookingAt(status) == FALSE) {
	3296	errln("Line %d: unexpected characters at end of test line.", lineNum);
	3297	continue;
	3298	}
	3299
	3300	//
	3301	// Run the test
	3302	//
	3303	regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
	3304	}
	3305
	3306	delete [] testData;
	3307
	3308	}
	3309
	3310
	3311
	3312	//---------------------------------------------------------------------------
	3313	//
	3314	// regex_find(pattern, flags, inputString, lineNumber)
	3315	//
	3316	// Function to run a single test from the Extended (data driven) tests.
	3317	// See file test/testdata/regextst.txt for a description of the
	3318	// pattern and inputString fields, and the allowed flags.
	3319	// lineNumber is the source line in regextst.txt of the test.
	3320	//
	3321	//---------------------------------------------------------------------------
	3322
	3323
	3324	// Set a value into a UVector at position specified by a decimal number in
	3325	// a UnicodeString. This is a utility function needed by the actual test function,
	3326	// which follows.
	3327	static void set(UVector &vec, int32_t val, UnicodeString index) {
	3328	UErrorCode status=U_ZERO_ERROR;
	3329	int32_t idx = 0;
	3330	for (int32_t i=0; i<index.length(); i++) {
	3331	int32_t d=u_charDigitValue(index.charAt(i));
	3332	if (d<0) {return;}
	3333	idx = idx*10 + d;
	3334	}
	3335	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3336	vec.setElementAt(val, idx);
	3337	}
	3338
	3339	static void setInt(UVector &vec, int32_t val, int32_t idx) {
	3340	UErrorCode status=U_ZERO_ERROR;
	3341	while (vec.size()<idx+1) {vec.addElement(-1, status);}
	3342	vec.setElementAt(val, idx);
	3343	}
	3344
	3345	static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
	3346	{
	3347	UBool couldFind = TRUE;
	3348	UTEXT_SETNATIVEINDEX(utext, 0);
	3349	int32_t i = 0;
	3350	while (i < unistrOffset) {
	3351	UChar32 c = UTEXT_NEXT32(utext);
	3352	if (c != U_SENTINEL) {
	3353	i += U16_LENGTH(c);
	3354	} else {
	3355	couldFind = FALSE;
	3356	break;
	3357	}
	3358	}
	3359	nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
	3360	return couldFind;
	3361	}
	3362
	3363
	3364	void RegexTest::regex_find(const UnicodeString &pattern,
	3365	const UnicodeString &flags,
	3366	const UnicodeString &inputString,
	3367	const char *srcPath,
	3368	int32_t line) {
	3369	UnicodeString unEscapedInput;
	3370	UnicodeString deTaggedInput;
	3371
	3372	int32_t patternUTF8Length, inputUTF8Length;
	3373	char patternChars = NULL, inputChars = NULL;
	3374	UText patternText = UTEXT_INITIALIZER;
	3375	UText inputText = UTEXT_INITIALIZER;
	3376	UConverter *UTF8Converter = NULL;
	3377
	3378	UErrorCode status = U_ZERO_ERROR;
	3379	UParseError pe;
	3380	RegexPattern *parsePat = NULL;
	3381	RegexMatcher *parseMatcher = NULL;
	3382	RegexPattern callerPattern = NULL, UTF8Pattern = NULL;
	3383	RegexMatcher matcher = NULL, UTF8Matcher = NULL;
	3384	UVector groupStarts(status);
	3385	UVector groupEnds(status);
	3386	UVector groupStartsUTF8(status);
	3387	UVector groupEndsUTF8(status);
	3388	UBool isMatch = FALSE, isUTF8Match = FALSE;
	3389	UBool failed = FALSE;
	3390	int32_t numFinds;
	3391	int32_t i;
	3392	UBool useMatchesFunc = FALSE;
	3393	UBool useLookingAtFunc = FALSE;
	3394	int32_t regionStart = -1;
	3395	int32_t regionEnd = -1;
	3396	int32_t regionStartUTF8 = -1;
	3397	int32_t regionEndUTF8 = -1;
	3398
	3399
	3400	//
	3401	// Compile the caller's pattern
	3402	//
	3403	uint32_t bflags = 0;
	3404	if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
	3405	bflags \|= UREGEX_CASE_INSENSITIVE;
	3406	}
	3407	if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
	3408	bflags \|= UREGEX_COMMENTS;
	3409	}
	3410	if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
	3411	bflags \|= UREGEX_DOTALL;
	3412	}
	3413	if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
	3414	bflags \|= UREGEX_MULTILINE;
	3415	}
	3416
	3417	if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
	3418	bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
	3419	}
	3420	if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
	3421	bflags \|= UREGEX_UNIX_LINES;
	3422	}
	3423	if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
	3424	bflags \|= UREGEX_LITERAL;
	3425	}
	3426
	3427
	3428	callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
	3429	if (status != U_ZERO_ERROR) {
	3430	#if UCONFIG_NO_BREAK_ITERATION==1
	3431	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3432	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3433	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3434	goto cleanupAndReturn;
	3435	}
	3436	#endif
	3437	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3438	// Expected pattern compilation error.
	3439	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3440	logln("Pattern Compile returns \"%s\"", u_errorName(status));
	3441	}
	3442	goto cleanupAndReturn;
	3443	} else {
	3444	// Unexpected pattern compilation error.
	3445	dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
	3446	goto cleanupAndReturn;
	3447	}
	3448	}
	3449
	3450	UTF8Converter = ucnv_open("UTF8", &status);
	3451	ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	3452
	3453	patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
	3454	status = U_ZERO_ERROR; // buffer overflow
	3455	patternChars = new char[patternUTF8Length+1];
	3456	pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
	3457	utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
	3458
	3459	if (status == U_ZERO_ERROR) {
	3460	UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
	3461
	3462	if (status != U_ZERO_ERROR) {
	3463	#if UCONFIG_NO_BREAK_ITERATION==1
	3464	// 'v' test flag means that the test pattern should not compile if ICU was configured
	3465	// to not include break iteration. RBBI is needed for Unicode word boundaries.
	3466	if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_ERROR) {
	3467	goto cleanupAndReturn;
	3468	}
	3469	#endif
	3470	if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
	3471	// Expected pattern compilation error.
	3472	if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
	3473	logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
	3474	}
	3475	goto cleanupAndReturn;
	3476	} else {
	3477	// Unexpected pattern compilation error.
	3478	errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
	3479	goto cleanupAndReturn;
	3480	}
	3481	}
	3482	}
	3483
	3484	if (UTF8Pattern == NULL) {
	3485	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3486	logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
	3487	status = U_ZERO_ERROR;
	3488	}
	3489
	3490	if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
	3491	callerPattern->dumpPattern();
	3492	}
	3493
	3494	if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
	3495	errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
	3496	goto cleanupAndReturn;
	3497	}
	3498
	3499
	3500	//
	3501	// Number of times find() should be called on the test string, default to 1
	3502	//
	3503	numFinds = 1;
	3504	for (i=2; i<=9; i++) {
	3505	if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
	3506	if (numFinds != 1) {
	3507	errln("Line %d: more than one digit flag. Scanning %d.", line, i);
	3508	goto cleanupAndReturn;
	3509	}
	3510	numFinds = i;
	3511	}
	3512	}
	3513
	3514	// 'M' flag. Use matches() instead of find()
	3515	if (flags.indexOf((UChar)0x4d) >= 0) {
	3516	useMatchesFunc = TRUE;
	3517	}
	3518	if (flags.indexOf((UChar)0x4c) >= 0) {
	3519	useLookingAtFunc = TRUE;
	3520	}
	3521
	3522	//
	3523	// Find the tags in the input data, remove them, and record the group boundary
	3524	// positions.
	3525	//
	3526	parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);
	3527	REGEX_CHECK_STATUS_L(line);
	3528
	3529	unEscapedInput = inputString.unescape();
	3530	parseMatcher = parsePat->matcher(unEscapedInput, status);
	3531	REGEX_CHECK_STATUS_L(line);
	3532	while(parseMatcher->find()) {
	3533	parseMatcher->appendReplacement(deTaggedInput, "", status);
	3534	REGEX_CHECK_STATUS;
	3535	UnicodeString groupNum = parseMatcher->group(2, status);
	3536	if (groupNum == "r") {
	3537	// <r> or </r>, a region specification within the string
	3538	if (parseMatcher->group(1, status) == "/") {
	3539	regionEnd = deTaggedInput.length();
	3540	} else {
	3541	regionStart = deTaggedInput.length();
	3542	}
	3543	} else {
	3544	// <digits> or </digits>, a group match boundary tag.
	3545	if (parseMatcher->group(1, status) == "/") {
	3546	set(groupEnds, deTaggedInput.length(), groupNum);
	3547	} else {
	3548	set(groupStarts, deTaggedInput.length(), groupNum);
	3549	}
	3550	}
	3551	}
	3552	parseMatcher->appendTail(deTaggedInput);
	3553	REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
	3554	if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>regionEnd)) {
	3555	errln("mismatched <r> tags");
	3556	failed = TRUE;
	3557	goto cleanupAndReturn;
	3558	}
	3559
	3560	//
	3561	// Configure the matcher according to the flags specified with this test.
	3562	//
	3563	matcher = callerPattern->matcher(deTaggedInput, status);
	3564	REGEX_CHECK_STATUS_L(line);
	3565	if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
	3566	matcher->setTrace(TRUE);
	3567	}
	3568
	3569	if (UTF8Pattern != NULL) {
	3570	inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
	3571	status = U_ZERO_ERROR; // buffer overflow
	3572	inputChars = new char[inputUTF8Length+1];
	3573	deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
	3574	utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
	3575
	3576	if (status == U_ZERO_ERROR) {
	3577	UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
	3578	REGEX_CHECK_STATUS_L(line);
	3579	}
	3580
	3581	if (UTF8Matcher == NULL) {
	3582	// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
	3583	logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
	3584	status = U_ZERO_ERROR;
	3585	}
	3586	}
	3587
	3588	//
	3589	// Generate native indices for UTF8 versions of region and capture group info
	3590	//
	3591	if (UTF8Matcher != NULL) {
	3592	if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
	3593	if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
	3594
	3595	// Fill out the native index UVector info.
	3596	// Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
	3597	for (i=0; i<groupStarts.size(); i++) {
	3598	int32_t start = groupStarts.elementAti(i);
	3599	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3600	if (start >= 0) {
	3601	int32_t startUTF8;
	3602	if (!utextOffsetToNative(&inputText, start, startUTF8)) {
	3603	errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
	3604	failed = TRUE;
	3605	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3606	}
	3607	setInt(groupStartsUTF8, startUTF8, i);
	3608	}
	3609
	3610	int32_t end = groupEnds.elementAti(i);
	3611	// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
	3612	if (end >= 0) {
	3613	int32_t endUTF8;
	3614	if (!utextOffsetToNative(&inputText, end, endUTF8)) {
	3615	errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
	3616	failed = TRUE;
	3617	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3618	}
	3619	setInt(groupEndsUTF8, endUTF8, i);
	3620	}
	3621	}
	3622	}
	3623
	3624	if (regionStart>=0) {
	3625	matcher->region(regionStart, regionEnd, status);
	3626	REGEX_CHECK_STATUS_L(line);
	3627	if (UTF8Matcher != NULL) {
	3628	UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
	3629	REGEX_CHECK_STATUS_L(line);
	3630	}
	3631	}
	3632	if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
	3633	matcher->useAnchoringBounds(FALSE);
	3634	if (UTF8Matcher != NULL) {
	3635	UTF8Matcher->useAnchoringBounds(FALSE);
	3636	}
	3637	}
	3638	if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
	3639	matcher->useTransparentBounds(TRUE);
	3640	if (UTF8Matcher != NULL) {
	3641	UTF8Matcher->useTransparentBounds(TRUE);
	3642	}
	3643	}
	3644
	3645
	3646
	3647	//
	3648	// Do a find on the de-tagged input using the caller's pattern
	3649	// TODO: error on count>1 and not find().
	3650	// error on both matches() and lookingAt().
	3651	//
	3652	for (i=0; i<numFinds; i++) {
	3653	if (useMatchesFunc) {
	3654	isMatch = matcher->matches(status);
	3655	if (UTF8Matcher != NULL) {
	3656	isUTF8Match = UTF8Matcher->matches(status);
	3657	}
	3658	} else if (useLookingAtFunc) {
	3659	isMatch = matcher->lookingAt(status);
	3660	if (UTF8Matcher != NULL) {
	3661	isUTF8Match = UTF8Matcher->lookingAt(status);
	3662	}
	3663	} else {
	3664	isMatch = matcher->find();
	3665	if (UTF8Matcher != NULL) {
	3666	isUTF8Match = UTF8Matcher->find();
	3667	}
	3668	}
	3669	}
	3670	matcher->setTrace(FALSE);
	3671	if (U_FAILURE(status)) {
	3672	errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
	3673	}
	3674
	3675	//
	3676	// Match up the groups from the find() with the groups from the tags
	3677	//
	3678
	3679	// number of tags should match number of groups from find operation.
	3680	// matcher->groupCount does not include group 0, the entire match, hence the +1.
	3681	// G option in test means that capture group data is not available in the
	3682	// expected results, so the check needs to be suppressed.
	3683	if (isMatch == FALSE && groupStarts.size() != 0) {
	3684	dataerrln("Error at line %d: Match expected, but none found.", line);
	3685	failed = TRUE;
	3686	goto cleanupAndReturn;
	3687	} else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
	3688	errln("Error at line %d: Match expected, but none found. (UTF8)", line);
	3689	failed = TRUE;
	3690	goto cleanupAndReturn;
	3691	}
	3692
	3693	if (flags.indexOf((UChar)0x47 /G/) >= 0) {
	3694	// Only check for match / no match. Don't check capture groups.
	3695	if (isMatch && groupStarts.size() == 0) {
	3696	errln("Error at line %d: No match expected, but one found.", line);
	3697	failed = TRUE;
	3698	} else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
	3699	errln("Error at line %d: No match expected, but one found. (UTF8)", line);
	3700	failed = TRUE;
	3701	}
	3702	goto cleanupAndReturn;
	3703	}
	3704
	3705	REGEX_CHECK_STATUS_L(line);
	3706	for (i=0; i<=matcher->groupCount(); i++) {
	3707	int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
	3708	int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
	3709	if (matcher->start(i, status) != expectedStart) {
	3710	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
	3711	line, i, expectedStart, matcher->start(i, status));
	3712	failed = TRUE;
	3713	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3714	} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
	3715	errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
	3716	line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
	3717	failed = TRUE;
	3718	goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
	3719	}
	3720
	3721	int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
	3722	int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
	3723	if (matcher->end(i, status) != expectedEnd) {
	3724	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
	3725	line, i, expectedEnd, matcher->end(i, status));
	3726	failed = TRUE;
	3727	// Error on end position; keep going; real error is probably yet to come as group
	3728	// end positions work from end of the input data towards the front.
	3729	} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
	3730	errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
	3731	line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
	3732	failed = TRUE;
	3733	// Error on end position; keep going; real error is probably yet to come as group
	3734	// end positions work from end of the input data towards the front.
	3735	}
	3736	}
	3737	if ( matcher->groupCount()+1 < groupStarts.size()) {
	3738	errln("Error at line %d: Expected %d capture groups, found %d.",
	3739	line, groupStarts.size()-1, matcher->groupCount());
	3740	failed = TRUE;
	3741	}
	3742	else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
	3743	errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
	3744	line, groupStarts.size()-1, UTF8Matcher->groupCount());
	3745	failed = TRUE;
	3746	}
	3747
	3748	if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3749	matcher->requireEnd() == TRUE) {
	3750	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
	3751	failed = TRUE;
	3752	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
	3753	UTF8Matcher->requireEnd() == TRUE) {
	3754	errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3755	failed = TRUE;
	3756	}
	3757
	3758	if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
	3759	matcher->requireEnd() == FALSE) {
	3760	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
	3761	failed = TRUE;
	3762	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
	3763	UTF8Matcher->requireEnd() == FALSE) {
	3764	errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3765	failed = TRUE;
	3766	}
	3767
	3768	if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3769	matcher->hitEnd() == TRUE) {
	3770	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
	3771	failed = TRUE;
	3772	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
	3773	UTF8Matcher->hitEnd() == TRUE) {
	3774	errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
	3775	failed = TRUE;
	3776	}
	3777
	3778	if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3779	matcher->hitEnd() == FALSE) {
	3780	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
	3781	failed = TRUE;
	3782	} else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
	3783	UTF8Matcher->hitEnd() == FALSE) {
	3784	errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
	3785	failed = TRUE;
	3786	}
	3787
	3788
	3789	cleanupAndReturn:
	3790	if (failed) {
	3791	infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
	3792	+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
	3793	// callerPattern->dump();
	3794	}
	3795	delete parseMatcher;
	3796	delete parsePat;
	3797	delete UTF8Matcher;
	3798	delete UTF8Pattern;
	3799	delete matcher;
	3800	delete callerPattern;
	3801
	3802	utext_close(&inputText);
	3803	delete[] inputChars;
	3804	utext_close(&patternText);
	3805	delete[] patternChars;
	3806	ucnv_close(UTF8Converter);
	3807	}
	3808
	3809
	3810
	3811
	3812	//---------------------------------------------------------------------------
	3813	//
	3814	// Errors Check for error handling in patterns.
	3815	//
	3816	//---------------------------------------------------------------------------
	3817	void RegexTest::Errors() {
	3818	// \escape sequences that aren't implemented yet.
	3819	//REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
	3820
	3821	// Missing close parentheses
	3822	REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3823	REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
	3824	REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
	3825
	3826	// Extra close paren
	3827	REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
	3828	REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
	3829	REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
	3830
	3831	// Look-ahead, Look-behind
	3832	// TODO: add tests for unbounded length look-behinds.
	3833	REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
	3834
	3835	// Attempt to use non-default flags
	3836	{
	3837	UParseError pe;
	3838	UErrorCode status = U_ZERO_ERROR;
	3839	int32_t flags = UREGEX_CANON_EQ \|
	3840	UREGEX_COMMENTS \| UREGEX_DOTALL \|
	3841	UREGEX_MULTILINE;
	3842	RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);
	3843	REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
	3844	delete pat1;
	3845	}
	3846
	3847
	3848	// Quantifiers are allowed only after something that can be quantified.
	3849	REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
	3850	REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
	3851	REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
	3852
	3853	// Mal-formed {min,max} quantifiers
	3854	REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
	3855	REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
	3856	REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
	3857	REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
	3858	REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
	3859	REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
	3860	REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
	3861	REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
	3862	REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
	3863
	3864	// Ticket 5389
	3865	REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
	3866
	3867	// Invalid Back Reference \0
	3868	// For ICU 3.8 and earlier
	3869	// For ICU versions newer than 3.8, \0 introduces an octal escape.
	3870	//
	3871	REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
	3872
	3873	}
	3874
	3875
	3876	//-------------------------------------------------------------------------------
	3877	//
	3878	// Read a text data file, convert it to UChars, and return the data
	3879	// in one big UChar * buffer, which the caller must delete.
	3880	//
	3881	//--------------------------------------------------------------------------------
	3882	UChar RegexTest::ReadAndConvertFile(const char fileName, int32_t &ulen,
	3883	const char *defEncoding, UErrorCode &status) {
	3884	UChar *retPtr = NULL;
	3885	char *fileBuf = NULL;
	3886	UConverter* conv = NULL;
	3887	FILE *f = NULL;
	3888
	3889	ulen = 0;
	3890	if (U_FAILURE(status)) {
	3891	return retPtr;
	3892	}
	3893
	3894	//
	3895	// Open the file.
	3896	//
	3897	f = fopen(fileName, "rb");
	3898	if (f == 0) {
	3899	dataerrln("Error opening test data file %s\n", fileName);
	3900	status = U_FILE_ACCESS_ERROR;
	3901	return NULL;
	3902	}
	3903	//
	3904	// Read it in
	3905	//
	3906	int32_t fileSize;
	3907	int32_t amt_read;
	3908
	3909	fseek( f, 0, SEEK_END);
	3910	fileSize = ftell(f);
	3911	fileBuf = new char[fileSize];
	3912	fseek(f, 0, SEEK_SET);
	3913	amt_read = fread(fileBuf, 1, fileSize, f);
	3914	if (amt_read != fileSize \|\| fileSize <= 0) {
	3915	errln("Error reading test data file.");
	3916	goto cleanUpAndReturn;
	3917	}
	3918
	3919	//
	3920	// Look for a Unicode Signature (BOM) on the data just read
	3921	//
	3922	int32_t signatureLength;
	3923	const char * fileBufC;
	3924	const char* encoding;
	3925
	3926	fileBufC = fileBuf;
	3927	encoding = ucnv_detectUnicodeSignature(
	3928	fileBuf, fileSize, &signatureLength, &status);
	3929	if(encoding!=NULL ){
	3930	fileBufC += signatureLength;
	3931	fileSize -= signatureLength;
	3932	} else {
	3933	encoding = defEncoding;
	3934	if (strcmp(encoding, "utf-8") == 0) {
	3935	errln("file %s is missing its BOM", fileName);
	3936	}
	3937	}
	3938
	3939	//
	3940	// Open a converter to take the rule file to UTF-16
	3941	//
	3942	conv = ucnv_open(encoding, &status);
	3943	if (U_FAILURE(status)) {
	3944	goto cleanUpAndReturn;
	3945	}
	3946
	3947	//
	3948	// Convert the rules to UChar.
	3949	// Preflight first to determine required buffer size.
	3950	//
	3951	ulen = ucnv_toUChars(conv,
	3952	NULL, // dest,
	3953	0, // destCapacity,
	3954	fileBufC,
	3955	fileSize,
	3956	&status);
	3957	if (status == U_BUFFER_OVERFLOW_ERROR) {
	3958	// Buffer Overflow is expected from the preflight operation.
	3959	status = U_ZERO_ERROR;
	3960
	3961	retPtr = new UChar[ulen+1];
	3962	ucnv_toUChars(conv,
	3963	retPtr, // dest,
	3964	ulen+1,
	3965	fileBufC,
	3966	fileSize,
	3967	&status);
	3968	}
	3969
	3970	cleanUpAndReturn:
	3971	fclose(f);
	3972	delete[] fileBuf;
	3973	ucnv_close(conv);
	3974	if (U_FAILURE(status)) {
	3975	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	3976	delete []retPtr;
	3977	retPtr = 0;
	3978	ulen = 0;
	3979	};
	3980	return retPtr;
	3981	}
	3982
	3983
	3984	//-------------------------------------------------------------------------------
	3985	//
	3986	// PerlTests - Run Perl's regular expression tests
	3987	// The input file for this test is re_tests, the standard regular
	3988	// expression test data distributed with the Perl source code.
	3989	//
	3990	// Here is Perl's description of the test data file:
	3991	//
	3992	// # The tests are in a separate file 't/op/re_tests'.
	3993	// # Each line in that file is a separate test.
	3994	// # There are five columns, separated by tabs.
	3995	// #
	3996	// # Column 1 contains the pattern, optionally enclosed in C<''>.
	3997	// # Modifiers can be put after the closing C<'>.
	3998	// #
	3999	// # Column 2 contains the string to be matched.
	4000	// #
	4001	// # Column 3 contains the expected result:
	4002	// # y expect a match
	4003	// # n expect no match
	4004	// # c expect an error
	4005	// # B test exposes a known bug in Perl, should be skipped
	4006	// # b test exposes a known bug in Perl, should be skipped if noamp
	4007	// #
	4008	// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
	4009	// #
	4010	// # Column 4 contains a string, usually C<$&>.
	4011	// #
	4012	// # Column 5 contains the expected result of double-quote
	4013	// # interpolating that string after the match, or start of error message.
	4014	// #
	4015	// # Column 6, if present, contains a reason why the test is skipped.
	4016	// # This is printed with "skipped", for harness to pick up.
	4017	// #
	4018	// # \n in the tests are interpolated, as are variables of the form ${\w+}.
	4019	// #
	4020	// # If you want to add a regular expression test that can't be expressed
	4021	// # in this format, don't add it here: put it in op/pat.t instead.
	4022	//
	4023	// For ICU, if field 3 contains an 'i', the test will be skipped.
	4024	// The test exposes is some known incompatibility between ICU and Perl regexps.
	4025	// (The i is in addition to whatever was there before.)
	4026	//
	4027	//-------------------------------------------------------------------------------
	4028	void RegexTest::PerlTests() {
	4029	char tdd[2048];
	4030	const char *srcPath;
	4031	UErrorCode status = U_ZERO_ERROR;
	4032	UParseError pe;
	4033
	4034	//
	4035	// Open and read the test data file.
	4036	//
	4037	srcPath=getPath(tdd, "re_tests.txt");
	4038	if(srcPath==NULL) {
	4039	return; /* something went wrong, error already output */
	4040	}
	4041
	4042	int32_t len;
	4043	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	4044	if (U_FAILURE(status)) {
	4045	return; /* something went wrong, error already output */
	4046	}
	4047
	4048	//
	4049	// Put the test data into a UnicodeString
	4050	//
	4051	UnicodeString testDataString(FALSE, testData, len);
	4052
	4053	//
	4054	// Regex to break the input file into lines, and strip the new lines.
	4055	// One line per match, capture group one is the desired data.
	4056	//
	4057	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	4058	if (U_FAILURE(status)) {
	4059	dataerrln("RegexPattern::compile() error");
	4060	return;
	4061	}
	4062	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	4063
	4064	//
	4065	// Regex to split a test file line into fields.
	4066	// There are six fields, separated by tabs.
	4067	//
	4068	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	4069
	4070	//
	4071	// Regex to identify test patterns with flag settings, and to separate them.
	4072	// Test patterns with flags look like 'pattern'i
	4073	// Test patterns without flags are not quoted: pattern
	4074	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	4075	//
	4076	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	4077	RegexMatcher* flagMat = flagPat->matcher(status);
	4078
	4079	//
	4080	// The Perl tests reference several perl-isms, which are evaluated/substituted
	4081	// in the test data. Not being perl, this must be done explicitly. Here
	4082	// are string constants and REs for these constructs.
	4083	//
	4084	UnicodeString nulnulSrc("${nulnul}");
	4085	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4086	nulnul = nulnul.unescape();
	4087
	4088	UnicodeString ffffSrc("${ffff}");
	4089	UnicodeString ffff("\\uffff", -1, US_INV);
	4090	ffff = ffff.unescape();
	4091
	4092	// regexp for $-[0], $+[2], etc.
	4093	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4094	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4095
	4096	// regexp for $0, $1, $2, etc.
	4097	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4098	RegexMatcher *cgMat = cgPat->matcher(status);
	4099
	4100
	4101	//
	4102	// Main Loop for the Perl Tests, runs once per line from the
	4103	// test data file.
	4104	//
	4105	int32_t lineNum = 0;
	4106	int32_t skippedUnimplementedCount = 0;
	4107	while (lineMat->find()) {
	4108	lineNum++;
	4109
	4110	//
	4111	// Get a line, break it into its fields, do the Perl
	4112	// variable substitutions.
	4113	//
	4114	UnicodeString line = lineMat->group(1, status);
	4115	UnicodeString fields[7];
	4116	fieldPat->split(line, fields, 7, status);
	4117
	4118	flagMat->reset(fields[0]);
	4119	flagMat->matches(status);
	4120	UnicodeString pattern = flagMat->group(2, status);
	4121	pattern.findAndReplace("${bang}", "!");
	4122	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4123	pattern.findAndReplace(ffffSrc, ffff);
	4124
	4125	//
	4126	// Identify patterns that include match flag settings,
	4127	// split off the flags, remove the extra quotes.
	4128	//
	4129	UnicodeString flagStr = flagMat->group(3, status);
	4130	if (U_FAILURE(status)) {
	4131	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4132	return;
	4133	}
	4134	int32_t flags = 0;
	4135	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4136	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4137	const UChar UChar_m = 0x6d;
	4138	const UChar UChar_x = 0x78;
	4139	const UChar UChar_y = 0x79;
	4140	if (flagStr.indexOf(UChar_i) != -1) {
	4141	flags \|= UREGEX_CASE_INSENSITIVE;
	4142	}
	4143	if (flagStr.indexOf(UChar_m) != -1) {
	4144	flags \|= UREGEX_MULTILINE;
	4145	}
	4146	if (flagStr.indexOf(UChar_x) != -1) {
	4147	flags \|= UREGEX_COMMENTS;
	4148	}
	4149
	4150	//
	4151	// Compile the test pattern.
	4152	//
	4153	status = U_ZERO_ERROR;
	4154	RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
	4155	if (status == U_REGEX_UNIMPLEMENTED) {
	4156	//
	4157	// Test of a feature that is planned for ICU, but not yet implemented.
	4158	// skip the test.
	4159	skippedUnimplementedCount++;
	4160	delete testPat;
	4161	status = U_ZERO_ERROR;
	4162	continue;
	4163	}
	4164
	4165	if (U_FAILURE(status)) {
	4166	// Some tests are supposed to generate errors.
	4167	// Only report an error for tests that are supposed to succeed.
	4168	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4169	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4170	{
	4171	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4172	}
	4173	status = U_ZERO_ERROR;
	4174	delete testPat;
	4175	continue;
	4176	}
	4177
	4178	if (fields[2].indexOf(UChar_i) >= 0) {
	4179	// ICU should skip this test.
	4180	delete testPat;
	4181	continue;
	4182	}
	4183
	4184	if (fields[2].indexOf(UChar_c) >= 0) {
	4185	// This pattern should have caused a compilation error, but didn't/
	4186	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4187	delete testPat;
	4188	continue;
	4189	}
	4190
	4191	//
	4192	// replace the Perl variables that appear in some of the
	4193	// match data strings.
	4194	//
	4195	UnicodeString matchString = fields[1];
	4196	matchString.findAndReplace(nulnulSrc, nulnul);
	4197	matchString.findAndReplace(ffffSrc, ffff);
	4198
	4199	// Replace any \n in the match string with an actual new-line char.
	4200	// Don't do full unescape, as this unescapes more than Perl does, which
	4201	// causes other spurious failures in the tests.
	4202	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4203
	4204
	4205
	4206	//
	4207	// Run the test, check for expected match/don't match result.
	4208	//
	4209	RegexMatcher *testMat = testPat->matcher(matchString, status);
	4210	UBool found = testMat->find();
	4211	UBool expected = FALSE;
	4212	if (fields[2].indexOf(UChar_y) >=0) {
	4213	expected = TRUE;
	4214	}
	4215	if (expected != found) {
	4216	errln("line %d: Expected %smatch, got %smatch",
	4217	lineNum, expected?"":"no ", found?"":"no " );
	4218	continue;
	4219	}
	4220
	4221	// Don't try to check expected results if there is no match.
	4222	// (Some have stuff in the expected fields)
	4223	if (!found) {
	4224	delete testMat;
	4225	delete testPat;
	4226	continue;
	4227	}
	4228
	4229	//
	4230	// Interpret the Perl expression from the fourth field of the data file,
	4231	// building up an ICU string from the results of the ICU match.
	4232	// The Perl expression will contain references to the results of
	4233	// a regex match, including the matched string, capture group strings,
	4234	// group starting and ending indicies, etc.
	4235	//
	4236	UnicodeString resultString;
	4237	UnicodeString perlExpr = fields[3];
	4238	#if SUPPORT_MUTATING_INPUT_STRING
	4239	groupsMat->reset(perlExpr);
	4240	cgMat->reset(perlExpr);
	4241	#endif
	4242
	4243	while (perlExpr.length() > 0) {
	4244	#if !SUPPORT_MUTATING_INPUT_STRING
	4245	// Perferred usage. Reset after any modification to input string.
	4246	groupsMat->reset(perlExpr);
	4247	cgMat->reset(perlExpr);
	4248	#endif
	4249
	4250	if (perlExpr.startsWith("$&")) {
	4251	resultString.append(testMat->group(status));
	4252	perlExpr.remove(0, 2);
	4253	}
	4254
	4255	else if (groupsMat->lookingAt(status)) {
	4256	// $-[0] $+[2] etc.
	4257	UnicodeString digitString = groupsMat->group(2, status);
	4258	int32_t t = 0;
	4259	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4260	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4261	int32_t matchPosition;
	4262	if (plusOrMinus.compare("+") == 0) {
	4263	matchPosition = testMat->end(groupNum, status);
	4264	} else {
	4265	matchPosition = testMat->start(groupNum, status);
	4266	}
	4267	if (matchPosition != -1) {
	4268	ICU_Utility::appendNumber(resultString, matchPosition);
	4269	}
	4270	perlExpr.remove(0, groupsMat->end(status));
	4271	}
	4272
	4273	else if (cgMat->lookingAt(status)) {
	4274	// $1, $2, $3, etc.
	4275	UnicodeString digitString = cgMat->group(1, status);
	4276	int32_t t = 0;
	4277	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4278	if (U_SUCCESS(status)) {
	4279	resultString.append(testMat->group(groupNum, status));
	4280	status = U_ZERO_ERROR;
	4281	}
	4282	perlExpr.remove(0, cgMat->end(status));
	4283	}
	4284
	4285	else if (perlExpr.startsWith("@-")) {
	4286	int32_t i;
	4287	for (i=0; i<=testMat->groupCount(); i++) {
	4288	if (i>0) {
	4289	resultString.append(" ");
	4290	}
	4291	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4292	}
	4293	perlExpr.remove(0, 2);
	4294	}
	4295
	4296	else if (perlExpr.startsWith("@+")) {
	4297	int32_t i;
	4298	for (i=0; i<=testMat->groupCount(); i++) {
	4299	if (i>0) {
	4300	resultString.append(" ");
	4301	}
	4302	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4303	}
	4304	perlExpr.remove(0, 2);
	4305	}
	4306
	4307	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4308	// or as an escaped sequence (e.g. \n)
	4309	if (perlExpr.length() > 1) {
	4310	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4311	}
	4312	UChar c = perlExpr.charAt(0);
	4313	switch (c) {
	4314	case 'n': c = '\n'; break;
	4315	// add any other escape sequences that show up in the test expected results.
	4316	}
	4317	resultString.append(c);
	4318	perlExpr.remove(0, 1);
	4319	}
	4320
	4321	else {
	4322	// Any characters from the perl expression that we don't explicitly
	4323	// recognize before here are assumed to be literals and copied
	4324	// as-is to the expected results.
	4325	resultString.append(perlExpr.charAt(0));
	4326	perlExpr.remove(0, 1);
	4327	}
	4328
	4329	if (U_FAILURE(status)) {
	4330	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4331	break;
	4332	}
	4333	}
	4334
	4335	//
	4336	// Expected Results Compare
	4337	//
	4338	UnicodeString expectedS(fields[4]);
	4339	expectedS.findAndReplace(nulnulSrc, nulnul);
	4340	expectedS.findAndReplace(ffffSrc, ffff);
	4341	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4342
	4343
	4344	if (expectedS.compare(resultString) != 0) {
	4345	err("Line %d: Incorrect perl expression results.", lineNum);
	4346	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4347	}
	4348
	4349	delete testMat;
	4350	delete testPat;
	4351	}
	4352
	4353	//
	4354	// All done. Clean up allocated stuff.
	4355	//
	4356	delete cgMat;
	4357	delete cgPat;
	4358
	4359	delete groupsMat;
	4360	delete groupsPat;
	4361
	4362	delete flagMat;
	4363	delete flagPat;
	4364
	4365	delete lineMat;
	4366	delete linePat;
	4367
	4368	delete fieldPat;
	4369	delete [] testData;
	4370
	4371
	4372	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4373
	4374	}
	4375
	4376
	4377	//-------------------------------------------------------------------------------
	4378	//
	4379	// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
	4380	// (instead of using UnicodeStrings) to test the alternate engine.
	4381	// The input file for this test is re_tests, the standard regular
	4382	// expression test data distributed with the Perl source code.
	4383	// See PerlTests() for more information.
	4384	//
	4385	//-------------------------------------------------------------------------------
	4386	void RegexTest::PerlTestsUTF8() {
	4387	char tdd[2048];
	4388	const char *srcPath;
	4389	UErrorCode status = U_ZERO_ERROR;
	4390	UParseError pe;
	4391	LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
	4392	UText patternText = UTEXT_INITIALIZER;
	4393	char *patternChars = NULL;
	4394	int32_t patternLength;
	4395	int32_t patternCapacity = 0;
	4396	UText inputText = UTEXT_INITIALIZER;
	4397	char *inputChars = NULL;
	4398	int32_t inputLength;
	4399	int32_t inputCapacity = 0;
	4400
	4401	ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
	4402
	4403	//
	4404	// Open and read the test data file.
	4405	//
	4406	srcPath=getPath(tdd, "re_tests.txt");
	4407	if(srcPath==NULL) {
	4408	return; /* something went wrong, error already output */
	4409	}
	4410
	4411	int32_t len;
	4412	UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
	4413	if (U_FAILURE(status)) {
	4414	return; /* something went wrong, error already output */
	4415	}
	4416
	4417	//
	4418	// Put the test data into a UnicodeString
	4419	//
	4420	UnicodeString testDataString(FALSE, testData, len);
	4421
	4422	//
	4423	// Regex to break the input file into lines, and strip the new lines.
	4424	// One line per match, capture group one is the desired data.
	4425	//
	4426	RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
	4427	if (U_FAILURE(status)) {
	4428	dataerrln("RegexPattern::compile() error");
	4429	return;
	4430	}
	4431	RegexMatcher* lineMat = linePat->matcher(testDataString, status);
	4432
	4433	//
	4434	// Regex to split a test file line into fields.
	4435	// There are six fields, separated by tabs.
	4436	//
	4437	RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
	4438
	4439	//
	4440	// Regex to identify test patterns with flag settings, and to separate them.
	4441	// Test patterns with flags look like 'pattern'i
	4442	// Test patterns without flags are not quoted: pattern
	4443	// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
	4444	//
	4445	RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.)\\1(.*)"), 0, pe, status);
	4446	RegexMatcher* flagMat = flagPat->matcher(status);
	4447
	4448	//
	4449	// The Perl tests reference several perl-isms, which are evaluated/substituted
	4450	// in the test data. Not being perl, this must be done explicitly. Here
	4451	// are string constants and REs for these constructs.
	4452	//
	4453	UnicodeString nulnulSrc("${nulnul}");
	4454	UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
	4455	nulnul = nulnul.unescape();
	4456
	4457	UnicodeString ffffSrc("${ffff}");
	4458	UnicodeString ffff("\\uffff", -1, US_INV);
	4459	ffff = ffff.unescape();
	4460
	4461	// regexp for $-[0], $+[2], etc.
	4462	RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
	4463	RegexMatcher *groupsMat = groupsPat->matcher(status);
	4464
	4465	// regexp for $0, $1, $2, etc.
	4466	RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
	4467	RegexMatcher *cgMat = cgPat->matcher(status);
	4468
	4469
	4470	//
	4471	// Main Loop for the Perl Tests, runs once per line from the
	4472	// test data file.
	4473	//
	4474	int32_t lineNum = 0;
	4475	int32_t skippedUnimplementedCount = 0;
	4476	while (lineMat->find()) {
	4477	lineNum++;
	4478
	4479	//
	4480	// Get a line, break it into its fields, do the Perl
	4481	// variable substitutions.
	4482	//
	4483	UnicodeString line = lineMat->group(1, status);
	4484	UnicodeString fields[7];
	4485	fieldPat->split(line, fields, 7, status);
	4486
	4487	flagMat->reset(fields[0]);
	4488	flagMat->matches(status);
	4489	UnicodeString pattern = flagMat->group(2, status);
	4490	pattern.findAndReplace("${bang}", "!");
	4491	pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
	4492	pattern.findAndReplace(ffffSrc, ffff);
	4493
	4494	//
	4495	// Identify patterns that include match flag settings,
	4496	// split off the flags, remove the extra quotes.
	4497	//
	4498	UnicodeString flagStr = flagMat->group(3, status);
	4499	if (U_FAILURE(status)) {
	4500	errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	4501	return;
	4502	}
	4503	int32_t flags = 0;
	4504	const UChar UChar_c = 0x63; // Char constants for the flag letters.
	4505	const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
	4506	const UChar UChar_m = 0x6d;
	4507	const UChar UChar_x = 0x78;
	4508	const UChar UChar_y = 0x79;
	4509	if (flagStr.indexOf(UChar_i) != -1) {
	4510	flags \|= UREGEX_CASE_INSENSITIVE;
	4511	}
	4512	if (flagStr.indexOf(UChar_m) != -1) {
	4513	flags \|= UREGEX_MULTILINE;
	4514	}
	4515	if (flagStr.indexOf(UChar_x) != -1) {
	4516	flags \|= UREGEX_COMMENTS;
	4517	}
	4518
	4519	//
	4520	// Put the pattern in a UTF-8 UText
	4521	//
	4522	status = U_ZERO_ERROR;
	4523	patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4524	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4525	status = U_ZERO_ERROR;
	4526	delete[] patternChars;
	4527	patternCapacity = patternLength + 1;
	4528	patternChars = new char[patternCapacity];
	4529	pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
	4530	}
	4531	utext_openUTF8(&patternText, patternChars, patternLength, &status);
	4532
	4533	//
	4534	// Compile the test pattern.
	4535	//
	4536	RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
	4537	if (status == U_REGEX_UNIMPLEMENTED) {
	4538	//
	4539	// Test of a feature that is planned for ICU, but not yet implemented.
	4540	// skip the test.
	4541	skippedUnimplementedCount++;
	4542	delete testPat;
	4543	status = U_ZERO_ERROR;
	4544	continue;
	4545	}
	4546
	4547	if (U_FAILURE(status)) {
	4548	// Some tests are supposed to generate errors.
	4549	// Only report an error for tests that are supposed to succeed.
	4550	if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
	4551	fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
	4552	{
	4553	errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
	4554	}
	4555	status = U_ZERO_ERROR;
	4556	delete testPat;
	4557	continue;
	4558	}
	4559
	4560	if (fields[2].indexOf(UChar_i) >= 0) {
	4561	// ICU should skip this test.
	4562	delete testPat;
	4563	continue;
	4564	}
	4565
	4566	if (fields[2].indexOf(UChar_c) >= 0) {
	4567	// This pattern should have caused a compilation error, but didn't/
	4568	errln("line %d: Expected a pattern compile error, got success.", lineNum);
	4569	delete testPat;
	4570	continue;
	4571	}
	4572
	4573
	4574	//
	4575	// replace the Perl variables that appear in some of the
	4576	// match data strings.
	4577	//
	4578	UnicodeString matchString = fields[1];
	4579	matchString.findAndReplace(nulnulSrc, nulnul);
	4580	matchString.findAndReplace(ffffSrc, ffff);
	4581
	4582	// Replace any \n in the match string with an actual new-line char.
	4583	// Don't do full unescape, as this unescapes more than Perl does, which
	4584	// causes other spurious failures in the tests.
	4585	matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4586
	4587	//
	4588	// Put the input in a UTF-8 UText
	4589	//
	4590	status = U_ZERO_ERROR;
	4591	inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4592	if (status == U_BUFFER_OVERFLOW_ERROR) {
	4593	status = U_ZERO_ERROR;
	4594	delete[] inputChars;
	4595	inputCapacity = inputLength + 1;
	4596	inputChars = new char[inputCapacity];
	4597	matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
	4598	}
	4599	utext_openUTF8(&inputText, inputChars, inputLength, &status);
	4600
	4601	//
	4602	// Run the test, check for expected match/don't match result.
	4603	//
	4604	RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
	4605	UBool found = testMat->find();
	4606	UBool expected = FALSE;
	4607	if (fields[2].indexOf(UChar_y) >=0) {
	4608	expected = TRUE;
	4609	}
	4610	if (expected != found) {
	4611	errln("line %d: Expected %smatch, got %smatch",
	4612	lineNum, expected?"":"no ", found?"":"no " );
	4613	continue;
	4614	}
	4615
	4616	// Don't try to check expected results if there is no match.
	4617	// (Some have stuff in the expected fields)
	4618	if (!found) {
	4619	delete testMat;
	4620	delete testPat;
	4621	continue;
	4622	}
	4623
	4624	//
	4625	// Interpret the Perl expression from the fourth field of the data file,
	4626	// building up an ICU string from the results of the ICU match.
	4627	// The Perl expression will contain references to the results of
	4628	// a regex match, including the matched string, capture group strings,
	4629	// group starting and ending indicies, etc.
	4630	//
	4631	UnicodeString resultString;
	4632	UnicodeString perlExpr = fields[3];
	4633
	4634	while (perlExpr.length() > 0) {
	4635	groupsMat->reset(perlExpr);
	4636	cgMat->reset(perlExpr);
	4637
	4638	if (perlExpr.startsWith("$&")) {
	4639	resultString.append(testMat->group(status));
	4640	perlExpr.remove(0, 2);
	4641	}
	4642
	4643	else if (groupsMat->lookingAt(status)) {
	4644	// $-[0] $+[2] etc.
	4645	UnicodeString digitString = groupsMat->group(2, status);
	4646	int32_t t = 0;
	4647	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4648	UnicodeString plusOrMinus = groupsMat->group(1, status);
	4649	int32_t matchPosition;
	4650	if (plusOrMinus.compare("+") == 0) {
	4651	matchPosition = testMat->end(groupNum, status);
	4652	} else {
	4653	matchPosition = testMat->start(groupNum, status);
	4654	}
	4655	if (matchPosition != -1) {
	4656	ICU_Utility::appendNumber(resultString, matchPosition);
	4657	}
	4658	perlExpr.remove(0, groupsMat->end(status));
	4659	}
	4660
	4661	else if (cgMat->lookingAt(status)) {
	4662	// $1, $2, $3, etc.
	4663	UnicodeString digitString = cgMat->group(1, status);
	4664	int32_t t = 0;
	4665	int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
	4666	if (U_SUCCESS(status)) {
	4667	resultString.append(testMat->group(groupNum, status));
	4668	status = U_ZERO_ERROR;
	4669	}
	4670	perlExpr.remove(0, cgMat->end(status));
	4671	}
	4672
	4673	else if (perlExpr.startsWith("@-")) {
	4674	int32_t i;
	4675	for (i=0; i<=testMat->groupCount(); i++) {
	4676	if (i>0) {
	4677	resultString.append(" ");
	4678	}
	4679	ICU_Utility::appendNumber(resultString, testMat->start(i, status));
	4680	}
	4681	perlExpr.remove(0, 2);
	4682	}
	4683
	4684	else if (perlExpr.startsWith("@+")) {
	4685	int32_t i;
	4686	for (i=0; i<=testMat->groupCount(); i++) {
	4687	if (i>0) {
	4688	resultString.append(" ");
	4689	}
	4690	ICU_Utility::appendNumber(resultString, testMat->end(i, status));
	4691	}
	4692	perlExpr.remove(0, 2);
	4693	}
	4694
	4695	else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
	4696	// or as an escaped sequence (e.g. \n)
	4697	if (perlExpr.length() > 1) {
	4698	perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
	4699	}
	4700	UChar c = perlExpr.charAt(0);
	4701	switch (c) {
	4702	case 'n': c = '\n'; break;
	4703	// add any other escape sequences that show up in the test expected results.
	4704	}
	4705	resultString.append(c);
	4706	perlExpr.remove(0, 1);
	4707	}
	4708
	4709	else {
	4710	// Any characters from the perl expression that we don't explicitly
	4711	// recognize before here are assumed to be literals and copied
	4712	// as-is to the expected results.
	4713	resultString.append(perlExpr.charAt(0));
	4714	perlExpr.remove(0, 1);
	4715	}
	4716
	4717	if (U_FAILURE(status)) {
	4718	errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
	4719	break;
	4720	}
	4721	}
	4722
	4723	//
	4724	// Expected Results Compare
	4725	//
	4726	UnicodeString expectedS(fields[4]);
	4727	expectedS.findAndReplace(nulnulSrc, nulnul);
	4728	expectedS.findAndReplace(ffffSrc, ffff);
	4729	expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
	4730
	4731
	4732	if (expectedS.compare(resultString) != 0) {
	4733	err("Line %d: Incorrect perl expression results.", lineNum);
	4734	infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
	4735	}
	4736
	4737	delete testMat;
	4738	delete testPat;
	4739	}
	4740
	4741	//
	4742	// All done. Clean up allocated stuff.
	4743	//
	4744	delete cgMat;
	4745	delete cgPat;
	4746
	4747	delete groupsMat;
	4748	delete groupsPat;
	4749
	4750	delete flagMat;
	4751	delete flagPat;
	4752
	4753	delete lineMat;
	4754	delete linePat;
	4755
	4756	delete fieldPat;
	4757	delete [] testData;
	4758
	4759	utext_close(&patternText);
	4760	utext_close(&inputText);
	4761
	4762	delete [] patternChars;
	4763	delete [] inputChars;
	4764
	4765
	4766	logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
	4767
	4768	}
	4769
	4770
	4771	//--------------------------------------------------------------
	4772	//
	4773	// Bug6149 Verify limits to heap expansion for backtrack stack.
	4774	// Use this pattern,
	4775	// "(a?){1,8000000}"
	4776	// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
	4777	// This test is likely to be fragile, as further optimizations stop
	4778	// more cases of pointless looping in the match engine.
	4779	//
	4780	//---------------------------------------------------------------
	4781	void RegexTest::Bug6149() {
	4782	UnicodeString pattern("(a?){1,8000000}");
	4783	UnicodeString s("xyz");
	4784	uint32_t flags = 0;
	4785	UErrorCode status = U_ZERO_ERROR;
	4786
	4787	RegexMatcher matcher(pattern, s, flags, status);
	4788	UBool result = false;
	4789	REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
	4790	REGEX_ASSERT(result == FALSE);
	4791	}
	4792
	4793
	4794	//
	4795	// Callbacks() Test the callback function.
	4796	// When set, callbacks occur periodically during matching operations,
	4797	// giving the application code the ability to abort the operation
	4798	// before it's normal completion.
	4799	//
	4800
	4801	struct callBackContext {
	4802	RegexTest *test;
	4803	int32_t maxCalls;
	4804	int32_t numCalls;
	4805	int32_t lastSteps;
	4806	void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
	4807	};
	4808
	4809	U_CDECL_BEGIN
	4810	static UBool U_CALLCONV
	4811	testCallBackFn(const void *context, int32_t steps) {
	4812	callBackContext info = (callBackContext )context;
	4813	if (info->lastSteps+1 != steps) {
	4814	info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
	4815	}
	4816	info->lastSteps = steps;
	4817	info->numCalls++;
	4818	return (info->numCalls < info->maxCalls);
	4819	}
	4820	U_CDECL_END
	4821
	4822	void RegexTest::Callbacks() {
	4823	{
	4824	// Getter returns NULLs if no callback has been set
	4825
	4826	// The variables that the getter will fill in.
	4827	// Init to non-null values so that the action of the getter can be seen.
	4828	const void *returnedContext = &returnedContext;
	4829	URegexMatchCallback *returnedFn = &testCallBackFn;
	4830
	4831	UErrorCode status = U_ZERO_ERROR;
	4832	RegexMatcher matcher("x", 0, status);
	4833	REGEX_CHECK_STATUS;
	4834	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4835	REGEX_CHECK_STATUS;
	4836	REGEX_ASSERT(returnedFn == NULL);
	4837	REGEX_ASSERT(returnedContext == NULL);
	4838	}
	4839
	4840	{
	4841	// Set and Get work
	4842	callBackContext cbInfo = {this, 0, 0, 0};
	4843	const void *returnedContext;
	4844	URegexMatchCallback *returnedFn;
	4845	UErrorCode status = U_ZERO_ERROR;
	4846	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
	4847	REGEX_CHECK_STATUS;
	4848	matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
	4849	REGEX_CHECK_STATUS;
	4850	matcher.getMatchCallback(returnedFn, returnedContext, status);
	4851	REGEX_CHECK_STATUS;
	4852	REGEX_ASSERT(returnedFn == testCallBackFn);
	4853	REGEX_ASSERT(returnedContext == &cbInfo);
	4854
	4855	// A short-running match shouldn't invoke the callback
	4856	status = U_ZERO_ERROR;
	4857	cbInfo.reset(1);
	4858	UnicodeString s = "xxx";
	4859	matcher.reset(s);
	4860	REGEX_ASSERT(matcher.matches(status));
	4861	REGEX_CHECK_STATUS;
	4862	REGEX_ASSERT(cbInfo.numCalls == 0);
	4863
	4864	// A medium-length match that runs long enough to invoke the
	4865	// callback, but not so long that the callback aborts it.
	4866	status = U_ZERO_ERROR;
	4867	cbInfo.reset(4);
	4868	s = "aaaaaaaaaaaaaaaaaaab";
	4869	matcher.reset(s);
	4870	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4871	REGEX_CHECK_STATUS;
	4872	REGEX_ASSERT(cbInfo.numCalls > 0);
	4873
	4874	// A longer running match that the callback function will abort.
	4875	status = U_ZERO_ERROR;
	4876	cbInfo.reset(4);
	4877	s = "aaaaaaaaaaaaaaaaaaaaaaab";
	4878	matcher.reset(s);
	4879	REGEX_ASSERT(matcher.matches(status)==FALSE);
	4880	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4881	REGEX_ASSERT(cbInfo.numCalls == 4);
	4882
	4883	// A longer running find that the callback function will abort.
	4884	status = U_ZERO_ERROR;
	4885	cbInfo.reset(4);
	4886	s = "aaaaaaaaaaaaaaaaaaaaaaab";
	4887	matcher.reset(s);
	4888	REGEX_ASSERT(matcher.find(status)==FALSE);
	4889	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4890	REGEX_ASSERT(cbInfo.numCalls == 4);
	4891	}
	4892
	4893
	4894	}
	4895
	4896
	4897	//
	4898	// FindProgressCallbacks() Test the find "progress" callback function.
	4899	// When set, the find progress callback will be invoked during a find operations
	4900	// after each return from a match attempt, giving the application the opportunity
	4901	// to terminate a long-running find operation before it's normal completion.
	4902	//
	4903
	4904	struct progressCallBackContext {
	4905	RegexTest *test;
	4906	int64_t lastIndex;
	4907	int32_t maxCalls;
	4908	int32_t numCalls;
	4909	void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
	4910	};
	4911
	4912	// call-back function for find().
	4913	// Return TRUE to continue the find().
	4914	// Return FALSE to stop the find().
	4915	U_CDECL_BEGIN
	4916	static UBool U_CALLCONV
	4917	testProgressCallBackFn(const void *context, int64_t matchIndex) {
	4918	progressCallBackContext info = (progressCallBackContext )context;
	4919	info->numCalls++;
	4920	info->lastIndex = matchIndex;
	4921	// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
	4922	return (info->numCalls < info->maxCalls);
	4923	}
	4924	U_CDECL_END
	4925
	4926	void RegexTest::FindProgressCallbacks() {
	4927	{
	4928	// Getter returns NULLs if no callback has been set
	4929
	4930	// The variables that the getter will fill in.
	4931	// Init to non-null values so that the action of the getter can be seen.
	4932	const void *returnedContext = &returnedContext;
	4933	URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
	4934
	4935	UErrorCode status = U_ZERO_ERROR;
	4936	RegexMatcher matcher("x", 0, status);
	4937	REGEX_CHECK_STATUS;
	4938	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4939	REGEX_CHECK_STATUS;
	4940	REGEX_ASSERT(returnedFn == NULL);
	4941	REGEX_ASSERT(returnedContext == NULL);
	4942	}
	4943
	4944	{
	4945	// Set and Get work
	4946	progressCallBackContext cbInfo = {this, 0, 0, 0};
	4947	const void *returnedContext;
	4948	URegexFindProgressCallback *returnedFn;
	4949	UErrorCode status = U_ZERO_ERROR;
	4950	RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
	4951	REGEX_CHECK_STATUS;
	4952	matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
	4953	REGEX_CHECK_STATUS;
	4954	matcher.getFindProgressCallback(returnedFn, returnedContext, status);
	4955	REGEX_CHECK_STATUS;
	4956	REGEX_ASSERT(returnedFn == testProgressCallBackFn);
	4957	REGEX_ASSERT(returnedContext == &cbInfo);
	4958
	4959	// A find that matches on the initial position does NOT invoke the callback.
	4960	status = U_ZERO_ERROR;
	4961	cbInfo.reset(100);
	4962	UnicodeString s = "aaxxx";
	4963	matcher.reset(s);
	4964	#if 0
	4965	matcher.setTrace(TRUE);
	4966	#endif
	4967	REGEX_ASSERT(matcher.find(0, status));
	4968	REGEX_CHECK_STATUS;
	4969	REGEX_ASSERT(cbInfo.numCalls == 0);
	4970
	4971	// A medium running find() that causes matcher.find() to invoke our callback for each index,
	4972	// but not so many times that we interrupt the operation.
	4973	status = U_ZERO_ERROR;
	4974	s = "aaaaaaaaaaaaaaaaaaab";
	4975	cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
	4976	matcher.reset(s);
	4977	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4978	REGEX_CHECK_STATUS;
	4979	REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
	4980
	4981	// A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
	4982	status = U_ZERO_ERROR;
	4983	UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
	4984	cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
	4985	matcher.reset(s1);
	4986	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4987	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4988	REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
	4989
	4990	// Now a match that will succeed, but after an interruption
	4991	status = U_ZERO_ERROR;
	4992	UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
	4993	cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
	4994	matcher.reset(s2);
	4995	REGEX_ASSERT(matcher.find(0, status)==FALSE);
	4996	REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
	4997	// Now retry the match from where left off
	4998	cbInfo.maxCalls = 100; // No callback limit
	4999	status = U_ZERO_ERROR;
	5000	REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
	5001	REGEX_CHECK_STATUS;
	5002	}
	5003
	5004
	5005	}
	5006
	5007
	5008	//---------------------------------------------------------------------------
	5009	//
	5010	// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
	5011	// UTexts. The pure-C implementation of UText
	5012	// has no mutable backing stores, but we can
	5013	// use UnicodeString here to test the functionality.
	5014	//
	5015	//---------------------------------------------------------------------------
	5016	void RegexTest::PreAllocatedUTextCAPI () {
	5017	UErrorCode status = U_ZERO_ERROR;
	5018	URegularExpression *re;
	5019	UText patternText = UTEXT_INITIALIZER;
	5020	UnicodeString buffer;
	5021	UText bufferText = UTEXT_INITIALIZER;
	5022
	5023	utext_openUnicodeString(&bufferText, &buffer, &status);
	5024
	5025	/*
	5026	* getText() and getUText()
	5027	*/
	5028	{
	5029	UText text1 = UTEXT_INITIALIZER;
	5030	UText text2 = UTEXT_INITIALIZER;
	5031	UChar text2Chars[20];
	5032	UText *resultText;
	5033
	5034	status = U_ZERO_ERROR;
	5035	regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
	5036	regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
	5037	u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
	5038	utext_openUChars(&text2, text2Chars, -1, &status);
	5039
	5040	regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
	5041	re = uregex_openUText(&patternText, 0, NULL, &status);
	5042
	5043	/* First set a UText */
	5044	uregex_setUText(re, &text1, &status);
	5045	resultText = uregex_getUText(re, &bufferText, &status);
	5046	REGEX_CHECK_STATUS;
	5047	REGEX_ASSERT(resultText == &bufferText);
	5048	utext_setNativeIndex(resultText, 0);
	5049	utext_setNativeIndex(&text1, 0);
	5050	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	5051
	5052	resultText = uregex_getUText(re, &bufferText, &status);
	5053	REGEX_CHECK_STATUS;
	5054	REGEX_ASSERT(resultText == &bufferText);
	5055	utext_setNativeIndex(resultText, 0);
	5056	utext_setNativeIndex(&text1, 0);
	5057	REGEX_ASSERT(testUTextEqual(resultText, &text1));
	5058
	5059	/* Then set a UChar * */
	5060	uregex_setText(re, text2Chars, 7, &status);
	5061	resultText = uregex_getUText(re, &bufferText, &status);
	5062	REGEX_CHECK_STATUS;
	5063	REGEX_ASSERT(resultText == &bufferText);
	5064	utext_setNativeIndex(resultText, 0);
	5065	utext_setNativeIndex(&text2, 0);
	5066	REGEX_ASSERT(testUTextEqual(resultText, &text2));
	5067
	5068	uregex_close(re);
	5069	utext_close(&text1);
	5070	utext_close(&text2);
	5071	}
	5072
	5073	/*
	5074	* group()
	5075	*/
	5076	{
	5077	UChar text1[80];
	5078	UText *actual;
	5079	UBool result;
	5080	int64_t length = 0;
	5081
	5082	u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
	5083	// 012345678901234567890123456789012345678901234567
	5084	// 0 1 2 3 4
	5085
	5086	status = U_ZERO_ERROR;
	5087	re = uregex_openC("abc(.*?)def", 0, NULL, &status);
	5088	REGEX_CHECK_STATUS;
	5089
	5090	uregex_setText(re, text1, -1, &status);
	5091	result = uregex_find(re, 0, &status);
	5092	REGEX_ASSERT(result==TRUE);
	5093
	5094	/* Capture Group 0, the full match. Should succeed. "abc interior def" */
	5095	status = U_ZERO_ERROR;
	5096	actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
	5097	REGEX_CHECK_STATUS;
	5098	REGEX_ASSERT(actual == &bufferText);
	5099	REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
	5100	REGEX_ASSERT(length == 16);
	5101	REGEX_ASSERT(utext_nativeLength(actual) == 47);
	5102
	5103	/* Capture group #1. Should succeed, matching " interior ". */
	5104	status = U_ZERO_ERROR;
	5105	actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
	5106	REGEX_CHECK_STATUS;
	5107	REGEX_ASSERT(actual == &bufferText);
	5108	REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
	5109	REGEX_ASSERT(length == 10);
	5110	REGEX_ASSERT(utext_nativeLength(actual) == 47);
	5111
	5112	/* Capture group out of range. Error. */
	5113	status = U_ZERO_ERROR;
	5114	actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
	5115	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5116	REGEX_ASSERT(actual == &bufferText);
	5117	uregex_close(re);
	5118
	5119	}
	5120
	5121	/*
	5122	* replaceFirst()
	5123	*/
	5124	{
	5125	UChar text1[80];
	5126	UChar text2[80];
	5127	UText replText = UTEXT_INITIALIZER;
	5128	UText *result;
	5129	status = U_ZERO_ERROR;
	5130	utext_openUnicodeString(&bufferText, &buffer, &status);
	5131
	5132	status = U_ZERO_ERROR;
	5133	u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
	5134	u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
	5135	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5136
	5137	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5138	REGEX_CHECK_STATUS;
	5139
	5140	/* Normal case, with match */
	5141	uregex_setText(re, text1, -1, &status);
	5142	REGEX_CHECK_STATUS;
	5143	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5144	REGEX_CHECK_STATUS;
	5145	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5146	REGEX_CHECK_STATUS;
	5147	REGEX_ASSERT(result == &bufferText);
	5148	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
	5149
	5150	/* No match. Text should copy to output with no changes. */
	5151	uregex_setText(re, text2, -1, &status);
	5152	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5153	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5154	REGEX_CHECK_STATUS;
	5155	REGEX_ASSERT(result == &bufferText);
	5156	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5157
	5158	/* Unicode escapes */
	5159	uregex_setText(re, text1, -1, &status);
	5160	regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
	5161	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5162	result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
	5163	REGEX_CHECK_STATUS;
	5164	REGEX_ASSERT(result == &bufferText);
	5165	REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
	5166
	5167	uregex_close(re);
	5168	utext_close(&replText);
	5169	}
	5170
	5171
	5172	/*
	5173	* replaceAll()
	5174	*/
	5175	{
	5176	UChar text1[80];
	5177	UChar text2[80];
	5178	UText replText = UTEXT_INITIALIZER;
	5179	UText *result;
	5180
	5181	status = U_ZERO_ERROR;
	5182	u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
	5183	u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
	5184	regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
	5185
	5186	re = uregex_openC("x(.*?)x", 0, NULL, &status);
	5187	REGEX_CHECK_STATUS;
	5188
	5189	/* Normal case, with match */
	5190	uregex_setText(re, text1, -1, &status);
	5191	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5192	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5193	REGEX_CHECK_STATUS;
	5194	REGEX_ASSERT(result == &bufferText);
	5195	REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
	5196
	5197	/* No match. Text should copy to output with no changes. */
	5198	uregex_setText(re, text2, -1, &status);
	5199	utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
	5200	result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
	5201	REGEX_CHECK_STATUS;
	5202	REGEX_ASSERT(result == &bufferText);
	5203	REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
	5204
	5205	uregex_close(re);
	5206	utext_close(&replText);
	5207	}
	5208
	5209
	5210	/*
	5211	* splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
	5212	* so we don't need to test it here.
	5213	*/
	5214
	5215	utext_close(&bufferText);
	5216	utext_close(&patternText);
	5217	}
	5218
	5219
	5220	//--------------------------------------------------------------
	5221	//
	5222	// NamedCapture Check basic named capture group functionality
	5223	//
	5224	//--------------------------------------------------------------
	5225	void RegexTest::NamedCapture() {
	5226	UErrorCode status = U_ZERO_ERROR;
	5227	RegexPattern *pat = RegexPattern::compile(UnicodeString(
	5228	"abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
	5229	REGEX_CHECK_STATUS;
	5230	int32_t group = pat->groupNumberFromName("five", -1, status);
	5231	REGEX_CHECK_STATUS;
	5232	REGEX_ASSERT(5 == group);
	5233	group = pat->groupNumberFromName("three", -1, status);
	5234	REGEX_CHECK_STATUS;
	5235	REGEX_ASSERT(3 == group);
	5236
	5237	status = U_ZERO_ERROR;
	5238	group = pat->groupNumberFromName(UnicodeString("six"), status);
	5239	REGEX_CHECK_STATUS;
	5240	REGEX_ASSERT(6 == group);
	5241
	5242	status = U_ZERO_ERROR;
	5243	group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
	5244	U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5245
	5246	status = U_ZERO_ERROR;
	5247
	5248	// After copying a pattern, named capture should still work in the copy.
	5249	RegexPattern copiedPat = new RegexPattern(pat);
	5250	REGEX_ASSERT(copiedPat == pat);
	5251	delete pat; pat = NULL; // Delete original, copy should have no references back to it.
	5252
	5253	group = copiedPat->groupNumberFromName("five", -1, status);
	5254	REGEX_CHECK_STATUS;
	5255	REGEX_ASSERT(5 == group);
	5256	group = copiedPat->groupNumberFromName("three", -1, status);
	5257	REGEX_CHECK_STATUS;
	5258	REGEX_ASSERT(3 == group);
	5259	delete copiedPat;
	5260
	5261	// ReplaceAll with named capture group.
	5262	status = U_ZERO_ERROR;
	5263	UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
	5264	RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
	5265	REGEX_CHECK_STATUS;
	5266	// m.pattern().dumpPattern();
	5267	UnicodeString replacedText = m->replaceAll("'${mid}'", status);
	5268	REGEX_CHECK_STATUS;
	5269	REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
	5270	delete m;
	5271
	5272	// ReplaceAll, allowed capture group numbers.
	5273	text = UnicodeString("abcmxyz");
	5274	m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
	5275	REGEX_CHECK_STATUS;
	5276
	5277	status = U_ZERO_ERROR;
	5278	replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
	5279	REGEX_CHECK_STATUS;
	5280	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
	5281
	5282	status = U_ZERO_ERROR;
	5283	replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
	5284	REGEX_CHECK_STATUS;
	5285	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
	5286
	5287	status = U_ZERO_ERROR;
	5288	replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
	5289	REGEX_CHECK_STATUS;
	5290	REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
	5291
	5292	status = U_ZERO_ERROR;
	5293	replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
	5294	REGEX_CHECK_STATUS;
	5295	REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
	5296
	5297	status = U_ZERO_ERROR;
	5298	replacedText = m->replaceAll(UnicodeString("<$3>"), status);
	5299	REGEX_CHECK_STATUS;
	5300	REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
	5301
	5302	status = U_ZERO_ERROR;
	5303	replacedText = m->replaceAll(UnicodeString("<$4>"), status);
	5304	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5305
	5306	status = U_ZERO_ERROR;
	5307	replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
	5308	REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
	5309	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
	5310
	5311	status = U_ZERO_ERROR;
	5312	replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
	5313	REGEX_CHECK_STATUS; // that push group num out of range.
	5314	REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
	5315
	5316	status = U_ZERO_ERROR;
	5317	replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
	5318	REGEX_CHECK_STATUS;
	5319	REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
	5320
	5321	status = U_ZERO_ERROR;
	5322	replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
	5323	REGEX_CHECK_STATUS;
	5324	REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
	5325
	5326	status = U_ZERO_ERROR;
	5327	replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
	5328	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5329
	5330	status = U_ZERO_ERROR;
	5331	replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
	5332	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5333
	5334	status = U_ZERO_ERROR;
	5335	replacedText = m->replaceAll(UnicodeString("<${one"), status);
	5336	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5337
	5338	status = U_ZERO_ERROR;
	5339	replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
	5340	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5341
	5342	delete m;
	5343
	5344	// Repeat the above replaceAll() tests using the plain C API, which
	5345	// has a separate implementation internally.
	5346	// TODO: factor out the test data.
	5347
	5348	status = U_ZERO_ERROR;
	5349	URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
	5350	REGEX_CHECK_STATUS;
	5351	text = UnicodeString("abcmxyz");
	5352	uregex_setText(re, text.getBuffer(), text.length(), &status);
	5353	REGEX_CHECK_STATUS;
	5354
	5355	UChar resultBuf[100];
	5356	int32_t resultLength;
	5357	UnicodeString repl;
	5358
	5359	status = U_ZERO_ERROR;
	5360	repl = UnicodeString("<$0>");
	5361	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5362	REGEX_CHECK_STATUS;
	5363	REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
	5364
	5365	status = U_ZERO_ERROR;
	5366	repl = UnicodeString("<$1>");
	5367	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5368	REGEX_CHECK_STATUS;
	5369	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
	5370
	5371	status = U_ZERO_ERROR;
	5372	repl = UnicodeString("<${one}>");
	5373	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5374	REGEX_CHECK_STATUS;
	5375	REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
	5376
	5377	status = U_ZERO_ERROR;
	5378	repl = UnicodeString("<$2>");
	5379	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5380	REGEX_CHECK_STATUS;
	5381	REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
	5382
	5383	status = U_ZERO_ERROR;
	5384	repl = UnicodeString("<$3>");
	5385	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5386	REGEX_CHECK_STATUS;
	5387	REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
	5388
	5389	status = U_ZERO_ERROR;
	5390	repl = UnicodeString("<$4>");
	5391	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5392	REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
	5393
	5394	status = U_ZERO_ERROR;
	5395	repl = UnicodeString("<$04>");
	5396	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5397	REGEX_CHECK_STATUS;
	5398	REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
	5399
	5400	status = U_ZERO_ERROR;
	5401	repl = UnicodeString("<$000016>");
	5402	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5403	REGEX_CHECK_STATUS;
	5404	REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
	5405
	5406	status = U_ZERO_ERROR;
	5407	repl = UnicodeString("<$3$2$1${one}>");
	5408	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5409	REGEX_CHECK_STATUS;
	5410	REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
	5411
	5412	status = U_ZERO_ERROR;
	5413	repl = UnicodeString("$3$2$1${one}");
	5414	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5415	REGEX_CHECK_STATUS;
	5416	REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
	5417
	5418	status = U_ZERO_ERROR;
	5419	repl = UnicodeString("<${noSuchName}>");
	5420	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5421	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5422
	5423	status = U_ZERO_ERROR;
	5424	repl = UnicodeString("<${invalid-name}>");
	5425	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5426	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5427
	5428	status = U_ZERO_ERROR;
	5429	repl = UnicodeString("<${one");
	5430	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5431	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5432
	5433	status = U_ZERO_ERROR;
	5434	repl = UnicodeString("$not a capture group");
	5435	resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
	5436	REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
	5437
	5438	uregex_close(re);
	5439	}
	5440
	5441	//--------------------------------------------------------------
	5442	//
	5443	// NamedCaptureLimits Patterns with huge numbers of named capture groups.
	5444	// The point is not so much what the exact limit is,
	5445	// but that a largish number doesn't hit bad non-linear performance,
	5446	// and that exceeding the limit fails cleanly.
	5447	//
	5448	//--------------------------------------------------------------
	5449	void RegexTest::NamedCaptureLimits() {
	5450	if (quick) {
	5451	logln("Skipping test. Runs in exhuastive mode only.");
	5452	return;
	5453	}
	5454	const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
	5455	const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
	5456	char nnbuf[100];
	5457	UnicodeString pattern;
	5458	int32_t nn;
	5459
	5460	for (nn=1; nn<goodLimit; nn++) {
	5461	sprintf(nnbuf, "(?<nn%d>)", nn);
	5462	pattern.append(UnicodeString(nnbuf, -1, US_INV));
	5463	}
	5464	UErrorCode status = U_ZERO_ERROR;
	5465	RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
	5466	REGEX_CHECK_STATUS;
	5467	for (nn=1; nn<goodLimit; nn++) {
	5468	sprintf(nnbuf, "nn%d", nn);
	5469	int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
	5470	REGEX_ASSERT(nn == groupNum);
	5471	if (nn != groupNum) {
	5472	break;
	5473	}
	5474	}
	5475	delete pat;
	5476
	5477	pattern.remove();
	5478	for (nn=1; nn<failLimit; nn++) {
	5479	sprintf(nnbuf, "(?<nn%d>)", nn);
	5480	pattern.append(UnicodeString(nnbuf, -1, US_INV));
	5481	}
	5482	status = U_ZERO_ERROR;
	5483	pat = RegexPattern::compile(pattern, 0, status);
	5484	REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
	5485	delete pat;
	5486	}
	5487
	5488
	5489	//--------------------------------------------------------------
	5490	//
	5491	// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
	5492	//
	5493	//---------------------------------------------------------------
	5494	void RegexTest::Bug7651() {
	5495	UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|\\$[A-Za-z]+)");
	5496	// The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
	5497	// It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
	5498	UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|\\$[A-Za-z]+)");
	5499	UnicodeString s("#ff @abcd This is test");
	5500	RegexPattern *REPattern = NULL;
	5501	RegexMatcher *REMatcher = NULL;
	5502	UErrorCode status = U_ZERO_ERROR;
	5503	UParseError pe;
	5504
	5505	REPattern = RegexPattern::compile(pattern1, 0, pe, status);
	5506	REGEX_CHECK_STATUS;
	5507	REMatcher = REPattern->matcher(s, status);
	5508	REGEX_CHECK_STATUS;
	5509	REGEX_ASSERT(REMatcher->find());
	5510	REGEX_ASSERT(REMatcher->start(status) == 0);
	5511	delete REPattern;
	5512	delete REMatcher;
	5513	status = U_ZERO_ERROR;
	5514
	5515	REPattern = RegexPattern::compile(pattern2, 0, pe, status);
	5516	REGEX_CHECK_STATUS;
	5517	REMatcher = REPattern->matcher(s, status);
	5518	REGEX_CHECK_STATUS;
	5519	REGEX_ASSERT(REMatcher->find());
	5520	REGEX_ASSERT(REMatcher->start(status) == 0);
	5521	delete REPattern;
	5522	delete REMatcher;
	5523	status = U_ZERO_ERROR;
	5524	}
	5525
	5526	void RegexTest::Bug7740() {
	5527	UErrorCode status = U_ZERO_ERROR;
	5528	UnicodeString pattern = "(a)";
	5529	UnicodeString text = "abcdef";
	5530	RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
	5531	REGEX_CHECK_STATUS;
	5532	REGEX_ASSERT(m->lookingAt(status));
	5533	REGEX_CHECK_STATUS;
	5534	status = U_ILLEGAL_ARGUMENT_ERROR;
	5535	UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
	5536	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5537	REGEX_ASSERT(s == "");
	5538	delete m;
	5539	}
	5540
	5541	// Bug 8479: was crashing whith a Bogus UnicodeString as input.
	5542
	5543	void RegexTest::Bug8479() {
	5544	UErrorCode status = U_ZERO_ERROR;
	5545
	5546	RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL\|UREGEX_CASE_INSENSITIVE, status);
	5547	REGEX_CHECK_STATUS;
	5548	if (U_SUCCESS(status))
	5549	{
	5550	UnicodeString str;
	5551	str.setToBogus();
	5552	pMatcher->reset(str);
	5553	status = U_ZERO_ERROR;
	5554	pMatcher->matches(status);
	5555	REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
	5556	delete pMatcher;
	5557	}
	5558	}
	5559
	5560
	5561	// Bug 7029
	5562	void RegexTest::Bug7029() {
	5563	UErrorCode status = U_ZERO_ERROR;
	5564
	5565	RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
	5566	UnicodeString text = "abc.def";
	5567	UnicodeString splits[10];
	5568	REGEX_CHECK_STATUS;
	5569	int32_t numFields = pMatcher->split(text, splits, 10, status);
	5570	REGEX_CHECK_STATUS;
	5571	REGEX_ASSERT(numFields == 8);
	5572	delete pMatcher;
	5573	}
	5574
	5575	// Bug 9283
	5576	// This test is checking for the existance of any supplemental characters that case-fold
	5577	// to a bmp character.
	5578	//
	5579	// At the time of this writing there are none. If any should appear in a subsequent release
	5580	// of Unicode, the code in regular expressions compilation that determines the longest
	5581	// posssible match for a literal string will need to be enhanced.
	5582	//
	5583	// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
	5584	// for details on what to do in case of a failure of this test.
	5585	//
	5586	void RegexTest::Bug9283() {
	5587	#if !UCONFIG_NO_NORMALIZATION
	5588	UErrorCode status = U_ZERO_ERROR;
	5589	UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
	5590	REGEX_CHECK_STATUS;
	5591	int32_t index;
	5592	UChar32 c;
	5593	for (index=0; ; index++) {
	5594	c = supplementalsWithCaseFolding.charAt(index);
	5595	if (c == -1) {
	5596	break;
	5597	}
	5598	UnicodeString cf = UnicodeString(c).foldCase();
	5599	REGEX_ASSERT(cf.length() >= 2);
	5600	}
	5601	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	5602	}
	5603
	5604
	5605	void RegexTest::CheckInvBufSize() {
	5606	if(inv_next>=INV_BUFSIZ) {
	5607	errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
	5608	__FILE__, INV_BUFSIZ, inv_next);
	5609	} else {
	5610	logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
	5611	}
	5612	}
	5613
	5614
	5615	void RegexTest::Bug10459() {
	5616	UErrorCode status = U_ZERO_ERROR;
	5617	UnicodeString patternString("(txt)");
	5618	UnicodeString txtString("txt");
	5619
	5620	UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
	5621	REGEX_CHECK_STATUS;
	5622	UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
	5623	REGEX_CHECK_STATUS;
	5624
	5625	URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
	5626	REGEX_CHECK_STATUS;
	5627
	5628	uregex_setUText(icu_re, utext_txt, &status);
	5629	REGEX_CHECK_STATUS;
	5630
	5631	// The bug was that calling uregex_group() before doing a matching operation
	5632	// was causing a segfault. Only for Regular Expressions created from UText.
	5633	// It should set an U_REGEX_INVALID_STATE.
	5634
	5635	UChar buf[100];
	5636	int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
	5637	REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
	5638	REGEX_ASSERT(len == 0);
	5639
	5640	uregex_close(icu_re);
	5641	utext_close(utext_pat);
	5642	utext_close(utext_txt);
	5643	}
	5644
	5645	void RegexTest::TestCaseInsensitiveStarters() {
	5646	// Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
	5647	// become stale because of new Unicode characters.
	5648	// If it is stale, rerun the generation tool
	5649	// svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
	5650	// and replace the embedded data in i18n/regexcmp.cpp
	5651
	5652	for (UChar32 cp=0; cp<=0x10ffff; cp++) {
	5653	if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
	5654	continue;
	5655	}
	5656	UnicodeSet s(cp, cp);
	5657	s.closeOver(USET_CASE_INSENSITIVE);
	5658	UnicodeSetIterator setIter(s);
	5659	while (setIter.next()) {
	5660	if (!setIter.isString()) {
	5661	continue;
	5662	}
	5663	const UnicodeString &str = setIter.getString();
	5664	UChar32 firstChar = str.char32At(0);
	5665	UnicodeSet starters;
	5666	RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
	5667	if (!starters.contains(cp)) {
	5668	errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
	5669	return;
	5670	}
	5671	}
	5672	}
	5673	}
	5674
	5675
	5676	void RegexTest::TestBug11049() {
	5677	// Original bug report: pattern with match start consisting of one of several individual characters,
	5678	// and the text being matched ending with a supplementary character. find() would read past the
	5679	// end of the input text when searching for potential match starting points.
	5680
	5681	// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
	5682	// detect the bad read.
	5683
	5684	TestCase11049("A\|B\|C", "a string \\ud800\\udc00", FALSE, __LINE__);
	5685	TestCase11049("A\|B\|C", "string matches at end C", TRUE, __LINE__);
	5686
	5687	// Test again with a pattern starting with a single character,
	5688	// which takes a different code path than starting with an OR expression,
	5689	// but with similar logic.
	5690	TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
	5691	TestCase11049("C", "string matches at end C", TRUE, __LINE__);
	5692	}
	5693
	5694	// Run a single test case from TestBug11049(). Internal function.
	5695	void RegexTest::TestCase11049(const char pattern, const char data, UBool expectMatch, int32_t lineNumber) {
	5696	UErrorCode status = U_ZERO_ERROR;
	5697	UnicodeString patternString = UnicodeString(pattern).unescape();
	5698	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
	5699
	5700	UnicodeString dataString = UnicodeString(data).unescape();
	5701	UChar *exactBuffer = new UChar[dataString.length()];
	5702	dataString.extract(exactBuffer, dataString.length(), status);
	5703	UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
	5704
	5705	LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
	5706	REGEX_CHECK_STATUS;
	5707	matcher->reset(ut);
	5708	UBool result = matcher->find();
	5709	if (result != expectMatch) {
	5710	errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
	5711	__FILE__, lineNumber, expectMatch, result, pattern, data);
	5712	}
	5713
	5714	// Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
	5715	// off-by-one on find() with match at the last code point.
	5716	// Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
	5717	// because string.unescape() will only shrink it.
	5718	char * utf8Buffer = new char[uprv_strlen(data)+1];
	5719	u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
	5720	REGEX_CHECK_STATUS;
	5721	ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
	5722	REGEX_CHECK_STATUS;
	5723	matcher->reset(ut);
	5724	result = matcher->find();
	5725	if (result != expectMatch) {
	5726	errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
	5727	__FILE__, lineNumber, expectMatch, result, pattern, data);
	5728	}
	5729	delete [] utf8Buffer;
	5730
	5731	utext_close(ut);
	5732	delete [] exactBuffer;
	5733	}
	5734
	5735
	5736	void RegexTest::TestBug11371() {
	5737	if (quick) {
	5738	logln("Skipping test. Runs in exhuastive mode only.");
	5739	return;
	5740	}
	5741	UErrorCode status = U_ZERO_ERROR;
	5742	UnicodeString patternString;
	5743
	5744	for (int i=0; i<8000000; i++) {
	5745	patternString.append(UnicodeString("()"));
	5746	}
	5747	LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
	5748	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5749	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5750	__FILE__, __LINE__, u_errorName(status));
	5751	}
	5752
	5753	status = U_ZERO_ERROR;
	5754	patternString = "(";
	5755	for (int i=0; i<20000000; i++) {
	5756	patternString.append(UnicodeString("A++"));
	5757	}
	5758	patternString.append(UnicodeString("){0}B++"));
	5759	LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
	5760	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5761	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5762	__FILE__, __LINE__, u_errorName(status));
	5763	}
	5764
	5765	// Pattern with too much string data, such that string indexes overflow operand data field size
	5766	// in compiled instruction.
	5767	status = U_ZERO_ERROR;
	5768	patternString = "";
	5769	while (patternString.length() < 0x00ffffff) {
	5770	patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
	5771	}
	5772	patternString.append(UnicodeString("X? trailing string"));
	5773	LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
	5774	if (status != U_REGEX_PATTERN_TOO_BIG) {
	5775	errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
	5776	__FILE__, __LINE__, u_errorName(status));
	5777	}
	5778	}
	5779
	5780	void RegexTest::TestBug11480() {
	5781	// C API, get capture group of a group that does not participate in the match.
	5782	// (Returns a zero length string, with nul termination,
	5783	// indistinguishable from a group with a zero lenght match.)
	5784
	5785	UErrorCode status = U_ZERO_ERROR;
	5786	URegularExpression *re = uregex_openC("(A)\|(B)", 0, NULL, &status);
	5787	REGEX_CHECK_STATUS;
	5788	UnicodeString text = UNICODE_STRING_SIMPLE("A");
	5789	uregex_setText(re, text.getBuffer(), text.length(), &status);
	5790	REGEX_CHECK_STATUS;
	5791	REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
	5792	UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
	5793	int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
	5794	REGEX_ASSERT(length == 0);
	5795	REGEX_ASSERT(buf[0] == 13);
	5796	REGEX_ASSERT(buf[1] == 0);
	5797	REGEX_ASSERT(buf[2] == 13);
	5798	uregex_close(re);
	5799	}
	5800
	5801
	5802	#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */