git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/usettest.cpp

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	********************************************************************************
	5	* Copyright (C) 1999-2016 International Business Machines Corporation and
	6	* others. All Rights Reserved.
	7	********************************************************************************
	8	* Date Name Description
	9	* 10/20/99 alan Creation.
	10	* 03/22/2000 Madhu Added additional tests
	11	********************************************************************************
	12	*/
	13
	14	#include <stdio.h>
	15
	16	#include <string.h>
	17	#include "unicode/utypes.h"
	18	#include "usettest.h"
	19	#include "unicode/ucnv.h"
	20	#include "unicode/uniset.h"
	21	#include "unicode/uchar.h"
	22	#include "unicode/usetiter.h"
	23	#include "unicode/ustring.h"
	24	#include "unicode/parsepos.h"
	25	#include "unicode/symtable.h"
	26	#include "unicode/utf8.h"
	27	#include "unicode/utf16.h"
	28	#include "unicode/uversion.h"
	29	#include "cmemory.h"
	30	#include "hash.h"
	31
	32	#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
	33	dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
	34	u_errorName(status));}}
	35
	36	#define TEST_ASSERT(expr) {if (!(expr)) { \
	37	dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
	38
	39	UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
	40	UnicodeString pat;
	41	set.toPattern(pat);
	42	return left + UnicodeSetTest::escape(pat);
	43	}
	44
	45	UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
	46	}
	47
	48	UConverter *UnicodeSetTest::openUTF8Converter() {
	49	if(utf8Cnv==NULL) {
	50	UErrorCode errorCode=U_ZERO_ERROR;
	51	utf8Cnv=ucnv_open("UTF-8", &errorCode);
	52	}
	53	return utf8Cnv;
	54	}
	55
	56	UnicodeSetTest::~UnicodeSetTest() {
	57	ucnv_close(utf8Cnv);
	58	}
	59
	60	void
	61	UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
	62	const char* &name, char* /par/) {
	63	if (exec) {
	64	logln(u"TestSuite UnicodeSetTest");
	65	}
	66	TESTCASE_AUTO_BEGIN;
	67	TESTCASE_AUTO(TestPatterns);
	68	TESTCASE_AUTO(TestAddRemove);
	69	TESTCASE_AUTO(TestCategories);
	70	TESTCASE_AUTO(TestCloneEqualHash);
	71	TESTCASE_AUTO(TestMinimalRep);
	72	TESTCASE_AUTO(TestAPI);
	73	TESTCASE_AUTO(TestScriptSet);
	74	TESTCASE_AUTO(TestPropertySet);
	75	TESTCASE_AUTO(TestClone);
	76	TESTCASE_AUTO(TestExhaustive);
	77	TESTCASE_AUTO(TestToPattern);
	78	TESTCASE_AUTO(TestIndexOf);
	79	TESTCASE_AUTO(TestStrings);
	80	TESTCASE_AUTO(Testj2268);
	81	TESTCASE_AUTO(TestCloseOver);
	82	TESTCASE_AUTO(TestEscapePattern);
	83	TESTCASE_AUTO(TestInvalidCodePoint);
	84	TESTCASE_AUTO(TestSymbolTable);
	85	TESTCASE_AUTO(TestSurrogate);
	86	TESTCASE_AUTO(TestPosixClasses);
	87	TESTCASE_AUTO(TestIteration);
	88	TESTCASE_AUTO(TestFreezable);
	89	TESTCASE_AUTO(TestSpan);
	90	TESTCASE_AUTO(TestStringSpan);
	91	TESTCASE_AUTO(TestUCAUnsafeBackwards);
	92	TESTCASE_AUTO(TestIntOverflow);
	93	TESTCASE_AUTO(TestUnusedCcc);
	94	TESTCASE_AUTO(TestDeepPattern);
	95	TESTCASE_AUTO_END;
	96	}
	97
	98	static const char NOT[] = "%%%%";
	99
	100	/**
	101	* UVector was improperly copying contents
	102	* This code will crash this is still true
	103	*/
	104	void UnicodeSetTest::Testj2268() {
	105	UnicodeSet t;
	106	t.add(UnicodeString("abc"));
	107	UnicodeSet test(t);
	108	UnicodeString ustrPat;
	109	test.toPattern(ustrPat, TRUE);
	110	}
	111
	112	/**
	113	* Test toPattern().
	114	*/
	115	void UnicodeSetTest::TestToPattern() {
	116	UErrorCode ec = U_ZERO_ERROR;
	117
	118	// Test that toPattern() round trips with syntax characters and
	119	// whitespace.
	120	{
	121	static const char* OTHER_TOPATTERN_TESTS[] = {
	122	"[[:latin:]&[:greek:]]",
	123	"[[:latin:]-[:greek:]]",
	124	"[:nonspacing mark:]",
	125	NULL
	126	};
	127
	128	for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
	129	ec = U_ZERO_ERROR;
	130	UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
	131	if (U_FAILURE(ec)) {
	132	dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
	133	continue;
	134	}
	135	checkPat(OTHER_TOPATTERN_TESTS[j], s);
	136	}
	137
	138	for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
	139	if ((i <= 0xFF && !u_isalpha(i)) \|\| u_isspace(i)) {
	140
	141	// check various combinations to make sure they all work.
	142	if (i != 0 && !toPatternAux(i, i)){
	143	continue;
	144	}
	145	if (!toPatternAux(0, i)){
	146	continue;
	147	}
	148	if (!toPatternAux(i, 0xFFFF)){
	149	continue;
	150	}
	151	}
	152	}
	153	}
	154
	155	// Test pattern behavior of multicharacter strings.
	156	{
	157	ec = U_ZERO_ERROR;
	158	UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
	159
	160	// This loop isn't a loop. It's here to make the compiler happy.
	161	// If you're curious, try removing it and changing the 'break'
	162	// statements (except for the last) to goto's.
	163	for (;;) {
	164	if (U_FAILURE(ec)) break;
	165	const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
	166	expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
	167
	168	s->add("ac");
	169	const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
	170	expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
	171
	172	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
	173	if (U_FAILURE(ec)) break;
	174	const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
	175	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
	176
	177	s->add("[]");
	178	const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
	179	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
	180
	181	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
	182	if (U_FAILURE(ec)) break;
	183	const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
	184	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
	185
	186	// j2189
	187	s->clear();
	188	s->add(UnicodeString("abc", ""));
	189	s->add(UnicodeString("abc", ""));
	190	const char* exp6[] = {"abc", NOT, "ab", NULL};
	191	expectToPattern(*s, "[{abc}]", exp6);
	192
	193	break;
	194	}
	195
	196	if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
	197	delete s;
	198	}
	199
	200	// JB#3400: For 2 character ranges prefer [ab] to [a-b]
	201	UnicodeSet s;
	202	s.add((UChar)97, (UChar)98); // 'a', 'b'
	203	expectToPattern(s, "[ab]", NULL);
	204	}
	205
	206	UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
	207
	208	// use Integer.toString because Utility.hex doesn't handle ints
	209	UnicodeString pat = "";
	210	// TODO do these in hex
	211	//String source = "0x" + Integer.toString(start,16).toUpperCase();
	212	//if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
	213	UnicodeString source;
	214	source = source + (uint32_t)start;
	215	if (start != end)
	216	source = source + ".." + (uint32_t)end;
	217	UnicodeSet testSet;
	218	testSet.add(start, end);
	219	return checkPat(source, testSet);
	220	}
	221
	222	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	223	const UnicodeSet& testSet) {
	224	// What we want to make sure of is that a pattern generated
	225	// by toPattern(), with or without escaped unprintables, can
	226	// be passed back into the UnicodeSet constructor.
	227	UnicodeString pat0;
	228
	229	testSet.toPattern(pat0, TRUE);
	230
	231	if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
	232
	233	//String pat1 = unescapeLeniently(pat0);
	234	//if (!checkPat(source + " (in code)", testSet, pat1)) return false;
	235
	236	UnicodeString pat2;
	237	testSet.toPattern(pat2, FALSE);
	238	if (!checkPat(source, testSet, pat2)) return FALSE;
	239
	240	//String pat3 = unescapeLeniently(pat2);
	241	// if (!checkPat(source + " (in code)", testSet, pat3)) return false;
	242
	243	//logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
	244	logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
	245	return TRUE;
	246	}
	247
	248	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	249	const UnicodeSet& testSet,
	250	const UnicodeString& pat) {
	251	UErrorCode ec = U_ZERO_ERROR;
	252	UnicodeSet testSet2(pat, ec);
	253	if (testSet2 != testSet) {
	254	errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
	255	return FALSE;
	256	}
	257	return TRUE;
	258	}
	259
	260	void
	261	UnicodeSetTest::TestPatterns(void) {
	262	UnicodeSet set;
	263	expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
	264	expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
	265	expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
	266	expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
	267	expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
	268	expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
	269
	270	// Throw in a test of complement
	271	set.complement();
	272	UnicodeString exp;
	273	exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
	274	expectPairs(set, exp);
	275	}
	276
	277	void
	278	UnicodeSetTest::TestCategories(void) {
	279	UErrorCode status = U_ZERO_ERROR;
	280	const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
	281	UnicodeSet set(pat, status);
	282	if (U_FAILURE(status)) {
	283	dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
	284	return;
	285	} else {
	286	expectContainment(set, pat, "ABC", "abc");
	287	}
	288
	289	UChar32 i;
	290	int32_t failures = 0;
	291	// Make sure generation of L doesn't pollute cached Lu set
	292	// First generate L, then Lu
	293	set.applyPattern("[:L:]", status);
	294	if (U_FAILURE(status)) { errln("FAIL"); return; }
	295	for (i=0; i<0x200; ++i) {
	296	UBool l = u_isalpha((UChar)i);
	297	if (l != set.contains(i)) {
	298	errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
	299	set.contains(i));
	300	if (++failures == 10) break;
	301	}
	302	}
	303
	304	set.applyPattern("[:Lu:]", status);
	305	if (U_FAILURE(status)) { errln("FAIL"); return; }
	306	for (i=0; i<0x200; ++i) {
	307	UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
	308	if (lu != set.contains(i)) {
	309	errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
	310	set.contains(i));
	311	if (++failures == 20) break;
	312	}
	313	}
	314	}
	315	void
	316	UnicodeSetTest::TestCloneEqualHash(void) {
	317	UErrorCode status = U_ZERO_ERROR;
	318	// set1 and set2 used to be built with the obsolete constructor taking
	319	// UCharCategory values; replaced with pattern constructors
	320	// markus 20030502
	321	UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
	322	UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
	323	if (U_FAILURE(status)){
	324	dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
	325	return;
	326	}
	327	UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
	328	UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
	329	if (U_FAILURE(status)){
	330	errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
	331	return;
	332	}
	333
	334	if (set1 != set1a) {
	335	errln("FAIL: category constructor for Ll broken");
	336	}
	337	if (set2 != set2a) {
	338	errln("FAIL: category constructor for Nd broken");
	339	}
	340	delete set1a;
	341	delete set2a;
	342
	343	logln("Testing copy construction");
	344	UnicodeSet set1copy=new UnicodeSet(set1);
	345	if(set1 != set1copy \|\| set1 == set2 \|\|
	346	getPairs(set1) != getPairs(set1copy) \|\|
	347	set1->hashCode() != set1copy->hashCode()){
	348	errln("FAIL : Error in copy construction");
	349	return;
	350	}
	351
	352	logln("Testing =operator");
	353	UnicodeSet set1equal=*set1;
	354	UnicodeSet set2equal=*set2;
	355	if(set1equal != set1 \|\| set1equal != set1copy \|\| set2equal != *set2 \|\|
	356	set2equal == set1 \|\| set2equal == set1copy \|\| set2equal == set1equal){
	357	errln("FAIL: Error in =operator");
	358	}
	359
	360	logln("Testing clone()");
	361	UnicodeSet set1clone=(UnicodeSet)set1->clone();
	362	UnicodeSet set2clone=(UnicodeSet)set2->clone();
	363	if(set1clone != set1 \|\| set1clone != set1copy \|\| *set1clone != set1equal \|\|
	364	set2clone != set2 \|\| set2clone == set1copy \|\| *set2clone != set2equal \|\|
	365	set2clone == set1 \|\| set2clone == set1equal \|\| set2clone == *set1clone){
	366	errln("FAIL: Error in clone");
	367	}
	368
	369	logln("Testing hashcode");
	370	if(set1->hashCode() != set1equal.hashCode() \|\| set1->hashCode() != set1clone->hashCode() \|\|
	371	set2->hashCode() != set2equal.hashCode() \|\| set2->hashCode() != set2clone->hashCode() \|\|
	372	set1copy->hashCode() != set1equal.hashCode() \|\| set1copy->hashCode() != set1clone->hashCode() \|\|
	373	set1->hashCode() == set2->hashCode() \|\| set1copy->hashCode() == set2->hashCode() \|\|
	374	set2->hashCode() == set1clone->hashCode() \|\| set2->hashCode() == set1equal.hashCode() ){
	375	errln("FAIL: Error in hashCode()");
	376	}
	377
	378	delete set1;
	379	delete set1copy;
	380	delete set2;
	381	delete set1clone;
	382	delete set2clone;
	383
	384
	385	}
	386	void
	387	UnicodeSetTest::TestAddRemove(void) {
	388	UnicodeSet set; // Construct empty set
	389	doAssert(set.isEmpty() == TRUE, "set should be empty");
	390	doAssert(set.size() == 0, "size should be 0");
	391	set.complement();
	392	doAssert(set.size() == 0x110000, "size should be 0x110000");
	393	set.clear();
	394	set.add(0x0061, 0x007a);
	395	expectPairs(set, "az");
	396	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	397	doAssert(set.size() != 0, "size should not be equal to 0");
	398	doAssert(set.size() == 26, "size should be equal to 26");
	399	set.remove(0x006d, 0x0070);
	400	expectPairs(set, "alqz");
	401	doAssert(set.size() == 22, "size should be equal to 22");
	402	set.remove(0x0065, 0x0067);
	403	expectPairs(set, "adhlqz");
	404	doAssert(set.size() == 19, "size should be equal to 19");
	405	set.remove(0x0064, 0x0069);
	406	expectPairs(set, "acjlqz");
	407	doAssert(set.size() == 16, "size should be equal to 16");
	408	set.remove(0x0063, 0x0072);
	409	expectPairs(set, "absz");
	410	doAssert(set.size() == 10, "size should be equal to 10");
	411	set.add(0x0066, 0x0071);
	412	expectPairs(set, "abfqsz");
	413	doAssert(set.size() == 22, "size should be equal to 22");
	414	set.remove(0x0061, 0x0067);
	415	expectPairs(set, "hqsz");
	416	set.remove(0x0061, 0x007a);
	417	expectPairs(set, "");
	418	doAssert(set.isEmpty() == TRUE, "set should be empty");
	419	doAssert(set.size() == 0, "size should be 0");
	420	set.add(0x0061);
	421	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	422	doAssert(set.size() == 1, "size should not be equal to 1");
	423	set.add(0x0062);
	424	set.add(0x0063);
	425	expectPairs(set, "ac");
	426	doAssert(set.size() == 3, "size should not be equal to 3");
	427	set.add(0x0070);
	428	set.add(0x0071);
	429	expectPairs(set, "acpq");
	430	doAssert(set.size() == 5, "size should not be equal to 5");
	431	set.clear();
	432	expectPairs(set, "");
	433	doAssert(set.isEmpty() == TRUE, "set should be empty");
	434	doAssert(set.size() == 0, "size should be 0");
	435
	436	// Try removing an entire set from another set
	437	expectPattern(set, "[c-x]", "cx");
	438	UnicodeSet set2;
	439	expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
	440	set.removeAll(set2);
	441	expectPairs(set, "deluxx");
	442
	443	// Try adding an entire set to another set
	444	expectPattern(set, "[jackiemclean]", "aacceein");
	445	expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
	446	set.addAll(set2);
	447	expectPairs(set, "aacehort");
	448	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	449
	450	// Try retaining an set of elements contained in another set (intersection)
	451	UnicodeSet set3;
	452	expectPattern(set3, "[a-c]", "ac");
	453	doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
	454	set3.remove(0x0062);
	455	expectPairs(set3, "aacc");
	456	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	457	set.retainAll(set3);
	458	expectPairs(set, "aacc");
	459	doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
	460	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	461	set.clear();
	462	doAssert(set.size() != set3.size(), "set.size() != set3.size()");
	463
	464	// Test commutativity
	465	expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
	466	expectPattern(set2, "[jackiemclean]", "aacceein");
	467	set.addAll(set2);
	468	expectPairs(set, "aacehort");
	469	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	470
	471
	472
	473
	474	}
	475
	476	/**
	477	* Make sure minimal representation is maintained.
	478	*/
	479	void UnicodeSetTest::TestMinimalRep() {
	480	UErrorCode status = U_ZERO_ERROR;
	481	// This is pretty thoroughly tested by checkCanonicalRep()
	482	// run against the exhaustive operation results. Use the code
	483	// here for debugging specific spot problems.
	484
	485	// 1 overlap against 2
	486	UnicodeSet set("[h-km-q]", status);
	487	if (U_FAILURE(status)) { errln("FAIL"); return; }
	488	UnicodeSet set2("[i-o]", status);
	489	if (U_FAILURE(status)) { errln("FAIL"); return; }
	490	set.addAll(set2);
	491	expectPairs(set, "hq");
	492	// right
	493	set.applyPattern("[a-m]", status);
	494	if (U_FAILURE(status)) { errln("FAIL"); return; }
	495	set2.applyPattern("[e-o]", status);
	496	if (U_FAILURE(status)) { errln("FAIL"); return; }
	497	set.addAll(set2);
	498	expectPairs(set, "ao");
	499	// left
	500	set.applyPattern("[e-o]", status);
	501	if (U_FAILURE(status)) { errln("FAIL"); return; }
	502	set2.applyPattern("[a-m]", status);
	503	if (U_FAILURE(status)) { errln("FAIL"); return; }
	504	set.addAll(set2);
	505	expectPairs(set, "ao");
	506	// 1 overlap against 3
	507	set.applyPattern("[a-eg-mo-w]", status);
	508	if (U_FAILURE(status)) { errln("FAIL"); return; }
	509	set2.applyPattern("[d-q]", status);
	510	if (U_FAILURE(status)) { errln("FAIL"); return; }
	511	set.addAll(set2);
	512	expectPairs(set, "aw");
	513	}
	514
	515	void UnicodeSetTest::TestAPI() {
	516	UErrorCode status = U_ZERO_ERROR;
	517	// default ct
	518	UnicodeSet set;
	519	if (!set.isEmpty() \|\| set.getRangeCount() != 0) {
	520	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	521	set);
	522	}
	523
	524	// clear(), isEmpty()
	525	set.add(0x0061);
	526	if (set.isEmpty()) {
	527	errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
	528	set);
	529	}
	530	set.clear();
	531	if (!set.isEmpty()) {
	532	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	533	set);
	534	}
	535
	536	// size()
	537	set.clear();
	538	if (set.size() != 0) {
	539	errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
	540	": " + set);
	541	}
	542	set.add(0x0061);
	543	if (set.size() != 1) {
	544	errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
	545	": " + set);
	546	}
	547	set.add(0x0031, 0x0039);
	548	if (set.size() != 10) {
	549	errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
	550	": " + set);
	551	}
	552
	553	// contains(first, last)
	554	set.clear();
	555	set.applyPattern("[A-Y 1-8 b-d l-y]", status);
	556	if (U_FAILURE(status)) { errln("FAIL"); return; }
	557	for (int32_t i = 0; i<set.getRangeCount(); ++i) {
	558	UChar32 a = set.getRangeStart(i);
	559	UChar32 b = set.getRangeEnd(i);
	560	if (!set.contains(a, b)) {
	561	errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
	562	" but doesn't: " + set);
	563	}
	564	if (set.contains((UChar32)(a-1), b)) {
	565	errln((UnicodeString)"FAIL, shouldn't contain " +
	566	(unsigned short)(a-1) + '-' + (unsigned short)b +
	567	" but does: " + set);
	568	}
	569	if (set.contains(a, (UChar32)(b+1))) {
	570	errln((UnicodeString)"FAIL, shouldn't contain " +
	571	(unsigned short)a + '-' + (unsigned short)(b+1) +
	572	" but does: " + set);
	573	}
	574	}
	575
	576	// Ported InversionList test.
	577	UnicodeSet a((UChar32)3,(UChar32)10);
	578	UnicodeSet b((UChar32)7,(UChar32)15);
	579	UnicodeSet c;
	580
	581	logln((UnicodeString)"a [3-10]: " + a);
	582	logln((UnicodeString)"b [7-15]: " + b);
	583	c = a;
	584	c.addAll(b);
	585	UnicodeSet exp((UChar32)3,(UChar32)15);
	586	if (c == exp) {
	587	logln((UnicodeString)"c.set(a).add(b): " + c);
	588	} else {
	589	errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
	590	}
	591	c.complement();
	592	exp.set((UChar32)0, (UChar32)2);
	593	exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
	594	if (c == exp) {
	595	logln((UnicodeString)"c.complement(): " + c);
	596	} else {
	597	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	598	}
	599	c.complement();
	600	exp.set((UChar32)3, (UChar32)15);
	601	if (c == exp) {
	602	logln((UnicodeString)"c.complement(): " + c);
	603	} else {
	604	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	605	}
	606	c = a;
	607	c.complementAll(b);
	608	exp.set((UChar32)3,(UChar32)6);
	609	exp.add((UChar32)11,(UChar32) 15);
	610	if (c == exp) {
	611	logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
	612	} else {
	613	errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
	614	}
	615
	616	exp = c;
	617	bitsToSet(setToBits(c), c);
	618	if (c == exp) {
	619	logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
	620	} else {
	621	errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
	622	}
	623
	624	// Additional tests for coverage JB#2118
	625	//UnicodeSet::complement(class UnicodeString const &)
	626	//UnicodeSet::complementAll(class UnicodeString const &)
	627	//UnicodeSet::containsNone(class UnicodeSet const &)
	628	//UnicodeSet::containsNone(long,long)
	629	//UnicodeSet::containsSome(class UnicodeSet const &)
	630	//UnicodeSet::containsSome(long,long)
	631	//UnicodeSet::removeAll(class UnicodeString const &)
	632	//UnicodeSet::retain(long)
	633	//UnicodeSet::retainAll(class UnicodeString const &)
	634	//UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
	635	//UnicodeSetIterator::getString(void)
	636	set.clear();
	637	set.complement("ab");
	638	exp.applyPattern("[{ab}]", status);
	639	if (U_FAILURE(status)) { errln("FAIL"); return; }
	640	if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
	641
	642	UnicodeSetIterator iset(set);
	643	if (!iset.next() \|\| !iset.isString()) {
	644	errln("FAIL: UnicodeSetIterator::next/isString");
	645	} else if (iset.getString() != "ab") {
	646	errln("FAIL: UnicodeSetIterator::getString");
	647	}
	648
	649	set.add((UChar32)0x61, (UChar32)0x7A);
	650	set.complementAll("alan");
	651	exp.applyPattern("[{ab}b-kmo-z]", status);
	652	if (U_FAILURE(status)) { errln("FAIL"); return; }
	653	if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
	654
	655	exp.applyPattern("[a-z]", status);
	656	if (U_FAILURE(status)) { errln("FAIL"); return; }
	657	if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	658	if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	659	exp.applyPattern("[aln]", status);
	660	if (U_FAILURE(status)) { errln("FAIL"); return; }
	661	if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	662	if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	663
	664	if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
	665	errln("FAIL: containsNone(UChar32, UChar32)");
	666	}
	667	if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
	668	errln("FAIL: containsSome(UChar32, UChar32)");
	669	}
	670	if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
	671	errln("FAIL: containsNone(UChar32, UChar32)");
	672	}
	673	if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
	674	errln("FAIL: containsSome(UChar32, UChar32)");
	675	}
	676
	677	set.removeAll("liu");
	678	exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
	679	if (U_FAILURE(status)) { errln("FAIL"); return; }
	680	if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
	681
	682	set.retainAll("star");
	683	exp.applyPattern("[rst]", status);
	684	if (U_FAILURE(status)) { errln("FAIL"); return; }
	685	if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
	686
	687	set.retain((UChar32)0x73);
	688	exp.applyPattern("[s]", status);
	689	if (U_FAILURE(status)) { errln("FAIL"); return; }
	690	if (set != exp) { errln("FAIL: retain('s')"); return; }
	691
	692	uint16_t buf[32];
	693	int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
	694	if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
	695	if (slen != 3 \|\| buf[0] != 2 \|\| buf[1] != 0x73 \|\| buf[2] != 0x74) {
	696	errln("FAIL: serialize");
	697	return;
	698	}
	699
	700	// Conversions to and from USet
	701	UnicodeSet *uniset = &set;
	702	USet *uset = uniset->toUSet();
	703	TEST_ASSERT((void )uset == (void )uniset);
	704	UnicodeSet *setx = UnicodeSet::fromUSet(uset);
	705	TEST_ASSERT((void )setx == (void )uset);
	706	const UnicodeSet *constSet = uniset;
	707	const USet *constUSet = constSet->toUSet();
	708	TEST_ASSERT((void )constUSet == (void )constSet);
	709	const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
	710	TEST_ASSERT((void )constSetx == (void )constUSet);
	711
	712	// span(UnicodeString) and spanBack(UnicodeString) convenience methods
	713	UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
	714	UnicodeSet ac(0x61, 0x63);
	715	ac.remove(0x62).freeze();
	716	if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 \|\|
	717	ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 \|\|
	718	ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 \|\|
	719	ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 \|\|
	720	ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|
	721	ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 \|\|
	722	ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 \|\|
	723	ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 \|\|
	724	ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 \|\|
	725	ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
	726	) {
	727	errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
	728	}
	729	if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 \|\|
	730	ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 \|\|
	731	ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 \|\|
	732	ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 \|\|
	733	ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|
	734	ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 \|\|
	735	ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 \|\|
	736	ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 \|\|
	737	ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 \|\|
	738	ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
	739	) {
	740	errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
	741	}
	742	}
	743
	744	void UnicodeSetTest::TestIteration() {
	745	UErrorCode ec = U_ZERO_ERROR;
	746	int i = 0;
	747	int outerLoop;
	748
	749	// 6 code points, 3 ranges, 2 strings, 8 total elements
	750	// Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
	751	UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
	752	TEST_ASSERT_SUCCESS(ec);
	753	UnicodeSetIterator it(set);
	754
	755	for (outerLoop=0; outerLoop<3; outerLoop++) {
	756	// Run the test multiple times, to check that iterator.reset() is working.
	757	for (i=0; i<10; i++) {
	758	UBool nextv = it.next();
	759	UBool isString = it.isString();
	760	int32_t codePoint = it.getCodepoint();
	761	//int32_t codePointEnd = it.getCodepointEnd();
	762	UnicodeString s = it.getString();
	763	switch (i) {
	764	case 0:
	765	TEST_ASSERT(nextv == TRUE);
	766	TEST_ASSERT(isString == FALSE);
	767	TEST_ASSERT(codePoint==0x61);
	768	TEST_ASSERT(s == "a");
	769	break;
	770	case 1:
	771	TEST_ASSERT(nextv == TRUE);
	772	TEST_ASSERT(isString == FALSE);
	773	TEST_ASSERT(codePoint==0x62);
	774	TEST_ASSERT(s == "b");
	775	break;
	776	case 2:
	777	TEST_ASSERT(nextv == TRUE);
	778	TEST_ASSERT(isString == FALSE);
	779	TEST_ASSERT(codePoint==0x63);
	780	TEST_ASSERT(s == "c");
	781	break;
	782	case 3:
	783	TEST_ASSERT(nextv == TRUE);
	784	TEST_ASSERT(isString == FALSE);
	785	TEST_ASSERT(codePoint==0x79);
	786	TEST_ASSERT(s == "y");
	787	break;
	788	case 4:
	789	TEST_ASSERT(nextv == TRUE);
	790	TEST_ASSERT(isString == FALSE);
	791	TEST_ASSERT(codePoint==0x7a);
	792	TEST_ASSERT(s == "z");
	793	break;
	794	case 5:
	795	TEST_ASSERT(nextv == TRUE);
	796	TEST_ASSERT(isString == FALSE);
	797	TEST_ASSERT(codePoint==0x1abcd);
	798	TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
	799	break;
	800	case 6:
	801	TEST_ASSERT(nextv == TRUE);
	802	TEST_ASSERT(isString == TRUE);
	803	TEST_ASSERT(s == "str1");
	804	break;
	805	case 7:
	806	TEST_ASSERT(nextv == TRUE);
	807	TEST_ASSERT(isString == TRUE);
	808	TEST_ASSERT(s == "str2");
	809	break;
	810	case 8:
	811	TEST_ASSERT(nextv == FALSE);
	812	break;
	813	case 9:
	814	TEST_ASSERT(nextv == FALSE);
	815	break;
	816	}
	817	}
	818	it.reset(); // prepare to run the iteration again.
	819	}
	820	}
	821
	822
	823
	824
	825	void UnicodeSetTest::TestStrings() {
	826	UErrorCode ec = U_ZERO_ERROR;
	827
	828	UnicodeSet* testList[] = {
	829	UnicodeSet::createFromAll("abc"),
	830	new UnicodeSet("[a-c]", ec),
	831
	832	&(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
	833	new UnicodeSet("[{ll}{ch}a-z]", ec),
	834
	835	UnicodeSet::createFrom("ab}c"),
	836	new UnicodeSet("[{ab\\}c}]", ec),
	837
	838	&((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
	839	new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
	840
	841	NULL
	842	};
	843
	844	if (U_FAILURE(ec)) {
	845	errln("FAIL: couldn't construct test sets");
	846	}
	847
	848	for (int32_t i = 0; testList[i] != NULL; i+=2) {
	849	if (U_SUCCESS(ec)) {
	850	UnicodeString pat0, pat1;
	851	testList[i]->toPattern(pat0, TRUE);
	852	testList[i+1]->toPattern(pat1, TRUE);
	853	if (testList[i] == testList[i+1]) {
	854	logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
	855	} else {
	856	logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
	857	}
	858	}
	859	delete testList[i];
	860	delete testList[i+1];
	861	}
	862	}
	863
	864	/**
	865	* Test the [:Latin:] syntax.
	866	*/
	867	void UnicodeSetTest::TestScriptSet() {
	868	expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
	869
	870	expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
	871
	872	/* Jitterbug 1423 */
	873	expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
	874
	875	}
	876
	877	/**
	878	* Test the [:Latin:] syntax.
	879	*/
	880	void UnicodeSetTest::TestPropertySet() {
	881	static const char* const DATA[] = {
	882	// Pattern, Chars IN, Chars NOT in
	883
	884	"[:Latin:]",
	885	"aA",
	886	"\\u0391\\u03B1",
	887
	888	"[\\p{Greek}]",
	889	"\\u0391\\u03B1",
	890	"aA",
	891
	892	"\\P{ GENERAL Category = upper case letter }",
	893	"abc",
	894	"ABC",
	895
	896	#if !UCONFIG_NO_NORMALIZATION
	897	// Combining class: @since ICU 2.2
	898	// Check both symbolic and numeric
	899	"\\p{ccc=Nukta}",
	900	"\\u0ABC",
	901	"abc",
	902
	903	"\\p{Canonical Combining Class = 11}",
	904	"\\u05B1",
	905	"\\u05B2",
	906
	907	"[:c c c = iota subscript :]",
	908	"\\u0345",
	909	"xyz",
	910	#endif
	911
	912	// Bidi class: @since ICU 2.2
	913	"\\p{bidiclass=lefttoright}",
	914	"abc",
	915	"\\u0671\\u0672",
	916
	917	// Binary properties: @since ICU 2.2
	918	"\\p{ideographic}",
	919	"\\u4E0A",
	920	"x",
	921
	922	"[:math=false:]",
	923	"q)*(",
	924	// weiv: )(and * were removed from math in Unicode 4.0.1
	925	//"(*+)",
	926	"+<>^",
	927
	928	// JB#1767 \N{}, \p{ASCII}
	929	"[:Ascii:]",
	930	"abc\\u0000\\u007F",
	931	"\\u0080\\u4E00",
	932
	933	"[\\N{ latin small letter a }[:name= latin small letter z:]]",
	934	"az",
	935	"qrs",
	936
	937	// JB#2015
	938	"[:any:]",
	939	"a\\U0010FFFF",
	940	"",
	941
	942	"[:nv=0.5:]",
	943	"\\u00BD\\u0F2A",
	944	"\\u00BC",
	945
	946	// JB#2653: Age
	947	"[:Age=1.1:]",
	948	"\\u03D6", // 1.1
	949	"\\u03D8\\u03D9", // 3.2
	950
	951	"[:Age=3.1:]",
	952	"\\u1800\\u3400\\U0002f800",
	953	"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
	954
	955	// JB#2350: Case_Sensitive
	956	"[:Case Sensitive:]",
	957	"A\\u1FFC\\U00010410",
	958	";\\u00B4\\U00010500",
	959
	960	// JB#2832: C99-compatibility props
	961	"[:blank:]",
	962	" \\u0009",
	963	"1-9A-Z",
	964
	965	"[:graph:]",
	966	"19AZ",
	967	" \\u0003\\u0007\\u0009\\u000A\\u000D",
	968
	969	"[:punct:]",
	970	"!@#%&*()[]{}-_\\/;:,.?'\"",
	971	"09azAZ",
	972
	973	"[:xdigit:]",
	974	"09afAF",
	975	"gG!",
	976
	977	// Regex compatibility test
	978	"[-b]", // leading '-' is literal
	979	"-b",
	980	"ac",
	981
	982	"[^-b]", // leading '-' is literal
	983	"ac",
	984	"-b",
	985
	986	"[b-]", // trailing '-' is literal
	987	"-b",
	988	"ac",
	989
	990	"[^b-]", // trailing '-' is literal
	991	"ac",
	992	"-b",
	993
	994	"[a-b-]", // trailing '-' is literal
	995	"ab-",
	996	"c=",
	997
	998	"[[a-q]&[p-z]-]", // trailing '-' is literal
	999	"pq-",
	1000	"or=",
	1001
	1002	"[\\s\|\\)\|:\|$\|\\>]", // from regex tests
	1003	"s\|):$>",
	1004	"abc",
	1005
	1006	"[\\uDC00cd]", // JB#2906: isolated trail at start
	1007	"cd\\uDC00",
	1008	"ab\\uD800\\U00010000",
	1009
	1010	"[ab\\uD800]", // JB#2906: isolated trail at start
	1011	"ab\\uD800",
	1012	"cd\\uDC00\\U00010000",
	1013
	1014	"[ab\\uD800cd]", // JB#2906: isolated lead in middle
	1015	"abcd\\uD800",
	1016	"ef\\uDC00\\U00010000",
	1017
	1018	"[ab\\uDC00cd]", // JB#2906: isolated trail in middle
	1019	"abcd\\uDC00",
	1020	"ef\\uD800\\U00010000",
	1021
	1022	#if !UCONFIG_NO_NORMALIZATION
	1023	"[:^lccc=0:]", // Lead canonical class
	1024	"\\u0300\\u0301",
	1025	"abcd\\u00c0\\u00c5",
	1026
	1027	"[:^tccc=0:]", // Trail canonical class
	1028	"\\u0300\\u0301\\u00c0\\u00c5",
	1029	"abcd",
	1030
	1031	"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
	1032	"\\u0300\\u0301\\u00c0\\u00c5",
	1033	"abcd",
	1034
	1035	"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
	1036	"",
	1037	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	1038
	1039	"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
	1040	"\\u0F73\\u0F75\\u0F81",
	1041	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	1042	#endif /* !UCONFIG_NO_NORMALIZATION */
	1043
	1044	"[:Assigned:]",
	1045	"A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
	1046	"\\u0888\\uFDD3\\uFFFE\\U00050005",
	1047
	1048	// Script_Extensions, new in Unicode 6.0
	1049	"[:scx=Arab:]",
	1050	"\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
	1051	"\\u061D\\uFDEF\\uFDFE",
	1052
	1053	// U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
	1054	// so scx-sc is missing U+FDF2.
	1055	"[[:Script_Extensions=Arabic:]-[:Arab:]]",
	1056	"\\u0640\\u064B\\u0650\\u0655",
	1057	"\\uFDF2"
	1058	};
	1059
	1060	static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
	1061
	1062	for (int32_t i=0; i<DATA_LEN; i+=3) {
	1063	expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
	1064	CharsToUnicodeString(DATA[i+2]));
	1065	}
	1066	}
	1067
	1068	/**
	1069	* Test that Posix style character classes [:digit:], etc.
	1070	* have the Unicode definitions from TR 18.
	1071	*/
	1072	void UnicodeSetTest::TestPosixClasses() {
	1073	{
	1074	UErrorCode status = U_ZERO_ERROR;
	1075	UnicodeSet s1("[:alpha:]", status);
	1076	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
	1077	TEST_ASSERT_SUCCESS(status);
	1078	TEST_ASSERT(s1==s2);
	1079	}
	1080	{
	1081	UErrorCode status = U_ZERO_ERROR;
	1082	UnicodeSet s1("[:lower:]", status);
	1083	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
	1084	TEST_ASSERT_SUCCESS(status);
	1085	TEST_ASSERT(s1==s2);
	1086	}
	1087	{
	1088	UErrorCode status = U_ZERO_ERROR;
	1089	UnicodeSet s1("[:upper:]", status);
	1090	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
	1091	TEST_ASSERT_SUCCESS(status);
	1092	TEST_ASSERT(s1==s2);
	1093	}
	1094	{
	1095	UErrorCode status = U_ZERO_ERROR;
	1096	UnicodeSet s1("[:punct:]", status);
	1097	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
	1098	TEST_ASSERT_SUCCESS(status);
	1099	TEST_ASSERT(s1==s2);
	1100	}
	1101	{
	1102	UErrorCode status = U_ZERO_ERROR;
	1103	UnicodeSet s1("[:digit:]", status);
	1104	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
	1105	TEST_ASSERT_SUCCESS(status);
	1106	TEST_ASSERT(s1==s2);
	1107	}
	1108	{
	1109	UErrorCode status = U_ZERO_ERROR;
	1110	UnicodeSet s1("[:xdigit:]", status);
	1111	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
	1112	TEST_ASSERT_SUCCESS(status);
	1113	TEST_ASSERT(s1==s2);
	1114	}
	1115	{
	1116	UErrorCode status = U_ZERO_ERROR;
	1117	UnicodeSet s1("[:alnum:]", status);
	1118	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
	1119	TEST_ASSERT_SUCCESS(status);
	1120	TEST_ASSERT(s1==s2);
	1121	}
	1122	{
	1123	UErrorCode status = U_ZERO_ERROR;
	1124	UnicodeSet s1("[:space:]", status);
	1125	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
	1126	TEST_ASSERT_SUCCESS(status);
	1127	TEST_ASSERT(s1==s2);
	1128	}
	1129	{
	1130	UErrorCode status = U_ZERO_ERROR;
	1131	UnicodeSet s1("[:blank:]", status);
	1132	TEST_ASSERT_SUCCESS(status);
	1133	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
	1134	status);
	1135	TEST_ASSERT_SUCCESS(status);
	1136	TEST_ASSERT(s1==s2);
	1137	}
	1138	{
	1139	UErrorCode status = U_ZERO_ERROR;
	1140	UnicodeSet s1("[:cntrl:]", status);
	1141	TEST_ASSERT_SUCCESS(status);
	1142	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
	1143	TEST_ASSERT_SUCCESS(status);
	1144	TEST_ASSERT(s1==s2);
	1145	}
	1146	{
	1147	UErrorCode status = U_ZERO_ERROR;
	1148	UnicodeSet s1("[:graph:]", status);
	1149	TEST_ASSERT_SUCCESS(status);
	1150	UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
	1151	TEST_ASSERT_SUCCESS(status);
	1152	TEST_ASSERT(s1==s2);
	1153	}
	1154	{
	1155	UErrorCode status = U_ZERO_ERROR;
	1156	UnicodeSet s1("[:print:]", status);
	1157	TEST_ASSERT_SUCCESS(status);
	1158	UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
	1159	TEST_ASSERT_SUCCESS(status);
	1160	TEST_ASSERT(s1==s2);
	1161	}
	1162	}
	1163	/**
	1164	* Test cloning of UnicodeSet. For C++, we test the copy constructor.
	1165	*/
	1166	void UnicodeSetTest::TestClone() {
	1167	UErrorCode ec = U_ZERO_ERROR;
	1168	UnicodeSet s("[abcxyz]", ec);
	1169	UnicodeSet t(s);
	1170	expectContainment(t, "abc", "def");
	1171	}
	1172
	1173	/**
	1174	* Test the indexOf() and charAt() methods.
	1175	*/
	1176	void UnicodeSetTest::TestIndexOf() {
	1177	UErrorCode ec = U_ZERO_ERROR;
	1178	UnicodeSet set("[a-cx-y3578]", ec);
	1179	if (U_FAILURE(ec)) {
	1180	errln("FAIL: UnicodeSet constructor");
	1181	return;
	1182	}
	1183	for (int32_t i=0; i<set.size(); ++i) {
	1184	UChar32 c = set.charAt(i);
	1185	if (set.indexOf(c) != i) {
	1186	errln("FAIL: charAt(%d) = %X => indexOf() => %d",
	1187	i, c, set.indexOf(c));
	1188	}
	1189	}
	1190	UChar32 c = set.charAt(set.size());
	1191	if (c != -1) {
	1192	errln("FAIL: charAt(<out of range>) = %X", c);
	1193	}
	1194	int32_t j = set.indexOf((UChar32)0x71/'q'/);
	1195	if (j != -1) {
	1196	errln((UnicodeString)"FAIL: indexOf('q') = " + j);
	1197	}
	1198	}
	1199
	1200	/**
	1201	* Test closure API.
	1202	*/
	1203	void UnicodeSetTest::TestCloseOver() {
	1204	UErrorCode ec = U_ZERO_ERROR;
	1205
	1206	char CASE[] = {(char)USET_CASE_INSENSITIVE};
	1207	char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
	1208	const char* DATA[] = {
	1209	// selector, input, output
	1210	CASE,
	1211	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1212	"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
	1213
	1214	CASE,
	1215	"[\\u01F1]", // 'DZ'
	1216	"[\\u01F1\\u01F2\\u01F3]",
	1217
	1218	CASE,
	1219	"[\\u1FB4]",
	1220	"[\\u1FB4{\\u03AC\\u03B9}]",
	1221
	1222	CASE,
	1223	"[{F\\uFB01}]",
	1224	"[\\uFB03{ffi}]",
	1225
	1226	CASE, // make sure binary search finds limits
	1227	"[a\\uFF3A]",
	1228	"[aA\\uFF3A\\uFF5A]",
	1229
	1230	CASE,
	1231	"[a-z]","[A-Za-z\\u017F\\u212A]",
	1232	CASE,
	1233	"[abc]","[A-Ca-c]",
	1234	CASE,
	1235	"[ABC]","[A-Ca-c]",
	1236
	1237	CASE, "[i]", "[iI]",
	1238
	1239	CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
	1240	CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
	1241
	1242	CASE, "[\\u0131]", "[\\u0131]", // dotless i
	1243
	1244	CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
	1245
	1246	CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
	1247
	1248	CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
	1249
	1250	CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
	1251
	1252	CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
	1253
	1254	CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
	1255	CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
	1256
	1257	CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
	1258
	1259	CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
	1260
	1261	CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
	1262
	1263	#if !UCONFIG_NO_FILE_IO
	1264	CASE_MAPPINGS,
	1265	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1266	"[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
	1267	#endif
	1268
	1269	CASE_MAPPINGS,
	1270	"[\\u01F1]", // 'DZ'
	1271	"[\\u01F1\\u01F2\\u01F3]",
	1272
	1273	CASE_MAPPINGS,
	1274	"[a-z]",
	1275	"[A-Za-z]",
	1276
	1277	NULL
	1278	};
	1279
	1280	UnicodeSet s;
	1281	UnicodeSet t;
	1282	UnicodeString buf;
	1283	for (int32_t i=0; DATA[i]!=NULL; i+=3) {
	1284	int32_t selector = DATA[i][0];
	1285	UnicodeString pat(DATA[i+1], -1, US_INV);
	1286	UnicodeString exp(DATA[i+2], -1, US_INV);
	1287	s.applyPattern(pat, ec);
	1288	s.closeOver(selector);
	1289	t.applyPattern(exp, ec);
	1290	if (U_FAILURE(ec)) {
	1291	errln("FAIL: applyPattern failed");
	1292	continue;
	1293	}
	1294	if (s == t) {
	1295	logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
	1296	} else {
	1297	dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
	1298	s.toPattern(buf, TRUE) + ", expected " + exp);
	1299	}
	1300	}
	1301
	1302	#if 0
	1303	/*
	1304	* Unused test code.
	1305	* This was used to compare the old implementation (using USET_CASE)
	1306	* with the new one (using 0x100 temporarily)
	1307	* while transitioning from hardcoded case closure tables in uniset.cpp
	1308	* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
	1309	* and using ucase.c functions for closure.
	1310	* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
	1311	*
	1312	* Note: The old and new implementation never fully matched because
	1313	* the old implementation turned out to not map U+0130 and U+0131 correctly
	1314	* (dotted I and dotless i) and because the old implementation's data tables
	1315	* were outdated compared to Unicode 4.0.1 at the time of the change to the
	1316	* new implementation. (So sigmas and some other characters were not handled
	1317	* according to the newer Unicode version.)
	1318	*/
	1319	UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
	1320	UnicodeSetIterator si(sens);
	1321	UnicodeString str, buf2;
	1322	const UnicodeString *pStr;
	1323	UChar32 c;
	1324	while(si.next()) {
	1325	if(!si.isString()) {
	1326	c=si.getCodepoint();
	1327	s.clear();
	1328	s.add(c);
	1329
	1330	str.setTo(c);
	1331	str.foldCase();
	1332	sens2.add(str);
	1333
	1334	t=s;
	1335	s.closeOver(USET_CASE);
	1336	t.closeOver(0x100);
	1337	if(s!=t) {
	1338	errln("FAIL: closeOver(U+%04x) differs: ", c);
	1339	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1340	}
	1341	}
	1342	}
	1343	// remove all code points
	1344	// should contain all full case folding mapping strings
	1345	sens2.remove(0, 0x10ffff);
	1346	si.reset(sens2);
	1347	while(si.next()) {
	1348	if(si.isString()) {
	1349	pStr=&si.getString();
	1350	s.clear();
	1351	s.add(*pStr);
	1352	t=s2=s;
	1353	s.closeOver(USET_CASE);
	1354	t.closeOver(0x100);
	1355	if(s!=t) {
	1356	errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
	1357	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1358	}
	1359	}
	1360	}
	1361	#endif
	1362
	1363	// Test the pattern API
	1364	s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1365	if (U_FAILURE(ec)) {
	1366	errln("FAIL: applyPattern failed");
	1367	} else {
	1368	expectContainment(s, "abcABC", "defDEF");
	1369	}
	1370	UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1371	if (U_FAILURE(ec)) {
	1372	errln("FAIL: constructor failed");
	1373	} else {
	1374	expectContainment(v, "defDEF", "abcABC");
	1375	}
	1376	UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
	1377	if (U_FAILURE(ec)) {
	1378	errln("FAIL: construct w/case mappings failed");
	1379	} else {
	1380	expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
	1381	}
	1382	}
	1383
	1384	void UnicodeSetTest::TestEscapePattern() {
	1385	const char pattern[] =
	1386	"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
	1387	const char exp[] =
	1388	"[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
	1389	// We test this with two passes; in the second pass we
	1390	// pre-unescape the pattern. Since U+200E is Pattern_White_Space,
	1391	// this fails -- which is what we expect.
	1392	for (int32_t pass=1; pass<=2; ++pass) {
	1393	UErrorCode ec = U_ZERO_ERROR;
	1394	UnicodeString pat(pattern, -1, US_INV);
	1395	if (pass==2) {
	1396	pat = pat.unescape();
	1397	}
	1398	// Pattern is only good for pass 1
	1399	UBool isPatternValid = (pass==1);
	1400
	1401	UnicodeSet set(pat, ec);
	1402	if (U_SUCCESS(ec) != isPatternValid){
	1403	errln((UnicodeString)"FAIL: applyPattern(" +
	1404	escape(pat) + ") => " +
	1405	u_errorName(ec));
	1406	continue;
	1407	}
	1408	if (U_FAILURE(ec)) {
	1409	continue;
	1410	}
	1411	if (set.contains((UChar)0x0644)){
	1412	errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
	1413	}
	1414
	1415	UnicodeString newpat;
	1416	set.toPattern(newpat, TRUE);
	1417	if (newpat == UnicodeString(exp, -1, US_INV)) {
	1418	logln(escape(pat) + " => " + newpat);
	1419	} else {
	1420	errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
	1421	}
	1422
	1423	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1424	UnicodeString str("Range ");
	1425	str.append((UChar)(0x30 + i))
	1426	.append(": ")
	1427	.append((UChar32)set.getRangeStart(i))
	1428	.append(" - ")
	1429	.append((UChar32)set.getRangeEnd(i));
	1430	str = str + " (" + set.getRangeStart(i) + " - " +
	1431	set.getRangeEnd(i) + ")";
	1432	if (set.getRangeStart(i) < 0) {
	1433	errln((UnicodeString)"FAIL: " + escape(str));
	1434	} else {
	1435	logln(escape(str));
	1436	}
	1437	}
	1438	}
	1439	}
	1440
	1441	void UnicodeSetTest::expectRange(const UnicodeString& label,
	1442	const UnicodeSet& set,
	1443	UChar32 start, UChar32 end) {
	1444	UnicodeSet exp(start, end);
	1445	UnicodeString pat;
	1446	if (set == exp) {
	1447	logln(label + " => " + set.toPattern(pat, TRUE));
	1448	} else {
	1449	UnicodeString xpat;
	1450	errln((UnicodeString)"FAIL: " + label + " => " +
	1451	set.toPattern(pat, TRUE) +
	1452	", expected " + exp.toPattern(xpat, TRUE));
	1453	}
	1454	}
	1455
	1456	void UnicodeSetTest::TestInvalidCodePoint() {
	1457
	1458	const UChar32 DATA[] = {
	1459	// Test range Expected range
	1460	0, 0x10FFFF, 0, 0x10FFFF,
	1461	(UChar32)-1, 8, 0, 8,
	1462	8, 0x110000, 8, 0x10FFFF
	1463	};
	1464	const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
	1465
	1466	UnicodeString pat;
	1467	int32_t i;
	1468
	1469	for (i=0; i<DATA_LENGTH; i+=4) {
	1470	UChar32 start = DATA[i];
	1471	UChar32 end = DATA[i+1];
	1472	UChar32 xstart = DATA[i+2];
	1473	UChar32 xend = DATA[i+3];
	1474
	1475	// Try various API using the test code points
	1476
	1477	UnicodeSet set(start, end);
	1478	expectRange((UnicodeString)"ct(" + start + "," + end + ")",
	1479	set, xstart, xend);
	1480
	1481	set.clear();
	1482	set.set(start, end);
	1483	expectRange((UnicodeString)"set(" + start + "," + end + ")",
	1484	set, xstart, xend);
	1485
	1486	UBool b = set.contains(start);
	1487	b = set.contains(start, end);
	1488	b = set.containsNone(start, end);
	1489	b = set.containsSome(start, end);
	1490	(void)b; // Suppress set but not used warning.
	1491
	1492	/int32_t index = set.indexOf(start);/
	1493
	1494	set.clear();
	1495	set.add(start);
	1496	set.add(start, end);
	1497	expectRange((UnicodeString)"add(" + start + "," + end + ")",
	1498	set, xstart, xend);
	1499
	1500	set.set(0, 0x10FFFF);
	1501	set.retain(start, end);
	1502	expectRange((UnicodeString)"retain(" + start + "," + end + ")",
	1503	set, xstart, xend);
	1504	set.retain(start);
	1505
	1506	set.set(0, 0x10FFFF);
	1507	set.remove(start);
	1508	set.remove(start, end);
	1509	set.complement();
	1510	expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
	1511	set, xstart, xend);
	1512
	1513	set.set(0, 0x10FFFF);
	1514	set.complement(start, end);
	1515	set.complement();
	1516	expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
	1517	set, xstart, xend);
	1518	set.complement(start);
	1519	}
	1520
	1521	const UChar32 DATA2[] = {
	1522	0,
	1523	0x10FFFF,
	1524	(UChar32)-1,
	1525	0x110000
	1526	};
	1527	const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
	1528
	1529	for (i=0; i<DATA2_LENGTH; ++i) {
	1530	UChar32 c = DATA2[i], end = 0x10FFFF;
	1531	UBool valid = (c >= 0 && c <= 0x10FFFF);
	1532
	1533	UnicodeSet set(0, 0x10FFFF);
	1534
	1535	// For single-codepoint contains, invalid codepoints are NOT contained
	1536	UBool b = set.contains(c);
	1537	if (b == valid) {
	1538	logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
	1539	") = " + b);
	1540	} else {
	1541	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
	1542	") = " + b);
	1543	}
	1544
	1545	// For codepoint range contains, containsNone, and containsSome,
	1546	// invalid or empty (start > end) ranges have UNDEFINED behavior.
	1547	b = set.contains(c, end);
	1548	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
	1549	"," + end + ") = " + b);
	1550
	1551	b = set.containsNone(c, end);
	1552	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
	1553	"," + end + ") = " + b);
	1554
	1555	b = set.containsSome(c, end);
	1556	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
	1557	"," + end + ") = " + b);
	1558
	1559	int32_t index = set.indexOf(c);
	1560	if ((index >= 0) == valid) {
	1561	logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
	1562	") = " + index);
	1563	} else {
	1564	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
	1565	") = " + index);
	1566	}
	1567	}
	1568	}
	1569
	1570	// Used by TestSymbolTable
	1571	class TokenSymbolTable : public SymbolTable {
	1572	public:
	1573	Hashtable contents;
	1574
	1575	TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
	1576	contents.setValueDeleter(uprv_deleteUObject);
	1577	}
	1578
	1579	~TokenSymbolTable() {}
	1580
	1581	/**
	1582	* (Non-SymbolTable API) Add the given variable and value to
	1583	* the table. Variable should NOT contain leading '$'.
	1584	*/
	1585	void add(const UnicodeString& var, const UnicodeString& value,
	1586	UErrorCode& ec) {
	1587	if (U_SUCCESS(ec)) {
	1588	contents.put(var, new UnicodeString(value), ec);
	1589	}
	1590	}
	1591
	1592	/**
	1593	* SymbolTable API
	1594	*/
	1595	virtual const UnicodeString* lookup(const UnicodeString& s) const {
	1596	return (const UnicodeString*) contents.get(s);
	1597	}
	1598
	1599	/**
	1600	* SymbolTable API
	1601	*/
	1602	virtual const UnicodeFunctor* lookupMatcher(UChar32 /ch/) const {
	1603	return NULL;
	1604	}
	1605
	1606	/**
	1607	* SymbolTable API
	1608	*/
	1609	virtual UnicodeString parseReference(const UnicodeString& text,
	1610	ParsePosition& pos, int32_t limit) const {
	1611	int32_t start = pos.getIndex();
	1612	int32_t i = start;
	1613	UnicodeString result;
	1614	while (i < limit) {
	1615	UChar c = text.charAt(i);
	1616	if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {
	1617	break;
	1618	}
	1619	++i;
	1620	}
	1621	if (i == start) { // No valid name chars
	1622	return result; // Indicate failure with empty string
	1623	}
	1624	pos.setIndex(i);
	1625	text.extractBetween(start, i, result);
	1626	return result;
	1627	}
	1628	};
	1629
	1630	void UnicodeSetTest::TestSymbolTable() {
	1631	// Multiple test cases can be set up here. Each test case
	1632	// is terminated by null:
	1633	// var, value, var, value,..., input pat., exp. output pat., null
	1634	const char* DATA[] = {
	1635	"us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
	1636	"us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
	1637	"us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
	1638	NULL
	1639	};
	1640
	1641	for (int32_t i=0; DATA[i]!=NULL; ++i) {
	1642	UErrorCode ec = U_ZERO_ERROR;
	1643	TokenSymbolTable sym(ec);
	1644	if (U_FAILURE(ec)) {
	1645	errln("FAIL: couldn't construct TokenSymbolTable");
	1646	continue;
	1647	}
	1648
	1649	// Set up variables
	1650	while (DATA[i+2] != NULL) {
	1651	sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
	1652	if (U_FAILURE(ec)) {
	1653	errln("FAIL: couldn't add to TokenSymbolTable");
	1654	continue;
	1655	}
	1656	i += 2;
	1657	}
	1658
	1659	// Input pattern and expected output pattern
	1660	UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
	1661	i += 2;
	1662
	1663	ParsePosition pos(0);
	1664	UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
	1665	if (U_FAILURE(ec)) {
	1666	errln("FAIL: couldn't construct UnicodeSet");
	1667	continue;
	1668	}
	1669
	1670	// results
	1671	if (pos.getIndex() != inpat.length()) {
	1672	errln((UnicodeString)"Failed to read to end of string \""
	1673	+ inpat + "\": read to "
	1674	+ pos.getIndex() + ", length is "
	1675	+ inpat.length());
	1676	}
	1677
	1678	UnicodeSet us2(exppat, ec);
	1679	if (U_FAILURE(ec)) {
	1680	errln("FAIL: couldn't construct expected UnicodeSet");
	1681	continue;
	1682	}
	1683
	1684	UnicodeString a, b;
	1685	if (us != us2) {
	1686	errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
	1687	", expected " + us2.toPattern(b, TRUE));
	1688	} else {
	1689	logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
	1690	}
	1691	}
	1692	}
	1693
	1694	void UnicodeSetTest::TestSurrogate() {
	1695	const char* DATA[] = {
	1696	// These should all behave identically
	1697	"[abc\\uD800\\uDC00]",
	1698	// "[abc\uD800\uDC00]", // Can't do this on C -- only Java
	1699	"[abc\\U00010000]",
	1700	0
	1701	};
	1702	for (int i=0; DATA[i] != 0; ++i) {
	1703	UErrorCode ec = U_ZERO_ERROR;
	1704	logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
	1705	UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
	1706	UnicodeSet set(str, ec);
	1707	if (U_FAILURE(ec)) {
	1708	errln("FAIL: UnicodeSet constructor");
	1709	continue;
	1710	}
	1711	expectContainment(set,
	1712	CharsToUnicodeString("abc\\U00010000"),
	1713	CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
	1714	if (set.size() != 4) {
	1715	errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
	1716	set.size() + ", expected 4");
	1717	}
	1718
	1719	{
	1720	UErrorCode subErr = U_ZERO_ERROR;
	1721	checkRoundTrip(set);
	1722	checkSerializeRoundTrip(set, subErr);
	1723	}
	1724	}
	1725	}
	1726
	1727	void UnicodeSetTest::TestExhaustive() {
	1728	// exhaustive tests. Simulate UnicodeSets with integers.
	1729	// That gives us very solid tests (except for large memory tests).
	1730
	1731	int32_t limit = 128;
	1732
	1733	UnicodeSet x, y, z, aa;
	1734
	1735	for (int32_t i = 0; i < limit; ++i) {
	1736	bitsToSet(i, x);
	1737	logln((UnicodeString)"Testing " + i + ", " + x);
	1738	_testComplement(i, x, y);
	1739
	1740	UnicodeSet &toTest = bitsToSet(i, aa);
	1741
	1742	// AS LONG AS WE ARE HERE, check roundtrip
	1743	checkRoundTrip(toTest);
	1744	UErrorCode ec = U_ZERO_ERROR;
	1745	checkSerializeRoundTrip(toTest, ec);
	1746
	1747	for (int32_t j = 0; j < limit; ++j) {
	1748	_testAdd(i,j, x,y,z);
	1749	_testXor(i,j, x,y,z);
	1750	_testRetain(i,j, x,y,z);
	1751	_testRemove(i,j, x,y,z);
	1752	}
	1753	}
	1754	}
	1755
	1756	void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
	1757	bitsToSet(a, x);
	1758	z = x;
	1759	z.complement();
	1760	int32_t c = setToBits(z);
	1761	if (c != (~a)) {
	1762	errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
	1763	errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
	1764	}
	1765	checkCanonicalRep(z, (UnicodeString)"complement " + a);
	1766	}
	1767
	1768	void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1769	bitsToSet(a, x);
	1770	bitsToSet(b, y);
	1771	z = x;
	1772	z.addAll(y);
	1773	int32_t c = setToBits(z);
	1774	if (c != (a \| b)) {
	1775	errln((UnicodeString)"FAILED: add: " + x + " \| " + y + " != " + z);
	1776	errln((UnicodeString)"FAILED: add: " + a + " \| " + b + " != " + c);
	1777	}
	1778	checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
	1779	}
	1780
	1781	void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1782	bitsToSet(a, x);
	1783	bitsToSet(b, y);
	1784	z = x;
	1785	z.retainAll(y);
	1786	int32_t c = setToBits(z);
	1787	if (c != (a & b)) {
	1788	errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
	1789	errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
	1790	}
	1791	checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
	1792	}
	1793
	1794	void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1795	bitsToSet(a, x);
	1796	bitsToSet(b, y);
	1797	z = x;
	1798	z.removeAll(y);
	1799	int32_t c = setToBits(z);
	1800	if (c != (a &~ b)) {
	1801	errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
	1802	errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
	1803	}
	1804	checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
	1805	}
	1806
	1807	void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1808	bitsToSet(a, x);
	1809	bitsToSet(b, y);
	1810	z = x;
	1811	z.complementAll(y);
	1812	int32_t c = setToBits(z);
	1813	if (c != (a ^ b)) {
	1814	errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
	1815	errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
	1816	}
	1817	checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
	1818	}
	1819
	1820	/**
	1821	* Check that ranges are monotonically increasing and non-
	1822	* overlapping.
	1823	*/
	1824	void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
	1825	int32_t n = set.getRangeCount();
	1826	if (n < 0) {
	1827	errln((UnicodeString)"FAIL result of " + msg +
	1828	": range count should be >= 0 but is " +
	1829	n /+ " for " + set.toPattern())/);
	1830	return;
	1831	}
	1832	UChar32 last = 0;
	1833	for (int32_t i=0; i<n; ++i) {
	1834	UChar32 start = set.getRangeStart(i);
	1835	UChar32 end = set.getRangeEnd(i);
	1836	if (start > end) {
	1837	errln((UnicodeString)"FAIL result of " + msg +
	1838	": range " + (i+1) +
	1839	" start > end: " + (int)start + ", " + (int)end +
	1840	" for " + set);
	1841	}
	1842	if (i > 0 && start <= last) {
	1843	errln((UnicodeString)"FAIL result of " + msg +
	1844	": range " + (i+1) +
	1845	" overlaps previous range: " + (int)start + ", " + (int)end +
	1846	" for " + set);
	1847	}
	1848	last = end;
	1849	}
	1850	}
	1851
	1852	/**
	1853	* Convert a bitmask to a UnicodeSet.
	1854	*/
	1855	UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
	1856	result.clear();
	1857	for (UChar32 i = 0; i < 32; ++i) {
	1858	if ((a & (1<<i)) != 0) {
	1859	result.add(i);
	1860	}
	1861	}
	1862	return result;
	1863	}
	1864
	1865	/**
	1866	* Convert a UnicodeSet to a bitmask. Only the characters
	1867	* U+0000 to U+0020 are represented in the bitmask.
	1868	*/
	1869	int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
	1870	int32_t result = 0;
	1871	for (int32_t i = 0; i < 32; ++i) {
	1872	if (x.contains((UChar32)i)) {
	1873	result \|= (1<<i);
	1874	}
	1875	}
	1876	return result;
	1877	}
	1878
	1879	/**
	1880	* Return the representation of an inversion list based UnicodeSet
	1881	* as a pairs list. Ranges are listed in ascending Unicode order.
	1882	* For example, the set [a-zA-M3] is represented as "33AMaz".
	1883	*/
	1884	UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
	1885	UnicodeString pairs;
	1886	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1887	UChar32 start = set.getRangeStart(i);
	1888	UChar32 end = set.getRangeEnd(i);
	1889	if (end > 0xFFFF) {
	1890	end = 0xFFFF;
	1891	i = set.getRangeCount(); // Should be unnecessary
	1892	}
	1893	pairs.append((UChar)start).append((UChar)end);
	1894	}
	1895	return pairs;
	1896	}
	1897
	1898	/**
	1899	* Basic consistency check for a few items.
	1900	* That the iterator works, and that we can create a pattern and
	1901	* get the same thing back
	1902	*/
	1903	void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
	1904	{
	1905	UnicodeSet t(s);
	1906	checkEqual(s, t, "copy ct");
	1907	}
	1908
	1909	{
	1910	UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
	1911	t = s;
	1912	checkEqual(s, t, "operator=");
	1913	}
	1914
	1915	{
	1916	UnicodeSet t;
	1917	copyWithIterator(t, s, FALSE);
	1918	checkEqual(s, t, "iterator roundtrip");
	1919	}
	1920
	1921	{
	1922	UnicodeSet t;
	1923	copyWithIterator(t, s, TRUE); // try range
	1924	checkEqual(s, t, "iterator roundtrip");
	1925	}
	1926
	1927	{
	1928	UnicodeSet t;
	1929	UnicodeString pat;
	1930	UErrorCode ec = U_ZERO_ERROR;
	1931	s.toPattern(pat, FALSE);
	1932	t.applyPattern(pat, ec);
	1933	if (U_FAILURE(ec)) {
	1934	errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
	1935	return;
	1936	} else {
	1937	checkEqual(s, t, "toPattern(false)");
	1938	}
	1939	}
	1940
	1941	{
	1942	UnicodeSet t;
	1943	UnicodeString pat;
	1944	UErrorCode ec = U_ZERO_ERROR;
	1945	s.toPattern(pat, TRUE);
	1946	t.applyPattern(pat, ec);
	1947	if (U_FAILURE(ec)) {
	1948	errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
	1949	return;
	1950	} else {
	1951	checkEqual(s, t, "toPattern(true)");
	1952	}
	1953	}
	1954	}
	1955
	1956	void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
	1957	if(U_FAILURE(status)) return;
	1958	int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
	1959	if(status == U_BUFFER_OVERFLOW_ERROR) {
	1960	status = U_ZERO_ERROR;
	1961	serializeBuffer.resize(len);
	1962	len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
	1963	// let 2nd error stand
	1964	}
	1965	if(U_FAILURE(status)) {
	1966	errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
	1967	return;
	1968	}
	1969	UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
	1970	if(U_FAILURE(status)) {
	1971	errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
	1972	return;
	1973	}
	1974
	1975	checkEqual(t, deserialized, "Set was unequal when deserialized");
	1976	}
	1977
	1978	void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
	1979	t.clear();
	1980	UnicodeSetIterator it(s);
	1981	if (withRange) {
	1982	while (it.nextRange()) {
	1983	if (it.isString()) {
	1984	t.add(it.getString());
	1985	} else {
	1986	t.add(it.getCodepoint(), it.getCodepointEnd());
	1987	}
	1988	}
	1989	} else {
	1990	while (it.next()) {
	1991	if (it.isString()) {
	1992	t.add(it.getString());
	1993	} else {
	1994	t.add(it.getCodepoint());
	1995	}
	1996	}
	1997	}
	1998	}
	1999
	2000	UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
	2001	assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
	2002	assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
	2003	UnicodeString source; s.toPattern(source, TRUE);
	2004	UnicodeString result; t.toPattern(result, TRUE);
	2005	if (s != t) {
	2006	errln((UnicodeString)"FAIL: " + message
	2007	+ "; source = " + source
	2008	+ "; result = " + result
	2009	);
	2010	return FALSE;
	2011	} else {
	2012	logln((UnicodeString)"Ok: " + message
	2013	+ "; source = " + source
	2014	+ "; result = " + result
	2015	);
	2016	}
	2017	return TRUE;
	2018	}
	2019
	2020	void
	2021	UnicodeSetTest::expectContainment(const UnicodeString& pat,
	2022	const UnicodeString& charsIn,
	2023	const UnicodeString& charsOut) {
	2024	UErrorCode ec = U_ZERO_ERROR;
	2025	UnicodeSet set(pat, ec);
	2026	if (U_FAILURE(ec)) {
	2027	dataerrln((UnicodeString)"FAIL: pattern \"" +
	2028	pat + "\" => " + u_errorName(ec));
	2029	return;
	2030	}
	2031	expectContainment(set, pat, charsIn, charsOut);
	2032	}
	2033
	2034	void
	2035	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	2036	const UnicodeString& charsIn,
	2037	const UnicodeString& charsOut) {
	2038	UnicodeString pat;
	2039	set.toPattern(pat);
	2040	expectContainment(set, pat, charsIn, charsOut);
	2041	}
	2042
	2043	void
	2044	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	2045	const UnicodeString& setName,
	2046	const UnicodeString& charsIn,
	2047	const UnicodeString& charsOut) {
	2048	UnicodeString bad;
	2049	UChar32 c;
	2050	int32_t i;
	2051
	2052	for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
	2053	c = charsIn.char32At(i);
	2054	if (!set.contains(c)) {
	2055	bad.append(c);
	2056	}
	2057	}
	2058	if (bad.length() > 0) {
	2059	errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
	2060	", expected containment of " + prettify(charsIn));
	2061	} else {
	2062	logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
	2063	}
	2064
	2065	bad.truncate(0);
	2066	for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
	2067	c = charsOut.char32At(i);
	2068	if (set.contains(c)) {
	2069	bad.append(c);
	2070	}
	2071	}
	2072	if (bad.length() > 0) {
	2073	errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
	2074	", expected non-containment of " + prettify(charsOut));
	2075	} else {
	2076	logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
	2077	}
	2078	}
	2079
	2080	void
	2081	UnicodeSetTest::expectPattern(UnicodeSet& set,
	2082	const UnicodeString& pattern,
	2083	const UnicodeString& expectedPairs){
	2084	UErrorCode status = U_ZERO_ERROR;
	2085	set.applyPattern(pattern, status);
	2086	if (U_FAILURE(status)) {
	2087	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	2088	"\") failed");
	2089	return;
	2090	} else {
	2091	if (getPairs(set) != expectedPairs ) {
	2092	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	2093	"\") => pairs \"" +
	2094	escape(getPairs(set)) + "\", expected \"" +
	2095	escape(expectedPairs) + "\"");
	2096	} else {
	2097	logln(UnicodeString("Ok: applyPattern(\"") + pattern +
	2098	"\") => pairs \"" +
	2099	escape(getPairs(set)) + "\"");
	2100	}
	2101	}
	2102	// the result of calling set.toPattern(), which is the string representation of
	2103	// this set(set), is passed to a UnicodeSet constructor, and tested that it
	2104	// will produce another set that is equal to this one.
	2105	UnicodeString temppattern;
	2106	set.toPattern(temppattern);
	2107	UnicodeSet *tempset=new UnicodeSet(temppattern, status);
	2108	if (U_FAILURE(status)) {
	2109	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
	2110	return;
	2111	}
	2112	if(tempset != set \|\| getPairs(tempset) != getPairs(set)){
	2113	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
	2114	escape(getPairs(set)) + "\""));
	2115	} else{
	2116	logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
	2117	}
	2118
	2119	delete tempset;
	2120
	2121	}
	2122
	2123	void
	2124	UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
	2125	if (getPairs(set) != expectedPairs) {
	2126	errln(UnicodeString("FAIL: Expected pair list \"") +
	2127	escape(expectedPairs) + "\", got \"" +
	2128	escape(getPairs(set)) + "\"");
	2129	}
	2130	}
	2131
	2132	void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
	2133	const UnicodeString& expPat,
	2134	const char** expStrings) {
	2135	UnicodeString pat;
	2136	set.toPattern(pat, TRUE);
	2137	if (pat == expPat) {
	2138	logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
	2139	} else {
	2140	errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
	2141	return;
	2142	}
	2143	if (expStrings == NULL) {
	2144	return;
	2145	}
	2146	UBool in = TRUE;
	2147	for (int32_t i=0; expStrings[i] != NULL; ++i) {
	2148	if (expStrings[i] == NOT) { // sic; pointer comparison
	2149	in = FALSE;
	2150	continue;
	2151	}
	2152	UnicodeString s = CharsToUnicodeString(expStrings[i]);
	2153	UBool contained = set.contains(s);
	2154	if (contained == in) {
	2155	logln((UnicodeString)"Ok: " + expPat +
	2156	(contained ? " contains {" : " does not contain {") +
	2157	escape(expStrings[i]) + "}");
	2158	} else {
	2159	errln((UnicodeString)"FAIL: " + expPat +
	2160	(contained ? " contains {" : " does not contain {") +
	2161	escape(expStrings[i]) + "}");
	2162	}
	2163	}
	2164	}
	2165
	2166	static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
	2167
	2168	void
	2169	UnicodeSetTest::doAssert(UBool condition, const char *message)
	2170	{
	2171	if (!condition) {
	2172	errln(UnicodeString("ERROR : ") + message);
	2173	}
	2174	}
	2175
	2176	UnicodeString
	2177	UnicodeSetTest::escape(const UnicodeString& s) {
	2178	UnicodeString buf;
	2179	for (int32_t i=0; i<s.length(); )
	2180	{
	2181	UChar32 c = s.char32At(i);
	2182	if (0x0020 <= c && c <= 0x007F) {
	2183	buf += c;
	2184	} else {
	2185	if (c <= 0xFFFF) {
	2186	buf += (UChar)0x5c; buf += (UChar)0x75;
	2187	} else {
	2188	buf += (UChar)0x5c; buf += (UChar)0x55;
	2189	buf += toHexString((c & 0xF0000000) >> 28);
	2190	buf += toHexString((c & 0x0F000000) >> 24);
	2191	buf += toHexString((c & 0x00F00000) >> 20);
	2192	buf += toHexString((c & 0x000F0000) >> 16);
	2193	}
	2194	buf += toHexString((c & 0xF000) >> 12);
	2195	buf += toHexString((c & 0x0F00) >> 8);
	2196	buf += toHexString((c & 0x00F0) >> 4);
	2197	buf += toHexString(c & 0x000F);
	2198	}
	2199	i += U16_LENGTH(c);
	2200	}
	2201	return buf;
	2202	}
	2203
	2204	void UnicodeSetTest::TestFreezable() {
	2205	UErrorCode errorCode=U_ZERO_ERROR;
	2206	UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
	2207	UnicodeSet idSet(idPattern, errorCode);
	2208	if(U_FAILURE(errorCode)) {
	2209	dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
	2210	return;
	2211	}
	2212
	2213	UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
	2214	UnicodeSet wsSet(wsPattern, errorCode);
	2215	if(U_FAILURE(errorCode)) {
	2216	dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
	2217	return;
	2218	}
	2219
	2220	idSet.add(idPattern);
	2221	UnicodeSet frozen(idSet);
	2222	frozen.freeze();
	2223
	2224	if(idSet.isFrozen() \|\| !frozen.isFrozen()) {
	2225	errln("FAIL: isFrozen() is wrong");
	2226	}
	2227	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2228	errln("FAIL: a copy-constructed frozen set differs from its original");
	2229	}
	2230
	2231	frozen=wsSet;
	2232	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2233	errln("FAIL: a frozen set was modified by operator=");
	2234	}
	2235
	2236	UnicodeSet frozen2(frozen);
	2237	if(frozen2!=frozen \|\| frozen2!=idSet) {
	2238	errln("FAIL: a copied frozen set differs from its frozen original");
	2239	}
	2240	if(!frozen2.isFrozen()) {
	2241	errln("FAIL: copy-constructing a frozen set results in a thawed one");
	2242	}
	2243	UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
	2244	if(frozen3.contains(0, 4) \|\| !frozen3.contains(5, 55) \|\| frozen3.contains(56, 0x10ffff)) {
	2245	errln("FAIL: UnicodeSet(5, 55) failed");
	2246	}
	2247	frozen3=frozen;
	2248	if(!frozen3.isFrozen()) {
	2249	errln("FAIL: copying a frozen set results in a thawed one");
	2250	}
	2251
	2252	UnicodeSet cloned=(UnicodeSet )frozen.clone();
	2253	if(!cloned->isFrozen() \|\| *cloned!=frozen \|\| cloned->containsSome(0xd802, 0xd805)) {
	2254	errln("FAIL: clone() failed");
	2255	}
	2256	cloned->add(0xd802, 0xd805);
	2257	if(cloned->containsSome(0xd802, 0xd805)) {
	2258	errln("FAIL: unable to modify clone");
	2259	}
	2260	delete cloned;
	2261
	2262	UnicodeSet thawed=(UnicodeSet )frozen.cloneAsThawed();
	2263	if(thawed->isFrozen() \|\| *thawed!=frozen \|\| thawed->containsSome(0xd802, 0xd805)) {
	2264	errln("FAIL: cloneAsThawed() failed");
	2265	}
	2266	thawed->add(0xd802, 0xd805);
	2267	if(!thawed->contains(0xd802, 0xd805)) {
	2268	errln("FAIL: unable to modify thawed clone");
	2269	}
	2270	delete thawed;
	2271
	2272	frozen.set(5, 55);
	2273	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2274	errln("FAIL: UnicodeSet::set() modified a frozen set");
	2275	}
	2276
	2277	frozen.clear();
	2278	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2279	errln("FAIL: UnicodeSet::clear() modified a frozen set");
	2280	}
	2281
	2282	frozen.closeOver(USET_CASE_INSENSITIVE);
	2283	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2284	errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
	2285	}
	2286
	2287	frozen.compact();
	2288	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2289	errln("FAIL: UnicodeSet::compact() modified a frozen set");
	2290	}
	2291
	2292	ParsePosition pos;
	2293	frozen.
	2294	applyPattern(wsPattern, errorCode).
	2295	applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
	2296	applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
	2297	applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
	2298	applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
	2299	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2300	errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
	2301	}
	2302
	2303	frozen.
	2304	add(0xd800).
	2305	add(0xd802, 0xd805).
	2306	add(wsPattern).
	2307	addAll(idPattern).
	2308	addAll(wsSet);
	2309	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2310	errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
	2311	}
	2312
	2313	frozen.
	2314	retain(0x62).
	2315	retain(0x64, 0x69).
	2316	retainAll(wsPattern).
	2317	retainAll(wsSet);
	2318	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2319	errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
	2320	}
	2321
	2322	frozen.
	2323	remove(0x62).
	2324	remove(0x64, 0x69).
	2325	remove(idPattern).
	2326	removeAll(idPattern).
	2327	removeAll(idSet);
	2328	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2329	errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
	2330	}
	2331
	2332	frozen.
	2333	complement().
	2334	complement(0x62).
	2335	complement(0x64, 0x69).
	2336	complement(idPattern).
	2337	complementAll(idPattern).
	2338	complementAll(idSet);
	2339	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2340	errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
	2341	}
	2342	}
	2343
	2344	// Test span() etc. -------------------------------------------------------- ***
	2345
	2346	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
	2347	static int32_t
	2348	appendUTF8(const UChar s, int32_t length, char t, int32_t capacity) {
	2349	UErrorCode errorCode=U_ZERO_ERROR;
	2350	int32_t length8=0;
	2351	u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
	2352	if(U_SUCCESS(errorCode)) {
	2353	return length8;
	2354	} else {
	2355	// The string contains an unpaired surrogate.
	2356	// Ignore this string.
	2357	return 0;
	2358	}
	2359	}
	2360
	2361	class UnicodeSetWithStringsIterator;
	2362
	2363	// Make the strings in a UnicodeSet easily accessible.
	2364	class UnicodeSetWithStrings {
	2365	public:
	2366	UnicodeSetWithStrings(const UnicodeSet &normalSet) :
	2367	set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
	2368	int32_t size=set.size();
	2369	if(size>0 && set.charAt(size-1)<0) {
	2370	// If a set's last element is not a code point, then it must contain strings.
	2371	// Iterate over the set, skip all code point ranges, and cache the strings.
	2372	// Convert them to UTF-8 for spanUTF8().
	2373	UnicodeSetIterator iter(set);
	2374	const UnicodeString *s;
	2375	char *s8=utf8;
	2376	int32_t length8, utf8Count=0;
	2377	while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
	2378	if(iter.isString()) {
	2379	// Store the pointer to the set's string element
	2380	// which we happen to know is a stable pointer.
	2381	strings[stringsLength]=s=&iter.getString();
	2382	utf8Count+=
	2383	utf8Lengths[stringsLength]=length8=
	2384	appendUTF8(s->getBuffer(), s->length(),
	2385	s8, (int32_t)(sizeof(utf8)-utf8Count));
	2386	if(length8==0) {
	2387	hasSurrogates=TRUE; // Contains unpaired surrogates.
	2388	}
	2389	s8+=length8;
	2390	++stringsLength;
	2391	}
	2392	}
	2393	}
	2394	}
	2395
	2396	const UnicodeSet &getSet() const {
	2397	return set;
	2398	}
	2399
	2400	UBool hasStrings() const {
	2401	return (UBool)(stringsLength>0);
	2402	}
	2403
	2404	UBool hasStringsWithSurrogates() const {
	2405	return hasSurrogates;
	2406	}
	2407
	2408	private:
	2409	friend class UnicodeSetWithStringsIterator;
	2410
	2411	const UnicodeSet &set;
	2412
	2413	const UnicodeString *strings[20];
	2414	int32_t stringsLength;
	2415	UBool hasSurrogates;
	2416
	2417	char utf8[1024];
	2418	int32_t utf8Lengths[20];
	2419	};
	2420
	2421	class UnicodeSetWithStringsIterator {
	2422	public:
	2423	UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
	2424	fSet(set), nextStringIndex(0), nextUTF8Start(0) {
	2425	}
	2426
	2427	void reset() {
	2428	nextStringIndex=nextUTF8Start=0;
	2429	}
	2430
	2431	const UnicodeString *nextString() {
	2432	if(nextStringIndex<fSet.stringsLength) {
	2433	return fSet.strings[nextStringIndex++];
	2434	} else {
	2435	return NULL;
	2436	}
	2437	}
	2438
	2439	// Do not mix with calls to nextString().
	2440	const char *nextUTF8(int32_t &length) {
	2441	if(nextStringIndex<fSet.stringsLength) {
	2442	const char *s8=fSet.utf8+nextUTF8Start;
	2443	nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
	2444	return s8;
	2445	} else {
	2446	length=0;
	2447	return NULL;
	2448	}
	2449	}
	2450
	2451	private:
	2452	const UnicodeSetWithStrings &fSet;
	2453	int32_t nextStringIndex;
	2454	int32_t nextUTF8Start;
	2455	};
	2456
	2457	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
	2458	// at code point boundaries.
	2459	// That is, each edge of a match must not be in the middle of a surrogate pair.
	2460	static inline UBool
	2461	matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
	2462	s+=start;
	2463	limit-=start;
	2464	int32_t length=t.length();
	2465	return 0==t.compare(s, length) &&
	2466	!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
	2467	!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
	2468	}
	2469
	2470	// Implement span() with contains() for comparison.
	2471	static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2472	USetSpanCondition spanCondition) {
	2473	const UnicodeSet &realSet(set.getSet());
	2474	if(!set.hasStrings()) {
	2475	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2476	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2477	}
	2478
	2479	UChar32 c;
	2480	int32_t start=0, prev;
	2481	while((prev=start)<length) {
	2482	U16_NEXT(s, start, length, c);
	2483	if(realSet.contains(c)!=spanCondition) {
	2484	break;
	2485	}
	2486	}
	2487	return prev;
	2488	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2489	UnicodeSetWithStringsIterator iter(set);
	2490	UChar32 c;
	2491	int32_t start, next;
	2492	for(start=next=0; start<length;) {
	2493	U16_NEXT(s, next, length, c);
	2494	if(realSet.contains(c)) {
	2495	break;
	2496	}
	2497	const UnicodeString *str;
	2498	iter.reset();
	2499	while((str=iter.nextString())!=NULL) {
	2500	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2501	// spanNeedsStrings=TRUE;
	2502	return start;
	2503	}
	2504	}
	2505	start=next;
	2506	}
	2507	return start;
	2508	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2509	UnicodeSetWithStringsIterator iter(set);
	2510	UChar32 c;
	2511	int32_t start, next, maxSpanLimit=0;
	2512	for(start=next=0; start<length;) {
	2513	U16_NEXT(s, next, length, c);
	2514	if(!realSet.contains(c)) {
	2515	next=start; // Do not span this single, not-contained code point.
	2516	}
	2517	const UnicodeString *str;
	2518	iter.reset();
	2519	while((str=iter.nextString())!=NULL) {
	2520	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2521	// spanNeedsStrings=TRUE;
	2522	int32_t matchLimit=start+str->length();
	2523	if(matchLimit==length) {
	2524	return length;
	2525	}
	2526	if(spanCondition==USET_SPAN_CONTAINED) {
	2527	// Iterate for the shortest match at each position.
	2528	// Recurse for each but the shortest match.
	2529	if(next==start) {
	2530	next=matchLimit; // First match from start.
	2531	} else {
	2532	if(matchLimit<next) {
	2533	// Remember shortest match from start for iteration.
	2534	int32_t temp=next;
	2535	next=matchLimit;
	2536	matchLimit=temp;
	2537	}
	2538	// Recurse for non-shortest match from start.
	2539	int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
	2540	USET_SPAN_CONTAINED);
	2541	if((matchLimit+spanLength)>maxSpanLimit) {
	2542	maxSpanLimit=matchLimit+spanLength;
	2543	if(maxSpanLimit==length) {
	2544	return length;
	2545	}
	2546	}
	2547	}
	2548	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2549	if(matchLimit>next) {
	2550	// Remember longest match from start.
	2551	next=matchLimit;
	2552	}
	2553	}
	2554	}
	2555	}
	2556	if(next==start) {
	2557	break; // No match from start.
	2558	}
	2559	start=next;
	2560	}
	2561	if(start>maxSpanLimit) {
	2562	return start;
	2563	} else {
	2564	return maxSpanLimit;
	2565	}
	2566	}
	2567	}
	2568
	2569	static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2570	USetSpanCondition spanCondition) {
	2571	if(length==0) {
	2572	return 0;
	2573	}
	2574	const UnicodeSet &realSet(set.getSet());
	2575	if(!set.hasStrings()) {
	2576	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2577	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2578	}
	2579
	2580	UChar32 c;
	2581	int32_t prev=length;
	2582	do {
	2583	U16_PREV(s, 0, length, c);
	2584	if(realSet.contains(c)!=spanCondition) {
	2585	break;
	2586	}
	2587	} while((prev=length)>0);
	2588	return prev;
	2589	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2590	UnicodeSetWithStringsIterator iter(set);
	2591	UChar32 c;
	2592	int32_t prev=length, length0=length;
	2593	do {
	2594	U16_PREV(s, 0, length, c);
	2595	if(realSet.contains(c)) {
	2596	break;
	2597	}
	2598	const UnicodeString *str;
	2599	iter.reset();
	2600	while((str=iter.nextString())!=NULL) {
	2601	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2602	// spanNeedsStrings=TRUE;
	2603	return prev;
	2604	}
	2605	}
	2606	} while((prev=length)>0);
	2607	return prev;
	2608	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2609	UnicodeSetWithStringsIterator iter(set);
	2610	UChar32 c;
	2611	int32_t prev=length, minSpanStart=length, length0=length;
	2612	do {
	2613	U16_PREV(s, 0, length, c);
	2614	if(!realSet.contains(c)) {
	2615	length=prev; // Do not span this single, not-contained code point.
	2616	}
	2617	const UnicodeString *str;
	2618	iter.reset();
	2619	while((str=iter.nextString())!=NULL) {
	2620	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2621	// spanNeedsStrings=TRUE;
	2622	int32_t matchStart=prev-str->length();
	2623	if(matchStart==0) {
	2624	return 0;
	2625	}
	2626	if(spanCondition==USET_SPAN_CONTAINED) {
	2627	// Iterate for the shortest match at each position.
	2628	// Recurse for each but the shortest match.
	2629	if(length==prev) {
	2630	length=matchStart; // First match from prev.
	2631	} else {
	2632	if(matchStart>length) {
	2633	// Remember shortest match from prev for iteration.
	2634	int32_t temp=length;
	2635	length=matchStart;
	2636	matchStart=temp;
	2637	}
	2638	// Recurse for non-shortest match from prev.
	2639	int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
	2640	USET_SPAN_CONTAINED);
	2641	if(spanStart<minSpanStart) {
	2642	minSpanStart=spanStart;
	2643	if(minSpanStart==0) {
	2644	return 0;
	2645	}
	2646	}
	2647	}
	2648	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2649	if(matchStart<length) {
	2650	// Remember longest match from prev.
	2651	length=matchStart;
	2652	}
	2653	}
	2654	}
	2655	}
	2656	if(length==prev) {
	2657	break; // No match from prev.
	2658	}
	2659	} while((prev=length)>0);
	2660	if(prev<minSpanStart) {
	2661	return prev;
	2662	} else {
	2663	return minSpanStart;
	2664	}
	2665	}
	2666	}
	2667
	2668	static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2669	USetSpanCondition spanCondition) {
	2670	const UnicodeSet &realSet(set.getSet());
	2671	if(!set.hasStrings()) {
	2672	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2673	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2674	}
	2675
	2676	UChar32 c;
	2677	int32_t start=0, prev;
	2678	while((prev=start)<length) {
	2679	U8_NEXT_OR_FFFD(s, start, length, c);
	2680	if(realSet.contains(c)!=spanCondition) {
	2681	break;
	2682	}
	2683	}
	2684	return prev;
	2685	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2686	UnicodeSetWithStringsIterator iter(set);
	2687	UChar32 c;
	2688	int32_t start, next;
	2689	for(start=next=0; start<length;) {
	2690	U8_NEXT_OR_FFFD(s, next, length, c);
	2691	if(realSet.contains(c)) {
	2692	break;
	2693	}
	2694	const char *s8;
	2695	int32_t length8;
	2696	iter.reset();
	2697	while((s8=iter.nextUTF8(length8))!=NULL) {
	2698	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2699	// spanNeedsStrings=TRUE;
	2700	return start;
	2701	}
	2702	}
	2703	start=next;
	2704	}
	2705	return start;
	2706	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2707	UnicodeSetWithStringsIterator iter(set);
	2708	UChar32 c;
	2709	int32_t start, next, maxSpanLimit=0;
	2710	for(start=next=0; start<length;) {
	2711	U8_NEXT_OR_FFFD(s, next, length, c);
	2712	if(!realSet.contains(c)) {
	2713	next=start; // Do not span this single, not-contained code point.
	2714	}
	2715	const char *s8;
	2716	int32_t length8;
	2717	iter.reset();
	2718	while((s8=iter.nextUTF8(length8))!=NULL) {
	2719	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2720	// spanNeedsStrings=TRUE;
	2721	int32_t matchLimit=start+length8;
	2722	if(matchLimit==length) {
	2723	return length;
	2724	}
	2725	if(spanCondition==USET_SPAN_CONTAINED) {
	2726	// Iterate for the shortest match at each position.
	2727	// Recurse for each but the shortest match.
	2728	if(next==start) {
	2729	next=matchLimit; // First match from start.
	2730	} else {
	2731	if(matchLimit<next) {
	2732	// Remember shortest match from start for iteration.
	2733	int32_t temp=next;
	2734	next=matchLimit;
	2735	matchLimit=temp;
	2736	}
	2737	// Recurse for non-shortest match from start.
	2738	int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
	2739	USET_SPAN_CONTAINED);
	2740	if((matchLimit+spanLength)>maxSpanLimit) {
	2741	maxSpanLimit=matchLimit+spanLength;
	2742	if(maxSpanLimit==length) {
	2743	return length;
	2744	}
	2745	}
	2746	}
	2747	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2748	if(matchLimit>next) {
	2749	// Remember longest match from start.
	2750	next=matchLimit;
	2751	}
	2752	}
	2753	}
	2754	}
	2755	if(next==start) {
	2756	break; // No match from start.
	2757	}
	2758	start=next;
	2759	}
	2760	if(start>maxSpanLimit) {
	2761	return start;
	2762	} else {
	2763	return maxSpanLimit;
	2764	}
	2765	}
	2766	}
	2767
	2768	static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2769	USetSpanCondition spanCondition) {
	2770	if(length==0) {
	2771	return 0;
	2772	}
	2773	const UnicodeSet &realSet(set.getSet());
	2774	if(!set.hasStrings()) {
	2775	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2776	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2777	}
	2778
	2779	UChar32 c;
	2780	int32_t prev=length;
	2781	do {
	2782	U8_PREV_OR_FFFD(s, 0, length, c);
	2783	if(realSet.contains(c)!=spanCondition) {
	2784	break;
	2785	}
	2786	} while((prev=length)>0);
	2787	return prev;
	2788	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2789	UnicodeSetWithStringsIterator iter(set);
	2790	UChar32 c;
	2791	int32_t prev=length;
	2792	do {
	2793	U8_PREV_OR_FFFD(s, 0, length, c);
	2794	if(realSet.contains(c)) {
	2795	break;
	2796	}
	2797	const char *s8;
	2798	int32_t length8;
	2799	iter.reset();
	2800	while((s8=iter.nextUTF8(length8))!=NULL) {
	2801	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2802	// spanNeedsStrings=TRUE;
	2803	return prev;
	2804	}
	2805	}
	2806	} while((prev=length)>0);
	2807	return prev;
	2808	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2809	UnicodeSetWithStringsIterator iter(set);
	2810	UChar32 c;
	2811	int32_t prev=length, minSpanStart=length;
	2812	do {
	2813	U8_PREV_OR_FFFD(s, 0, length, c);
	2814	if(!realSet.contains(c)) {
	2815	length=prev; // Do not span this single, not-contained code point.
	2816	}
	2817	const char *s8;
	2818	int32_t length8;
	2819	iter.reset();
	2820	while((s8=iter.nextUTF8(length8))!=NULL) {
	2821	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2822	// spanNeedsStrings=TRUE;
	2823	int32_t matchStart=prev-length8;
	2824	if(matchStart==0) {
	2825	return 0;
	2826	}
	2827	if(spanCondition==USET_SPAN_CONTAINED) {
	2828	// Iterate for the shortest match at each position.
	2829	// Recurse for each but the shortest match.
	2830	if(length==prev) {
	2831	length=matchStart; // First match from prev.
	2832	} else {
	2833	if(matchStart>length) {
	2834	// Remember shortest match from prev for iteration.
	2835	int32_t temp=length;
	2836	length=matchStart;
	2837	matchStart=temp;
	2838	}
	2839	// Recurse for non-shortest match from prev.
	2840	int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
	2841	USET_SPAN_CONTAINED);
	2842	if(spanStart<minSpanStart) {
	2843	minSpanStart=spanStart;
	2844	if(minSpanStart==0) {
	2845	return 0;
	2846	}
	2847	}
	2848	}
	2849	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2850	if(matchStart<length) {
	2851	// Remember longest match from prev.
	2852	length=matchStart;
	2853	}
	2854	}
	2855	}
	2856	}
	2857	if(length==prev) {
	2858	break; // No match from prev.
	2859	}
	2860	} while((prev=length)>0);
	2861	if(prev<minSpanStart) {
	2862	return prev;
	2863	} else {
	2864	return minSpanStart;
	2865	}
	2866	}
	2867	}
	2868
	2869	// spans to be performed and compared
	2870	enum {
	2871	SPAN_UTF16 =1,
	2872	SPAN_UTF8 =2,
	2873	SPAN_UTFS =3,
	2874
	2875	SPAN_SET =4,
	2876	SPAN_COMPLEMENT =8,
	2877	SPAN_POLARITY =0xc,
	2878
	2879	SPAN_FWD =0x10,
	2880	SPAN_BACK =0x20,
	2881	SPAN_DIRS =0x30,
	2882
	2883	SPAN_CONTAINED =0x100,
	2884	SPAN_SIMPLE =0x200,
	2885	SPAN_CONDITION =0x300,
	2886
	2887	SPAN_ALL =0x33f
	2888	};
	2889
	2890	static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
	2891	return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
	2892	}
	2893
	2894	static inline int32_t slen(const void *s, UBool isUTF16) {
	2895	return isUTF16 ? u_strlen((const UChar )s) : static_cast<int32_t>(strlen((const char )s));
	2896	}
	2897
	2898	/*
	2899	* Count spans on a string with the method according to type and set the span limits.
	2900	* The set may be the complement of the original.
	2901	* When using spanBack() and comparing with span(), use a span condition for the first spanBack()
	2902	* according to the expected number of spans.
	2903	* Sets typeName to an empty string if there is no such type.
	2904	* Returns -1 if the span option is filtered out.
	2905	*/
	2906	static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
	2907	const void *s, int32_t length, UBool isUTF16,
	2908	uint32_t whichSpans,
	2909	int type, const char *&typeName,
	2910	int32_t limits[], int32_t limitsCapacity,
	2911	int32_t expectCount) {
	2912	const UnicodeSet &realSet(set.getSet());
	2913	int32_t start, count;
	2914	USetSpanCondition spanCondition, firstSpanCondition, contained;
	2915	UBool isForward;
	2916
	2917	if(type<0 \|\| 7<type) {
	2918	typeName="";
	2919	return 0;
	2920	}
	2921
	2922	static const char *const typeNames16[]={
	2923	"contains", "contains(LM)",
	2924	"span", "span(LM)",
	2925	"containsBack", "containsBack(LM)",
	2926	"spanBack", "spanBack(LM)"
	2927	};
	2928
	2929	static const char *const typeNames8[]={
	2930	"containsUTF8", "containsUTF8(LM)",
	2931	"spanUTF8", "spanUTF8(LM)",
	2932	"containsBackUTF8", "containsBackUTF8(LM)", // not implemented
	2933	"spanBackUTF8", "spanBackUTF8(LM)"
	2934	};
	2935
	2936	typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
	2937
	2938	// filter span options
	2939	if(type<=3) {
	2940	// span forward
	2941	if((whichSpans&SPAN_FWD)==0) {
	2942	return -1;
	2943	}
	2944	isForward=TRUE;
	2945	} else {
	2946	// span backward
	2947	if((whichSpans&SPAN_BACK)==0) {
	2948	return -1;
	2949	}
	2950	isForward=FALSE;
	2951	}
	2952	if((type&1)==0) {
	2953	// use USET_SPAN_CONTAINED
	2954	if((whichSpans&SPAN_CONTAINED)==0) {
	2955	return -1;
	2956	}
	2957	contained=USET_SPAN_CONTAINED;
	2958	} else {
	2959	// use USET_SPAN_SIMPLE
	2960	if((whichSpans&SPAN_SIMPLE)==0) {
	2961	return -1;
	2962	}
	2963	contained=USET_SPAN_SIMPLE;
	2964	}
	2965
	2966	// Default first span condition for going forward with an uncomplemented set.
	2967	spanCondition=USET_SPAN_NOT_CONTAINED;
	2968	if(isComplement) {
	2969	spanCondition=invertSpanCondition(spanCondition, contained);
	2970	}
	2971
	2972	// First span condition for span(), used to terminate the spanBack() iteration.
	2973	firstSpanCondition=spanCondition;
	2974
	2975	// spanBack(): Its initial span condition is span()'s last span condition,
	2976	// which is the opposite of span()'s first span condition
	2977	// if we expect an even number of spans.
	2978	// (The loop inverts spanCondition (expectCount-1) times
	2979	// before the expectCount'th span() call.)
	2980	// If we do not compare forward and backward directions, then we do not have an
	2981	// expectCount and just start with firstSpanCondition.
	2982	if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
	2983	spanCondition=invertSpanCondition(spanCondition, contained);
	2984	}
	2985
	2986	count=0;
	2987	switch(type) {
	2988	case 0:
	2989	case 1:
	2990	start=0;
	2991	if(length<0) {
	2992	length=slen(s, isUTF16);
	2993	}
	2994	for(;;) {
	2995	start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
	2996	containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
	2997	if(count<limitsCapacity) {
	2998	limits[count]=start;
	2999	}
	3000	++count;
	3001	if(start>=length) {
	3002	break;
	3003	}
	3004	spanCondition=invertSpanCondition(spanCondition, contained);
	3005	}
	3006	break;
	3007	case 2:
	3008	case 3:
	3009	start=0;
	3010	for(;;) {
	3011	start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
	3012	realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
	3013	if(count<limitsCapacity) {
	3014	limits[count]=start;
	3015	}
	3016	++count;
	3017	if(length>=0 ? start>=length :
	3018	isUTF16 ? ((const UChar *)s)[start]==0 :
	3019	((const char *)s)[start]==0
	3020	) {
	3021	break;
	3022	}
	3023	spanCondition=invertSpanCondition(spanCondition, contained);
	3024	}
	3025	break;
	3026	case 4:
	3027	case 5:
	3028	if(length<0) {
	3029	length=slen(s, isUTF16);
	3030	}
	3031	for(;;) {
	3032	++count;
	3033	if(count<=limitsCapacity) {
	3034	limits[limitsCapacity-count]=length;
	3035	}
	3036	length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
	3037	containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
	3038	if(length==0 && spanCondition==firstSpanCondition) {
	3039	break;
	3040	}
	3041	spanCondition=invertSpanCondition(spanCondition, contained);
	3042	}
	3043	if(count<limitsCapacity) {
	3044	memmove(limits, limits+(limitsCapacity-count), count*4);
	3045	}
	3046	break;
	3047	case 6:
	3048	case 7:
	3049	for(;;) {
	3050	++count;
	3051	if(count<=limitsCapacity) {
	3052	limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
	3053	}
	3054	// Note: Length<0 is tested only for the first spanBack().
	3055	// If we wanted to keep length<0 for all spanBack()s, we would have to
	3056	// temporarily modify the string by placing a NUL where the previous spanBack() stopped.
	3057	length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
	3058	realSet.spanBackUTF8((const char *)s, length, spanCondition);
	3059	if(length==0 && spanCondition==firstSpanCondition) {
	3060	break;
	3061	}
	3062	spanCondition=invertSpanCondition(spanCondition, contained);
	3063	}
	3064	if(count<limitsCapacity) {
	3065	memmove(limits, limits+(limitsCapacity-count), count*4);
	3066	}
	3067	break;
	3068	default:
	3069	typeName="";
	3070	return -1;
	3071	}
	3072
	3073	return count;
	3074	}
	3075
	3076	// sets to be tested; odd index=isComplement
	3077	enum {
	3078	SLOW,
	3079	SLOW_NOT,
	3080	FAST,
	3081	FAST_NOT,
	3082	SET_COUNT
	3083	};
	3084
	3085	static const char *const setNames[SET_COUNT]={
	3086	"slow",
	3087	"slow.not",
	3088	"fast",
	3089	"fast.not"
	3090	};
	3091
	3092	/*
	3093	* Verify that we get the same results whether we look at text with contains(),
	3094	* span() or spanBack(), using unfrozen or frozen versions of the set,
	3095	* and using the set or its complement (switching the spanConditions accordingly).
	3096	* The latter verifies that
	3097	* set.span(spanCondition) == set.complement().span(!spanCondition).
	3098	*
	3099	* The expectLimits[] are either provided by the caller (with expectCount>=0)
	3100	* or returned to the caller (with an input expectCount<0).
	3101	*/
	3102	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3103	const void *s, int32_t length, UBool isUTF16,
	3104	uint32_t whichSpans,
	3105	int32_t expectLimits[], int32_t &expectCount,
	3106	const char *testName, int32_t index) {
	3107	int32_t limits[500];
	3108	int32_t limitsCount;
	3109	int i, j;
	3110
	3111	const char *typeName;
	3112	int type;
	3113
	3114	for(i=0; i<SET_COUNT; ++i) {
	3115	if((i&1)==0) {
	3116	// Even-numbered sets are original, uncomplemented sets.
	3117	if((whichSpans&SPAN_SET)==0) {
	3118	continue;
	3119	}
	3120	} else {
	3121	// Odd-numbered sets are complemented.
	3122	if((whichSpans&SPAN_COMPLEMENT)==0) {
	3123	continue;
	3124	}
	3125	}
	3126	for(type=0;; ++type) {
	3127	limitsCount=getSpans(*sets[i], (UBool)(i&1),
	3128	s, length, isUTF16,
	3129	whichSpans,
	3130	type, typeName,
	3131	limits, UPRV_LENGTHOF(limits), expectCount);
	3132	if(typeName[0]==0) {
	3133	break; // All types tried.
	3134	}
	3135	if(limitsCount<0) {
	3136	continue; // Span option filtered out.
	3137	}
	3138	if(expectCount<0) {
	3139	expectCount=limitsCount;
	3140	if(limitsCount>UPRV_LENGTHOF(limits)) {
	3141	errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
	3142	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
	3143	return;
	3144	}
	3145	memcpy(expectLimits, limits, limitsCount*4);
	3146	} else if(limitsCount!=expectCount) {
	3147	errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
	3148	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
	3149	} else {
	3150	for(j=0; j<limitsCount; ++j) {
	3151	if(limits[j]!=expectLimits[j]) {
	3152	errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
	3153	testName, (long)index, setNames[i], typeName, (long)limitsCount,
	3154	j, (long)limits[j], (long)expectLimits[j]);
	3155	break;
	3156	}
	3157	}
	3158	}
	3159	}
	3160	}
	3161
	3162	// Compare span() with containsAll()/containsNone(),
	3163	// but only if we have expectLimits[] from the uncomplemented set.
	3164	if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
	3165	const UChar s16=(const UChar )s;
	3166	UnicodeString string;
	3167	int32_t prev=0, limit, length;
	3168	for(i=0; i<expectCount; ++i) {
	3169	limit=expectLimits[i];
	3170	length=limit-prev;
	3171	if(length>0) {
	3172	string.setTo(FALSE, s16+prev, length); // read-only alias
	3173	if(i&1) {
	3174	if(!sets[SLOW]->getSet().containsAll(string)) {
	3175	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3176	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3177	return;
	3178	}
	3179	if(!sets[FAST]->getSet().containsAll(string)) {
	3180	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3181	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3182	return;
	3183	}
	3184	} else {
	3185	if(!sets[SLOW]->getSet().containsNone(string)) {
	3186	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3187	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3188	return;
	3189	}
	3190	if(!sets[FAST]->getSet().containsNone(string)) {
	3191	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3192	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3193	return;
	3194	}
	3195	}
	3196	}
	3197	prev=limit;
	3198	}
	3199	}
	3200	}
	3201
	3202	// Specifically test either UTF-16 or UTF-8.
	3203	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3204	const void *s, int32_t length, UBool isUTF16,
	3205	uint32_t whichSpans,
	3206	const char *testName, int32_t index) {
	3207	int32_t expectLimits[500];
	3208	int32_t expectCount=-1;
	3209	testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
	3210	}
	3211
	3212	UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
	3213	UChar c, c2;
	3214
	3215	if(length>=0) {
	3216	while(length>0) {
	3217	c=*s++;
	3218	--length;
	3219	if(0xd800<=c && c<0xe000) {
	3220	if(c>=0xdc00 \|\| length==0 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3221	return TRUE;
	3222	}
	3223	--length;
	3224	}
	3225	}
	3226	} else {
	3227	while((c=*s++)!=0) {
	3228	if(0xd800<=c && c<0xe000) {
	3229	if(c>=0xdc00 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3230	return TRUE;
	3231	}
	3232	}
	3233	}
	3234	}
	3235	return FALSE;
	3236	}
	3237
	3238	// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
	3239	// unless either UTF is turned off in whichSpans.
	3240	// Testing UTF-16 and UTF-8 together requires that surrogate code points
	3241	// have the same contains(c) value as U+FFFD.
	3242	void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
	3243	const UChar *s16, int32_t length16,
	3244	uint32_t whichSpans,
	3245	const char *testName, int32_t index) {
	3246	int32_t expectLimits[500];
	3247	int32_t expectCount;
	3248
	3249	expectCount=-1; // Get expectLimits[] from testSpan().
	3250
	3251	if((whichSpans&SPAN_UTF16)!=0) {
	3252	testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
	3253	}
	3254	if((whichSpans&SPAN_UTF8)==0) {
	3255	return;
	3256	}
	3257
	3258	// Convert s16[] and expectLimits[] to UTF-8.
	3259	uint8_t s8[3000];
	3260	int32_t offsets[3000];
	3261
	3262	const UChar *s16Limit=s16+length16;
	3263	char t=(char )s8;
	3264	char *tLimit=t+sizeof(s8);
	3265	int32_t *o=offsets;
	3266	UErrorCode errorCode=U_ZERO_ERROR;
	3267
	3268	// Convert with substitution: Turn unpaired surrogates into U+FFFD.
	3269	ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
	3270	if(U_FAILURE(errorCode)) {
	3271	errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
	3272	testName, (long)index, u_errorName(errorCode));
	3273	ucnv_resetFromUnicode(utf8Cnv);
	3274	return;
	3275	}
	3276	int32_t length8=(int32_t)(t-(char *)s8);
	3277
	3278	// Convert expectLimits[].
	3279	int32_t i, j, expect;
	3280	for(i=j=0; i<expectCount; ++i) {
	3281	expect=expectLimits[i];
	3282	if(expect==length16) {
	3283	expectLimits[i]=length8;
	3284	} else {
	3285	while(offsets[j]<expect) {
	3286	++j;
	3287	}
	3288	expectLimits[i]=j;
	3289	}
	3290	}
	3291
	3292	testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
	3293	}
	3294
	3295	static UChar32 nextCodePoint(UChar32 c) {
	3296	// Skip some large and boring ranges.
	3297	switch(c) {
	3298	case 0x3441:
	3299	return 0x4d7f;
	3300	case 0x5100:
	3301	return 0x9f00;
	3302	case 0xb040:
	3303	return 0xd780;
	3304	case 0xe041:
	3305	return 0xf8fe;
	3306	case 0x10100:
	3307	return 0x20000;
	3308	case 0x20041:
	3309	return 0xe0000;
	3310	case 0xe0101:
	3311	return 0x10fffd;
	3312	default:
	3313	return c+1;
	3314	}
	3315	}
	3316
	3317	// Verify that all implementations represent the same set.
	3318	void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3319	// contains(U+FFFD) is inconsistent with contains(some surrogates),
	3320	// or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
	3321	// Skip the UTF-8 part of the test - if the string contains surrogates -
	3322	// because it is likely to produce a different result.
	3323	UBool inconsistentSurrogates=
	3324	(!(sets[0]->getSet().contains(0xfffd) ?
	3325	sets[0]->getSet().contains(0xd800, 0xdfff) :
	3326	sets[0]->getSet().containsNone(0xd800, 0xdfff)) \|\|
	3327	sets[0]->hasStringsWithSurrogates());
	3328
	3329	UChar s[1000];
	3330	int32_t length=0;
	3331	uint32_t localWhichSpans;
	3332
	3333	UChar32 c, first;
	3334	for(first=c=0;; c=nextCodePoint(c)) {
	3335	if(c>0x10ffff \|\| length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
	3336	localWhichSpans=whichSpans;
	3337	if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
	3338	localWhichSpans&=~SPAN_UTF8;
	3339	}
	3340	testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
	3341	if(c>0x10ffff) {
	3342	break;
	3343	}
	3344	length=0;
	3345	first=c;
	3346	}
	3347	U16_APPEND_UNSAFE(s, length, c);
	3348	}
	3349	}
	3350
	3351	// Test with a particular, interesting string.
	3352	// Specify length and try NUL-termination.
	3353	void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3354	static const UChar s[]={
	3355	0x61, 0x62, 0x20, // Latin, space
	3356	0x3b1, 0x3b2, 0x3b3, // Greek
	3357	0xd900, // lead surrogate
	3358	0x3000, 0x30ab, 0x30ad, // wide space, Katakana
	3359	0xdc05, // trail surrogate
	3360	0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
	3361	0xd900, 0xdc05, // unassigned supplementary
	3362	0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
	3363	0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
	3364	0 // NUL
	3365	};
	3366
	3367	if((whichSpans&SPAN_UTF16)==0) {
	3368	return;
	3369	}
	3370	testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
	3371	testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
	3372	}
	3373
	3374	void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3375	static const char s[]={
	3376	"abc" // Latin
	3377
	3378	/* trail byte in lead position */
	3379	"\x80"
	3380
	3381	" " // space
	3382
	3383	/* truncated multi-byte sequences */
	3384	"\xd0"
	3385	"\xe0"
	3386	"\xe1"
	3387	"\xed"
	3388	"\xee"
	3389	"\xf0"
	3390	"\xf1"
	3391	"\xf4"
	3392	"\xf8"
	3393	"\xfc"
	3394
	3395	"\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
	3396
	3397	/* trail byte in lead position */
	3398	"\x80"
	3399
	3400	"\xe0\x80"
	3401	"\xe0\xa0"
	3402	"\xe1\x80"
	3403	"\xed\x80"
	3404	"\xed\xa0"
	3405	"\xee\x80"
	3406	"\xf0\x80"
	3407	"\xf0\x90"
	3408	"\xf1\x80"
	3409	"\xf4\x80"
	3410	"\xf4\x90"
	3411	"\xf8\x80"
	3412	"\xfc\x80"
	3413
	3414	"\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
	3415
	3416	/* trail byte in lead position */
	3417	"\x80"
	3418
	3419	"\xf0\x80\x80"
	3420	"\xf0\x90\x80"
	3421	"\xf1\x80\x80"
	3422	"\xf4\x80\x80"
	3423	"\xf4\x90\x80"
	3424	"\xf8\x80\x80"
	3425	"\xfc\x80\x80"
	3426
	3427	"\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
	3428
	3429	/* trail byte in lead position */
	3430	"\x80"
	3431
	3432	"\xf8\x80\x80\x80"
	3433	"\xfc\x80\x80\x80"
	3434
	3435	"\xF1\x90\x80\x85" // unassigned supplementary
	3436
	3437	/* trail byte in lead position */
	3438	"\x80"
	3439
	3440	"\xfc\x80\x80\x80\x80"
	3441
	3442	"\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
	3443
	3444	/* trail byte in lead position */
	3445	"\x80"
	3446
	3447	/* complete sequences but non-shortest forms or out of range etc. */
	3448	"\xc0\x80"
	3449	"\xe0\x80\x80"
	3450	"\xed\xa0\x80"
	3451	"\xf0\x80\x80\x80"
	3452	"\xf4\x90\x80\x80"
	3453	"\xf8\x80\x80\x80\x80"
	3454	"\xfc\x80\x80\x80\x80\x80"
	3455	"\xfe"
	3456	"\xff"
	3457
	3458	/* trail byte in lead position */
	3459	"\x80"
	3460
	3461	"\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
	3462	};
	3463
	3464	if((whichSpans&SPAN_UTF8)==0) {
	3465	return;
	3466	}
	3467	testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
	3468	testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
	3469	}
	3470
	3471	// Take a set of span options and multiply them so that
	3472	// each portion only has one of the options a, b and c.
	3473	// If b==0, then the set of options is just modified with mask and a.
	3474	// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
	3475	static int32_t
	3476	addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
	3477	uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
	3478	uint32_t s;
	3479	int32_t i;
	3480
	3481	for(i=0; i<whichSpansCount; ++i) {
	3482	s=whichSpans[i]&mask;
	3483	whichSpans[i]=s\|a;
	3484	if(b!=0) {
	3485	whichSpans[whichSpansCount+i]=s\|b;
	3486	if(c!=0) {
	3487	whichSpans[2*whichSpansCount+i]=s\|c;
	3488	}
	3489	}
	3490	}
	3491	return b==0 ? whichSpansCount : c==0 ? 2whichSpansCount : 3whichSpansCount;
	3492	}
	3493
	3494	#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3495	#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3496	#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3497	#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3498
	3499	void UnicodeSetTest::TestSpan() {
	3500	// "[...]" is a UnicodeSet pattern.
	3501	// "*" performs tests on all Unicode code points and on a selection of
	3502	// malformed UTF-8/16 strings.
	3503	// "-options" limits the scope of testing for the current set.
	3504	// By default, the test verifies that equivalent boundaries are found
	3505	// for UTF-16 and UTF-8, going forward and backward,
	3506	// alternating USET_SPAN_NOT_CONTAINED with
	3507	// either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
	3508	// Single-character options:
	3509	// 8 -- UTF-16 and UTF-8 boundaries may differ.
	3510	// Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
	3511	// or the set contains strings with unpaired surrogates
	3512	// which do not translate to valid UTF-8.
	3513	// c -- set.span() and set.complement().span() boundaries may differ.
	3514	// Cause: Set strings are not complemented.
	3515	// b -- span() and spanBack() boundaries may differ.
	3516	// Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
	3517	// and spanBack(USET_SPAN_SIMPLE) are defined to
	3518	// match with non-overlapping substrings.
	3519	// For example, with a set containing "ab" and "ba",
	3520	// span() of "aba" yields boundaries { 0, 2, 3 }
	3521	// because the initial "ab" matches from 0 to 2,
	3522	// while spanBack() yields boundaries { 0, 1, 3 }
	3523	// because the final "ba" matches from 1 to 3.
	3524	// l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
	3525	// Cause: Strings in the set overlap, and a longer match may
	3526	// require a sequence including non-longest substrings.
	3527	// For example, with a set containing "ab", "abc" and "cd",
	3528	// span(contained) of "abcd" spans the entire string
	3529	// but span(longest match) only spans the first 3 characters.
	3530	// Each "-options" first resets all options and then applies the specified options.
	3531	// A "-" without options resets the options.
	3532	// The options are also reset for each new set.
	3533	// Other strings will be spanned.
	3534	static const char *const testdata[]={
	3535	"[:ID_Continue:]",
	3536	"*",
	3537	"[:White_Space:]",
	3538	"*",
	3539	"[]",
	3540	"*",
	3541	"[\\u0000-\\U0010FFFF]",
	3542	"*",
	3543	"[\\u0000\\u0080\\u0800\\U00010000]",
	3544	"*",
	3545	"[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
	3546	"*",
	3547	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
	3548	"-c",
	3549	"*",
	3550	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
	3551	"-c",
	3552	"*",
	3553
	3554	// Overlapping strings cause overlapping attempts to match.
	3555	"[x{xy}{xya}{axy}{ax}]",
	3556	"-cl",
	3557
	3558	// More repetitions of "xya" would take too long with the recursive
	3559	// reference implementation.
	3560	// containsAll()=FALSE
	3561	// test_string 0x14
	3562	"xx"
	3563	"xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
	3564	"xx" // set.complement().span(contained) will stop between the two 'x'es.
	3565	"xyaxyaxyaxya"
	3566	"xx"
	3567	"xyaxyaxyaxya" // span() ends here.
	3568	"aaa",
	3569
	3570	// containsAll()=TRUE
	3571	// test_string 0x15
	3572	"xx"
	3573	"xyaxyaxyaxya"
	3574	"xx"
	3575	"xyaxyaxyaxya"
	3576	"xx"
	3577	"xyaxyaxyaxy",
	3578
	3579	"-bc",
	3580	// test_string 0x17
	3581	"byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
	3582	"-c",
	3583	"byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
	3584	"byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
	3585	"-",
	3586	"byaya", // span() -> { 5 }
	3587	"byay", // span() -> { 4 }
	3588	"bya", // span() -> { 3 }
	3589
	3590	// span(longest match) will not span the whole string.
	3591	"[a{ab}{bc}]",
	3592	"-cl",
	3593	// test_string 0x21
	3594	"abc",
	3595
	3596	"[a{ab}{abc}{cd}]",
	3597	"-cl",
	3598	"acdabcdabccd",
	3599
	3600	// spanBack(longest match) will not span the whole string.
	3601	"[c{ab}{bc}]",
	3602	"-cl",
	3603	"abc",
	3604
	3605	"[d{cd}{bcd}{ab}]",
	3606	"-cl",
	3607	"abbcdabcdabd",
	3608
	3609	// Test with non-ASCII set strings - test proper handling of surrogate pairs
	3610	// and UTF-8 trail bytes.
	3611	// Copies of above test sets and strings, but transliterated to have
	3612	// different code points with similar trail units.
	3613	// Previous: a b c d
	3614	// Unicode: 042B 30AB 200AB 204AB
	3615	// UTF-16: 042B 30AB D840 DCAB D841 DCAB
	3616	// UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
	3617	"[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
	3618	"-cl",
	3619	"\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
	3620
	3621	"[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
	3622	"-cl",
	3623	"\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
	3624
	3625	// Stress bookkeeping and recursion.
	3626	// The following strings are barely doable with the recursive
	3627	// reference implementation.
	3628	// The not-contained character at the end prevents an early exit from the span().
	3629	"[b{bb}]",
	3630	"-c",
	3631	// test_string 0x33
	3632	"bbbbbbbbbbbbbbbbbbbbbbbb-",
	3633	// On complement sets, span() and spanBack() get different results
	3634	// because b is not in the complement set and there is an odd number of b's
	3635	// in the test string.
	3636	"-bc",
	3637	"bbbbbbbbbbbbbbbbbbbbbbbbb-",
	3638
	3639	// Test with set strings with an initial or final code point span
	3640	// longer than 254.
	3641	"[a{" _64_a _64_a _64_a _64_a "b}"
	3642	"{a" _64_b _64_b _64_b _64_b "}]",
	3643	"-c",
	3644	_64_a _64_a _64_a _63_a "b",
	3645	_64_a _64_a _64_a _64_a "b",
	3646	_64_a _64_a _64_a _64_a "aaaabbbb",
	3647	"a" _64_b _64_b _64_b _63_b,
	3648	"a" _64_b _64_b _64_b _64_b,
	3649	"aaaabbbb" _64_b _64_b _64_b _64_b,
	3650
	3651	// Test with strings containing unpaired surrogates.
	3652	// They are not representable in UTF-8, and a leading trail surrogate
	3653	// and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
	3654	// U+20001 == \\uD840\\uDC01
	3655	// U+20400 == \\uD841\\uDC00
	3656	"[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
	3657	"-8cl",
	3658	"aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
	3659	};
	3660	uint32_t whichSpans[96]={ SPAN_ALL };
	3661	int32_t whichSpansCount=1;
	3662
	3663	UnicodeSet *sets[SET_COUNT]={ NULL };
	3664	const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
	3665
	3666	char testName[1024];
	3667	char *testNameLimit=testName;
	3668
	3669	int32_t i, j;
	3670	for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
	3671	const char *s=testdata[i];
	3672	if(s[0]=='[') {
	3673	// Create new test sets from this pattern.
	3674	for(j=0; j<SET_COUNT; ++j) {
	3675	delete sets_with_str[j];
	3676	delete sets[j];
	3677	}
	3678	UErrorCode errorCode=U_ZERO_ERROR;
	3679	sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
	3680	if(U_FAILURE(errorCode)) {
	3681	dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
	3682	break;
	3683	}
	3684	sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
	3685	sets[SLOW_NOT]->complement();
	3686	// Intermediate set: Test cloning of a frozen set.
	3687	UnicodeSet fast=new UnicodeSet(sets[SLOW]);
	3688	fast->freeze();
	3689	sets[FAST]=(UnicodeSet *)fast->clone();
	3690	delete fast;
	3691	UnicodeSet fastNot=new UnicodeSet(sets[SLOW_NOT]);
	3692	fastNot->freeze();
	3693	sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
	3694	delete fastNot;
	3695
	3696	for(j=0; j<SET_COUNT; ++j) {
	3697	sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
	3698	}
	3699
	3700	strcpy(testName, s);
	3701	testNameLimit=strchr(testName, 0);
	3702	*testNameLimit++=':';
	3703	*testNameLimit=0;
	3704
	3705	whichSpans[0]=SPAN_ALL;
	3706	whichSpansCount=1;
	3707	} else if(s[0]=='-') {
	3708	whichSpans[0]=SPAN_ALL;
	3709	whichSpansCount=1;
	3710
	3711	while(*++s!=0) {
	3712	switch(*s) {
	3713	case 'c':
	3714	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3715	~SPAN_POLARITY,
	3716	SPAN_SET,
	3717	SPAN_COMPLEMENT,
	3718	0);
	3719	break;
	3720	case 'b':
	3721	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3722	~SPAN_DIRS,
	3723	SPAN_FWD,
	3724	SPAN_BACK,
	3725	0);
	3726	break;
	3727	case 'l':
	3728	// test USET_SPAN_CONTAINED FWD & BACK, and separately
	3729	// USET_SPAN_SIMPLE only FWD, and separately
	3730	// USET_SPAN_SIMPLE only BACK
	3731	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3732	~(SPAN_DIRS\|SPAN_CONDITION),
	3733	SPAN_DIRS\|SPAN_CONTAINED,
	3734	SPAN_FWD\|SPAN_SIMPLE,
	3735	SPAN_BACK\|SPAN_SIMPLE);
	3736	break;
	3737	case '8':
	3738	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3739	~SPAN_UTFS,
	3740	SPAN_UTF16,
	3741	SPAN_UTF8,
	3742	0);
	3743	break;
	3744	default:
	3745	errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
	3746	break;
	3747	}
	3748	}
	3749	} else if(0==strcmp(s, "*")) {
	3750	strcpy(testNameLimit, "bad_string");
	3751	for(j=0; j<whichSpansCount; ++j) {
	3752	if(whichSpansCount>1) {
	3753	sprintf(testNameLimit+10 /* strlen("bad_string") */,
	3754	"%%0x%3x",
	3755	whichSpans[j]);
	3756	}
	3757	testSpanUTF16String(sets_with_str, whichSpans[j], testName);
	3758	testSpanUTF8String(sets_with_str, whichSpans[j], testName);
	3759	}
	3760
	3761	strcpy(testNameLimit, "contents");
	3762	for(j=0; j<whichSpansCount; ++j) {
	3763	if(whichSpansCount>1) {
	3764	sprintf(testNameLimit+8 /* strlen("contents") */,
	3765	"%%0x%3x",
	3766	whichSpans[j]);
	3767	}
	3768	testSpanContents(sets_with_str, whichSpans[j], testName);
	3769	}
	3770	} else {
	3771	UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
	3772	strcpy(testNameLimit, "test_string");
	3773	for(j=0; j<whichSpansCount; ++j) {
	3774	if(whichSpansCount>1) {
	3775	sprintf(testNameLimit+11 /* strlen("test_string") */,
	3776	"%%0x%3x",
	3777	whichSpans[j]);
	3778	}
	3779	testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
	3780	}
	3781	}
	3782	}
	3783	for(j=0; j<SET_COUNT; ++j) {
	3784	delete sets_with_str[j];
	3785	delete sets[j];
	3786	}
	3787	}
	3788
	3789	// Test select patterns and strings, and test USET_SPAN_SIMPLE.
	3790	void UnicodeSetTest::TestStringSpan() {
	3791	static const char *pattern="[x{xy}{xya}{axy}{ax}]";
	3792	static const char *const string=
	3793	"xx"
	3794	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3795	"xx"
	3796	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3797	"xx"
	3798	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
	3799	"aaaa";
	3800
	3801	UErrorCode errorCode=U_ZERO_ERROR;
	3802	UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
	3803	UnicodeSet set(pattern16, errorCode);
	3804	if(U_FAILURE(errorCode)) {
	3805	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3806	return;
	3807	}
	3808
	3809	UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
	3810
	3811	if(set.containsAll(string16)) {
	3812	errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
	3813	}
	3814
	3815	// Remove trailing "aaaa".
	3816	string16.truncate(string16.length()-4);
	3817	if(!set.containsAll(string16)) {
	3818	errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
	3819	}
	3820
	3821	string16=UNICODE_STRING_SIMPLE("byayaxya");
	3822	const UChar *s16=string16.getBuffer();
	3823	int32_t length16=string16.length();
	3824	(void)length16; // Suppress set but not used warning.
	3825	if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3826	set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3827	set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3828	set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 \|\|
	3829	set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3830	set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
	3831	) {
	3832	errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
	3833	}
	3834
	3835	pattern="[a{ab}{abc}{cd}]";
	3836	pattern16=UnicodeString(pattern, -1, US_INV);
	3837	set.applyPattern(pattern16, errorCode);
	3838	if(U_FAILURE(errorCode)) {
	3839	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3840	return;
	3841	}
	3842	string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
	3843	s16=string16.getBuffer();
	3844	length16=string16.length();
	3845	if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 \|\|
	3846	set.span(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3847	set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
	3848	) {
	3849	errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
	3850	}
	3851
	3852	pattern="[d{cd}{bcd}{ab}]";
	3853	pattern16=UnicodeString(pattern, -1, US_INV);
	3854	set.applyPattern(pattern16, errorCode).freeze();
	3855	if(U_FAILURE(errorCode)) {
	3856	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3857	return;
	3858	}
	3859	string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
	3860	s16=string16.getBuffer();
	3861	length16=string16.length();
	3862	if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 \|\|
	3863	set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3864	set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
	3865	) {
	3866	errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
	3867	}
	3868	}
	3869
	3870	/**
	3871	* Including collationroot.h fails here with
	3872	1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
	3873	* .. so, we skip this test on Windows.
	3874	*
	3875	* the cause is that intltest builds with /Za which disables language extensions - which means
	3876	* windows header files can't be used.
	3877	*/
	3878	#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
	3879	#include "collationroot.h"
	3880	#include "collationtailoring.h"
	3881	#endif
	3882
	3883	void UnicodeSetTest::TestUCAUnsafeBackwards() {
	3884	#if U_PLATFORM_HAS_WIN32_API
	3885	infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
	3886	#elif !UCONFIG_NO_COLLATION
	3887	UErrorCode errorCode = U_ZERO_ERROR;
	3888
	3889	// Get the unsafeBackwardsSet
	3890	const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
	3891	if(U_FAILURE(errorCode)) {
	3892	dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
	3893	return;
	3894	}
	3895	//const UVersionInfo &version = rootEntry->tailoring->version;
	3896	const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
	3897
	3898	checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
	3899
	3900	if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
	3901	// simple test case
	3902	// TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
	3903	// TODO(ticket #11891): Port test to Java. Is this a bug there, too?
	3904	UnicodeSet surrogates;
	3905	surrogates.add(0xd83a); // a lead surrogate
	3906	surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
	3907	UnicodeString pat;
	3908	surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
	3909	// TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
	3910	// so that at least one type of surrogate code points are escaped,
	3911	// or (minimally) so that adjacent lead+trail surrogate code points are escaped.
	3912	errorCode = U_ZERO_ERROR;
	3913	UnicodeSet s2;
	3914	s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
	3915	if(U_FAILURE(errorCode)) {
	3916	errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
	3917	} else {
	3918	checkEqual(surrogates, s2, "surrogates to/from pattern");
	3919	}
	3920	// This occurs in the UCA unsafe-backwards set.
	3921	checkRoundTrip(*unsafeBackwardSet);
	3922	}
	3923	#endif
	3924	}
	3925
	3926	void UnicodeSetTest::TestIntOverflow() {
	3927	// This test triggers undefined double->int conversion behavior
	3928	// if the implementation is not careful.
	3929	IcuTestErrorCode errorCode(*this, "TestIntOverflow");
	3930	UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
	3931	assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
	3932	assertEquals("[:ccc=int_overflow:] -> illegal argument",
	3933	U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
	3934	}
	3935
	3936	void UnicodeSetTest::TestUnusedCcc() {
	3937	#if !UCONFIG_NO_NORMALIZATION
	3938	// All numeric ccc values 0..255 are valid, but many are unused.
	3939	IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
	3940	UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
	3941	assertSuccess("[:ccc=2:]", errorCode);
	3942	assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
	3943
	3944	UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
	3945	assertSuccess("[:ccc=255:]", errorCode);
	3946	assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
	3947
	3948	// Non-integer values and values outside 0..255 are invalid.
	3949	UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
	3950	assertEquals("[:ccc=-1:] -> illegal argument",
	3951	U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
	3952	assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
	3953
	3954	UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
	3955	assertEquals("[:ccc=256:] -> illegal argument",
	3956	U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
	3957	assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
	3958
	3959	UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
	3960	assertEquals("[:ccc=1.1:] -> illegal argument",
	3961	U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
	3962	assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
	3963	#endif
	3964	}
	3965
	3966	void UnicodeSetTest::TestDeepPattern() {
	3967	IcuTestErrorCode errorCode(*this, "TestDeepPattern");
	3968	// Nested ranges are parsed via recursion which can use a lot of stack space.
	3969	// After a reasonable limit, we should get an error.
	3970	constexpr int32_t DEPTH = 20000;
	3971	UnicodeString pattern, suffix;
	3972	for (int32_t i = 0; i < DEPTH; ++i) {
	3973	pattern.append(u"[a", 2);
	3974	suffix.append(']');
	3975	}
	3976	pattern.append(suffix);
	3977	UnicodeSet set(pattern, errorCode);
	3978	assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
	3979	errorCode.reset();
	3980	}