git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/usettest.cpp

... / ...

Commit	Line	Data
	1	/*
	2	********************************************************************************
	3	* Copyright (C) 1999-2012 International Business Machines Corporation and
	4	* others. All Rights Reserved.
	5	********************************************************************************
	6	* Date Name Description
	7	* 10/20/99 alan Creation.
	8	* 03/22/2000 Madhu Added additional tests
	9	********************************************************************************
	10	*/
	11
	12	#include <stdio.h>
	13
	14	#include <string.h>
	15	#include "unicode/utypes.h"
	16	#include "usettest.h"
	17	#include "unicode/ucnv.h"
	18	#include "unicode/uniset.h"
	19	#include "unicode/uchar.h"
	20	#include "unicode/usetiter.h"
	21	#include "unicode/ustring.h"
	22	#include "unicode/parsepos.h"
	23	#include "unicode/symtable.h"
	24	#include "unicode/uversion.h"
	25	#include "hash.h"
	26
	27	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	28
	29	#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
	30	dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
	31	u_errorName(status));}}
	32
	33	#define TEST_ASSERT(expr) {if (!(expr)) { \
	34	dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
	35
	36	UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
	37	UnicodeString pat;
	38	set.toPattern(pat);
	39	return left + UnicodeSetTest::escape(pat);
	40	}
	41
	42	#define CASE(id,test) case id: \
	43	name = #test; \
	44	if (exec) { \
	45	logln(#test "---"); \
	46	logln(); \
	47	test(); \
	48	} \
	49	break
	50
	51	UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
	52	}
	53
	54	UConverter *UnicodeSetTest::openUTF8Converter() {
	55	if(utf8Cnv==NULL) {
	56	UErrorCode errorCode=U_ZERO_ERROR;
	57	utf8Cnv=ucnv_open("UTF-8", &errorCode);
	58	}
	59	return utf8Cnv;
	60	}
	61
	62	UnicodeSetTest::~UnicodeSetTest() {
	63	ucnv_close(utf8Cnv);
	64	}
	65
	66	void
	67	UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
	68	const char* &name, char* /par/) {
	69	// if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
	70	switch (index) {
	71	CASE(0,TestPatterns);
	72	CASE(1,TestAddRemove);
	73	CASE(2,TestCategories);
	74	CASE(3,TestCloneEqualHash);
	75	CASE(4,TestMinimalRep);
	76	CASE(5,TestAPI);
	77	CASE(6,TestScriptSet);
	78	CASE(7,TestPropertySet);
	79	CASE(8,TestClone);
	80	CASE(9,TestExhaustive);
	81	CASE(10,TestToPattern);
	82	CASE(11,TestIndexOf);
	83	CASE(12,TestStrings);
	84	CASE(13,Testj2268);
	85	CASE(14,TestCloseOver);
	86	CASE(15,TestEscapePattern);
	87	CASE(16,TestInvalidCodePoint);
	88	CASE(17,TestSymbolTable);
	89	CASE(18,TestSurrogate);
	90	CASE(19,TestPosixClasses);
	91	CASE(20,TestIteration);
	92	CASE(21,TestFreezable);
	93	CASE(22,TestSpan);
	94	CASE(23,TestStringSpan);
	95	default: name = ""; break;
	96	}
	97	}
	98
	99	static const char NOT[] = "%%%%";
	100
	101	/**
	102	* UVector was improperly copying contents
	103	* This code will crash this is still true
	104	*/
	105	void UnicodeSetTest::Testj2268() {
	106	UnicodeSet t;
	107	t.add(UnicodeString("abc"));
	108	UnicodeSet test(t);
	109	UnicodeString ustrPat;
	110	test.toPattern(ustrPat, TRUE);
	111	}
	112
	113	/**
	114	* Test toPattern().
	115	*/
	116	void UnicodeSetTest::TestToPattern() {
	117	UErrorCode ec = U_ZERO_ERROR;
	118
	119	// Test that toPattern() round trips with syntax characters and
	120	// whitespace.
	121	{
	122	static const char* OTHER_TOPATTERN_TESTS[] = {
	123	"[[:latin:]&[:greek:]]",
	124	"[[:latin:]-[:greek:]]",
	125	"[:nonspacing mark:]",
	126	NULL
	127	};
	128
	129	for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
	130	ec = U_ZERO_ERROR;
	131	UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
	132	if (U_FAILURE(ec)) {
	133	dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
	134	continue;
	135	}
	136	checkPat(OTHER_TOPATTERN_TESTS[j], s);
	137	}
	138
	139	for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
	140	if ((i <= 0xFF && !u_isalpha(i)) \|\| u_isspace(i)) {
	141
	142	// check various combinations to make sure they all work.
	143	if (i != 0 && !toPatternAux(i, i)){
	144	continue;
	145	}
	146	if (!toPatternAux(0, i)){
	147	continue;
	148	}
	149	if (!toPatternAux(i, 0xFFFF)){
	150	continue;
	151	}
	152	}
	153	}
	154	}
	155
	156	// Test pattern behavior of multicharacter strings.
	157	{
	158	ec = U_ZERO_ERROR;
	159	UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
	160
	161	// This loop isn't a loop. It's here to make the compiler happy.
	162	// If you're curious, try removing it and changing the 'break'
	163	// statements (except for the last) to goto's.
	164	for (;;) {
	165	if (U_FAILURE(ec)) break;
	166	const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
	167	expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
	168
	169	s->add("ac");
	170	const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
	171	expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
	172
	173	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
	174	if (U_FAILURE(ec)) break;
	175	const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
	176	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
	177
	178	s->add("[]");
	179	const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
	180	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
	181
	182	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
	183	if (U_FAILURE(ec)) break;
	184	const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
	185	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
	186
	187	// j2189
	188	s->clear();
	189	s->add(UnicodeString("abc", ""));
	190	s->add(UnicodeString("abc", ""));
	191	const char* exp6[] = {"abc", NOT, "ab", NULL};
	192	expectToPattern(*s, "[{abc}]", exp6);
	193
	194	break;
	195	}
	196
	197	if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
	198	delete s;
	199	}
	200
	201	// JB#3400: For 2 character ranges prefer [ab] to [a-b]
	202	UnicodeSet s;
	203	s.add((UChar)97, (UChar)98); // 'a', 'b'
	204	expectToPattern(s, "[ab]", NULL);
	205	}
	206
	207	UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
	208
	209	// use Integer.toString because Utility.hex doesn't handle ints
	210	UnicodeString pat = "";
	211	// TODO do these in hex
	212	//String source = "0x" + Integer.toString(start,16).toUpperCase();
	213	//if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
	214	UnicodeString source;
	215	source = source + (uint32_t)start;
	216	if (start != end)
	217	source = source + ".." + (uint32_t)end;
	218	UnicodeSet testSet;
	219	testSet.add(start, end);
	220	return checkPat(source, testSet);
	221	}
	222
	223	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	224	const UnicodeSet& testSet) {
	225	// What we want to make sure of is that a pattern generated
	226	// by toPattern(), with or without escaped unprintables, can
	227	// be passed back into the UnicodeSet constructor.
	228	UnicodeString pat0;
	229
	230	testSet.toPattern(pat0, TRUE);
	231
	232	if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
	233
	234	//String pat1 = unescapeLeniently(pat0);
	235	//if (!checkPat(source + " (in code)", testSet, pat1)) return false;
	236
	237	UnicodeString pat2;
	238	testSet.toPattern(pat2, FALSE);
	239	if (!checkPat(source, testSet, pat2)) return FALSE;
	240
	241	//String pat3 = unescapeLeniently(pat2);
	242	// if (!checkPat(source + " (in code)", testSet, pat3)) return false;
	243
	244	//logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
	245	logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
	246	return TRUE;
	247	}
	248
	249	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	250	const UnicodeSet& testSet,
	251	const UnicodeString& pat) {
	252	UErrorCode ec = U_ZERO_ERROR;
	253	UnicodeSet testSet2(pat, ec);
	254	if (testSet2 != testSet) {
	255	errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
	256	return FALSE;
	257	}
	258	return TRUE;
	259	}
	260
	261	void
	262	UnicodeSetTest::TestPatterns(void) {
	263	UnicodeSet set;
	264	expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
	265	expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
	266	expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
	267	expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
	268	expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
	269	expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
	270
	271	// Throw in a test of complement
	272	set.complement();
	273	UnicodeString exp;
	274	exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
	275	expectPairs(set, exp);
	276	}
	277
	278	void
	279	UnicodeSetTest::TestCategories(void) {
	280	UErrorCode status = U_ZERO_ERROR;
	281	const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
	282	UnicodeSet set(pat, status);
	283	if (U_FAILURE(status)) {
	284	dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
	285	return;
	286	} else {
	287	expectContainment(set, pat, "ABC", "abc");
	288	}
	289
	290	UChar32 i;
	291	int32_t failures = 0;
	292	// Make sure generation of L doesn't pollute cached Lu set
	293	// First generate L, then Lu
	294	set.applyPattern("[:L:]", status);
	295	if (U_FAILURE(status)) { errln("FAIL"); return; }
	296	for (i=0; i<0x200; ++i) {
	297	UBool l = u_isalpha((UChar)i);
	298	if (l != set.contains(i)) {
	299	errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
	300	set.contains(i));
	301	if (++failures == 10) break;
	302	}
	303	}
	304
	305	set.applyPattern("[:Lu:]", status);
	306	if (U_FAILURE(status)) { errln("FAIL"); return; }
	307	for (i=0; i<0x200; ++i) {
	308	UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
	309	if (lu != set.contains(i)) {
	310	errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
	311	set.contains(i));
	312	if (++failures == 20) break;
	313	}
	314	}
	315	}
	316	void
	317	UnicodeSetTest::TestCloneEqualHash(void) {
	318	UErrorCode status = U_ZERO_ERROR;
	319	// set1 and set2 used to be built with the obsolete constructor taking
	320	// UCharCategory values; replaced with pattern constructors
	321	// markus 20030502
	322	UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
	323	UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
	324	if (U_FAILURE(status)){
	325	dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
	326	return;
	327	}
	328	UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
	329	UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
	330	if (U_FAILURE(status)){
	331	errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
	332	return;
	333	}
	334
	335	if (set1 != set1a) {
	336	errln("FAIL: category constructor for Ll broken");
	337	}
	338	if (set2 != set2a) {
	339	errln("FAIL: category constructor for Nd broken");
	340	}
	341	delete set1a;
	342	delete set2a;
	343
	344	logln("Testing copy construction");
	345	UnicodeSet set1copy=new UnicodeSet(set1);
	346	if(set1 != set1copy \|\| set1 == set2 \|\|
	347	getPairs(set1) != getPairs(set1copy) \|\|
	348	set1->hashCode() != set1copy->hashCode()){
	349	errln("FAIL : Error in copy construction");
	350	return;
	351	}
	352
	353	logln("Testing =operator");
	354	UnicodeSet set1equal=*set1;
	355	UnicodeSet set2equal=*set2;
	356	if(set1equal != set1 \|\| set1equal != set1copy \|\| set2equal != *set2 \|\|
	357	set2equal == set1 \|\| set2equal == set1copy \|\| set2equal == set1equal){
	358	errln("FAIL: Error in =operator");
	359	}
	360
	361	logln("Testing clone()");
	362	UnicodeSet set1clone=(UnicodeSet)set1->clone();
	363	UnicodeSet set2clone=(UnicodeSet)set2->clone();
	364	if(set1clone != set1 \|\| set1clone != set1copy \|\| *set1clone != set1equal \|\|
	365	set2clone != set2 \|\| set2clone == set1copy \|\| *set2clone != set2equal \|\|
	366	set2clone == set1 \|\| set2clone == set1equal \|\| set2clone == *set1clone){
	367	errln("FAIL: Error in clone");
	368	}
	369
	370	logln("Testing hashcode");
	371	if(set1->hashCode() != set1equal.hashCode() \|\| set1->hashCode() != set1clone->hashCode() \|\|
	372	set2->hashCode() != set2equal.hashCode() \|\| set2->hashCode() != set2clone->hashCode() \|\|
	373	set1copy->hashCode() != set1equal.hashCode() \|\| set1copy->hashCode() != set1clone->hashCode() \|\|
	374	set1->hashCode() == set2->hashCode() \|\| set1copy->hashCode() == set2->hashCode() \|\|
	375	set2->hashCode() == set1clone->hashCode() \|\| set2->hashCode() == set1equal.hashCode() ){
	376	errln("FAIL: Error in hashCode()");
	377	}
	378
	379	delete set1;
	380	delete set1copy;
	381	delete set2;
	382	delete set1clone;
	383	delete set2clone;
	384
	385
	386	}
	387	void
	388	UnicodeSetTest::TestAddRemove(void) {
	389	UnicodeSet set; // Construct empty set
	390	doAssert(set.isEmpty() == TRUE, "set should be empty");
	391	doAssert(set.size() == 0, "size should be 0");
	392	set.complement();
	393	doAssert(set.size() == 0x110000, "size should be 0x110000");
	394	set.clear();
	395	set.add(0x0061, 0x007a);
	396	expectPairs(set, "az");
	397	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	398	doAssert(set.size() != 0, "size should not be equal to 0");
	399	doAssert(set.size() == 26, "size should be equal to 26");
	400	set.remove(0x006d, 0x0070);
	401	expectPairs(set, "alqz");
	402	doAssert(set.size() == 22, "size should be equal to 22");
	403	set.remove(0x0065, 0x0067);
	404	expectPairs(set, "adhlqz");
	405	doAssert(set.size() == 19, "size should be equal to 19");
	406	set.remove(0x0064, 0x0069);
	407	expectPairs(set, "acjlqz");
	408	doAssert(set.size() == 16, "size should be equal to 16");
	409	set.remove(0x0063, 0x0072);
	410	expectPairs(set, "absz");
	411	doAssert(set.size() == 10, "size should be equal to 10");
	412	set.add(0x0066, 0x0071);
	413	expectPairs(set, "abfqsz");
	414	doAssert(set.size() == 22, "size should be equal to 22");
	415	set.remove(0x0061, 0x0067);
	416	expectPairs(set, "hqsz");
	417	set.remove(0x0061, 0x007a);
	418	expectPairs(set, "");
	419	doAssert(set.isEmpty() == TRUE, "set should be empty");
	420	doAssert(set.size() == 0, "size should be 0");
	421	set.add(0x0061);
	422	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	423	doAssert(set.size() == 1, "size should not be equal to 1");
	424	set.add(0x0062);
	425	set.add(0x0063);
	426	expectPairs(set, "ac");
	427	doAssert(set.size() == 3, "size should not be equal to 3");
	428	set.add(0x0070);
	429	set.add(0x0071);
	430	expectPairs(set, "acpq");
	431	doAssert(set.size() == 5, "size should not be equal to 5");
	432	set.clear();
	433	expectPairs(set, "");
	434	doAssert(set.isEmpty() == TRUE, "set should be empty");
	435	doAssert(set.size() == 0, "size should be 0");
	436
	437	// Try removing an entire set from another set
	438	expectPattern(set, "[c-x]", "cx");
	439	UnicodeSet set2;
	440	expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
	441	set.removeAll(set2);
	442	expectPairs(set, "deluxx");
	443
	444	// Try adding an entire set to another set
	445	expectPattern(set, "[jackiemclean]", "aacceein");
	446	expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
	447	set.addAll(set2);
	448	expectPairs(set, "aacehort");
	449	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	450
	451	// Try retaining an set of elements contained in another set (intersection)
	452	UnicodeSet set3;
	453	expectPattern(set3, "[a-c]", "ac");
	454	doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
	455	set3.remove(0x0062);
	456	expectPairs(set3, "aacc");
	457	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	458	set.retainAll(set3);
	459	expectPairs(set, "aacc");
	460	doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
	461	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	462	set.clear();
	463	doAssert(set.size() != set3.size(), "set.size() != set3.size()");
	464
	465	// Test commutativity
	466	expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
	467	expectPattern(set2, "[jackiemclean]", "aacceein");
	468	set.addAll(set2);
	469	expectPairs(set, "aacehort");
	470	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	471
	472
	473
	474
	475	}
	476
	477	/**
	478	* Make sure minimal representation is maintained.
	479	*/
	480	void UnicodeSetTest::TestMinimalRep() {
	481	UErrorCode status = U_ZERO_ERROR;
	482	// This is pretty thoroughly tested by checkCanonicalRep()
	483	// run against the exhaustive operation results. Use the code
	484	// here for debugging specific spot problems.
	485
	486	// 1 overlap against 2
	487	UnicodeSet set("[h-km-q]", status);
	488	if (U_FAILURE(status)) { errln("FAIL"); return; }
	489	UnicodeSet set2("[i-o]", status);
	490	if (U_FAILURE(status)) { errln("FAIL"); return; }
	491	set.addAll(set2);
	492	expectPairs(set, "hq");
	493	// right
	494	set.applyPattern("[a-m]", status);
	495	if (U_FAILURE(status)) { errln("FAIL"); return; }
	496	set2.applyPattern("[e-o]", status);
	497	if (U_FAILURE(status)) { errln("FAIL"); return; }
	498	set.addAll(set2);
	499	expectPairs(set, "ao");
	500	// left
	501	set.applyPattern("[e-o]", status);
	502	if (U_FAILURE(status)) { errln("FAIL"); return; }
	503	set2.applyPattern("[a-m]", status);
	504	if (U_FAILURE(status)) { errln("FAIL"); return; }
	505	set.addAll(set2);
	506	expectPairs(set, "ao");
	507	// 1 overlap against 3
	508	set.applyPattern("[a-eg-mo-w]", status);
	509	if (U_FAILURE(status)) { errln("FAIL"); return; }
	510	set2.applyPattern("[d-q]", status);
	511	if (U_FAILURE(status)) { errln("FAIL"); return; }
	512	set.addAll(set2);
	513	expectPairs(set, "aw");
	514	}
	515
	516	void UnicodeSetTest::TestAPI() {
	517	UErrorCode status = U_ZERO_ERROR;
	518	// default ct
	519	UnicodeSet set;
	520	if (!set.isEmpty() \|\| set.getRangeCount() != 0) {
	521	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	522	set);
	523	}
	524
	525	// clear(), isEmpty()
	526	set.add(0x0061);
	527	if (set.isEmpty()) {
	528	errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
	529	set);
	530	}
	531	set.clear();
	532	if (!set.isEmpty()) {
	533	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	534	set);
	535	}
	536
	537	// size()
	538	set.clear();
	539	if (set.size() != 0) {
	540	errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
	541	": " + set);
	542	}
	543	set.add(0x0061);
	544	if (set.size() != 1) {
	545	errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
	546	": " + set);
	547	}
	548	set.add(0x0031, 0x0039);
	549	if (set.size() != 10) {
	550	errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
	551	": " + set);
	552	}
	553
	554	// contains(first, last)
	555	set.clear();
	556	set.applyPattern("[A-Y 1-8 b-d l-y]", status);
	557	if (U_FAILURE(status)) { errln("FAIL"); return; }
	558	for (int32_t i = 0; i<set.getRangeCount(); ++i) {
	559	UChar32 a = set.getRangeStart(i);
	560	UChar32 b = set.getRangeEnd(i);
	561	if (!set.contains(a, b)) {
	562	errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
	563	" but doesn't: " + set);
	564	}
	565	if (set.contains((UChar32)(a-1), b)) {
	566	errln((UnicodeString)"FAIL, shouldn't contain " +
	567	(unsigned short)(a-1) + '-' + (unsigned short)b +
	568	" but does: " + set);
	569	}
	570	if (set.contains(a, (UChar32)(b+1))) {
	571	errln((UnicodeString)"FAIL, shouldn't contain " +
	572	(unsigned short)a + '-' + (unsigned short)(b+1) +
	573	" but does: " + set);
	574	}
	575	}
	576
	577	// Ported InversionList test.
	578	UnicodeSet a((UChar32)3,(UChar32)10);
	579	UnicodeSet b((UChar32)7,(UChar32)15);
	580	UnicodeSet c;
	581
	582	logln((UnicodeString)"a [3-10]: " + a);
	583	logln((UnicodeString)"b [7-15]: " + b);
	584	c = a;
	585	c.addAll(b);
	586	UnicodeSet exp((UChar32)3,(UChar32)15);
	587	if (c == exp) {
	588	logln((UnicodeString)"c.set(a).add(b): " + c);
	589	} else {
	590	errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
	591	}
	592	c.complement();
	593	exp.set((UChar32)0, (UChar32)2);
	594	exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
	595	if (c == exp) {
	596	logln((UnicodeString)"c.complement(): " + c);
	597	} else {
	598	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	599	}
	600	c.complement();
	601	exp.set((UChar32)3, (UChar32)15);
	602	if (c == exp) {
	603	logln((UnicodeString)"c.complement(): " + c);
	604	} else {
	605	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	606	}
	607	c = a;
	608	c.complementAll(b);
	609	exp.set((UChar32)3,(UChar32)6);
	610	exp.add((UChar32)11,(UChar32) 15);
	611	if (c == exp) {
	612	logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
	613	} else {
	614	errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
	615	}
	616
	617	exp = c;
	618	bitsToSet(setToBits(c), c);
	619	if (c == exp) {
	620	logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
	621	} else {
	622	errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
	623	}
	624
	625	// Additional tests for coverage JB#2118
	626	//UnicodeSet::complement(class UnicodeString const &)
	627	//UnicodeSet::complementAll(class UnicodeString const &)
	628	//UnicodeSet::containsNone(class UnicodeSet const &)
	629	//UnicodeSet::containsNone(long,long)
	630	//UnicodeSet::containsSome(class UnicodeSet const &)
	631	//UnicodeSet::containsSome(long,long)
	632	//UnicodeSet::removeAll(class UnicodeString const &)
	633	//UnicodeSet::retain(long)
	634	//UnicodeSet::retainAll(class UnicodeString const &)
	635	//UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
	636	//UnicodeSetIterator::getString(void)
	637	set.clear();
	638	set.complement("ab");
	639	exp.applyPattern("[{ab}]", status);
	640	if (U_FAILURE(status)) { errln("FAIL"); return; }
	641	if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
	642
	643	UnicodeSetIterator iset(set);
	644	if (!iset.next() \|\| !iset.isString()) {
	645	errln("FAIL: UnicodeSetIterator::next/isString");
	646	} else if (iset.getString() != "ab") {
	647	errln("FAIL: UnicodeSetIterator::getString");
	648	}
	649
	650	set.add((UChar32)0x61, (UChar32)0x7A);
	651	set.complementAll("alan");
	652	exp.applyPattern("[{ab}b-kmo-z]", status);
	653	if (U_FAILURE(status)) { errln("FAIL"); return; }
	654	if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
	655
	656	exp.applyPattern("[a-z]", status);
	657	if (U_FAILURE(status)) { errln("FAIL"); return; }
	658	if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	659	if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	660	exp.applyPattern("[aln]", status);
	661	if (U_FAILURE(status)) { errln("FAIL"); return; }
	662	if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	663	if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	664
	665	if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
	666	errln("FAIL: containsNone(UChar32, UChar32)");
	667	}
	668	if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
	669	errln("FAIL: containsSome(UChar32, UChar32)");
	670	}
	671	if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
	672	errln("FAIL: containsNone(UChar32, UChar32)");
	673	}
	674	if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
	675	errln("FAIL: containsSome(UChar32, UChar32)");
	676	}
	677
	678	set.removeAll("liu");
	679	exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
	680	if (U_FAILURE(status)) { errln("FAIL"); return; }
	681	if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
	682
	683	set.retainAll("star");
	684	exp.applyPattern("[rst]", status);
	685	if (U_FAILURE(status)) { errln("FAIL"); return; }
	686	if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
	687
	688	set.retain((UChar32)0x73);
	689	exp.applyPattern("[s]", status);
	690	if (U_FAILURE(status)) { errln("FAIL"); return; }
	691	if (set != exp) { errln("FAIL: retain('s')"); return; }
	692
	693	uint16_t buf[32];
	694	int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
	695	if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
	696	if (slen != 3 \|\| buf[0] != 2 \|\| buf[1] != 0x73 \|\| buf[2] != 0x74) {
	697	errln("FAIL: serialize");
	698	return;
	699	}
	700
	701	// Conversions to and from USet
	702	UnicodeSet *uniset = &set;
	703	USet *uset = uniset->toUSet();
	704	TEST_ASSERT((void )uset == (void )uniset);
	705	UnicodeSet *setx = UnicodeSet::fromUSet(uset);
	706	TEST_ASSERT((void )setx == (void )uset);
	707	const UnicodeSet *constSet = uniset;
	708	const USet *constUSet = constSet->toUSet();
	709	TEST_ASSERT((void )constUSet == (void )constSet);
	710	const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
	711	TEST_ASSERT((void )constSetx == (void )constUSet);
	712
	713	// span(UnicodeString) and spanBack(UnicodeString) convenience methods
	714	UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
	715	UnicodeSet ac(0x61, 0x63);
	716	ac.remove(0x62).freeze();
	717	if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 \|\|
	718	ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 \|\|
	719	ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 \|\|
	720	ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 \|\|
	721	ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|
	722	ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 \|\|
	723	ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 \|\|
	724	ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 \|\|
	725	ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 \|\|
	726	ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
	727	) {
	728	errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
	729	}
	730	if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 \|\|
	731	ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 \|\|
	732	ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 \|\|
	733	ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 \|\|
	734	ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|
	735	ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 \|\|
	736	ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 \|\|
	737	ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 \|\|
	738	ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 \|\|
	739	ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
	740	) {
	741	errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
	742	}
	743	}
	744
	745	void UnicodeSetTest::TestIteration() {
	746	UErrorCode ec = U_ZERO_ERROR;
	747	int i = 0;
	748	int outerLoop;
	749
	750	// 6 code points, 3 ranges, 2 strings, 8 total elements
	751	// Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
	752	UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
	753	TEST_ASSERT_SUCCESS(ec);
	754	UnicodeSetIterator it(set);
	755
	756	for (outerLoop=0; outerLoop<3; outerLoop++) {
	757	// Run the test multiple times, to check that iterator.reset() is working.
	758	for (i=0; i<10; i++) {
	759	UBool nextv = it.next();
	760	UBool isString = it.isString();
	761	int32_t codePoint = it.getCodepoint();
	762	//int32_t codePointEnd = it.getCodepointEnd();
	763	UnicodeString s = it.getString();
	764	switch (i) {
	765	case 0:
	766	TEST_ASSERT(nextv == TRUE);
	767	TEST_ASSERT(isString == FALSE);
	768	TEST_ASSERT(codePoint==0x61);
	769	TEST_ASSERT(s == "a");
	770	break;
	771	case 1:
	772	TEST_ASSERT(nextv == TRUE);
	773	TEST_ASSERT(isString == FALSE);
	774	TEST_ASSERT(codePoint==0x62);
	775	TEST_ASSERT(s == "b");
	776	break;
	777	case 2:
	778	TEST_ASSERT(nextv == TRUE);
	779	TEST_ASSERT(isString == FALSE);
	780	TEST_ASSERT(codePoint==0x63);
	781	TEST_ASSERT(s == "c");
	782	break;
	783	case 3:
	784	TEST_ASSERT(nextv == TRUE);
	785	TEST_ASSERT(isString == FALSE);
	786	TEST_ASSERT(codePoint==0x79);
	787	TEST_ASSERT(s == "y");
	788	break;
	789	case 4:
	790	TEST_ASSERT(nextv == TRUE);
	791	TEST_ASSERT(isString == FALSE);
	792	TEST_ASSERT(codePoint==0x7a);
	793	TEST_ASSERT(s == "z");
	794	break;
	795	case 5:
	796	TEST_ASSERT(nextv == TRUE);
	797	TEST_ASSERT(isString == FALSE);
	798	TEST_ASSERT(codePoint==0x1abcd);
	799	TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
	800	break;
	801	case 6:
	802	TEST_ASSERT(nextv == TRUE);
	803	TEST_ASSERT(isString == TRUE);
	804	TEST_ASSERT(s == "str1");
	805	break;
	806	case 7:
	807	TEST_ASSERT(nextv == TRUE);
	808	TEST_ASSERT(isString == TRUE);
	809	TEST_ASSERT(s == "str2");
	810	break;
	811	case 8:
	812	TEST_ASSERT(nextv == FALSE);
	813	break;
	814	case 9:
	815	TEST_ASSERT(nextv == FALSE);
	816	break;
	817	}
	818	}
	819	it.reset(); // prepare to run the iteration again.
	820	}
	821	}
	822
	823
	824
	825
	826	void UnicodeSetTest::TestStrings() {
	827	UErrorCode ec = U_ZERO_ERROR;
	828
	829	UnicodeSet* testList[] = {
	830	UnicodeSet::createFromAll("abc"),
	831	new UnicodeSet("[a-c]", ec),
	832
	833	&(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
	834	new UnicodeSet("[{ll}{ch}a-z]", ec),
	835
	836	UnicodeSet::createFrom("ab}c"),
	837	new UnicodeSet("[{ab\\}c}]", ec),
	838
	839	&((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
	840	new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
	841
	842	NULL
	843	};
	844
	845	if (U_FAILURE(ec)) {
	846	errln("FAIL: couldn't construct test sets");
	847	}
	848
	849	for (int32_t i = 0; testList[i] != NULL; i+=2) {
	850	if (U_SUCCESS(ec)) {
	851	UnicodeString pat0, pat1;
	852	testList[i]->toPattern(pat0, TRUE);
	853	testList[i+1]->toPattern(pat1, TRUE);
	854	if (testList[i] == testList[i+1]) {
	855	logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
	856	} else {
	857	logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
	858	}
	859	}
	860	delete testList[i];
	861	delete testList[i+1];
	862	}
	863	}
	864
	865	/**
	866	* Test the [:Latin:] syntax.
	867	*/
	868	void UnicodeSetTest::TestScriptSet() {
	869	expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
	870
	871	expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
	872
	873	/* Jitterbug 1423 */
	874	expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
	875
	876	}
	877
	878	/**
	879	* Test the [:Latin:] syntax.
	880	*/
	881	void UnicodeSetTest::TestPropertySet() {
	882	static const char* const DATA[] = {
	883	// Pattern, Chars IN, Chars NOT in
	884
	885	"[:Latin:]",
	886	"aA",
	887	"\\u0391\\u03B1",
	888
	889	"[\\p{Greek}]",
	890	"\\u0391\\u03B1",
	891	"aA",
	892
	893	"\\P{ GENERAL Category = upper case letter }",
	894	"abc",
	895	"ABC",
	896
	897	#if !UCONFIG_NO_NORMALIZATION
	898	// Combining class: @since ICU 2.2
	899	// Check both symbolic and numeric
	900	"\\p{ccc=Nukta}",
	901	"\\u0ABC",
	902	"abc",
	903
	904	"\\p{Canonical Combining Class = 11}",
	905	"\\u05B1",
	906	"\\u05B2",
	907
	908	"[:c c c = iota subscript :]",
	909	"\\u0345",
	910	"xyz",
	911	#endif
	912
	913	// Bidi class: @since ICU 2.2
	914	"\\p{bidiclass=lefttoright}",
	915	"abc",
	916	"\\u0671\\u0672",
	917
	918	// Binary properties: @since ICU 2.2
	919	"\\p{ideographic}",
	920	"\\u4E0A",
	921	"x",
	922
	923	"[:math=false:]",
	924	"q)*(",
	925	// weiv: )(and * were removed from math in Unicode 4.0.1
	926	//"(*+)",
	927	"+<>^",
	928
	929	// JB#1767 \N{}, \p{ASCII}
	930	"[:Ascii:]",
	931	"abc\\u0000\\u007F",
	932	"\\u0080\\u4E00",
	933
	934	"[\\N{ latin small letter a }[:name= latin small letter z:]]",
	935	"az",
	936	"qrs",
	937
	938	// JB#2015
	939	"[:any:]",
	940	"a\\U0010FFFF",
	941	"",
	942
	943	"[:nv=0.5:]",
	944	"\\u00BD\\u0F2A",
	945	"\\u00BC",
	946
	947	// JB#2653: Age
	948	"[:Age=1.1:]",
	949	"\\u03D6", // 1.1
	950	"\\u03D8\\u03D9", // 3.2
	951
	952	"[:Age=3.1:]",
	953	"\\u1800\\u3400\\U0002f800",
	954	"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
	955
	956	// JB#2350: Case_Sensitive
	957	"[:Case Sensitive:]",
	958	"A\\u1FFC\\U00010410",
	959	";\\u00B4\\U00010500",
	960
	961	// JB#2832: C99-compatibility props
	962	"[:blank:]",
	963	" \\u0009",
	964	"1-9A-Z",
	965
	966	"[:graph:]",
	967	"19AZ",
	968	" \\u0003\\u0007\\u0009\\u000A\\u000D",
	969
	970	"[:punct:]",
	971	"!@#%&*()[]{}-_\\/;:,.?'\"",
	972	"09azAZ",
	973
	974	"[:xdigit:]",
	975	"09afAF",
	976	"gG!",
	977
	978	// Regex compatibility test
	979	"[-b]", // leading '-' is literal
	980	"-b",
	981	"ac",
	982
	983	"[^-b]", // leading '-' is literal
	984	"ac",
	985	"-b",
	986
	987	"[b-]", // trailing '-' is literal
	988	"-b",
	989	"ac",
	990
	991	"[^b-]", // trailing '-' is literal
	992	"ac",
	993	"-b",
	994
	995	"[a-b-]", // trailing '-' is literal
	996	"ab-",
	997	"c=",
	998
	999	"[[a-q]&[p-z]-]", // trailing '-' is literal
	1000	"pq-",
	1001	"or=",
	1002
	1003	"[\\s\|\\)\|:\|$\|\\>]", // from regex tests
	1004	"s\|):$>",
	1005	"abc",
	1006
	1007	"[\\uDC00cd]", // JB#2906: isolated trail at start
	1008	"cd\\uDC00",
	1009	"ab\\uD800\\U00010000",
	1010
	1011	"[ab\\uD800]", // JB#2906: isolated trail at start
	1012	"ab\\uD800",
	1013	"cd\\uDC00\\U00010000",
	1014
	1015	"[ab\\uD800cd]", // JB#2906: isolated lead in middle
	1016	"abcd\\uD800",
	1017	"ef\\uDC00\\U00010000",
	1018
	1019	"[ab\\uDC00cd]", // JB#2906: isolated trail in middle
	1020	"abcd\\uDC00",
	1021	"ef\\uD800\\U00010000",
	1022
	1023	#if !UCONFIG_NO_NORMALIZATION
	1024	"[:^lccc=0:]", // Lead canonical class
	1025	"\\u0300\\u0301",
	1026	"abcd\\u00c0\\u00c5",
	1027
	1028	"[:^tccc=0:]", // Trail canonical class
	1029	"\\u0300\\u0301\\u00c0\\u00c5",
	1030	"abcd",
	1031
	1032	"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
	1033	"\\u0300\\u0301\\u00c0\\u00c5",
	1034	"abcd",
	1035
	1036	"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
	1037	"",
	1038	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	1039
	1040	"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
	1041	"\\u0F73\\u0F75\\u0F81",
	1042	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	1043	#endif /* !UCONFIG_NO_NORMALIZATION */
	1044
	1045	"[:Assigned:]",
	1046	"A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
	1047	"\\u0888\\uFDD3\\uFFFE\\U00050005",
	1048
	1049	// Script_Extensions, new in Unicode 6.0
	1050	"[:scx=Arab:]",
	1051	"\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
	1052	"\\u061D\\uFDEF\\uFDFE",
	1053
	1054	// U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
	1055	// so scx-sc is missing U+FDF2.
	1056	"[[:Script_Extensions=Arabic:]-[:Arab:]]",
	1057	"\\u0640\\u064B\\u0650\\u0655\\uFDFD",
	1058	"\\uFDF2"
	1059	};
	1060
	1061	static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
	1062
	1063	for (int32_t i=0; i<DATA_LEN; i+=3) {
	1064	expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
	1065	CharsToUnicodeString(DATA[i+2]));
	1066	}
	1067	}
	1068
	1069	/**
	1070	* Test that Posix style character classes [:digit:], etc.
	1071	* have the Unicode definitions from TR 18.
	1072	*/
	1073	void UnicodeSetTest::TestPosixClasses() {
	1074	{
	1075	UErrorCode status = U_ZERO_ERROR;
	1076	UnicodeSet s1("[:alpha:]", status);
	1077	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
	1078	TEST_ASSERT_SUCCESS(status);
	1079	TEST_ASSERT(s1==s2);
	1080	}
	1081	{
	1082	UErrorCode status = U_ZERO_ERROR;
	1083	UnicodeSet s1("[:lower:]", status);
	1084	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
	1085	TEST_ASSERT_SUCCESS(status);
	1086	TEST_ASSERT(s1==s2);
	1087	}
	1088	{
	1089	UErrorCode status = U_ZERO_ERROR;
	1090	UnicodeSet s1("[:upper:]", status);
	1091	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
	1092	TEST_ASSERT_SUCCESS(status);
	1093	TEST_ASSERT(s1==s2);
	1094	}
	1095	{
	1096	UErrorCode status = U_ZERO_ERROR;
	1097	UnicodeSet s1("[:punct:]", status);
	1098	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
	1099	TEST_ASSERT_SUCCESS(status);
	1100	TEST_ASSERT(s1==s2);
	1101	}
	1102	{
	1103	UErrorCode status = U_ZERO_ERROR;
	1104	UnicodeSet s1("[:digit:]", status);
	1105	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
	1106	TEST_ASSERT_SUCCESS(status);
	1107	TEST_ASSERT(s1==s2);
	1108	}
	1109	{
	1110	UErrorCode status = U_ZERO_ERROR;
	1111	UnicodeSet s1("[:xdigit:]", status);
	1112	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
	1113	TEST_ASSERT_SUCCESS(status);
	1114	TEST_ASSERT(s1==s2);
	1115	}
	1116	{
	1117	UErrorCode status = U_ZERO_ERROR;
	1118	UnicodeSet s1("[:alnum:]", status);
	1119	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
	1120	TEST_ASSERT_SUCCESS(status);
	1121	TEST_ASSERT(s1==s2);
	1122	}
	1123	{
	1124	UErrorCode status = U_ZERO_ERROR;
	1125	UnicodeSet s1("[:space:]", status);
	1126	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
	1127	TEST_ASSERT_SUCCESS(status);
	1128	TEST_ASSERT(s1==s2);
	1129	}
	1130	{
	1131	UErrorCode status = U_ZERO_ERROR;
	1132	UnicodeSet s1("[:blank:]", status);
	1133	TEST_ASSERT_SUCCESS(status);
	1134	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
	1135	status);
	1136	TEST_ASSERT_SUCCESS(status);
	1137	TEST_ASSERT(s1==s2);
	1138	}
	1139	{
	1140	UErrorCode status = U_ZERO_ERROR;
	1141	UnicodeSet s1("[:cntrl:]", status);
	1142	TEST_ASSERT_SUCCESS(status);
	1143	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
	1144	TEST_ASSERT_SUCCESS(status);
	1145	TEST_ASSERT(s1==s2);
	1146	}
	1147	{
	1148	UErrorCode status = U_ZERO_ERROR;
	1149	UnicodeSet s1("[:graph:]", status);
	1150	TEST_ASSERT_SUCCESS(status);
	1151	UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
	1152	TEST_ASSERT_SUCCESS(status);
	1153	TEST_ASSERT(s1==s2);
	1154	}
	1155	{
	1156	UErrorCode status = U_ZERO_ERROR;
	1157	UnicodeSet s1("[:print:]", status);
	1158	TEST_ASSERT_SUCCESS(status);
	1159	UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
	1160	TEST_ASSERT_SUCCESS(status);
	1161	TEST_ASSERT(s1==s2);
	1162	}
	1163	}
	1164	/**
	1165	* Test cloning of UnicodeSet. For C++, we test the copy constructor.
	1166	*/
	1167	void UnicodeSetTest::TestClone() {
	1168	UErrorCode ec = U_ZERO_ERROR;
	1169	UnicodeSet s("[abcxyz]", ec);
	1170	UnicodeSet t(s);
	1171	expectContainment(t, "abc", "def");
	1172	}
	1173
	1174	/**
	1175	* Test the indexOf() and charAt() methods.
	1176	*/
	1177	void UnicodeSetTest::TestIndexOf() {
	1178	UErrorCode ec = U_ZERO_ERROR;
	1179	UnicodeSet set("[a-cx-y3578]", ec);
	1180	if (U_FAILURE(ec)) {
	1181	errln("FAIL: UnicodeSet constructor");
	1182	return;
	1183	}
	1184	for (int32_t i=0; i<set.size(); ++i) {
	1185	UChar32 c = set.charAt(i);
	1186	if (set.indexOf(c) != i) {
	1187	errln("FAIL: charAt(%d) = %X => indexOf() => %d",
	1188	i, c, set.indexOf(c));
	1189	}
	1190	}
	1191	UChar32 c = set.charAt(set.size());
	1192	if (c != -1) {
	1193	errln("FAIL: charAt(<out of range>) = %X", c);
	1194	}
	1195	int32_t j = set.indexOf((UChar32)0x71/'q'/);
	1196	if (j != -1) {
	1197	errln((UnicodeString)"FAIL: indexOf('q') = " + j);
	1198	}
	1199	}
	1200
	1201	/**
	1202	* Test closure API.
	1203	*/
	1204	void UnicodeSetTest::TestCloseOver() {
	1205	UErrorCode ec = U_ZERO_ERROR;
	1206
	1207	char CASE[] = {(char)USET_CASE_INSENSITIVE};
	1208	char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
	1209	const char* DATA[] = {
	1210	// selector, input, output
	1211	CASE,
	1212	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1213	"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
	1214
	1215	CASE,
	1216	"[\\u01F1]", // 'DZ'
	1217	"[\\u01F1\\u01F2\\u01F3]",
	1218
	1219	CASE,
	1220	"[\\u1FB4]",
	1221	"[\\u1FB4{\\u03AC\\u03B9}]",
	1222
	1223	CASE,
	1224	"[{F\\uFB01}]",
	1225	"[\\uFB03{ffi}]",
	1226
	1227	CASE, // make sure binary search finds limits
	1228	"[a\\uFF3A]",
	1229	"[aA\\uFF3A\\uFF5A]",
	1230
	1231	CASE,
	1232	"[a-z]","[A-Za-z\\u017F\\u212A]",
	1233	CASE,
	1234	"[abc]","[A-Ca-c]",
	1235	CASE,
	1236	"[ABC]","[A-Ca-c]",
	1237
	1238	CASE, "[i]", "[iI]",
	1239
	1240	CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
	1241	CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
	1242
	1243	CASE, "[\\u0131]", "[\\u0131]", // dotless i
	1244
	1245	CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
	1246
	1247	CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
	1248
	1249	CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
	1250
	1251	CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
	1252
	1253	CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
	1254
	1255	CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
	1256	CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
	1257
	1258	CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
	1259
	1260	CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
	1261
	1262	CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
	1263
	1264	#if !UCONFIG_NO_FILE_IO
	1265	CASE_MAPPINGS,
	1266	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1267	"[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
	1268	#endif
	1269
	1270	CASE_MAPPINGS,
	1271	"[\\u01F1]", // 'DZ'
	1272	"[\\u01F1\\u01F2\\u01F3]",
	1273
	1274	CASE_MAPPINGS,
	1275	"[a-z]",
	1276	"[A-Za-z]",
	1277
	1278	NULL
	1279	};
	1280
	1281	UnicodeSet s;
	1282	UnicodeSet t;
	1283	UnicodeString buf;
	1284	for (int32_t i=0; DATA[i]!=NULL; i+=3) {
	1285	int32_t selector = DATA[i][0];
	1286	UnicodeString pat(DATA[i+1], -1, US_INV);
	1287	UnicodeString exp(DATA[i+2], -1, US_INV);
	1288	s.applyPattern(pat, ec);
	1289	s.closeOver(selector);
	1290	t.applyPattern(exp, ec);
	1291	if (U_FAILURE(ec)) {
	1292	errln("FAIL: applyPattern failed");
	1293	continue;
	1294	}
	1295	if (s == t) {
	1296	logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
	1297	} else {
	1298	dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
	1299	s.toPattern(buf, TRUE) + ", expected " + exp);
	1300	}
	1301	}
	1302
	1303	#if 0
	1304	/*
	1305	* Unused test code.
	1306	* This was used to compare the old implementation (using USET_CASE)
	1307	* with the new one (using 0x100 temporarily)
	1308	* while transitioning from hardcoded case closure tables in uniset.cpp
	1309	* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
	1310	* and using ucase.c functions for closure.
	1311	* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
	1312	*
	1313	* Note: The old and new implementation never fully matched because
	1314	* the old implementation turned out to not map U+0130 and U+0131 correctly
	1315	* (dotted I and dotless i) and because the old implementation's data tables
	1316	* were outdated compared to Unicode 4.0.1 at the time of the change to the
	1317	* new implementation. (So sigmas and some other characters were not handled
	1318	* according to the newer Unicode version.)
	1319	*/
	1320	UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
	1321	UnicodeSetIterator si(sens);
	1322	UnicodeString str, buf2;
	1323	const UnicodeString *pStr;
	1324	UChar32 c;
	1325	while(si.next()) {
	1326	if(!si.isString()) {
	1327	c=si.getCodepoint();
	1328	s.clear();
	1329	s.add(c);
	1330
	1331	str.setTo(c);
	1332	str.foldCase();
	1333	sens2.add(str);
	1334
	1335	t=s;
	1336	s.closeOver(USET_CASE);
	1337	t.closeOver(0x100);
	1338	if(s!=t) {
	1339	errln("FAIL: closeOver(U+%04x) differs: ", c);
	1340	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1341	}
	1342	}
	1343	}
	1344	// remove all code points
	1345	// should contain all full case folding mapping strings
	1346	sens2.remove(0, 0x10ffff);
	1347	si.reset(sens2);
	1348	while(si.next()) {
	1349	if(si.isString()) {
	1350	pStr=&si.getString();
	1351	s.clear();
	1352	s.add(*pStr);
	1353	t=s2=s;
	1354	s.closeOver(USET_CASE);
	1355	t.closeOver(0x100);
	1356	if(s!=t) {
	1357	errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
	1358	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1359	}
	1360	}
	1361	}
	1362	#endif
	1363
	1364	// Test the pattern API
	1365	s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1366	if (U_FAILURE(ec)) {
	1367	errln("FAIL: applyPattern failed");
	1368	} else {
	1369	expectContainment(s, "abcABC", "defDEF");
	1370	}
	1371	UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1372	if (U_FAILURE(ec)) {
	1373	errln("FAIL: constructor failed");
	1374	} else {
	1375	expectContainment(v, "defDEF", "abcABC");
	1376	}
	1377	UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
	1378	if (U_FAILURE(ec)) {
	1379	errln("FAIL: construct w/case mappings failed");
	1380	} else {
	1381	expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
	1382	}
	1383	}
	1384
	1385	void UnicodeSetTest::TestEscapePattern() {
	1386	const char pattern[] =
	1387	"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
	1388	const char exp[] =
	1389	"[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
	1390	// We test this with two passes; in the second pass we
	1391	// pre-unescape the pattern. Since U+200E is Pattern_White_Space,
	1392	// this fails -- which is what we expect.
	1393	for (int32_t pass=1; pass<=2; ++pass) {
	1394	UErrorCode ec = U_ZERO_ERROR;
	1395	UnicodeString pat(pattern, -1, US_INV);
	1396	if (pass==2) {
	1397	pat = pat.unescape();
	1398	}
	1399	// Pattern is only good for pass 1
	1400	UBool isPatternValid = (pass==1);
	1401
	1402	UnicodeSet set(pat, ec);
	1403	if (U_SUCCESS(ec) != isPatternValid){
	1404	errln((UnicodeString)"FAIL: applyPattern(" +
	1405	escape(pat) + ") => " +
	1406	u_errorName(ec));
	1407	continue;
	1408	}
	1409	if (U_FAILURE(ec)) {
	1410	continue;
	1411	}
	1412	if (set.contains((UChar)0x0644)){
	1413	errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
	1414	}
	1415
	1416	UnicodeString newpat;
	1417	set.toPattern(newpat, TRUE);
	1418	if (newpat == UnicodeString(exp, -1, US_INV)) {
	1419	logln(escape(pat) + " => " + newpat);
	1420	} else {
	1421	errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
	1422	}
	1423
	1424	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1425	UnicodeString str("Range ");
	1426	str.append((UChar)(0x30 + i))
	1427	.append(": ")
	1428	.append((UChar32)set.getRangeStart(i))
	1429	.append(" - ")
	1430	.append((UChar32)set.getRangeEnd(i));
	1431	str = str + " (" + set.getRangeStart(i) + " - " +
	1432	set.getRangeEnd(i) + ")";
	1433	if (set.getRangeStart(i) < 0) {
	1434	errln((UnicodeString)"FAIL: " + escape(str));
	1435	} else {
	1436	logln(escape(str));
	1437	}
	1438	}
	1439	}
	1440	}
	1441
	1442	void UnicodeSetTest::expectRange(const UnicodeString& label,
	1443	const UnicodeSet& set,
	1444	UChar32 start, UChar32 end) {
	1445	UnicodeSet exp(start, end);
	1446	UnicodeString pat;
	1447	if (set == exp) {
	1448	logln(label + " => " + set.toPattern(pat, TRUE));
	1449	} else {
	1450	UnicodeString xpat;
	1451	errln((UnicodeString)"FAIL: " + label + " => " +
	1452	set.toPattern(pat, TRUE) +
	1453	", expected " + exp.toPattern(xpat, TRUE));
	1454	}
	1455	}
	1456
	1457	void UnicodeSetTest::TestInvalidCodePoint() {
	1458
	1459	const UChar32 DATA[] = {
	1460	// Test range Expected range
	1461	0, 0x10FFFF, 0, 0x10FFFF,
	1462	(UChar32)-1, 8, 0, 8,
	1463	8, 0x110000, 8, 0x10FFFF
	1464	};
	1465	const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
	1466
	1467	UnicodeString pat;
	1468	int32_t i;
	1469
	1470	for (i=0; i<DATA_LENGTH; i+=4) {
	1471	UChar32 start = DATA[i];
	1472	UChar32 end = DATA[i+1];
	1473	UChar32 xstart = DATA[i+2];
	1474	UChar32 xend = DATA[i+3];
	1475
	1476	// Try various API using the test code points
	1477
	1478	UnicodeSet set(start, end);
	1479	expectRange((UnicodeString)"ct(" + start + "," + end + ")",
	1480	set, xstart, xend);
	1481
	1482	set.clear();
	1483	set.set(start, end);
	1484	expectRange((UnicodeString)"set(" + start + "," + end + ")",
	1485	set, xstart, xend);
	1486
	1487	UBool b = set.contains(start);
	1488	b = set.contains(start, end);
	1489	b = set.containsNone(start, end);
	1490	b = set.containsSome(start, end);
	1491
	1492	/int32_t index = set.indexOf(start);/
	1493
	1494	set.clear();
	1495	set.add(start);
	1496	set.add(start, end);
	1497	expectRange((UnicodeString)"add(" + start + "," + end + ")",
	1498	set, xstart, xend);
	1499
	1500	set.set(0, 0x10FFFF);
	1501	set.retain(start, end);
	1502	expectRange((UnicodeString)"retain(" + start + "," + end + ")",
	1503	set, xstart, xend);
	1504	set.retain(start);
	1505
	1506	set.set(0, 0x10FFFF);
	1507	set.remove(start);
	1508	set.remove(start, end);
	1509	set.complement();
	1510	expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
	1511	set, xstart, xend);
	1512
	1513	set.set(0, 0x10FFFF);
	1514	set.complement(start, end);
	1515	set.complement();
	1516	expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
	1517	set, xstart, xend);
	1518	set.complement(start);
	1519	}
	1520
	1521	const UChar32 DATA2[] = {
	1522	0,
	1523	0x10FFFF,
	1524	(UChar32)-1,
	1525	0x110000
	1526	};
	1527	const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
	1528
	1529	for (i=0; i<DATA2_LENGTH; ++i) {
	1530	UChar32 c = DATA2[i], end = 0x10FFFF;
	1531	UBool valid = (c >= 0 && c <= 0x10FFFF);
	1532
	1533	UnicodeSet set(0, 0x10FFFF);
	1534
	1535	// For single-codepoint contains, invalid codepoints are NOT contained
	1536	UBool b = set.contains(c);
	1537	if (b == valid) {
	1538	logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
	1539	") = " + b);
	1540	} else {
	1541	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
	1542	") = " + b);
	1543	}
	1544
	1545	// For codepoint range contains, containsNone, and containsSome,
	1546	// invalid or empty (start > end) ranges have UNDEFINED behavior.
	1547	b = set.contains(c, end);
	1548	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
	1549	"," + end + ") = " + b);
	1550
	1551	b = set.containsNone(c, end);
	1552	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
	1553	"," + end + ") = " + b);
	1554
	1555	b = set.containsSome(c, end);
	1556	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
	1557	"," + end + ") = " + b);
	1558
	1559	int32_t index = set.indexOf(c);
	1560	if ((index >= 0) == valid) {
	1561	logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
	1562	") = " + index);
	1563	} else {
	1564	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
	1565	") = " + index);
	1566	}
	1567	}
	1568	}
	1569
	1570	// Used by TestSymbolTable
	1571	class TokenSymbolTable : public SymbolTable {
	1572	public:
	1573	Hashtable contents;
	1574
	1575	TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
	1576	contents.setValueDeleter(uprv_deleteUObject);
	1577	}
	1578
	1579	~TokenSymbolTable() {}
	1580
	1581	/**
	1582	* (Non-SymbolTable API) Add the given variable and value to
	1583	* the table. Variable should NOT contain leading '$'.
	1584	*/
	1585	void add(const UnicodeString& var, const UnicodeString& value,
	1586	UErrorCode& ec) {
	1587	if (U_SUCCESS(ec)) {
	1588	contents.put(var, new UnicodeString(value), ec);
	1589	}
	1590	}
	1591
	1592	/**
	1593	* SymbolTable API
	1594	*/
	1595	virtual const UnicodeString* lookup(const UnicodeString& s) const {
	1596	return (const UnicodeString*) contents.get(s);
	1597	}
	1598
	1599	/**
	1600	* SymbolTable API
	1601	*/
	1602	virtual const UnicodeFunctor* lookupMatcher(UChar32 /ch/) const {
	1603	return NULL;
	1604	}
	1605
	1606	/**
	1607	* SymbolTable API
	1608	*/
	1609	virtual UnicodeString parseReference(const UnicodeString& text,
	1610	ParsePosition& pos, int32_t limit) const {
	1611	int32_t start = pos.getIndex();
	1612	int32_t i = start;
	1613	UnicodeString result;
	1614	while (i < limit) {
	1615	UChar c = text.charAt(i);
	1616	if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {
	1617	break;
	1618	}
	1619	++i;
	1620	}
	1621	if (i == start) { // No valid name chars
	1622	return result; // Indicate failure with empty string
	1623	}
	1624	pos.setIndex(i);
	1625	text.extractBetween(start, i, result);
	1626	return result;
	1627	}
	1628	};
	1629
	1630	void UnicodeSetTest::TestSymbolTable() {
	1631	// Multiple test cases can be set up here. Each test case
	1632	// is terminated by null:
	1633	// var, value, var, value,..., input pat., exp. output pat., null
	1634	const char* DATA[] = {
	1635	"us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
	1636	"us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
	1637	"us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
	1638	NULL
	1639	};
	1640
	1641	for (int32_t i=0; DATA[i]!=NULL; ++i) {
	1642	UErrorCode ec = U_ZERO_ERROR;
	1643	TokenSymbolTable sym(ec);
	1644	if (U_FAILURE(ec)) {
	1645	errln("FAIL: couldn't construct TokenSymbolTable");
	1646	continue;
	1647	}
	1648
	1649	// Set up variables
	1650	while (DATA[i+2] != NULL) {
	1651	sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
	1652	if (U_FAILURE(ec)) {
	1653	errln("FAIL: couldn't add to TokenSymbolTable");
	1654	continue;
	1655	}
	1656	i += 2;
	1657	}
	1658
	1659	// Input pattern and expected output pattern
	1660	UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
	1661	i += 2;
	1662
	1663	ParsePosition pos(0);
	1664	UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
	1665	if (U_FAILURE(ec)) {
	1666	errln("FAIL: couldn't construct UnicodeSet");
	1667	continue;
	1668	}
	1669
	1670	// results
	1671	if (pos.getIndex() != inpat.length()) {
	1672	errln((UnicodeString)"Failed to read to end of string \""
	1673	+ inpat + "\": read to "
	1674	+ pos.getIndex() + ", length is "
	1675	+ inpat.length());
	1676	}
	1677
	1678	UnicodeSet us2(exppat, ec);
	1679	if (U_FAILURE(ec)) {
	1680	errln("FAIL: couldn't construct expected UnicodeSet");
	1681	continue;
	1682	}
	1683
	1684	UnicodeString a, b;
	1685	if (us != us2) {
	1686	errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
	1687	", expected " + us2.toPattern(b, TRUE));
	1688	} else {
	1689	logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
	1690	}
	1691	}
	1692	}
	1693
	1694	void UnicodeSetTest::TestSurrogate() {
	1695	const char* DATA[] = {
	1696	// These should all behave identically
	1697	"[abc\\uD800\\uDC00]",
	1698	// "[abc\uD800\uDC00]", // Can't do this on C -- only Java
	1699	"[abc\\U00010000]",
	1700	0
	1701	};
	1702	for (int i=0; DATA[i] != 0; ++i) {
	1703	UErrorCode ec = U_ZERO_ERROR;
	1704	logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
	1705	UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
	1706	UnicodeSet set(str, ec);
	1707	if (U_FAILURE(ec)) {
	1708	errln("FAIL: UnicodeSet constructor");
	1709	continue;
	1710	}
	1711	expectContainment(set,
	1712	CharsToUnicodeString("abc\\U00010000"),
	1713	CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
	1714	if (set.size() != 4) {
	1715	errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
	1716	set.size() + ", expected 4");
	1717	}
	1718	}
	1719	}
	1720
	1721	void UnicodeSetTest::TestExhaustive() {
	1722	// exhaustive tests. Simulate UnicodeSets with integers.
	1723	// That gives us very solid tests (except for large memory tests).
	1724
	1725	int32_t limit = 128;
	1726
	1727	UnicodeSet x, y, z, aa;
	1728
	1729	for (int32_t i = 0; i < limit; ++i) {
	1730	bitsToSet(i, x);
	1731	logln((UnicodeString)"Testing " + i + ", " + x);
	1732	_testComplement(i, x, y);
	1733
	1734	// AS LONG AS WE ARE HERE, check roundtrip
	1735	checkRoundTrip(bitsToSet(i, aa));
	1736
	1737	for (int32_t j = 0; j < limit; ++j) {
	1738	_testAdd(i,j, x,y,z);
	1739	_testXor(i,j, x,y,z);
	1740	_testRetain(i,j, x,y,z);
	1741	_testRemove(i,j, x,y,z);
	1742	}
	1743	}
	1744	}
	1745
	1746	void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
	1747	bitsToSet(a, x);
	1748	z = x;
	1749	z.complement();
	1750	int32_t c = setToBits(z);
	1751	if (c != (~a)) {
	1752	errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
	1753	errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
	1754	}
	1755	checkCanonicalRep(z, (UnicodeString)"complement " + a);
	1756	}
	1757
	1758	void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1759	bitsToSet(a, x);
	1760	bitsToSet(b, y);
	1761	z = x;
	1762	z.addAll(y);
	1763	int32_t c = setToBits(z);
	1764	if (c != (a \| b)) {
	1765	errln((UnicodeString)"FAILED: add: " + x + " \| " + y + " != " + z);
	1766	errln((UnicodeString)"FAILED: add: " + a + " \| " + b + " != " + c);
	1767	}
	1768	checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
	1769	}
	1770
	1771	void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1772	bitsToSet(a, x);
	1773	bitsToSet(b, y);
	1774	z = x;
	1775	z.retainAll(y);
	1776	int32_t c = setToBits(z);
	1777	if (c != (a & b)) {
	1778	errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
	1779	errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
	1780	}
	1781	checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
	1782	}
	1783
	1784	void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1785	bitsToSet(a, x);
	1786	bitsToSet(b, y);
	1787	z = x;
	1788	z.removeAll(y);
	1789	int32_t c = setToBits(z);
	1790	if (c != (a &~ b)) {
	1791	errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
	1792	errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
	1793	}
	1794	checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
	1795	}
	1796
	1797	void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1798	bitsToSet(a, x);
	1799	bitsToSet(b, y);
	1800	z = x;
	1801	z.complementAll(y);
	1802	int32_t c = setToBits(z);
	1803	if (c != (a ^ b)) {
	1804	errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
	1805	errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
	1806	}
	1807	checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
	1808	}
	1809
	1810	/**
	1811	* Check that ranges are monotonically increasing and non-
	1812	* overlapping.
	1813	*/
	1814	void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
	1815	int32_t n = set.getRangeCount();
	1816	if (n < 0) {
	1817	errln((UnicodeString)"FAIL result of " + msg +
	1818	": range count should be >= 0 but is " +
	1819	n /+ " for " + set.toPattern())/);
	1820	return;
	1821	}
	1822	UChar32 last = 0;
	1823	for (int32_t i=0; i<n; ++i) {
	1824	UChar32 start = set.getRangeStart(i);
	1825	UChar32 end = set.getRangeEnd(i);
	1826	if (start > end) {
	1827	errln((UnicodeString)"FAIL result of " + msg +
	1828	": range " + (i+1) +
	1829	" start > end: " + (int)start + ", " + (int)end +
	1830	" for " + set);
	1831	}
	1832	if (i > 0 && start <= last) {
	1833	errln((UnicodeString)"FAIL result of " + msg +
	1834	": range " + (i+1) +
	1835	" overlaps previous range: " + (int)start + ", " + (int)end +
	1836	" for " + set);
	1837	}
	1838	last = end;
	1839	}
	1840	}
	1841
	1842	/**
	1843	* Convert a bitmask to a UnicodeSet.
	1844	*/
	1845	UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
	1846	result.clear();
	1847	for (UChar32 i = 0; i < 32; ++i) {
	1848	if ((a & (1<<i)) != 0) {
	1849	result.add(i);
	1850	}
	1851	}
	1852	return result;
	1853	}
	1854
	1855	/**
	1856	* Convert a UnicodeSet to a bitmask. Only the characters
	1857	* U+0000 to U+0020 are represented in the bitmask.
	1858	*/
	1859	int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
	1860	int32_t result = 0;
	1861	for (int32_t i = 0; i < 32; ++i) {
	1862	if (x.contains((UChar32)i)) {
	1863	result \|= (1<<i);
	1864	}
	1865	}
	1866	return result;
	1867	}
	1868
	1869	/**
	1870	* Return the representation of an inversion list based UnicodeSet
	1871	* as a pairs list. Ranges are listed in ascending Unicode order.
	1872	* For example, the set [a-zA-M3] is represented as "33AMaz".
	1873	*/
	1874	UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
	1875	UnicodeString pairs;
	1876	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1877	UChar32 start = set.getRangeStart(i);
	1878	UChar32 end = set.getRangeEnd(i);
	1879	if (end > 0xFFFF) {
	1880	end = 0xFFFF;
	1881	i = set.getRangeCount(); // Should be unnecessary
	1882	}
	1883	pairs.append((UChar)start).append((UChar)end);
	1884	}
	1885	return pairs;
	1886	}
	1887
	1888	/**
	1889	* Basic consistency check for a few items.
	1890	* That the iterator works, and that we can create a pattern and
	1891	* get the same thing back
	1892	*/
	1893	void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
	1894	UErrorCode ec = U_ZERO_ERROR;
	1895
	1896	UnicodeSet t(s);
	1897	checkEqual(s, t, "copy ct");
	1898
	1899	t = s;
	1900	checkEqual(s, t, "operator=");
	1901
	1902	copyWithIterator(t, s, FALSE);
	1903	checkEqual(s, t, "iterator roundtrip");
	1904
	1905	copyWithIterator(t, s, TRUE); // try range
	1906	checkEqual(s, t, "iterator roundtrip");
	1907
	1908	UnicodeString pat; s.toPattern(pat, FALSE);
	1909	t.applyPattern(pat, ec);
	1910	if (U_FAILURE(ec)) {
	1911	errln("FAIL: applyPattern");
	1912	return;
	1913	} else {
	1914	checkEqual(s, t, "toPattern(false)");
	1915	}
	1916
	1917	s.toPattern(pat, TRUE);
	1918	t.applyPattern(pat, ec);
	1919	if (U_FAILURE(ec)) {
	1920	errln("FAIL: applyPattern");
	1921	return;
	1922	} else {
	1923	checkEqual(s, t, "toPattern(true)");
	1924	}
	1925	}
	1926
	1927	void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
	1928	t.clear();
	1929	UnicodeSetIterator it(s);
	1930	if (withRange) {
	1931	while (it.nextRange()) {
	1932	if (it.isString()) {
	1933	t.add(it.getString());
	1934	} else {
	1935	t.add(it.getCodepoint(), it.getCodepointEnd());
	1936	}
	1937	}
	1938	} else {
	1939	while (it.next()) {
	1940	if (it.isString()) {
	1941	t.add(it.getString());
	1942	} else {
	1943	t.add(it.getCodepoint());
	1944	}
	1945	}
	1946	}
	1947	}
	1948
	1949	UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
	1950	UnicodeString source; s.toPattern(source, TRUE);
	1951	UnicodeString result; t.toPattern(result, TRUE);
	1952	if (s != t) {
	1953	errln((UnicodeString)"FAIL: " + message
	1954	+ "; source = " + source
	1955	+ "; result = " + result
	1956	);
	1957	return FALSE;
	1958	} else {
	1959	logln((UnicodeString)"Ok: " + message
	1960	+ "; source = " + source
	1961	+ "; result = " + result
	1962	);
	1963	}
	1964	return TRUE;
	1965	}
	1966
	1967	void
	1968	UnicodeSetTest::expectContainment(const UnicodeString& pat,
	1969	const UnicodeString& charsIn,
	1970	const UnicodeString& charsOut) {
	1971	UErrorCode ec = U_ZERO_ERROR;
	1972	UnicodeSet set(pat, ec);
	1973	if (U_FAILURE(ec)) {
	1974	dataerrln((UnicodeString)"FAIL: pattern \"" +
	1975	pat + "\" => " + u_errorName(ec));
	1976	return;
	1977	}
	1978	expectContainment(set, pat, charsIn, charsOut);
	1979	}
	1980
	1981	void
	1982	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	1983	const UnicodeString& charsIn,
	1984	const UnicodeString& charsOut) {
	1985	UnicodeString pat;
	1986	set.toPattern(pat);
	1987	expectContainment(set, pat, charsIn, charsOut);
	1988	}
	1989
	1990	void
	1991	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	1992	const UnicodeString& setName,
	1993	const UnicodeString& charsIn,
	1994	const UnicodeString& charsOut) {
	1995	UnicodeString bad;
	1996	UChar32 c;
	1997	int32_t i;
	1998
	1999	for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
	2000	c = charsIn.char32At(i);
	2001	if (!set.contains(c)) {
	2002	bad.append(c);
	2003	}
	2004	}
	2005	if (bad.length() > 0) {
	2006	errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
	2007	", expected containment of " + prettify(charsIn));
	2008	} else {
	2009	logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
	2010	}
	2011
	2012	bad.truncate(0);
	2013	for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
	2014	c = charsOut.char32At(i);
	2015	if (set.contains(c)) {
	2016	bad.append(c);
	2017	}
	2018	}
	2019	if (bad.length() > 0) {
	2020	errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
	2021	", expected non-containment of " + prettify(charsOut));
	2022	} else {
	2023	logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
	2024	}
	2025	}
	2026
	2027	void
	2028	UnicodeSetTest::expectPattern(UnicodeSet& set,
	2029	const UnicodeString& pattern,
	2030	const UnicodeString& expectedPairs){
	2031	UErrorCode status = U_ZERO_ERROR;
	2032	set.applyPattern(pattern, status);
	2033	if (U_FAILURE(status)) {
	2034	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	2035	"\") failed");
	2036	return;
	2037	} else {
	2038	if (getPairs(set) != expectedPairs ) {
	2039	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	2040	"\") => pairs \"" +
	2041	escape(getPairs(set)) + "\", expected \"" +
	2042	escape(expectedPairs) + "\"");
	2043	} else {
	2044	logln(UnicodeString("Ok: applyPattern(\"") + pattern +
	2045	"\") => pairs \"" +
	2046	escape(getPairs(set)) + "\"");
	2047	}
	2048	}
	2049	// the result of calling set.toPattern(), which is the string representation of
	2050	// this set(set), is passed to a UnicodeSet constructor, and tested that it
	2051	// will produce another set that is equal to this one.
	2052	UnicodeString temppattern;
	2053	set.toPattern(temppattern);
	2054	UnicodeSet *tempset=new UnicodeSet(temppattern, status);
	2055	if (U_FAILURE(status)) {
	2056	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
	2057	return;
	2058	}
	2059	if(tempset != set \|\| getPairs(tempset) != getPairs(set)){
	2060	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
	2061	escape(getPairs(set)) + "\""));
	2062	} else{
	2063	logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
	2064	}
	2065
	2066	delete tempset;
	2067
	2068	}
	2069
	2070	void
	2071	UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
	2072	if (getPairs(set) != expectedPairs) {
	2073	errln(UnicodeString("FAIL: Expected pair list \"") +
	2074	escape(expectedPairs) + "\", got \"" +
	2075	escape(getPairs(set)) + "\"");
	2076	}
	2077	}
	2078
	2079	void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
	2080	const UnicodeString& expPat,
	2081	const char** expStrings) {
	2082	UnicodeString pat;
	2083	set.toPattern(pat, TRUE);
	2084	if (pat == expPat) {
	2085	logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
	2086	} else {
	2087	errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
	2088	return;
	2089	}
	2090	if (expStrings == NULL) {
	2091	return;
	2092	}
	2093	UBool in = TRUE;
	2094	for (int32_t i=0; expStrings[i] != NULL; ++i) {
	2095	if (expStrings[i] == NOT) { // sic; pointer comparison
	2096	in = FALSE;
	2097	continue;
	2098	}
	2099	UnicodeString s = CharsToUnicodeString(expStrings[i]);
	2100	UBool contained = set.contains(s);
	2101	if (contained == in) {
	2102	logln((UnicodeString)"Ok: " + expPat +
	2103	(contained ? " contains {" : " does not contain {") +
	2104	escape(expStrings[i]) + "}");
	2105	} else {
	2106	errln((UnicodeString)"FAIL: " + expPat +
	2107	(contained ? " contains {" : " does not contain {") +
	2108	escape(expStrings[i]) + "}");
	2109	}
	2110	}
	2111	}
	2112
	2113	static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
	2114
	2115	void
	2116	UnicodeSetTest::doAssert(UBool condition, const char *message)
	2117	{
	2118	if (!condition) {
	2119	errln(UnicodeString("ERROR : ") + message);
	2120	}
	2121	}
	2122
	2123	UnicodeString
	2124	UnicodeSetTest::escape(const UnicodeString& s) {
	2125	UnicodeString buf;
	2126	for (int32_t i=0; i<s.length(); )
	2127	{
	2128	UChar32 c = s.char32At(i);
	2129	if (0x0020 <= c && c <= 0x007F) {
	2130	buf += c;
	2131	} else {
	2132	if (c <= 0xFFFF) {
	2133	buf += (UChar)0x5c; buf += (UChar)0x75;
	2134	} else {
	2135	buf += (UChar)0x5c; buf += (UChar)0x55;
	2136	buf += toHexString((c & 0xF0000000) >> 28);
	2137	buf += toHexString((c & 0x0F000000) >> 24);
	2138	buf += toHexString((c & 0x00F00000) >> 20);
	2139	buf += toHexString((c & 0x000F0000) >> 16);
	2140	}
	2141	buf += toHexString((c & 0xF000) >> 12);
	2142	buf += toHexString((c & 0x0F00) >> 8);
	2143	buf += toHexString((c & 0x00F0) >> 4);
	2144	buf += toHexString(c & 0x000F);
	2145	}
	2146	i += U16_LENGTH(c);
	2147	}
	2148	return buf;
	2149	}
	2150
	2151	void UnicodeSetTest::TestFreezable() {
	2152	UErrorCode errorCode=U_ZERO_ERROR;
	2153	UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
	2154	UnicodeSet idSet(idPattern, errorCode);
	2155	if(U_FAILURE(errorCode)) {
	2156	dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
	2157	return;
	2158	}
	2159
	2160	UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
	2161	UnicodeSet wsSet(wsPattern, errorCode);
	2162	if(U_FAILURE(errorCode)) {
	2163	dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
	2164	return;
	2165	}
	2166
	2167	idSet.add(idPattern);
	2168	UnicodeSet frozen(idSet);
	2169	frozen.freeze();
	2170
	2171	if(idSet.isFrozen() \|\| !frozen.isFrozen()) {
	2172	errln("FAIL: isFrozen() is wrong");
	2173	}
	2174	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2175	errln("FAIL: a copy-constructed frozen set differs from its original");
	2176	}
	2177
	2178	frozen=wsSet;
	2179	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2180	errln("FAIL: a frozen set was modified by operator=");
	2181	}
	2182
	2183	UnicodeSet frozen2(frozen);
	2184	if(frozen2!=frozen \|\| frozen2!=idSet) {
	2185	errln("FAIL: a copied frozen set differs from its frozen original");
	2186	}
	2187	if(!frozen2.isFrozen()) {
	2188	errln("FAIL: copy-constructing a frozen set results in a thawed one");
	2189	}
	2190	UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
	2191	if(frozen3.contains(0, 4) \|\| !frozen3.contains(5, 55) \|\| frozen3.contains(56, 0x10ffff)) {
	2192	errln("FAIL: UnicodeSet(5, 55) failed");
	2193	}
	2194	frozen3=frozen;
	2195	if(!frozen3.isFrozen()) {
	2196	errln("FAIL: copying a frozen set results in a thawed one");
	2197	}
	2198
	2199	UnicodeSet cloned=(UnicodeSet )frozen.clone();
	2200	if(!cloned->isFrozen() \|\| *cloned!=frozen \|\| cloned->containsSome(0xd802, 0xd805)) {
	2201	errln("FAIL: clone() failed");
	2202	}
	2203	cloned->add(0xd802, 0xd805);
	2204	if(cloned->containsSome(0xd802, 0xd805)) {
	2205	errln("FAIL: unable to modify clone");
	2206	}
	2207	delete cloned;
	2208
	2209	UnicodeSet thawed=(UnicodeSet )frozen.cloneAsThawed();
	2210	if(thawed->isFrozen() \|\| *thawed!=frozen \|\| thawed->containsSome(0xd802, 0xd805)) {
	2211	errln("FAIL: cloneAsThawed() failed");
	2212	}
	2213	thawed->add(0xd802, 0xd805);
	2214	if(!thawed->contains(0xd802, 0xd805)) {
	2215	errln("FAIL: unable to modify thawed clone");
	2216	}
	2217	delete thawed;
	2218
	2219	frozen.set(5, 55);
	2220	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2221	errln("FAIL: UnicodeSet::set() modified a frozen set");
	2222	}
	2223
	2224	frozen.clear();
	2225	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2226	errln("FAIL: UnicodeSet::clear() modified a frozen set");
	2227	}
	2228
	2229	frozen.closeOver(USET_CASE_INSENSITIVE);
	2230	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2231	errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
	2232	}
	2233
	2234	frozen.compact();
	2235	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2236	errln("FAIL: UnicodeSet::compact() modified a frozen set");
	2237	}
	2238
	2239	ParsePosition pos;
	2240	frozen.
	2241	applyPattern(wsPattern, errorCode).
	2242	applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
	2243	applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
	2244	applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
	2245	applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
	2246	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2247	errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
	2248	}
	2249
	2250	frozen.
	2251	add(0xd800).
	2252	add(0xd802, 0xd805).
	2253	add(wsPattern).
	2254	addAll(idPattern).
	2255	addAll(wsSet);
	2256	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2257	errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
	2258	}
	2259
	2260	frozen.
	2261	retain(0x62).
	2262	retain(0x64, 0x69).
	2263	retainAll(wsPattern).
	2264	retainAll(wsSet);
	2265	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2266	errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
	2267	}
	2268
	2269	frozen.
	2270	remove(0x62).
	2271	remove(0x64, 0x69).
	2272	remove(idPattern).
	2273	removeAll(idPattern).
	2274	removeAll(idSet);
	2275	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2276	errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
	2277	}
	2278
	2279	frozen.
	2280	complement().
	2281	complement(0x62).
	2282	complement(0x64, 0x69).
	2283	complement(idPattern).
	2284	complementAll(idPattern).
	2285	complementAll(idSet);
	2286	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2287	errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
	2288	}
	2289	}
	2290
	2291	// Test span() etc. -------------------------------------------------------- ***
	2292
	2293	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
	2294	static int32_t
	2295	appendUTF8(const UChar s, int32_t length, char t, int32_t capacity) {
	2296	UErrorCode errorCode=U_ZERO_ERROR;
	2297	int32_t length8=0;
	2298	u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
	2299	if(U_SUCCESS(errorCode)) {
	2300	return length8;
	2301	} else {
	2302	// The string contains an unpaired surrogate.
	2303	// Ignore this string.
	2304	return 0;
	2305	}
	2306	}
	2307
	2308	class UnicodeSetWithStringsIterator;
	2309
	2310	// Make the strings in a UnicodeSet easily accessible.
	2311	class UnicodeSetWithStrings {
	2312	public:
	2313	UnicodeSetWithStrings(const UnicodeSet &normalSet) :
	2314	set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
	2315	int32_t size=set.size();
	2316	if(size>0 && set.charAt(size-1)<0) {
	2317	// If a set's last element is not a code point, then it must contain strings.
	2318	// Iterate over the set, skip all code point ranges, and cache the strings.
	2319	// Convert them to UTF-8 for spanUTF8().
	2320	UnicodeSetIterator iter(set);
	2321	const UnicodeString *s;
	2322	char *s8=utf8;
	2323	int32_t length8, utf8Count=0;
	2324	while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
	2325	if(iter.isString()) {
	2326	// Store the pointer to the set's string element
	2327	// which we happen to know is a stable pointer.
	2328	strings[stringsLength]=s=&iter.getString();
	2329	utf8Count+=
	2330	utf8Lengths[stringsLength]=length8=
	2331	appendUTF8(s->getBuffer(), s->length(),
	2332	s8, (int32_t)(sizeof(utf8)-utf8Count));
	2333	if(length8==0) {
	2334	hasSurrogates=TRUE; // Contains unpaired surrogates.
	2335	}
	2336	s8+=length8;
	2337	++stringsLength;
	2338	}
	2339	}
	2340	}
	2341	}
	2342
	2343	const UnicodeSet &getSet() const {
	2344	return set;
	2345	}
	2346
	2347	UBool hasStrings() const {
	2348	return (UBool)(stringsLength>0);
	2349	}
	2350
	2351	UBool hasStringsWithSurrogates() const {
	2352	return hasSurrogates;
	2353	}
	2354
	2355	private:
	2356	friend class UnicodeSetWithStringsIterator;
	2357
	2358	const UnicodeSet &set;
	2359
	2360	const UnicodeString *strings[20];
	2361	int32_t stringsLength;
	2362	UBool hasSurrogates;
	2363
	2364	char utf8[1024];
	2365	int32_t utf8Lengths[20];
	2366
	2367	int32_t nextStringIndex;
	2368	int32_t nextUTF8Start;
	2369	};
	2370
	2371	class UnicodeSetWithStringsIterator {
	2372	public:
	2373	UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
	2374	fSet(set), nextStringIndex(0), nextUTF8Start(0) {
	2375	}
	2376
	2377	void reset() {
	2378	nextStringIndex=nextUTF8Start=0;
	2379	}
	2380
	2381	const UnicodeString *nextString() {
	2382	if(nextStringIndex<fSet.stringsLength) {
	2383	return fSet.strings[nextStringIndex++];
	2384	} else {
	2385	return NULL;
	2386	}
	2387	}
	2388
	2389	// Do not mix with calls to nextString().
	2390	const char *nextUTF8(int32_t &length) {
	2391	if(nextStringIndex<fSet.stringsLength) {
	2392	const char *s8=fSet.utf8+nextUTF8Start;
	2393	nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
	2394	return s8;
	2395	} else {
	2396	length=0;
	2397	return NULL;
	2398	}
	2399	}
	2400
	2401	private:
	2402	const UnicodeSetWithStrings &fSet;
	2403	int32_t nextStringIndex;
	2404	int32_t nextUTF8Start;
	2405	};
	2406
	2407	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
	2408	// at code point boundaries.
	2409	// That is, each edge of a match must not be in the middle of a surrogate pair.
	2410	static inline UBool
	2411	matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
	2412	s+=start;
	2413	limit-=start;
	2414	int32_t length=t.length();
	2415	return 0==t.compare(s, length) &&
	2416	!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
	2417	!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
	2418	}
	2419
	2420	// Implement span() with contains() for comparison.
	2421	static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2422	USetSpanCondition spanCondition) {
	2423	const UnicodeSet &realSet(set.getSet());
	2424	if(!set.hasStrings()) {
	2425	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2426	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2427	}
	2428
	2429	UChar32 c;
	2430	int32_t start=0, prev;
	2431	while((prev=start)<length) {
	2432	U16_NEXT(s, start, length, c);
	2433	if(realSet.contains(c)!=spanCondition) {
	2434	break;
	2435	}
	2436	}
	2437	return prev;
	2438	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2439	UnicodeSetWithStringsIterator iter(set);
	2440	UChar32 c;
	2441	int32_t start, next;
	2442	for(start=next=0; start<length;) {
	2443	U16_NEXT(s, next, length, c);
	2444	if(realSet.contains(c)) {
	2445	break;
	2446	}
	2447	const UnicodeString *str;
	2448	iter.reset();
	2449	while((str=iter.nextString())!=NULL) {
	2450	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2451	// spanNeedsStrings=TRUE;
	2452	return start;
	2453	}
	2454	}
	2455	start=next;
	2456	}
	2457	return start;
	2458	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2459	UnicodeSetWithStringsIterator iter(set);
	2460	UChar32 c;
	2461	int32_t start, next, maxSpanLimit=0;
	2462	for(start=next=0; start<length;) {
	2463	U16_NEXT(s, next, length, c);
	2464	if(!realSet.contains(c)) {
	2465	next=start; // Do not span this single, not-contained code point.
	2466	}
	2467	const UnicodeString *str;
	2468	iter.reset();
	2469	while((str=iter.nextString())!=NULL) {
	2470	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2471	// spanNeedsStrings=TRUE;
	2472	int32_t matchLimit=start+str->length();
	2473	if(matchLimit==length) {
	2474	return length;
	2475	}
	2476	if(spanCondition==USET_SPAN_CONTAINED) {
	2477	// Iterate for the shortest match at each position.
	2478	// Recurse for each but the shortest match.
	2479	if(next==start) {
	2480	next=matchLimit; // First match from start.
	2481	} else {
	2482	if(matchLimit<next) {
	2483	// Remember shortest match from start for iteration.
	2484	int32_t temp=next;
	2485	next=matchLimit;
	2486	matchLimit=temp;
	2487	}
	2488	// Recurse for non-shortest match from start.
	2489	int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
	2490	USET_SPAN_CONTAINED);
	2491	if((matchLimit+spanLength)>maxSpanLimit) {
	2492	maxSpanLimit=matchLimit+spanLength;
	2493	if(maxSpanLimit==length) {
	2494	return length;
	2495	}
	2496	}
	2497	}
	2498	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2499	if(matchLimit>next) {
	2500	// Remember longest match from start.
	2501	next=matchLimit;
	2502	}
	2503	}
	2504	}
	2505	}
	2506	if(next==start) {
	2507	break; // No match from start.
	2508	}
	2509	start=next;
	2510	}
	2511	if(start>maxSpanLimit) {
	2512	return start;
	2513	} else {
	2514	return maxSpanLimit;
	2515	}
	2516	}
	2517	}
	2518
	2519	static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2520	USetSpanCondition spanCondition) {
	2521	if(length==0) {
	2522	return 0;
	2523	}
	2524	const UnicodeSet &realSet(set.getSet());
	2525	if(!set.hasStrings()) {
	2526	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2527	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2528	}
	2529
	2530	UChar32 c;
	2531	int32_t prev=length;
	2532	do {
	2533	U16_PREV(s, 0, length, c);
	2534	if(realSet.contains(c)!=spanCondition) {
	2535	break;
	2536	}
	2537	} while((prev=length)>0);
	2538	return prev;
	2539	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2540	UnicodeSetWithStringsIterator iter(set);
	2541	UChar32 c;
	2542	int32_t prev=length, length0=length;
	2543	do {
	2544	U16_PREV(s, 0, length, c);
	2545	if(realSet.contains(c)) {
	2546	break;
	2547	}
	2548	const UnicodeString *str;
	2549	iter.reset();
	2550	while((str=iter.nextString())!=NULL) {
	2551	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2552	// spanNeedsStrings=TRUE;
	2553	return prev;
	2554	}
	2555	}
	2556	} while((prev=length)>0);
	2557	return prev;
	2558	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2559	UnicodeSetWithStringsIterator iter(set);
	2560	UChar32 c;
	2561	int32_t prev=length, minSpanStart=length, length0=length;
	2562	do {
	2563	U16_PREV(s, 0, length, c);
	2564	if(!realSet.contains(c)) {
	2565	length=prev; // Do not span this single, not-contained code point.
	2566	}
	2567	const UnicodeString *str;
	2568	iter.reset();
	2569	while((str=iter.nextString())!=NULL) {
	2570	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2571	// spanNeedsStrings=TRUE;
	2572	int32_t matchStart=prev-str->length();
	2573	if(matchStart==0) {
	2574	return 0;
	2575	}
	2576	if(spanCondition==USET_SPAN_CONTAINED) {
	2577	// Iterate for the shortest match at each position.
	2578	// Recurse for each but the shortest match.
	2579	if(length==prev) {
	2580	length=matchStart; // First match from prev.
	2581	} else {
	2582	if(matchStart>length) {
	2583	// Remember shortest match from prev for iteration.
	2584	int32_t temp=length;
	2585	length=matchStart;
	2586	matchStart=temp;
	2587	}
	2588	// Recurse for non-shortest match from prev.
	2589	int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
	2590	USET_SPAN_CONTAINED);
	2591	if(spanStart<minSpanStart) {
	2592	minSpanStart=spanStart;
	2593	if(minSpanStart==0) {
	2594	return 0;
	2595	}
	2596	}
	2597	}
	2598	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2599	if(matchStart<length) {
	2600	// Remember longest match from prev.
	2601	length=matchStart;
	2602	}
	2603	}
	2604	}
	2605	}
	2606	if(length==prev) {
	2607	break; // No match from prev.
	2608	}
	2609	} while((prev=length)>0);
	2610	if(prev<minSpanStart) {
	2611	return prev;
	2612	} else {
	2613	return minSpanStart;
	2614	}
	2615	}
	2616	}
	2617
	2618	static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2619	USetSpanCondition spanCondition) {
	2620	const UnicodeSet &realSet(set.getSet());
	2621	if(!set.hasStrings()) {
	2622	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2623	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2624	}
	2625
	2626	UChar32 c;
	2627	int32_t start=0, prev;
	2628	while((prev=start)<length) {
	2629	U8_NEXT_OR_FFFD(s, start, length, c);
	2630	if(realSet.contains(c)!=spanCondition) {
	2631	break;
	2632	}
	2633	}
	2634	return prev;
	2635	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2636	UnicodeSetWithStringsIterator iter(set);
	2637	UChar32 c;
	2638	int32_t start, next;
	2639	for(start=next=0; start<length;) {
	2640	U8_NEXT_OR_FFFD(s, next, length, c);
	2641	if(realSet.contains(c)) {
	2642	break;
	2643	}
	2644	const char *s8;
	2645	int32_t length8;
	2646	iter.reset();
	2647	while((s8=iter.nextUTF8(length8))!=NULL) {
	2648	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2649	// spanNeedsStrings=TRUE;
	2650	return start;
	2651	}
	2652	}
	2653	start=next;
	2654	}
	2655	return start;
	2656	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2657	UnicodeSetWithStringsIterator iter(set);
	2658	UChar32 c;
	2659	int32_t start, next, maxSpanLimit=0;
	2660	for(start=next=0; start<length;) {
	2661	U8_NEXT_OR_FFFD(s, next, length, c);
	2662	if(!realSet.contains(c)) {
	2663	next=start; // Do not span this single, not-contained code point.
	2664	}
	2665	const char *s8;
	2666	int32_t length8;
	2667	iter.reset();
	2668	while((s8=iter.nextUTF8(length8))!=NULL) {
	2669	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2670	// spanNeedsStrings=TRUE;
	2671	int32_t matchLimit=start+length8;
	2672	if(matchLimit==length) {
	2673	return length;
	2674	}
	2675	if(spanCondition==USET_SPAN_CONTAINED) {
	2676	// Iterate for the shortest match at each position.
	2677	// Recurse for each but the shortest match.
	2678	if(next==start) {
	2679	next=matchLimit; // First match from start.
	2680	} else {
	2681	if(matchLimit<next) {
	2682	// Remember shortest match from start for iteration.
	2683	int32_t temp=next;
	2684	next=matchLimit;
	2685	matchLimit=temp;
	2686	}
	2687	// Recurse for non-shortest match from start.
	2688	int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
	2689	USET_SPAN_CONTAINED);
	2690	if((matchLimit+spanLength)>maxSpanLimit) {
	2691	maxSpanLimit=matchLimit+spanLength;
	2692	if(maxSpanLimit==length) {
	2693	return length;
	2694	}
	2695	}
	2696	}
	2697	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2698	if(matchLimit>next) {
	2699	// Remember longest match from start.
	2700	next=matchLimit;
	2701	}
	2702	}
	2703	}
	2704	}
	2705	if(next==start) {
	2706	break; // No match from start.
	2707	}
	2708	start=next;
	2709	}
	2710	if(start>maxSpanLimit) {
	2711	return start;
	2712	} else {
	2713	return maxSpanLimit;
	2714	}
	2715	}
	2716	}
	2717
	2718	static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2719	USetSpanCondition spanCondition) {
	2720	if(length==0) {
	2721	return 0;
	2722	}
	2723	const UnicodeSet &realSet(set.getSet());
	2724	if(!set.hasStrings()) {
	2725	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2726	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2727	}
	2728
	2729	UChar32 c;
	2730	int32_t prev=length;
	2731	do {
	2732	U8_PREV_OR_FFFD(s, 0, length, c);
	2733	if(realSet.contains(c)!=spanCondition) {
	2734	break;
	2735	}
	2736	} while((prev=length)>0);
	2737	return prev;
	2738	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2739	UnicodeSetWithStringsIterator iter(set);
	2740	UChar32 c;
	2741	int32_t prev=length;
	2742	do {
	2743	U8_PREV_OR_FFFD(s, 0, length, c);
	2744	if(realSet.contains(c)) {
	2745	break;
	2746	}
	2747	const char *s8;
	2748	int32_t length8;
	2749	iter.reset();
	2750	while((s8=iter.nextUTF8(length8))!=NULL) {
	2751	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2752	// spanNeedsStrings=TRUE;
	2753	return prev;
	2754	}
	2755	}
	2756	} while((prev=length)>0);
	2757	return prev;
	2758	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2759	UnicodeSetWithStringsIterator iter(set);
	2760	UChar32 c;
	2761	int32_t prev=length, minSpanStart=length;
	2762	do {
	2763	U8_PREV_OR_FFFD(s, 0, length, c);
	2764	if(!realSet.contains(c)) {
	2765	length=prev; // Do not span this single, not-contained code point.
	2766	}
	2767	const char *s8;
	2768	int32_t length8;
	2769	iter.reset();
	2770	while((s8=iter.nextUTF8(length8))!=NULL) {
	2771	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2772	// spanNeedsStrings=TRUE;
	2773	int32_t matchStart=prev-length8;
	2774	if(matchStart==0) {
	2775	return 0;
	2776	}
	2777	if(spanCondition==USET_SPAN_CONTAINED) {
	2778	// Iterate for the shortest match at each position.
	2779	// Recurse for each but the shortest match.
	2780	if(length==prev) {
	2781	length=matchStart; // First match from prev.
	2782	} else {
	2783	if(matchStart>length) {
	2784	// Remember shortest match from prev for iteration.
	2785	int32_t temp=length;
	2786	length=matchStart;
	2787	matchStart=temp;
	2788	}
	2789	// Recurse for non-shortest match from prev.
	2790	int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
	2791	USET_SPAN_CONTAINED);
	2792	if(spanStart<minSpanStart) {
	2793	minSpanStart=spanStart;
	2794	if(minSpanStart==0) {
	2795	return 0;
	2796	}
	2797	}
	2798	}
	2799	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2800	if(matchStart<length) {
	2801	// Remember longest match from prev.
	2802	length=matchStart;
	2803	}
	2804	}
	2805	}
	2806	}
	2807	if(length==prev) {
	2808	break; // No match from prev.
	2809	}
	2810	} while((prev=length)>0);
	2811	if(prev<minSpanStart) {
	2812	return prev;
	2813	} else {
	2814	return minSpanStart;
	2815	}
	2816	}
	2817	}
	2818
	2819	// spans to be performed and compared
	2820	enum {
	2821	SPAN_UTF16 =1,
	2822	SPAN_UTF8 =2,
	2823	SPAN_UTFS =3,
	2824
	2825	SPAN_SET =4,
	2826	SPAN_COMPLEMENT =8,
	2827	SPAN_POLARITY =0xc,
	2828
	2829	SPAN_FWD =0x10,
	2830	SPAN_BACK =0x20,
	2831	SPAN_DIRS =0x30,
	2832
	2833	SPAN_CONTAINED =0x100,
	2834	SPAN_SIMPLE =0x200,
	2835	SPAN_CONDITION =0x300,
	2836
	2837	SPAN_ALL =0x33f
	2838	};
	2839
	2840	static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
	2841	return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
	2842	}
	2843
	2844	static inline int32_t slen(const void *s, UBool isUTF16) {
	2845	return isUTF16 ? u_strlen((const UChar )s) : strlen((const char )s);
	2846	}
	2847
	2848	/*
	2849	* Count spans on a string with the method according to type and set the span limits.
	2850	* The set may be the complement of the original.
	2851	* When using spanBack() and comparing with span(), use a span condition for the first spanBack()
	2852	* according to the expected number of spans.
	2853	* Sets typeName to an empty string if there is no such type.
	2854	* Returns -1 if the span option is filtered out.
	2855	*/
	2856	static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
	2857	const void *s, int32_t length, UBool isUTF16,
	2858	uint32_t whichSpans,
	2859	int type, const char *&typeName,
	2860	int32_t limits[], int32_t limitsCapacity,
	2861	int32_t expectCount) {
	2862	const UnicodeSet &realSet(set.getSet());
	2863	int32_t start, count;
	2864	USetSpanCondition spanCondition, firstSpanCondition, contained;
	2865	UBool isForward;
	2866
	2867	if(type<0 \|\| 7<type) {
	2868	typeName="";
	2869	return 0;
	2870	}
	2871
	2872	static const char *const typeNames16[]={
	2873	"contains", "contains(LM)",
	2874	"span", "span(LM)",
	2875	"containsBack", "containsBack(LM)",
	2876	"spanBack", "spanBack(LM)"
	2877	};
	2878
	2879	static const char *const typeNames8[]={
	2880	"containsUTF8", "containsUTF8(LM)",
	2881	"spanUTF8", "spanUTF8(LM)",
	2882	"containsBackUTF8", "containsBackUTF8(LM)", // not implemented
	2883	"spanBackUTF8", "spanBackUTF8(LM)"
	2884	};
	2885
	2886	typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
	2887
	2888	// filter span options
	2889	if(type<=3) {
	2890	// span forward
	2891	if((whichSpans&SPAN_FWD)==0) {
	2892	return -1;
	2893	}
	2894	isForward=TRUE;
	2895	} else {
	2896	// span backward
	2897	if((whichSpans&SPAN_BACK)==0) {
	2898	return -1;
	2899	}
	2900	isForward=FALSE;
	2901	}
	2902	if((type&1)==0) {
	2903	// use USET_SPAN_CONTAINED
	2904	if((whichSpans&SPAN_CONTAINED)==0) {
	2905	return -1;
	2906	}
	2907	contained=USET_SPAN_CONTAINED;
	2908	} else {
	2909	// use USET_SPAN_SIMPLE
	2910	if((whichSpans&SPAN_SIMPLE)==0) {
	2911	return -1;
	2912	}
	2913	contained=USET_SPAN_SIMPLE;
	2914	}
	2915
	2916	// Default first span condition for going forward with an uncomplemented set.
	2917	spanCondition=USET_SPAN_NOT_CONTAINED;
	2918	if(isComplement) {
	2919	spanCondition=invertSpanCondition(spanCondition, contained);
	2920	}
	2921
	2922	// First span condition for span(), used to terminate the spanBack() iteration.
	2923	firstSpanCondition=spanCondition;
	2924
	2925	// spanBack(): Its initial span condition is span()'s last span condition,
	2926	// which is the opposite of span()'s first span condition
	2927	// if we expect an even number of spans.
	2928	// (The loop inverts spanCondition (expectCount-1) times
	2929	// before the expectCount'th span() call.)
	2930	// If we do not compare forward and backward directions, then we do not have an
	2931	// expectCount and just start with firstSpanCondition.
	2932	if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
	2933	spanCondition=invertSpanCondition(spanCondition, contained);
	2934	}
	2935
	2936	count=0;
	2937	switch(type) {
	2938	case 0:
	2939	case 1:
	2940	start=0;
	2941	if(length<0) {
	2942	length=slen(s, isUTF16);
	2943	}
	2944	for(;;) {
	2945	start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
	2946	containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
	2947	if(count<limitsCapacity) {
	2948	limits[count]=start;
	2949	}
	2950	++count;
	2951	if(start>=length) {
	2952	break;
	2953	}
	2954	spanCondition=invertSpanCondition(spanCondition, contained);
	2955	}
	2956	break;
	2957	case 2:
	2958	case 3:
	2959	start=0;
	2960	for(;;) {
	2961	start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
	2962	realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
	2963	if(count<limitsCapacity) {
	2964	limits[count]=start;
	2965	}
	2966	++count;
	2967	if(length>=0 ? start>=length :
	2968	isUTF16 ? ((const UChar *)s)[start]==0 :
	2969	((const char *)s)[start]==0
	2970	) {
	2971	break;
	2972	}
	2973	spanCondition=invertSpanCondition(spanCondition, contained);
	2974	}
	2975	break;
	2976	case 4:
	2977	case 5:
	2978	if(length<0) {
	2979	length=slen(s, isUTF16);
	2980	}
	2981	for(;;) {
	2982	++count;
	2983	if(count<=limitsCapacity) {
	2984	limits[limitsCapacity-count]=length;
	2985	}
	2986	length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
	2987	containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
	2988	if(length==0 && spanCondition==firstSpanCondition) {
	2989	break;
	2990	}
	2991	spanCondition=invertSpanCondition(spanCondition, contained);
	2992	}
	2993	if(count<limitsCapacity) {
	2994	memmove(limits, limits+(limitsCapacity-count), count*4);
	2995	}
	2996	break;
	2997	case 6:
	2998	case 7:
	2999	for(;;) {
	3000	++count;
	3001	if(count<=limitsCapacity) {
	3002	limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
	3003	}
	3004	// Note: Length<0 is tested only for the first spanBack().
	3005	// If we wanted to keep length<0 for all spanBack()s, we would have to
	3006	// temporarily modify the string by placing a NUL where the previous spanBack() stopped.
	3007	length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
	3008	realSet.spanBackUTF8((const char *)s, length, spanCondition);
	3009	if(length==0 && spanCondition==firstSpanCondition) {
	3010	break;
	3011	}
	3012	spanCondition=invertSpanCondition(spanCondition, contained);
	3013	}
	3014	if(count<limitsCapacity) {
	3015	memmove(limits, limits+(limitsCapacity-count), count*4);
	3016	}
	3017	break;
	3018	default:
	3019	typeName="";
	3020	return -1;
	3021	}
	3022
	3023	return count;
	3024	}
	3025
	3026	// sets to be tested; odd index=isComplement
	3027	enum {
	3028	SLOW,
	3029	SLOW_NOT,
	3030	FAST,
	3031	FAST_NOT,
	3032	SET_COUNT
	3033	};
	3034
	3035	static const char *const setNames[SET_COUNT]={
	3036	"slow",
	3037	"slow.not",
	3038	"fast",
	3039	"fast.not"
	3040	};
	3041
	3042	/*
	3043	* Verify that we get the same results whether we look at text with contains(),
	3044	* span() or spanBack(), using unfrozen or frozen versions of the set,
	3045	* and using the set or its complement (switching the spanConditions accordingly).
	3046	* The latter verifies that
	3047	* set.span(spanCondition) == set.complement().span(!spanCondition).
	3048	*
	3049	* The expectLimits[] are either provided by the caller (with expectCount>=0)
	3050	* or returned to the caller (with an input expectCount<0).
	3051	*/
	3052	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3053	const void *s, int32_t length, UBool isUTF16,
	3054	uint32_t whichSpans,
	3055	int32_t expectLimits[], int32_t &expectCount,
	3056	const char *testName, int32_t index) {
	3057	int32_t limits[500];
	3058	int32_t limitsCount;
	3059	int i, j;
	3060
	3061	const char *typeName;
	3062	int type;
	3063
	3064	for(i=0; i<SET_COUNT; ++i) {
	3065	if((i&1)==0) {
	3066	// Even-numbered sets are original, uncomplemented sets.
	3067	if((whichSpans&SPAN_SET)==0) {
	3068	continue;
	3069	}
	3070	} else {
	3071	// Odd-numbered sets are complemented.
	3072	if((whichSpans&SPAN_COMPLEMENT)==0) {
	3073	continue;
	3074	}
	3075	}
	3076	for(type=0;; ++type) {
	3077	limitsCount=getSpans(*sets[i], (UBool)(i&1),
	3078	s, length, isUTF16,
	3079	whichSpans,
	3080	type, typeName,
	3081	limits, LENGTHOF(limits), expectCount);
	3082	if(typeName[0]==0) {
	3083	break; // All types tried.
	3084	}
	3085	if(limitsCount<0) {
	3086	continue; // Span option filtered out.
	3087	}
	3088	if(expectCount<0) {
	3089	expectCount=limitsCount;
	3090	if(limitsCount>LENGTHOF(limits)) {
	3091	errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
	3092	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
	3093	return;
	3094	}
	3095	memcpy(expectLimits, limits, limitsCount*4);
	3096	} else if(limitsCount!=expectCount) {
	3097	errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
	3098	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
	3099	} else {
	3100	for(j=0; j<limitsCount; ++j) {
	3101	if(limits[j]!=expectLimits[j]) {
	3102	errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
	3103	testName, (long)index, setNames[i], typeName, (long)limitsCount,
	3104	j, (long)limits[j], (long)expectLimits[j]);
	3105	break;
	3106	}
	3107	}
	3108	}
	3109	}
	3110	}
	3111
	3112	// Compare span() with containsAll()/containsNone(),
	3113	// but only if we have expectLimits[] from the uncomplemented set.
	3114	if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
	3115	const UChar s16=(const UChar )s;
	3116	UnicodeString string;
	3117	int32_t prev=0, limit, length;
	3118	for(i=0; i<expectCount; ++i) {
	3119	limit=expectLimits[i];
	3120	length=limit-prev;
	3121	if(length>0) {
	3122	string.setTo(FALSE, s16+prev, length); // read-only alias
	3123	if(i&1) {
	3124	if(!sets[SLOW]->getSet().containsAll(string)) {
	3125	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3126	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3127	return;
	3128	}
	3129	if(!sets[FAST]->getSet().containsAll(string)) {
	3130	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3131	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3132	return;
	3133	}
	3134	} else {
	3135	if(!sets[SLOW]->getSet().containsNone(string)) {
	3136	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3137	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3138	return;
	3139	}
	3140	if(!sets[FAST]->getSet().containsNone(string)) {
	3141	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3142	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3143	return;
	3144	}
	3145	}
	3146	}
	3147	prev=limit;
	3148	}
	3149	}
	3150	}
	3151
	3152	// Specifically test either UTF-16 or UTF-8.
	3153	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3154	const void *s, int32_t length, UBool isUTF16,
	3155	uint32_t whichSpans,
	3156	const char *testName, int32_t index) {
	3157	int32_t expectLimits[500];
	3158	int32_t expectCount=-1;
	3159	testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
	3160	}
	3161
	3162	UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
	3163	UChar c, c2;
	3164
	3165	if(length>=0) {
	3166	while(length>0) {
	3167	c=*s++;
	3168	--length;
	3169	if(0xd800<=c && c<0xe000) {
	3170	if(c>=0xdc00 \|\| length==0 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3171	return TRUE;
	3172	}
	3173	--length;
	3174	}
	3175	}
	3176	} else {
	3177	while((c=*s++)!=0) {
	3178	if(0xd800<=c && c<0xe000) {
	3179	if(c>=0xdc00 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3180	return TRUE;
	3181	}
	3182	}
	3183	}
	3184	}
	3185	return FALSE;
	3186	}
	3187
	3188	// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
	3189	// unless either UTF is turned off in whichSpans.
	3190	// Testing UTF-16 and UTF-8 together requires that surrogate code points
	3191	// have the same contains(c) value as U+FFFD.
	3192	void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
	3193	const UChar *s16, int32_t length16,
	3194	uint32_t whichSpans,
	3195	const char *testName, int32_t index) {
	3196	int32_t expectLimits[500];
	3197	int32_t expectCount;
	3198
	3199	expectCount=-1; // Get expectLimits[] from testSpan().
	3200
	3201	if((whichSpans&SPAN_UTF16)!=0) {
	3202	testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
	3203	}
	3204	if((whichSpans&SPAN_UTF8)==0) {
	3205	return;
	3206	}
	3207
	3208	// Convert s16[] and expectLimits[] to UTF-8.
	3209	uint8_t s8[3000];
	3210	int32_t offsets[3000];
	3211
	3212	const UChar *s16Limit=s16+length16;
	3213	char t=(char )s8;
	3214	char *tLimit=t+sizeof(s8);
	3215	int32_t *o=offsets;
	3216	UErrorCode errorCode=U_ZERO_ERROR;
	3217
	3218	// Convert with substitution: Turn unpaired surrogates into U+FFFD.
	3219	ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
	3220	if(U_FAILURE(errorCode)) {
	3221	errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
	3222	testName, (long)index, u_errorName(errorCode));
	3223	ucnv_resetFromUnicode(utf8Cnv);
	3224	return;
	3225	}
	3226	int32_t length8=(int32_t)(t-(char *)s8);
	3227
	3228	// Convert expectLimits[].
	3229	int32_t i, j, expect;
	3230	for(i=j=0; i<expectCount; ++i) {
	3231	expect=expectLimits[i];
	3232	if(expect==length16) {
	3233	expectLimits[i]=length8;
	3234	} else {
	3235	while(offsets[j]<expect) {
	3236	++j;
	3237	}
	3238	expectLimits[i]=j;
	3239	}
	3240	}
	3241
	3242	testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
	3243	}
	3244
	3245	static UChar32 nextCodePoint(UChar32 c) {
	3246	// Skip some large and boring ranges.
	3247	switch(c) {
	3248	case 0x3441:
	3249	return 0x4d7f;
	3250	case 0x5100:
	3251	return 0x9f00;
	3252	case 0xb040:
	3253	return 0xd780;
	3254	case 0xe041:
	3255	return 0xf8fe;
	3256	case 0x10100:
	3257	return 0x20000;
	3258	case 0x20041:
	3259	return 0xe0000;
	3260	case 0xe0101:
	3261	return 0x10fffd;
	3262	default:
	3263	return c+1;
	3264	}
	3265	}
	3266
	3267	// Verify that all implementations represent the same set.
	3268	void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3269	// contains(U+FFFD) is inconsistent with contains(some surrogates),
	3270	// or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
	3271	// Skip the UTF-8 part of the test - if the string contains surrogates -
	3272	// because it is likely to produce a different result.
	3273	UBool inconsistentSurrogates=
	3274	(!(sets[0]->getSet().contains(0xfffd) ?
	3275	sets[0]->getSet().contains(0xd800, 0xdfff) :
	3276	sets[0]->getSet().containsNone(0xd800, 0xdfff)) \|\|
	3277	sets[0]->hasStringsWithSurrogates());
	3278
	3279	UChar s[1000];
	3280	int32_t length=0;
	3281	uint32_t localWhichSpans;
	3282
	3283	UChar32 c, first;
	3284	for(first=c=0;; c=nextCodePoint(c)) {
	3285	if(c>0x10ffff \|\| length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
	3286	localWhichSpans=whichSpans;
	3287	if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
	3288	localWhichSpans&=~SPAN_UTF8;
	3289	}
	3290	testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
	3291	if(c>0x10ffff) {
	3292	break;
	3293	}
	3294	length=0;
	3295	first=c;
	3296	}
	3297	U16_APPEND_UNSAFE(s, length, c);
	3298	}
	3299	}
	3300
	3301	// Test with a particular, interesting string.
	3302	// Specify length and try NUL-termination.
	3303	void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3304	static const UChar s[]={
	3305	0x61, 0x62, 0x20, // Latin, space
	3306	0x3b1, 0x3b2, 0x3b3, // Greek
	3307	0xd900, // lead surrogate
	3308	0x3000, 0x30ab, 0x30ad, // wide space, Katakana
	3309	0xdc05, // trail surrogate
	3310	0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
	3311	0xd900, 0xdc05, // unassigned supplementary
	3312	0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
	3313	0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
	3314	0 // NUL
	3315	};
	3316
	3317	if((whichSpans&SPAN_UTF16)==0) {
	3318	return;
	3319	}
	3320	testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
	3321	testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
	3322	}
	3323
	3324	void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3325	static const char s[]={
	3326	"abc" // Latin
	3327
	3328	/* trail byte in lead position */
	3329	"\x80"
	3330
	3331	" " // space
	3332
	3333	/* truncated multi-byte sequences */
	3334	"\xd0"
	3335	"\xe0"
	3336	"\xe1"
	3337	"\xed"
	3338	"\xee"
	3339	"\xf0"
	3340	"\xf1"
	3341	"\xf4"
	3342	"\xf8"
	3343	"\xfc"
	3344
	3345	"\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
	3346
	3347	/* trail byte in lead position */
	3348	"\x80"
	3349
	3350	"\xe0\x80"
	3351	"\xe0\xa0"
	3352	"\xe1\x80"
	3353	"\xed\x80"
	3354	"\xed\xa0"
	3355	"\xee\x80"
	3356	"\xf0\x80"
	3357	"\xf0\x90"
	3358	"\xf1\x80"
	3359	"\xf4\x80"
	3360	"\xf4\x90"
	3361	"\xf8\x80"
	3362	"\xfc\x80"
	3363
	3364	"\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
	3365
	3366	/* trail byte in lead position */
	3367	"\x80"
	3368
	3369	"\xf0\x80\x80"
	3370	"\xf0\x90\x80"
	3371	"\xf1\x80\x80"
	3372	"\xf4\x80\x80"
	3373	"\xf4\x90\x80"
	3374	"\xf8\x80\x80"
	3375	"\xfc\x80\x80"
	3376
	3377	"\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
	3378
	3379	/* trail byte in lead position */
	3380	"\x80"
	3381
	3382	"\xf8\x80\x80\x80"
	3383	"\xfc\x80\x80\x80"
	3384
	3385	"\xF1\x90\x80\x85" // unassigned supplementary
	3386
	3387	/* trail byte in lead position */
	3388	"\x80"
	3389
	3390	"\xfc\x80\x80\x80\x80"
	3391
	3392	"\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
	3393
	3394	/* trail byte in lead position */
	3395	"\x80"
	3396
	3397	/* complete sequences but non-shortest forms or out of range etc. */
	3398	"\xc0\x80"
	3399	"\xe0\x80\x80"
	3400	"\xed\xa0\x80"
	3401	"\xf0\x80\x80\x80"
	3402	"\xf4\x90\x80\x80"
	3403	"\xf8\x80\x80\x80\x80"
	3404	"\xfc\x80\x80\x80\x80\x80"
	3405	"\xfe"
	3406	"\xff"
	3407
	3408	/* trail byte in lead position */
	3409	"\x80"
	3410
	3411	"\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
	3412	};
	3413
	3414	if((whichSpans&SPAN_UTF8)==0) {
	3415	return;
	3416	}
	3417	testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
	3418	testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
	3419	}
	3420
	3421	// Take a set of span options and multiply them so that
	3422	// each portion only has one of the options a, b and c.
	3423	// If b==0, then the set of options is just modified with mask and a.
	3424	// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
	3425	static int32_t
	3426	addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
	3427	uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
	3428	uint32_t s;
	3429	int32_t i;
	3430
	3431	for(i=0; i<whichSpansCount; ++i) {
	3432	s=whichSpans[i]&mask;
	3433	whichSpans[i]=s\|a;
	3434	if(b!=0) {
	3435	whichSpans[whichSpansCount+i]=s\|b;
	3436	if(c!=0) {
	3437	whichSpans[2*whichSpansCount+i]=s\|c;
	3438	}
	3439	}
	3440	}
	3441	return b==0 ? whichSpansCount : c==0 ? 2whichSpansCount : 3whichSpansCount;
	3442	}
	3443
	3444	#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3445	#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3446	#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3447	#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3448
	3449	void UnicodeSetTest::TestSpan() {
	3450	// "[...]" is a UnicodeSet pattern.
	3451	// "*" performs tests on all Unicode code points and on a selection of
	3452	// malformed UTF-8/16 strings.
	3453	// "-options" limits the scope of testing for the current set.
	3454	// By default, the test verifies that equivalent boundaries are found
	3455	// for UTF-16 and UTF-8, going forward and backward,
	3456	// alternating USET_SPAN_NOT_CONTAINED with
	3457	// either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
	3458	// Single-character options:
	3459	// 8 -- UTF-16 and UTF-8 boundaries may differ.
	3460	// Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
	3461	// or the set contains strings with unpaired surrogates
	3462	// which do not translate to valid UTF-8.
	3463	// c -- set.span() and set.complement().span() boundaries may differ.
	3464	// Cause: Set strings are not complemented.
	3465	// b -- span() and spanBack() boundaries may differ.
	3466	// Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
	3467	// and spanBack(USET_SPAN_SIMPLE) are defined to
	3468	// match with non-overlapping substrings.
	3469	// For example, with a set containing "ab" and "ba",
	3470	// span() of "aba" yields boundaries { 0, 2, 3 }
	3471	// because the initial "ab" matches from 0 to 2,
	3472	// while spanBack() yields boundaries { 0, 1, 3 }
	3473	// because the final "ba" matches from 1 to 3.
	3474	// l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
	3475	// Cause: Strings in the set overlap, and a longer match may
	3476	// require a sequence including non-longest substrings.
	3477	// For example, with a set containing "ab", "abc" and "cd",
	3478	// span(contained) of "abcd" spans the entire string
	3479	// but span(longest match) only spans the first 3 characters.
	3480	// Each "-options" first resets all options and then applies the specified options.
	3481	// A "-" without options resets the options.
	3482	// The options are also reset for each new set.
	3483	// Other strings will be spanned.
	3484	static const char *const testdata[]={
	3485	"[:ID_Continue:]",
	3486	"*",
	3487	"[:White_Space:]",
	3488	"*",
	3489	"[]",
	3490	"*",
	3491	"[\\u0000-\\U0010FFFF]",
	3492	"*",
	3493	"[\\u0000\\u0080\\u0800\\U00010000]",
	3494	"*",
	3495	"[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
	3496	"*",
	3497	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
	3498	"-c",
	3499	"*",
	3500	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
	3501	"-c",
	3502	"*",
	3503
	3504	// Overlapping strings cause overlapping attempts to match.
	3505	"[x{xy}{xya}{axy}{ax}]",
	3506	"-cl",
	3507
	3508	// More repetitions of "xya" would take too long with the recursive
	3509	// reference implementation.
	3510	// containsAll()=FALSE
	3511	// test_string 0x14
	3512	"xx"
	3513	"xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
	3514	"xx" // set.complement().span(contained) will stop between the two 'x'es.
	3515	"xyaxyaxyaxya"
	3516	"xx"
	3517	"xyaxyaxyaxya" // span() ends here.
	3518	"aaa",
	3519
	3520	// containsAll()=TRUE
	3521	// test_string 0x15
	3522	"xx"
	3523	"xyaxyaxyaxya"
	3524	"xx"
	3525	"xyaxyaxyaxya"
	3526	"xx"
	3527	"xyaxyaxyaxy",
	3528
	3529	"-bc",
	3530	// test_string 0x17
	3531	"byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
	3532	"-c",
	3533	"byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
	3534	"byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
	3535	"-",
	3536	"byaya", // span() -> { 5 }
	3537	"byay", // span() -> { 4 }
	3538	"bya", // span() -> { 3 }
	3539
	3540	// span(longest match) will not span the whole string.
	3541	"[a{ab}{bc}]",
	3542	"-cl",
	3543	// test_string 0x21
	3544	"abc",
	3545
	3546	"[a{ab}{abc}{cd}]",
	3547	"-cl",
	3548	"acdabcdabccd",
	3549
	3550	// spanBack(longest match) will not span the whole string.
	3551	"[c{ab}{bc}]",
	3552	"-cl",
	3553	"abc",
	3554
	3555	"[d{cd}{bcd}{ab}]",
	3556	"-cl",
	3557	"abbcdabcdabd",
	3558
	3559	// Test with non-ASCII set strings - test proper handling of surrogate pairs
	3560	// and UTF-8 trail bytes.
	3561	// Copies of above test sets and strings, but transliterated to have
	3562	// different code points with similar trail units.
	3563	// Previous: a b c d
	3564	// Unicode: 042B 30AB 200AB 204AB
	3565	// UTF-16: 042B 30AB D840 DCAB D841 DCAB
	3566	// UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
	3567	"[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
	3568	"-cl",
	3569	"\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
	3570
	3571	"[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
	3572	"-cl",
	3573	"\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
	3574
	3575	// Stress bookkeeping and recursion.
	3576	// The following strings are barely doable with the recursive
	3577	// reference implementation.
	3578	// The not-contained character at the end prevents an early exit from the span().
	3579	"[b{bb}]",
	3580	"-c",
	3581	// test_string 0x33
	3582	"bbbbbbbbbbbbbbbbbbbbbbbb-",
	3583	// On complement sets, span() and spanBack() get different results
	3584	// because b is not in the complement set and there is an odd number of b's
	3585	// in the test string.
	3586	"-bc",
	3587	"bbbbbbbbbbbbbbbbbbbbbbbbb-",
	3588
	3589	// Test with set strings with an initial or final code point span
	3590	// longer than 254.
	3591	"[a{" _64_a _64_a _64_a _64_a "b}"
	3592	"{a" _64_b _64_b _64_b _64_b "}]",
	3593	"-c",
	3594	_64_a _64_a _64_a _63_a "b",
	3595	_64_a _64_a _64_a _64_a "b",
	3596	_64_a _64_a _64_a _64_a "aaaabbbb",
	3597	"a" _64_b _64_b _64_b _63_b,
	3598	"a" _64_b _64_b _64_b _64_b,
	3599	"aaaabbbb" _64_b _64_b _64_b _64_b,
	3600
	3601	// Test with strings containing unpaired surrogates.
	3602	// They are not representable in UTF-8, and a leading trail surrogate
	3603	// and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
	3604	// U+20001 == \\uD840\\uDC01
	3605	// U+20400 == \\uD841\\uDC00
	3606	"[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
	3607	"-8cl",
	3608	"aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
	3609	};
	3610	uint32_t whichSpans[96]={ SPAN_ALL };
	3611	int32_t whichSpansCount=1;
	3612
	3613	UnicodeSet *sets[SET_COUNT]={ NULL };
	3614	const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
	3615
	3616	char testName[1024];
	3617	char *testNameLimit=testName;
	3618
	3619	int32_t i, j;
	3620	for(i=0; i<LENGTHOF(testdata); ++i) {
	3621	const char *s=testdata[i];
	3622	if(s[0]=='[') {
	3623	// Create new test sets from this pattern.
	3624	for(j=0; j<SET_COUNT; ++j) {
	3625	delete sets_with_str[j];
	3626	delete sets[j];
	3627	}
	3628	UErrorCode errorCode=U_ZERO_ERROR;
	3629	sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
	3630	if(U_FAILURE(errorCode)) {
	3631	dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
	3632	break;
	3633	}
	3634	sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
	3635	sets[SLOW_NOT]->complement();
	3636	// Intermediate set: Test cloning of a frozen set.
	3637	UnicodeSet fast=new UnicodeSet(sets[SLOW]);
	3638	fast->freeze();
	3639	sets[FAST]=(UnicodeSet *)fast->clone();
	3640	delete fast;
	3641	UnicodeSet fastNot=new UnicodeSet(sets[SLOW_NOT]);
	3642	fastNot->freeze();
	3643	sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
	3644	delete fastNot;
	3645
	3646	for(j=0; j<SET_COUNT; ++j) {
	3647	sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
	3648	}
	3649
	3650	strcpy(testName, s);
	3651	testNameLimit=strchr(testName, 0);
	3652	*testNameLimit++=':';
	3653	*testNameLimit=0;
	3654
	3655	whichSpans[0]=SPAN_ALL;
	3656	whichSpansCount=1;
	3657	} else if(s[0]=='-') {
	3658	whichSpans[0]=SPAN_ALL;
	3659	whichSpansCount=1;
	3660
	3661	while(*++s!=0) {
	3662	switch(*s) {
	3663	case 'c':
	3664	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3665	~SPAN_POLARITY,
	3666	SPAN_SET,
	3667	SPAN_COMPLEMENT,
	3668	0);
	3669	break;
	3670	case 'b':
	3671	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3672	~SPAN_DIRS,
	3673	SPAN_FWD,
	3674	SPAN_BACK,
	3675	0);
	3676	break;
	3677	case 'l':
	3678	// test USET_SPAN_CONTAINED FWD & BACK, and separately
	3679	// USET_SPAN_SIMPLE only FWD, and separately
	3680	// USET_SPAN_SIMPLE only BACK
	3681	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3682	~(SPAN_DIRS\|SPAN_CONDITION),
	3683	SPAN_DIRS\|SPAN_CONTAINED,
	3684	SPAN_FWD\|SPAN_SIMPLE,
	3685	SPAN_BACK\|SPAN_SIMPLE);
	3686	break;
	3687	case '8':
	3688	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3689	~SPAN_UTFS,
	3690	SPAN_UTF16,
	3691	SPAN_UTF8,
	3692	0);
	3693	break;
	3694	default:
	3695	errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
	3696	break;
	3697	}
	3698	}
	3699	} else if(0==strcmp(s, "*")) {
	3700	strcpy(testNameLimit, "bad_string");
	3701	for(j=0; j<whichSpansCount; ++j) {
	3702	if(whichSpansCount>1) {
	3703	sprintf(testNameLimit+10 /* strlen("bad_string") */,
	3704	"%%0x%3x",
	3705	whichSpans[j]);
	3706	}
	3707	testSpanUTF16String(sets_with_str, whichSpans[j], testName);
	3708	testSpanUTF8String(sets_with_str, whichSpans[j], testName);
	3709	}
	3710
	3711	strcpy(testNameLimit, "contents");
	3712	for(j=0; j<whichSpansCount; ++j) {
	3713	if(whichSpansCount>1) {
	3714	sprintf(testNameLimit+8 /* strlen("contents") */,
	3715	"%%0x%3x",
	3716	whichSpans[j]);
	3717	}
	3718	testSpanContents(sets_with_str, whichSpans[j], testName);
	3719	}
	3720	} else {
	3721	UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
	3722	strcpy(testNameLimit, "test_string");
	3723	for(j=0; j<whichSpansCount; ++j) {
	3724	if(whichSpansCount>1) {
	3725	sprintf(testNameLimit+11 /* strlen("test_string") */,
	3726	"%%0x%3x",
	3727	whichSpans[j]);
	3728	}
	3729	testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
	3730	}
	3731	}
	3732	}
	3733	for(j=0; j<SET_COUNT; ++j) {
	3734	delete sets_with_str[j];
	3735	delete sets[j];
	3736	}
	3737	}
	3738
	3739	// Test select patterns and strings, and test USET_SPAN_SIMPLE.
	3740	void UnicodeSetTest::TestStringSpan() {
	3741	static const char *pattern="[x{xy}{xya}{axy}{ax}]";
	3742	static const char *const string=
	3743	"xx"
	3744	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3745	"xx"
	3746	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3747	"xx"
	3748	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
	3749	"aaaa";
	3750
	3751	UErrorCode errorCode=U_ZERO_ERROR;
	3752	UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
	3753	UnicodeSet set(pattern16, errorCode);
	3754	if(U_FAILURE(errorCode)) {
	3755	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3756	return;
	3757	}
	3758
	3759	UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
	3760
	3761	if(set.containsAll(string16)) {
	3762	errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
	3763	}
	3764
	3765	// Remove trailing "aaaa".
	3766	string16.truncate(string16.length()-4);
	3767	if(!set.containsAll(string16)) {
	3768	errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
	3769	}
	3770
	3771	string16=UNICODE_STRING_SIMPLE("byayaxya");
	3772	const UChar *s16=string16.getBuffer();
	3773	int32_t length16=string16.length();
	3774	if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3775	set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3776	set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3777	set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 \|\|
	3778	set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3779	set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
	3780	) {
	3781	errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
	3782	}
	3783
	3784	pattern="[a{ab}{abc}{cd}]";
	3785	pattern16=UnicodeString(pattern, -1, US_INV);
	3786	set.applyPattern(pattern16, errorCode);
	3787	if(U_FAILURE(errorCode)) {
	3788	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3789	return;
	3790	}
	3791	string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
	3792	s16=string16.getBuffer();
	3793	length16=string16.length();
	3794	if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 \|\|
	3795	set.span(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3796	set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
	3797	) {
	3798	errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
	3799	}
	3800
	3801	pattern="[d{cd}{bcd}{ab}]";
	3802	pattern16=UnicodeString(pattern, -1, US_INV);
	3803	set.applyPattern(pattern16, errorCode).freeze();
	3804	if(U_FAILURE(errorCode)) {
	3805	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3806	return;
	3807	}
	3808	string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
	3809	s16=string16.getBuffer();
	3810	length16=string16.length();
	3811	if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 \|\|
	3812	set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3813	set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
	3814	) {
	3815	errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
	3816	}
	3817	}