git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/usettest.cpp

... / ...

Commit	Line	Data
	1	/*
	2	********************************************************************************
	3	* Copyright (C) 1999-2008 International Business Machines Corporation and
	4	* others. All Rights Reserved.
	5	********************************************************************************
	6	* Date Name Description
	7	* 10/20/99 alan Creation.
	8	* 03/22/2000 Madhu Added additional tests
	9	********************************************************************************
	10	*/
	11
	12	#include <stdio.h>
	13
	14	#include <string.h>
	15	#include "unicode/utypes.h"
	16	#include "usettest.h"
	17	#include "unicode/ucnv.h"
	18	#include "unicode/uniset.h"
	19	#include "unicode/uchar.h"
	20	#include "unicode/usetiter.h"
	21	#include "unicode/ustring.h"
	22	#include "unicode/parsepos.h"
	23	#include "unicode/symtable.h"
	24	#include "unicode/uversion.h"
	25	#include "hash.h"
	26
	27	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	28
	29	#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
	30	errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
	31	u_errorName(status));}}
	32
	33	#define TEST_ASSERT(expr) {if (!(expr)) { \
	34	errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
	35
	36	UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
	37	UnicodeString pat;
	38	set.toPattern(pat);
	39	return left + UnicodeSetTest::escape(pat);
	40	}
	41
	42	#define CASE(id,test) case id: \
	43	name = #test; \
	44	if (exec) { \
	45	logln(#test "---"); \
	46	logln(); \
	47	test(); \
	48	} \
	49	break
	50
	51	UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
	52	}
	53
	54	UConverter *UnicodeSetTest::openUTF8Converter() {
	55	if(utf8Cnv==NULL) {
	56	UErrorCode errorCode=U_ZERO_ERROR;
	57	utf8Cnv=ucnv_open("UTF-8", &errorCode);
	58	}
	59	return utf8Cnv;
	60	}
	61
	62	UnicodeSetTest::~UnicodeSetTest() {
	63	ucnv_close(utf8Cnv);
	64	}
	65
	66	void
	67	UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
	68	const char* &name, char* /par/) {
	69	// if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
	70	switch (index) {
	71	CASE(0,TestPatterns);
	72	CASE(1,TestAddRemove);
	73	CASE(2,TestCategories);
	74	CASE(3,TestCloneEqualHash);
	75	CASE(4,TestMinimalRep);
	76	CASE(5,TestAPI);
	77	CASE(6,TestScriptSet);
	78	CASE(7,TestPropertySet);
	79	CASE(8,TestClone);
	80	CASE(9,TestExhaustive);
	81	CASE(10,TestToPattern);
	82	CASE(11,TestIndexOf);
	83	CASE(12,TestStrings);
	84	CASE(13,Testj2268);
	85	CASE(14,TestCloseOver);
	86	CASE(15,TestEscapePattern);
	87	CASE(16,TestInvalidCodePoint);
	88	CASE(17,TestSymbolTable);
	89	CASE(18,TestSurrogate);
	90	CASE(19,TestPosixClasses);
	91	CASE(20,TestIteration);
	92	CASE(21,TestFreezable);
	93	CASE(22,TestSpan);
	94	CASE(23,TestStringSpan);
	95	default: name = ""; break;
	96	}
	97	}
	98
	99	static const char NOT[] = "%%%%";
	100
	101	/**
	102	* UVector was improperly copying contents
	103	* This code will crash this is still true
	104	*/
	105	void UnicodeSetTest::Testj2268() {
	106	UnicodeSet t;
	107	t.add(UnicodeString("abc"));
	108	UnicodeSet test(t);
	109	UnicodeString ustrPat;
	110	test.toPattern(ustrPat, TRUE);
	111	}
	112
	113	/**
	114	* Test toPattern().
	115	*/
	116	void UnicodeSetTest::TestToPattern() {
	117	UErrorCode ec = U_ZERO_ERROR;
	118
	119	// Test that toPattern() round trips with syntax characters and
	120	// whitespace.
	121	{
	122	static const char* OTHER_TOPATTERN_TESTS[] = {
	123	"[[:latin:]&[:greek:]]",
	124	"[[:latin:]-[:greek:]]",
	125	"[:nonspacing mark:]",
	126	NULL
	127	};
	128
	129	for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
	130	ec = U_ZERO_ERROR;
	131	UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
	132	if (U_FAILURE(ec)) {
	133	errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
	134	continue;
	135	}
	136	checkPat(OTHER_TOPATTERN_TESTS[j], s);
	137	}
	138
	139	for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
	140	if ((i <= 0xFF && !u_isalpha(i)) \|\| u_isspace(i)) {
	141
	142	// check various combinations to make sure they all work.
	143	if (i != 0 && !toPatternAux(i, i)){
	144	continue;
	145	}
	146	if (!toPatternAux(0, i)){
	147	continue;
	148	}
	149	if (!toPatternAux(i, 0xFFFF)){
	150	continue;
	151	}
	152	}
	153	}
	154	}
	155
	156	// Test pattern behavior of multicharacter strings.
	157	{
	158	ec = U_ZERO_ERROR;
	159	UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
	160
	161	// This loop isn't a loop. It's here to make the compiler happy.
	162	// If you're curious, try removing it and changing the 'break'
	163	// statements (except for the last) to goto's.
	164	for (;;) {
	165	if (U_FAILURE(ec)) break;
	166	const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
	167	expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
	168
	169	s->add("ac");
	170	const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
	171	expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
	172
	173	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
	174	if (U_FAILURE(ec)) break;
	175	const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
	176	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
	177
	178	s->add("[]");
	179	const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
	180	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
	181
	182	s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
	183	if (U_FAILURE(ec)) break;
	184	const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
	185	expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
	186
	187	// j2189
	188	s->clear();
	189	s->add(UnicodeString("abc", ""));
	190	s->add(UnicodeString("abc", ""));
	191	const char* exp6[] = {"abc", NOT, "ab", NULL};
	192	expectToPattern(*s, "[{abc}]", exp6);
	193
	194	break;
	195	}
	196
	197	if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
	198	delete s;
	199	}
	200
	201	// JB#3400: For 2 character ranges prefer [ab] to [a-b]
	202	UnicodeSet s;
	203	s.add((UChar)97, (UChar)98); // 'a', 'b'
	204	expectToPattern(s, "[ab]", NULL);
	205	}
	206
	207	UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
	208
	209	// use Integer.toString because Utility.hex doesn't handle ints
	210	UnicodeString pat = "";
	211	// TODO do these in hex
	212	//String source = "0x" + Integer.toString(start,16).toUpperCase();
	213	//if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
	214	UnicodeString source;
	215	source = source + (uint32_t)start;
	216	if (start != end)
	217	source = source + ".." + (uint32_t)end;
	218	UnicodeSet testSet;
	219	testSet.add(start, end);
	220	return checkPat(source, testSet);
	221	}
	222
	223	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	224	const UnicodeSet& testSet) {
	225	// What we want to make sure of is that a pattern generated
	226	// by toPattern(), with or without escaped unprintables, can
	227	// be passed back into the UnicodeSet constructor.
	228	UnicodeString pat0;
	229
	230	testSet.toPattern(pat0, TRUE);
	231
	232	if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
	233
	234	//String pat1 = unescapeLeniently(pat0);
	235	//if (!checkPat(source + " (in code)", testSet, pat1)) return false;
	236
	237	UnicodeString pat2;
	238	testSet.toPattern(pat2, FALSE);
	239	if (!checkPat(source, testSet, pat2)) return FALSE;
	240
	241	//String pat3 = unescapeLeniently(pat2);
	242	// if (!checkPat(source + " (in code)", testSet, pat3)) return false;
	243
	244	//logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
	245	logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
	246	return TRUE;
	247	}
	248
	249	UBool UnicodeSetTest::checkPat(const UnicodeString& source,
	250	const UnicodeSet& testSet,
	251	const UnicodeString& pat) {
	252	UErrorCode ec = U_ZERO_ERROR;
	253	UnicodeSet testSet2(pat, ec);
	254	if (testSet2 != testSet) {
	255	errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
	256	return FALSE;
	257	}
	258	return TRUE;
	259	}
	260
	261	void
	262	UnicodeSetTest::TestPatterns(void) {
	263	UnicodeSet set;
	264	expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
	265	expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
	266	expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
	267	expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
	268	expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
	269	expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
	270
	271	// Throw in a test of complement
	272	set.complement();
	273	UnicodeString exp;
	274	exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
	275	expectPairs(set, exp);
	276	}
	277
	278	void
	279	UnicodeSetTest::TestCategories(void) {
	280	UErrorCode status = U_ZERO_ERROR;
	281	const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
	282	UnicodeSet set(pat, status);
	283	if (U_FAILURE(status)) {
	284	errln((UnicodeString)"Fail: Can't construct set with " + pat);
	285	} else {
	286	expectContainment(set, pat, "ABC", "abc");
	287	}
	288
	289	UChar32 i;
	290	int32_t failures = 0;
	291	// Make sure generation of L doesn't pollute cached Lu set
	292	// First generate L, then Lu
	293	set.applyPattern("[:L:]", status);
	294	if (U_FAILURE(status)) { errln("FAIL"); return; }
	295	for (i=0; i<0x200; ++i) {
	296	UBool l = u_isalpha((UChar)i);
	297	if (l != set.contains(i)) {
	298	errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
	299	set.contains(i));
	300	if (++failures == 10) break;
	301	}
	302	}
	303
	304	set.applyPattern("[:Lu:]", status);
	305	if (U_FAILURE(status)) { errln("FAIL"); return; }
	306	for (i=0; i<0x200; ++i) {
	307	UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
	308	if (lu != set.contains(i)) {
	309	errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
	310	set.contains(i));
	311	if (++failures == 20) break;
	312	}
	313	}
	314	}
	315	void
	316	UnicodeSetTest::TestCloneEqualHash(void) {
	317	UErrorCode status = U_ZERO_ERROR;
	318	// set1 and set2 used to be built with the obsolete constructor taking
	319	// UCharCategory values; replaced with pattern constructors
	320	// markus 20030502
	321	UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
	322	UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
	323	if (U_FAILURE(status)){
	324	errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
	325	return;
	326	}
	327	UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
	328	UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
	329	if (U_FAILURE(status)){
	330	errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
	331	return;
	332	}
	333
	334	if (set1 != set1a) {
	335	errln("FAIL: category constructor for Ll broken");
	336	}
	337	if (set2 != set2a) {
	338	errln("FAIL: category constructor for Nd broken");
	339	}
	340	delete set1a;
	341	delete set2a;
	342
	343	logln("Testing copy construction");
	344	UnicodeSet set1copy=new UnicodeSet(set1);
	345	if(set1 != set1copy \|\| set1 == set2 \|\|
	346	getPairs(set1) != getPairs(set1copy) \|\|
	347	set1->hashCode() != set1copy->hashCode()){
	348	errln("FAIL : Error in copy construction");
	349	return;
	350	}
	351
	352	logln("Testing =operator");
	353	UnicodeSet set1equal=*set1;
	354	UnicodeSet set2equal=*set2;
	355	if(set1equal != set1 \|\| set1equal != set1copy \|\| set2equal != *set2 \|\|
	356	set2equal == set1 \|\| set2equal == set1copy \|\| set2equal == set1equal){
	357	errln("FAIL: Error in =operator");
	358	}
	359
	360	logln("Testing clone()");
	361	UnicodeSet set1clone=(UnicodeSet)set1->clone();
	362	UnicodeSet set2clone=(UnicodeSet)set2->clone();
	363	if(set1clone != set1 \|\| set1clone != set1copy \|\| *set1clone != set1equal \|\|
	364	set2clone != set2 \|\| set2clone == set1copy \|\| *set2clone != set2equal \|\|
	365	set2clone == set1 \|\| set2clone == set1equal \|\| set2clone == *set1clone){
	366	errln("FAIL: Error in clone");
	367	}
	368
	369	logln("Testing hashcode");
	370	if(set1->hashCode() != set1equal.hashCode() \|\| set1->hashCode() != set1clone->hashCode() \|\|
	371	set2->hashCode() != set2equal.hashCode() \|\| set2->hashCode() != set2clone->hashCode() \|\|
	372	set1copy->hashCode() != set1equal.hashCode() \|\| set1copy->hashCode() != set1clone->hashCode() \|\|
	373	set1->hashCode() == set2->hashCode() \|\| set1copy->hashCode() == set2->hashCode() \|\|
	374	set2->hashCode() == set1clone->hashCode() \|\| set2->hashCode() == set1equal.hashCode() ){
	375	errln("FAIL: Error in hashCode()");
	376	}
	377
	378	delete set1;
	379	delete set1copy;
	380	delete set2;
	381	delete set1clone;
	382	delete set2clone;
	383
	384
	385	}
	386	void
	387	UnicodeSetTest::TestAddRemove(void) {
	388	UnicodeSet set; // Construct empty set
	389	doAssert(set.isEmpty() == TRUE, "set should be empty");
	390	doAssert(set.size() == 0, "size should be 0");
	391	set.complement();
	392	doAssert(set.size() == 0x110000, "size should be 0x110000");
	393	set.clear();
	394	set.add(0x0061, 0x007a);
	395	expectPairs(set, "az");
	396	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	397	doAssert(set.size() != 0, "size should not be equal to 0");
	398	doAssert(set.size() == 26, "size should be equal to 26");
	399	set.remove(0x006d, 0x0070);
	400	expectPairs(set, "alqz");
	401	doAssert(set.size() == 22, "size should be equal to 22");
	402	set.remove(0x0065, 0x0067);
	403	expectPairs(set, "adhlqz");
	404	doAssert(set.size() == 19, "size should be equal to 19");
	405	set.remove(0x0064, 0x0069);
	406	expectPairs(set, "acjlqz");
	407	doAssert(set.size() == 16, "size should be equal to 16");
	408	set.remove(0x0063, 0x0072);
	409	expectPairs(set, "absz");
	410	doAssert(set.size() == 10, "size should be equal to 10");
	411	set.add(0x0066, 0x0071);
	412	expectPairs(set, "abfqsz");
	413	doAssert(set.size() == 22, "size should be equal to 22");
	414	set.remove(0x0061, 0x0067);
	415	expectPairs(set, "hqsz");
	416	set.remove(0x0061, 0x007a);
	417	expectPairs(set, "");
	418	doAssert(set.isEmpty() == TRUE, "set should be empty");
	419	doAssert(set.size() == 0, "size should be 0");
	420	set.add(0x0061);
	421	doAssert(set.isEmpty() == FALSE, "set should not be empty");
	422	doAssert(set.size() == 1, "size should not be equal to 1");
	423	set.add(0x0062);
	424	set.add(0x0063);
	425	expectPairs(set, "ac");
	426	doAssert(set.size() == 3, "size should not be equal to 3");
	427	set.add(0x0070);
	428	set.add(0x0071);
	429	expectPairs(set, "acpq");
	430	doAssert(set.size() == 5, "size should not be equal to 5");
	431	set.clear();
	432	expectPairs(set, "");
	433	doAssert(set.isEmpty() == TRUE, "set should be empty");
	434	doAssert(set.size() == 0, "size should be 0");
	435
	436	// Try removing an entire set from another set
	437	expectPattern(set, "[c-x]", "cx");
	438	UnicodeSet set2;
	439	expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
	440	set.removeAll(set2);
	441	expectPairs(set, "deluxx");
	442
	443	// Try adding an entire set to another set
	444	expectPattern(set, "[jackiemclean]", "aacceein");
	445	expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
	446	set.addAll(set2);
	447	expectPairs(set, "aacehort");
	448	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	449
	450	// Try retaining an set of elements contained in another set (intersection)
	451	UnicodeSet set3;
	452	expectPattern(set3, "[a-c]", "ac");
	453	doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
	454	set3.remove(0x0062);
	455	expectPairs(set3, "aacc");
	456	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	457	set.retainAll(set3);
	458	expectPairs(set, "aacc");
	459	doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
	460	doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
	461	set.clear();
	462	doAssert(set.size() != set3.size(), "set.size() != set3.size()");
	463
	464	// Test commutativity
	465	expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
	466	expectPattern(set2, "[jackiemclean]", "aacceein");
	467	set.addAll(set2);
	468	expectPairs(set, "aacehort");
	469	doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
	470
	471
	472
	473
	474	}
	475
	476	/**
	477	* Make sure minimal representation is maintained.
	478	*/
	479	void UnicodeSetTest::TestMinimalRep() {
	480	UErrorCode status = U_ZERO_ERROR;
	481	// This is pretty thoroughly tested by checkCanonicalRep()
	482	// run against the exhaustive operation results. Use the code
	483	// here for debugging specific spot problems.
	484
	485	// 1 overlap against 2
	486	UnicodeSet set("[h-km-q]", status);
	487	if (U_FAILURE(status)) { errln("FAIL"); return; }
	488	UnicodeSet set2("[i-o]", status);
	489	if (U_FAILURE(status)) { errln("FAIL"); return; }
	490	set.addAll(set2);
	491	expectPairs(set, "hq");
	492	// right
	493	set.applyPattern("[a-m]", status);
	494	if (U_FAILURE(status)) { errln("FAIL"); return; }
	495	set2.applyPattern("[e-o]", status);
	496	if (U_FAILURE(status)) { errln("FAIL"); return; }
	497	set.addAll(set2);
	498	expectPairs(set, "ao");
	499	// left
	500	set.applyPattern("[e-o]", status);
	501	if (U_FAILURE(status)) { errln("FAIL"); return; }
	502	set2.applyPattern("[a-m]", status);
	503	if (U_FAILURE(status)) { errln("FAIL"); return; }
	504	set.addAll(set2);
	505	expectPairs(set, "ao");
	506	// 1 overlap against 3
	507	set.applyPattern("[a-eg-mo-w]", status);
	508	if (U_FAILURE(status)) { errln("FAIL"); return; }
	509	set2.applyPattern("[d-q]", status);
	510	if (U_FAILURE(status)) { errln("FAIL"); return; }
	511	set.addAll(set2);
	512	expectPairs(set, "aw");
	513	}
	514
	515	void UnicodeSetTest::TestAPI() {
	516	UErrorCode status = U_ZERO_ERROR;
	517	// default ct
	518	UnicodeSet set;
	519	if (!set.isEmpty() \|\| set.getRangeCount() != 0) {
	520	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	521	set);
	522	}
	523
	524	// clear(), isEmpty()
	525	set.add(0x0061);
	526	if (set.isEmpty()) {
	527	errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
	528	set);
	529	}
	530	set.clear();
	531	if (!set.isEmpty()) {
	532	errln((UnicodeString)"FAIL, set should be empty but isn't: " +
	533	set);
	534	}
	535
	536	// size()
	537	set.clear();
	538	if (set.size() != 0) {
	539	errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
	540	": " + set);
	541	}
	542	set.add(0x0061);
	543	if (set.size() != 1) {
	544	errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
	545	": " + set);
	546	}
	547	set.add(0x0031, 0x0039);
	548	if (set.size() != 10) {
	549	errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
	550	": " + set);
	551	}
	552
	553	// contains(first, last)
	554	set.clear();
	555	set.applyPattern("[A-Y 1-8 b-d l-y]", status);
	556	if (U_FAILURE(status)) { errln("FAIL"); return; }
	557	for (int32_t i = 0; i<set.getRangeCount(); ++i) {
	558	UChar32 a = set.getRangeStart(i);
	559	UChar32 b = set.getRangeEnd(i);
	560	if (!set.contains(a, b)) {
	561	errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
	562	" but doesn't: " + set);
	563	}
	564	if (set.contains((UChar32)(a-1), b)) {
	565	errln((UnicodeString)"FAIL, shouldn't contain " +
	566	(unsigned short)(a-1) + '-' + (unsigned short)b +
	567	" but does: " + set);
	568	}
	569	if (set.contains(a, (UChar32)(b+1))) {
	570	errln((UnicodeString)"FAIL, shouldn't contain " +
	571	(unsigned short)a + '-' + (unsigned short)(b+1) +
	572	" but does: " + set);
	573	}
	574	}
	575
	576	// Ported InversionList test.
	577	UnicodeSet a((UChar32)3,(UChar32)10);
	578	UnicodeSet b((UChar32)7,(UChar32)15);
	579	UnicodeSet c;
	580
	581	logln((UnicodeString)"a [3-10]: " + a);
	582	logln((UnicodeString)"b [7-15]: " + b);
	583	c = a;
	584	c.addAll(b);
	585	UnicodeSet exp((UChar32)3,(UChar32)15);
	586	if (c == exp) {
	587	logln((UnicodeString)"c.set(a).add(b): " + c);
	588	} else {
	589	errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
	590	}
	591	c.complement();
	592	exp.set((UChar32)0, (UChar32)2);
	593	exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
	594	if (c == exp) {
	595	logln((UnicodeString)"c.complement(): " + c);
	596	} else {
	597	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	598	}
	599	c.complement();
	600	exp.set((UChar32)3, (UChar32)15);
	601	if (c == exp) {
	602	logln((UnicodeString)"c.complement(): " + c);
	603	} else {
	604	errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
	605	}
	606	c = a;
	607	c.complementAll(b);
	608	exp.set((UChar32)3,(UChar32)6);
	609	exp.add((UChar32)11,(UChar32) 15);
	610	if (c == exp) {
	611	logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
	612	} else {
	613	errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
	614	}
	615
	616	exp = c;
	617	bitsToSet(setToBits(c), c);
	618	if (c == exp) {
	619	logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
	620	} else {
	621	errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
	622	}
	623
	624	// Additional tests for coverage JB#2118
	625	//UnicodeSet::complement(class UnicodeString const &)
	626	//UnicodeSet::complementAll(class UnicodeString const &)
	627	//UnicodeSet::containsNone(class UnicodeSet const &)
	628	//UnicodeSet::containsNone(long,long)
	629	//UnicodeSet::containsSome(class UnicodeSet const &)
	630	//UnicodeSet::containsSome(long,long)
	631	//UnicodeSet::removeAll(class UnicodeString const &)
	632	//UnicodeSet::retain(long)
	633	//UnicodeSet::retainAll(class UnicodeString const &)
	634	//UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
	635	//UnicodeSetIterator::getString(void)
	636	set.clear();
	637	set.complement("ab");
	638	exp.applyPattern("[{ab}]", status);
	639	if (U_FAILURE(status)) { errln("FAIL"); return; }
	640	if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
	641
	642	UnicodeSetIterator iset(set);
	643	if (!iset.next() \|\| !iset.isString()) {
	644	errln("FAIL: UnicodeSetIterator::next/isString");
	645	} else if (iset.getString() != "ab") {
	646	errln("FAIL: UnicodeSetIterator::getString");
	647	}
	648
	649	set.add((UChar32)0x61, (UChar32)0x7A);
	650	set.complementAll("alan");
	651	exp.applyPattern("[{ab}b-kmo-z]", status);
	652	if (U_FAILURE(status)) { errln("FAIL"); return; }
	653	if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
	654
	655	exp.applyPattern("[a-z]", status);
	656	if (U_FAILURE(status)) { errln("FAIL"); return; }
	657	if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	658	if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	659	exp.applyPattern("[aln]", status);
	660	if (U_FAILURE(status)) { errln("FAIL"); return; }
	661	if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
	662	if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
	663
	664	if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
	665	errln("FAIL: containsNone(UChar32, UChar32)");
	666	}
	667	if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
	668	errln("FAIL: containsSome(UChar32, UChar32)");
	669	}
	670	if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
	671	errln("FAIL: containsNone(UChar32, UChar32)");
	672	}
	673	if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
	674	errln("FAIL: containsSome(UChar32, UChar32)");
	675	}
	676
	677	set.removeAll("liu");
	678	exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
	679	if (U_FAILURE(status)) { errln("FAIL"); return; }
	680	if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
	681
	682	set.retainAll("star");
	683	exp.applyPattern("[rst]", status);
	684	if (U_FAILURE(status)) { errln("FAIL"); return; }
	685	if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
	686
	687	set.retain((UChar32)0x73);
	688	exp.applyPattern("[s]", status);
	689	if (U_FAILURE(status)) { errln("FAIL"); return; }
	690	if (set != exp) { errln("FAIL: retain('s')"); return; }
	691
	692	uint16_t buf[32];
	693	int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
	694	if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
	695	if (slen != 3 \|\| buf[0] != 2 \|\| buf[1] != 0x73 \|\| buf[2] != 0x74) {
	696	errln("FAIL: serialize");
	697	return;
	698	}
	699	}
	700
	701	void UnicodeSetTest::TestIteration() {
	702	UErrorCode ec = U_ZERO_ERROR;
	703	int i = 0;
	704	int outerLoop;
	705
	706	// 6 code points, 3 ranges, 2 strings, 8 total elements
	707	// Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
	708	UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
	709	TEST_ASSERT_SUCCESS(ec);
	710	UnicodeSetIterator it(set);
	711
	712	for (outerLoop=0; outerLoop<3; outerLoop++) {
	713	// Run the test multiple times, to check that iterator.reset() is working.
	714	for (i=0; i<10; i++) {
	715	UBool nextv = it.next();
	716	UBool isString = it.isString();
	717	int32_t codePoint = it.getCodepoint();
	718	//int32_t codePointEnd = it.getCodepointEnd();
	719	UnicodeString s = it.getString();
	720	switch (i) {
	721	case 0:
	722	TEST_ASSERT(nextv == TRUE);
	723	TEST_ASSERT(isString == FALSE);
	724	TEST_ASSERT(codePoint==0x61);
	725	TEST_ASSERT(s == "a");
	726	break;
	727	case 1:
	728	TEST_ASSERT(nextv == TRUE);
	729	TEST_ASSERT(isString == FALSE);
	730	TEST_ASSERT(codePoint==0x62);
	731	TEST_ASSERT(s == "b");
	732	break;
	733	case 2:
	734	TEST_ASSERT(nextv == TRUE);
	735	TEST_ASSERT(isString == FALSE);
	736	TEST_ASSERT(codePoint==0x63);
	737	TEST_ASSERT(s == "c");
	738	break;
	739	case 3:
	740	TEST_ASSERT(nextv == TRUE);
	741	TEST_ASSERT(isString == FALSE);
	742	TEST_ASSERT(codePoint==0x79);
	743	TEST_ASSERT(s == "y");
	744	break;
	745	case 4:
	746	TEST_ASSERT(nextv == TRUE);
	747	TEST_ASSERT(isString == FALSE);
	748	TEST_ASSERT(codePoint==0x7a);
	749	TEST_ASSERT(s == "z");
	750	break;
	751	case 5:
	752	TEST_ASSERT(nextv == TRUE);
	753	TEST_ASSERT(isString == FALSE);
	754	TEST_ASSERT(codePoint==0x1abcd);
	755	TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
	756	break;
	757	case 6:
	758	TEST_ASSERT(nextv == TRUE);
	759	TEST_ASSERT(isString == TRUE);
	760	TEST_ASSERT(s == "str1");
	761	break;
	762	case 7:
	763	TEST_ASSERT(nextv == TRUE);
	764	TEST_ASSERT(isString == TRUE);
	765	TEST_ASSERT(s == "str2");
	766	break;
	767	case 8:
	768	TEST_ASSERT(nextv == FALSE);
	769	break;
	770	case 9:
	771	TEST_ASSERT(nextv == FALSE);
	772	break;
	773	}
	774	}
	775	it.reset(); // prepare to run the iteration again.
	776	}
	777	}
	778
	779
	780
	781
	782	void UnicodeSetTest::TestStrings() {
	783	UErrorCode ec = U_ZERO_ERROR;
	784
	785	UnicodeSet* testList[] = {
	786	UnicodeSet::createFromAll("abc"),
	787	new UnicodeSet("[a-c]", ec),
	788
	789	&(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
	790	new UnicodeSet("[{ll}{ch}a-z]", ec),
	791
	792	UnicodeSet::createFrom("ab}c"),
	793	new UnicodeSet("[{ab\\}c}]", ec),
	794
	795	&((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
	796	new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
	797
	798	NULL
	799	};
	800
	801	if (U_FAILURE(ec)) {
	802	errln("FAIL: couldn't construct test sets");
	803	}
	804
	805	for (int32_t i = 0; testList[i] != NULL; i+=2) {
	806	if (U_SUCCESS(ec)) {
	807	UnicodeString pat0, pat1;
	808	testList[i]->toPattern(pat0, TRUE);
	809	testList[i+1]->toPattern(pat1, TRUE);
	810	if (testList[i] == testList[i+1]) {
	811	logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
	812	} else {
	813	logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
	814	}
	815	}
	816	delete testList[i];
	817	delete testList[i+1];
	818	}
	819	}
	820
	821	/**
	822	* Test the [:Latin:] syntax.
	823	*/
	824	void UnicodeSetTest::TestScriptSet() {
	825	expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
	826
	827	expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
	828
	829	/* Jitterbug 1423 */
	830	expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
	831
	832	}
	833
	834	/**
	835	* Test the [:Latin:] syntax.
	836	*/
	837	void UnicodeSetTest::TestPropertySet() {
	838	static const char* const DATA[] = {
	839	// Pattern, Chars IN, Chars NOT in
	840
	841	"[:Latin:]",
	842	"aA",
	843	"\\u0391\\u03B1",
	844
	845	"[\\p{Greek}]",
	846	"\\u0391\\u03B1",
	847	"aA",
	848
	849	"\\P{ GENERAL Category = upper case letter }",
	850	"abc",
	851	"ABC",
	852
	853	// Combining class: @since ICU 2.2
	854	// Check both symbolic and numeric
	855	"\\p{ccc=Nukta}",
	856	"\\u0ABC",
	857	"abc",
	858
	859	"\\p{Canonical Combining Class = 11}",
	860	"\\u05B1",
	861	"\\u05B2",
	862
	863	"[:c c c = iota subscript :]",
	864	"\\u0345",
	865	"xyz",
	866
	867	// Bidi class: @since ICU 2.2
	868	"\\p{bidiclass=lefttoright}",
	869	"abc",
	870	"\\u0671\\u0672",
	871
	872	// Binary properties: @since ICU 2.2
	873	"\\p{ideographic}",
	874	"\\u4E0A",
	875	"x",
	876
	877	"[:math=false:]",
	878	"q)*(",
	879	// weiv: )(and * were removed from math in Unicode 4.0.1
	880	//"(*+)",
	881	"+<>^",
	882
	883	// JB#1767 \N{}, \p{ASCII}
	884	"[:Ascii:]",
	885	"abc\\u0000\\u007F",
	886	"\\u0080\\u4E00",
	887
	888	"[\\N{ latin small letter a }[:name= latin small letter z:]]",
	889	"az",
	890	"qrs",
	891
	892	// JB#2015
	893	"[:any:]",
	894	"a\\U0010FFFF",
	895	"",
	896
	897	"[:nv=0.5:]",
	898	"\\u00BD\\u0F2A",
	899	"\\u00BC",
	900
	901	// JB#2653: Age
	902	"[:Age=1.1:]",
	903	"\\u03D6", // 1.1
	904	"\\u03D8\\u03D9", // 3.2
	905
	906	"[:Age=3.1:]",
	907	"\\u1800\\u3400\\U0002f800",
	908	"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
	909
	910	// JB#2350: Case_Sensitive
	911	"[:Case Sensitive:]",
	912	"A\\u1FFC\\U00010410",
	913	";\\u00B4\\U00010500",
	914
	915	// JB#2832: C99-compatibility props
	916	"[:blank:]",
	917	" \\u0009",
	918	"1-9A-Z",
	919
	920	"[:graph:]",
	921	"19AZ",
	922	" \\u0003\\u0007\\u0009\\u000A\\u000D",
	923
	924	"[:punct:]",
	925	"!@#%&*()[]{}-_\\/;:,.?'\"",
	926	"09azAZ",
	927
	928	"[:xdigit:]",
	929	"09afAF",
	930	"gG!",
	931
	932	// Regex compatibility test
	933	"[-b]", // leading '-' is literal
	934	"-b",
	935	"ac",
	936
	937	"[^-b]", // leading '-' is literal
	938	"ac",
	939	"-b",
	940
	941	"[b-]", // trailing '-' is literal
	942	"-b",
	943	"ac",
	944
	945	"[^b-]", // trailing '-' is literal
	946	"ac",
	947	"-b",
	948
	949	"[a-b-]", // trailing '-' is literal
	950	"ab-",
	951	"c=",
	952
	953	"[[a-q]&[p-z]-]", // trailing '-' is literal
	954	"pq-",
	955	"or=",
	956
	957	"[\\s\|\\)\|:\|$\|\\>]", // from regex tests
	958	"s\|):$>",
	959	"abc",
	960
	961	"[\\uDC00cd]", // JB#2906: isolated trail at start
	962	"cd\\uDC00",
	963	"ab\\uD800\\U00010000",
	964
	965	"[ab\\uD800]", // JB#2906: isolated trail at start
	966	"ab\\uD800",
	967	"cd\\uDC00\\U00010000",
	968
	969	"[ab\\uD800cd]", // JB#2906: isolated lead in middle
	970	"abcd\\uD800",
	971	"ef\\uDC00\\U00010000",
	972
	973	"[ab\\uDC00cd]", // JB#2906: isolated trail in middle
	974	"abcd\\uDC00",
	975	"ef\\uD800\\U00010000",
	976
	977	"[:^lccc=0:]", // Lead canonical class
	978	"\\u0300\\u0301",
	979	"abcd\\u00c0\\u00c5",
	980
	981	"[:^tccc=0:]", // Trail canonical class
	982	"\\u0300\\u0301\\u00c0\\u00c5",
	983	"abcd",
	984
	985	"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
	986	"\\u0300\\u0301\\u00c0\\u00c5",
	987	"abcd",
	988
	989	"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
	990	"",
	991	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	992
	993	"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
	994	"\\u0F73\\u0F75\\u0F81",
	995	"abcd\\u0300\\u0301\\u00c0\\u00c5",
	996
	997	"[:Assigned:]",
	998	"A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
	999	"\\u0888\\uFDD3\\uFFFE\\U00050005"
	1000	};
	1001
	1002	static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
	1003
	1004	for (int32_t i=0; i<DATA_LEN; i+=3) {
	1005	expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
	1006	CharsToUnicodeString(DATA[i+2]));
	1007	}
	1008	}
	1009
	1010	/**
	1011	* Test that Posix style character classes [:digit:], etc.
	1012	* have the Unicode definitions from TR 18.
	1013	*/
	1014	void UnicodeSetTest::TestPosixClasses() {
	1015	{
	1016	UErrorCode status = U_ZERO_ERROR;
	1017	UnicodeSet s1("[:alpha:]", status);
	1018	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
	1019	TEST_ASSERT_SUCCESS(status);
	1020	TEST_ASSERT(s1==s2);
	1021	}
	1022	{
	1023	UErrorCode status = U_ZERO_ERROR;
	1024	UnicodeSet s1("[:lower:]", status);
	1025	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
	1026	TEST_ASSERT_SUCCESS(status);
	1027	TEST_ASSERT(s1==s2);
	1028	}
	1029	{
	1030	UErrorCode status = U_ZERO_ERROR;
	1031	UnicodeSet s1("[:upper:]", status);
	1032	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
	1033	TEST_ASSERT_SUCCESS(status);
	1034	TEST_ASSERT(s1==s2);
	1035	}
	1036	{
	1037	UErrorCode status = U_ZERO_ERROR;
	1038	UnicodeSet s1("[:punct:]", status);
	1039	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
	1040	TEST_ASSERT_SUCCESS(status);
	1041	TEST_ASSERT(s1==s2);
	1042	}
	1043	{
	1044	UErrorCode status = U_ZERO_ERROR;
	1045	UnicodeSet s1("[:digit:]", status);
	1046	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
	1047	TEST_ASSERT_SUCCESS(status);
	1048	TEST_ASSERT(s1==s2);
	1049	}
	1050	{
	1051	UErrorCode status = U_ZERO_ERROR;
	1052	UnicodeSet s1("[:xdigit:]", status);
	1053	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
	1054	TEST_ASSERT_SUCCESS(status);
	1055	TEST_ASSERT(s1==s2);
	1056	}
	1057	{
	1058	UErrorCode status = U_ZERO_ERROR;
	1059	UnicodeSet s1("[:alnum:]", status);
	1060	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
	1061	TEST_ASSERT_SUCCESS(status);
	1062	TEST_ASSERT(s1==s2);
	1063	}
	1064	{
	1065	UErrorCode status = U_ZERO_ERROR;
	1066	UnicodeSet s1("[:space:]", status);
	1067	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
	1068	TEST_ASSERT_SUCCESS(status);
	1069	TEST_ASSERT(s1==s2);
	1070	}
	1071	{
	1072	UErrorCode status = U_ZERO_ERROR;
	1073	UnicodeSet s1("[:blank:]", status);
	1074	TEST_ASSERT_SUCCESS(status);
	1075	UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
	1076	status);
	1077	TEST_ASSERT_SUCCESS(status);
	1078	TEST_ASSERT(s1==s2);
	1079	}
	1080	{
	1081	UErrorCode status = U_ZERO_ERROR;
	1082	UnicodeSet s1("[:cntrl:]", status);
	1083	TEST_ASSERT_SUCCESS(status);
	1084	UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
	1085	TEST_ASSERT_SUCCESS(status);
	1086	TEST_ASSERT(s1==s2);
	1087	}
	1088	{
	1089	UErrorCode status = U_ZERO_ERROR;
	1090	UnicodeSet s1("[:graph:]", status);
	1091	TEST_ASSERT_SUCCESS(status);
	1092	UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
	1093	TEST_ASSERT_SUCCESS(status);
	1094	TEST_ASSERT(s1==s2);
	1095	}
	1096	{
	1097	UErrorCode status = U_ZERO_ERROR;
	1098	UnicodeSet s1("[:print:]", status);
	1099	TEST_ASSERT_SUCCESS(status);
	1100	UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
	1101	TEST_ASSERT_SUCCESS(status);
	1102	TEST_ASSERT(s1==s2);
	1103	}
	1104	}
	1105	/**
	1106	* Test cloning of UnicodeSet. For C++, we test the copy constructor.
	1107	*/
	1108	void UnicodeSetTest::TestClone() {
	1109	UErrorCode ec = U_ZERO_ERROR;
	1110	UnicodeSet s("[abcxyz]", ec);
	1111	UnicodeSet t(s);
	1112	expectContainment(t, "abc", "def");
	1113	}
	1114
	1115	/**
	1116	* Test the indexOf() and charAt() methods.
	1117	*/
	1118	void UnicodeSetTest::TestIndexOf() {
	1119	UErrorCode ec = U_ZERO_ERROR;
	1120	UnicodeSet set("[a-cx-y3578]", ec);
	1121	if (U_FAILURE(ec)) {
	1122	errln("FAIL: UnicodeSet constructor");
	1123	return;
	1124	}
	1125	for (int32_t i=0; i<set.size(); ++i) {
	1126	UChar32 c = set.charAt(i);
	1127	if (set.indexOf(c) != i) {
	1128	errln("FAIL: charAt(%d) = %X => indexOf() => %d",
	1129	i, c, set.indexOf(c));
	1130	}
	1131	}
	1132	UChar32 c = set.charAt(set.size());
	1133	if (c != -1) {
	1134	errln("FAIL: charAt(<out of range>) = %X", c);
	1135	}
	1136	int32_t j = set.indexOf((UChar32)0x71/'q'/);
	1137	if (j != -1) {
	1138	errln((UnicodeString)"FAIL: indexOf('q') = " + j);
	1139	}
	1140	}
	1141
	1142	/**
	1143	* Test closure API.
	1144	*/
	1145	void UnicodeSetTest::TestCloseOver() {
	1146	UErrorCode ec = U_ZERO_ERROR;
	1147
	1148	char CASE[] = {(char)USET_CASE_INSENSITIVE};
	1149	char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
	1150	const char* DATA[] = {
	1151	// selector, input, output
	1152	CASE,
	1153	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1154	"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
	1155
	1156	CASE,
	1157	"[\\u01F1]", // 'DZ'
	1158	"[\\u01F1\\u01F2\\u01F3]",
	1159
	1160	CASE,
	1161	"[\\u1FB4]",
	1162	"[\\u1FB4{\\u03AC\\u03B9}]",
	1163
	1164	CASE,
	1165	"[{F\\uFB01}]",
	1166	"[\\uFB03{ffi}]",
	1167
	1168	CASE, // make sure binary search finds limits
	1169	"[a\\uFF3A]",
	1170	"[aA\\uFF3A\\uFF5A]",
	1171
	1172	CASE,
	1173	"[a-z]","[A-Za-z\\u017F\\u212A]",
	1174	CASE,
	1175	"[abc]","[A-Ca-c]",
	1176	CASE,
	1177	"[ABC]","[A-Ca-c]",
	1178
	1179	CASE, "[i]", "[iI]",
	1180
	1181	CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
	1182	CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
	1183
	1184	CASE, "[\\u0131]", "[\\u0131]", // dotless i
	1185
	1186	CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
	1187
	1188	CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
	1189
	1190	CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
	1191
	1192	CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
	1193
	1194	CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
	1195
	1196	CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
	1197	CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
	1198
	1199	CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
	1200
	1201	CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
	1202
	1203	CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
	1204
	1205	CASE_MAPPINGS,
	1206	"[aq\\u00DF{Bc}{bC}{Fi}]",
	1207	"[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
	1208
	1209	CASE_MAPPINGS,
	1210	"[\\u01F1]", // 'DZ'
	1211	"[\\u01F1\\u01F2\\u01F3]",
	1212
	1213	CASE_MAPPINGS,
	1214	"[a-z]",
	1215	"[A-Za-z]",
	1216
	1217	NULL
	1218	};
	1219
	1220	UnicodeSet s;
	1221	UnicodeSet t;
	1222	UnicodeString buf;
	1223	for (int32_t i=0; DATA[i]!=NULL; i+=3) {
	1224	int32_t selector = DATA[i][0];
	1225	UnicodeString pat(DATA[i+1], -1, US_INV);
	1226	UnicodeString exp(DATA[i+2], -1, US_INV);
	1227	s.applyPattern(pat, ec);
	1228	s.closeOver(selector);
	1229	t.applyPattern(exp, ec);
	1230	if (U_FAILURE(ec)) {
	1231	errln("FAIL: applyPattern failed");
	1232	continue;
	1233	}
	1234	if (s == t) {
	1235	logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
	1236	} else {
	1237	errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
	1238	s.toPattern(buf, TRUE) + ", expected " + exp);
	1239	}
	1240	}
	1241
	1242	#if 0
	1243	/*
	1244	* Unused test code.
	1245	* This was used to compare the old implementation (using USET_CASE)
	1246	* with the new one (using 0x100 temporarily)
	1247	* while transitioning from hardcoded case closure tables in uniset.cpp
	1248	* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
	1249	* and using ucase.c functions for closure.
	1250	* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
	1251	*
	1252	* Note: The old and new implementation never fully matched because
	1253	* the old implementation turned out to not map U+0130 and U+0131 correctly
	1254	* (dotted I and dotless i) and because the old implementation's data tables
	1255	* were outdated compared to Unicode 4.0.1 at the time of the change to the
	1256	* new implementation. (So sigmas and some other characters were not handled
	1257	* according to the newer Unicode version.)
	1258	*/
	1259	UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
	1260	UnicodeSetIterator si(sens);
	1261	UnicodeString str, buf2;
	1262	const UnicodeString *pStr;
	1263	UChar32 c;
	1264	while(si.next()) {
	1265	if(!si.isString()) {
	1266	c=si.getCodepoint();
	1267	s.clear();
	1268	s.add(c);
	1269
	1270	str.setTo(c);
	1271	str.foldCase();
	1272	sens2.add(str);
	1273
	1274	t=s;
	1275	s.closeOver(USET_CASE);
	1276	t.closeOver(0x100);
	1277	if(s!=t) {
	1278	errln("FAIL: closeOver(U+%04x) differs: ", c);
	1279	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1280	}
	1281	}
	1282	}
	1283	// remove all code points
	1284	// should contain all full case folding mapping strings
	1285	sens2.remove(0, 0x10ffff);
	1286	si.reset(sens2);
	1287	while(si.next()) {
	1288	if(si.isString()) {
	1289	pStr=&si.getString();
	1290	s.clear();
	1291	s.add(*pStr);
	1292	t=s2=s;
	1293	s.closeOver(USET_CASE);
	1294	t.closeOver(0x100);
	1295	if(s!=t) {
	1296	errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
	1297	errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
	1298	}
	1299	}
	1300	}
	1301	#endif
	1302
	1303	// Test the pattern API
	1304	s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1305	if (U_FAILURE(ec)) {
	1306	errln("FAIL: applyPattern failed");
	1307	} else {
	1308	expectContainment(s, "abcABC", "defDEF");
	1309	}
	1310	UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
	1311	if (U_FAILURE(ec)) {
	1312	errln("FAIL: constructor failed");
	1313	} else {
	1314	expectContainment(v, "defDEF", "abcABC");
	1315	}
	1316	UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
	1317	if (U_FAILURE(ec)) {
	1318	errln("FAIL: construct w/case mappings failed");
	1319	} else {
	1320	expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
	1321	}
	1322	}
	1323
	1324	void UnicodeSetTest::TestEscapePattern() {
	1325	const char pattern[] =
	1326	"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
	1327	const char exp[] =
	1328	"[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
	1329	// We test this with two passes; in the second pass we
	1330	// pre-unescape the pattern. Since U+200E is rule whitespace,
	1331	// this fails -- which is what we expect.
	1332	for (int32_t pass=1; pass<=2; ++pass) {
	1333	UErrorCode ec = U_ZERO_ERROR;
	1334	UnicodeString pat(pattern, -1, US_INV);
	1335	if (pass==2) {
	1336	pat = pat.unescape();
	1337	}
	1338	// Pattern is only good for pass 1
	1339	UBool isPatternValid = (pass==1);
	1340
	1341	UnicodeSet set(pat, ec);
	1342	if (U_SUCCESS(ec) != isPatternValid){
	1343	errln((UnicodeString)"FAIL: applyPattern(" +
	1344	escape(pat) + ") => " +
	1345	u_errorName(ec));
	1346	continue;
	1347	}
	1348	if (U_FAILURE(ec)) {
	1349	continue;
	1350	}
	1351	if (set.contains((UChar)0x0644)){
	1352	errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
	1353	}
	1354
	1355	UnicodeString newpat;
	1356	set.toPattern(newpat, TRUE);
	1357	if (newpat == UnicodeString(exp, -1, US_INV)) {
	1358	logln(escape(pat) + " => " + newpat);
	1359	} else {
	1360	errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
	1361	}
	1362
	1363	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1364	UnicodeString str("Range ");
	1365	str.append((UChar)(0x30 + i))
	1366	.append(": ")
	1367	.append((UChar32)set.getRangeStart(i))
	1368	.append(" - ")
	1369	.append((UChar32)set.getRangeEnd(i));
	1370	str = str + " (" + set.getRangeStart(i) + " - " +
	1371	set.getRangeEnd(i) + ")";
	1372	if (set.getRangeStart(i) < 0) {
	1373	errln((UnicodeString)"FAIL: " + escape(str));
	1374	} else {
	1375	logln(escape(str));
	1376	}
	1377	}
	1378	}
	1379	}
	1380
	1381	void UnicodeSetTest::expectRange(const UnicodeString& label,
	1382	const UnicodeSet& set,
	1383	UChar32 start, UChar32 end) {
	1384	UnicodeSet exp(start, end);
	1385	UnicodeString pat;
	1386	if (set == exp) {
	1387	logln(label + " => " + set.toPattern(pat, TRUE));
	1388	} else {
	1389	UnicodeString xpat;
	1390	errln((UnicodeString)"FAIL: " + label + " => " +
	1391	set.toPattern(pat, TRUE) +
	1392	", expected " + exp.toPattern(xpat, TRUE));
	1393	}
	1394	}
	1395
	1396	void UnicodeSetTest::TestInvalidCodePoint() {
	1397
	1398	const UChar32 DATA[] = {
	1399	// Test range Expected range
	1400	0, 0x10FFFF, 0, 0x10FFFF,
	1401	(UChar32)-1, 8, 0, 8,
	1402	8, 0x110000, 8, 0x10FFFF
	1403	};
	1404	const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
	1405
	1406	UnicodeString pat;
	1407	int32_t i;
	1408
	1409	for (i=0; i<DATA_LENGTH; i+=4) {
	1410	UChar32 start = DATA[i];
	1411	UChar32 end = DATA[i+1];
	1412	UChar32 xstart = DATA[i+2];
	1413	UChar32 xend = DATA[i+3];
	1414
	1415	// Try various API using the test code points
	1416
	1417	UnicodeSet set(start, end);
	1418	expectRange((UnicodeString)"ct(" + start + "," + end + ")",
	1419	set, xstart, xend);
	1420
	1421	set.clear();
	1422	set.set(start, end);
	1423	expectRange((UnicodeString)"set(" + start + "," + end + ")",
	1424	set, xstart, xend);
	1425
	1426	UBool b = set.contains(start);
	1427	b = set.contains(start, end);
	1428	b = set.containsNone(start, end);
	1429	b = set.containsSome(start, end);
	1430
	1431	/int32_t index = set.indexOf(start);/
	1432
	1433	set.clear();
	1434	set.add(start);
	1435	set.add(start, end);
	1436	expectRange((UnicodeString)"add(" + start + "," + end + ")",
	1437	set, xstart, xend);
	1438
	1439	set.set(0, 0x10FFFF);
	1440	set.retain(start, end);
	1441	expectRange((UnicodeString)"retain(" + start + "," + end + ")",
	1442	set, xstart, xend);
	1443	set.retain(start);
	1444
	1445	set.set(0, 0x10FFFF);
	1446	set.remove(start);
	1447	set.remove(start, end);
	1448	set.complement();
	1449	expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
	1450	set, xstart, xend);
	1451
	1452	set.set(0, 0x10FFFF);
	1453	set.complement(start, end);
	1454	set.complement();
	1455	expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
	1456	set, xstart, xend);
	1457	set.complement(start);
	1458	}
	1459
	1460	const UChar32 DATA2[] = {
	1461	0,
	1462	0x10FFFF,
	1463	(UChar32)-1,
	1464	0x110000
	1465	};
	1466	const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
	1467
	1468	for (i=0; i<DATA2_LENGTH; ++i) {
	1469	UChar32 c = DATA2[i], end = 0x10FFFF;
	1470	UBool valid = (c >= 0 && c <= 0x10FFFF);
	1471
	1472	UnicodeSet set(0, 0x10FFFF);
	1473
	1474	// For single-codepoint contains, invalid codepoints are NOT contained
	1475	UBool b = set.contains(c);
	1476	if (b == valid) {
	1477	logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
	1478	") = " + b);
	1479	} else {
	1480	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
	1481	") = " + b);
	1482	}
	1483
	1484	// For codepoint range contains, containsNone, and containsSome,
	1485	// invalid or empty (start > end) ranges have UNDEFINED behavior.
	1486	b = set.contains(c, end);
	1487	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
	1488	"," + end + ") = " + b);
	1489
	1490	b = set.containsNone(c, end);
	1491	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
	1492	"," + end + ") = " + b);
	1493
	1494	b = set.containsSome(c, end);
	1495	logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
	1496	"," + end + ") = " + b);
	1497
	1498	int32_t index = set.indexOf(c);
	1499	if ((index >= 0) == valid) {
	1500	logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
	1501	") = " + index);
	1502	} else {
	1503	errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
	1504	") = " + index);
	1505	}
	1506	}
	1507	}
	1508
	1509	// Used by TestSymbolTable
	1510	class TokenSymbolTable : public SymbolTable {
	1511	public:
	1512	Hashtable contents;
	1513
	1514	TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
	1515	contents.setValueDeleter(uhash_deleteUnicodeString);
	1516	}
	1517
	1518	~TokenSymbolTable() {}
	1519
	1520	/**
	1521	* (Non-SymbolTable API) Add the given variable and value to
	1522	* the table. Variable should NOT contain leading '$'.
	1523	*/
	1524	void add(const UnicodeString& var, const UnicodeString& value,
	1525	UErrorCode& ec) {
	1526	if (U_SUCCESS(ec)) {
	1527	contents.put(var, new UnicodeString(value), ec);
	1528	}
	1529	}
	1530
	1531	/**
	1532	* SymbolTable API
	1533	*/
	1534	virtual const UnicodeString* lookup(const UnicodeString& s) const {
	1535	return (const UnicodeString*) contents.get(s);
	1536	}
	1537
	1538	/**
	1539	* SymbolTable API
	1540	*/
	1541	virtual const UnicodeFunctor* lookupMatcher(UChar32 /ch/) const {
	1542	return NULL;
	1543	}
	1544
	1545	/**
	1546	* SymbolTable API
	1547	*/
	1548	virtual UnicodeString parseReference(const UnicodeString& text,
	1549	ParsePosition& pos, int32_t limit) const {
	1550	int32_t start = pos.getIndex();
	1551	int32_t i = start;
	1552	UnicodeString result;
	1553	while (i < limit) {
	1554	UChar c = text.charAt(i);
	1555	if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {
	1556	break;
	1557	}
	1558	++i;
	1559	}
	1560	if (i == start) { // No valid name chars
	1561	return result; // Indicate failure with empty string
	1562	}
	1563	pos.setIndex(i);
	1564	text.extractBetween(start, i, result);
	1565	return result;
	1566	}
	1567	};
	1568
	1569	void UnicodeSetTest::TestSymbolTable() {
	1570	// Multiple test cases can be set up here. Each test case
	1571	// is terminated by null:
	1572	// var, value, var, value,..., input pat., exp. output pat., null
	1573	const char* DATA[] = {
	1574	"us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
	1575	"us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
	1576	"us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
	1577	NULL
	1578	};
	1579
	1580	for (int32_t i=0; DATA[i]!=NULL; ++i) {
	1581	UErrorCode ec = U_ZERO_ERROR;
	1582	TokenSymbolTable sym(ec);
	1583	if (U_FAILURE(ec)) {
	1584	errln("FAIL: couldn't construct TokenSymbolTable");
	1585	continue;
	1586	}
	1587
	1588	// Set up variables
	1589	while (DATA[i+2] != NULL) {
	1590	sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
	1591	if (U_FAILURE(ec)) {
	1592	errln("FAIL: couldn't add to TokenSymbolTable");
	1593	continue;
	1594	}
	1595	i += 2;
	1596	}
	1597
	1598	// Input pattern and expected output pattern
	1599	UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
	1600	i += 2;
	1601
	1602	ParsePosition pos(0);
	1603	UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
	1604	if (U_FAILURE(ec)) {
	1605	errln("FAIL: couldn't construct UnicodeSet");
	1606	continue;
	1607	}
	1608
	1609	// results
	1610	if (pos.getIndex() != inpat.length()) {
	1611	errln((UnicodeString)"Failed to read to end of string \""
	1612	+ inpat + "\": read to "
	1613	+ pos.getIndex() + ", length is "
	1614	+ inpat.length());
	1615	}
	1616
	1617	UnicodeSet us2(exppat, ec);
	1618	if (U_FAILURE(ec)) {
	1619	errln("FAIL: couldn't construct expected UnicodeSet");
	1620	continue;
	1621	}
	1622
	1623	UnicodeString a, b;
	1624	if (us != us2) {
	1625	errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
	1626	", expected " + us2.toPattern(b, TRUE));
	1627	} else {
	1628	logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
	1629	}
	1630	}
	1631	}
	1632
	1633	void UnicodeSetTest::TestSurrogate() {
	1634	const char* DATA[] = {
	1635	// These should all behave identically
	1636	"[abc\\uD800\\uDC00]",
	1637	// "[abc\uD800\uDC00]", // Can't do this on C -- only Java
	1638	"[abc\\U00010000]",
	1639	0
	1640	};
	1641	for (int i=0; DATA[i] != 0; ++i) {
	1642	UErrorCode ec = U_ZERO_ERROR;
	1643	logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
	1644	UnicodeSet set(UnicodeString(DATA[i], -1, US_INV), ec);
	1645	if (U_FAILURE(ec)) {
	1646	errln("FAIL: UnicodeSet constructor");
	1647	continue;
	1648	}
	1649	expectContainment(set,
	1650	CharsToUnicodeString("abc\\U00010000"),
	1651	CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
	1652	if (set.size() != 4) {
	1653	errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
	1654	set.size() + ", expected 4");
	1655	}
	1656	}
	1657	}
	1658
	1659	void UnicodeSetTest::TestExhaustive() {
	1660	// exhaustive tests. Simulate UnicodeSets with integers.
	1661	// That gives us very solid tests (except for large memory tests).
	1662
	1663	int32_t limit = 128;
	1664
	1665	UnicodeSet x, y, z, aa;
	1666
	1667	for (int32_t i = 0; i < limit; ++i) {
	1668	bitsToSet(i, x);
	1669	logln((UnicodeString)"Testing " + i + ", " + x);
	1670	_testComplement(i, x, y);
	1671
	1672	// AS LONG AS WE ARE HERE, check roundtrip
	1673	checkRoundTrip(bitsToSet(i, aa));
	1674
	1675	for (int32_t j = 0; j < limit; ++j) {
	1676	_testAdd(i,j, x,y,z);
	1677	_testXor(i,j, x,y,z);
	1678	_testRetain(i,j, x,y,z);
	1679	_testRemove(i,j, x,y,z);
	1680	}
	1681	}
	1682	}
	1683
	1684	void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
	1685	bitsToSet(a, x);
	1686	z = x;
	1687	z.complement();
	1688	int32_t c = setToBits(z);
	1689	if (c != (~a)) {
	1690	errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
	1691	errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
	1692	}
	1693	checkCanonicalRep(z, (UnicodeString)"complement " + a);
	1694	}
	1695
	1696	void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1697	bitsToSet(a, x);
	1698	bitsToSet(b, y);
	1699	z = x;
	1700	z.addAll(y);
	1701	int32_t c = setToBits(z);
	1702	if (c != (a \| b)) {
	1703	errln((UnicodeString)"FAILED: add: " + x + " \| " + y + " != " + z);
	1704	errln((UnicodeString)"FAILED: add: " + a + " \| " + b + " != " + c);
	1705	}
	1706	checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
	1707	}
	1708
	1709	void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1710	bitsToSet(a, x);
	1711	bitsToSet(b, y);
	1712	z = x;
	1713	z.retainAll(y);
	1714	int32_t c = setToBits(z);
	1715	if (c != (a & b)) {
	1716	errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
	1717	errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
	1718	}
	1719	checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
	1720	}
	1721
	1722	void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1723	bitsToSet(a, x);
	1724	bitsToSet(b, y);
	1725	z = x;
	1726	z.removeAll(y);
	1727	int32_t c = setToBits(z);
	1728	if (c != (a &~ b)) {
	1729	errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
	1730	errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
	1731	}
	1732	checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
	1733	}
	1734
	1735	void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
	1736	bitsToSet(a, x);
	1737	bitsToSet(b, y);
	1738	z = x;
	1739	z.complementAll(y);
	1740	int32_t c = setToBits(z);
	1741	if (c != (a ^ b)) {
	1742	errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
	1743	errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
	1744	}
	1745	checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
	1746	}
	1747
	1748	/**
	1749	* Check that ranges are monotonically increasing and non-
	1750	* overlapping.
	1751	*/
	1752	void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
	1753	int32_t n = set.getRangeCount();
	1754	if (n < 0) {
	1755	errln((UnicodeString)"FAIL result of " + msg +
	1756	": range count should be >= 0 but is " +
	1757	n /+ " for " + set.toPattern())/);
	1758	return;
	1759	}
	1760	UChar32 last = 0;
	1761	for (int32_t i=0; i<n; ++i) {
	1762	UChar32 start = set.getRangeStart(i);
	1763	UChar32 end = set.getRangeEnd(i);
	1764	if (start > end) {
	1765	errln((UnicodeString)"FAIL result of " + msg +
	1766	": range " + (i+1) +
	1767	" start > end: " + (int)start + ", " + (int)end +
	1768	" for " + set);
	1769	}
	1770	if (i > 0 && start <= last) {
	1771	errln((UnicodeString)"FAIL result of " + msg +
	1772	": range " + (i+1) +
	1773	" overlaps previous range: " + (int)start + ", " + (int)end +
	1774	" for " + set);
	1775	}
	1776	last = end;
	1777	}
	1778	}
	1779
	1780	/**
	1781	* Convert a bitmask to a UnicodeSet.
	1782	*/
	1783	UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
	1784	result.clear();
	1785	for (UChar32 i = 0; i < 32; ++i) {
	1786	if ((a & (1<<i)) != 0) {
	1787	result.add(i);
	1788	}
	1789	}
	1790	return result;
	1791	}
	1792
	1793	/**
	1794	* Convert a UnicodeSet to a bitmask. Only the characters
	1795	* U+0000 to U+0020 are represented in the bitmask.
	1796	*/
	1797	int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
	1798	int32_t result = 0;
	1799	for (int32_t i = 0; i < 32; ++i) {
	1800	if (x.contains((UChar32)i)) {
	1801	result \|= (1<<i);
	1802	}
	1803	}
	1804	return result;
	1805	}
	1806
	1807	/**
	1808	* Return the representation of an inversion list based UnicodeSet
	1809	* as a pairs list. Ranges are listed in ascending Unicode order.
	1810	* For example, the set [a-zA-M3] is represented as "33AMaz".
	1811	*/
	1812	UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
	1813	UnicodeString pairs;
	1814	for (int32_t i=0; i<set.getRangeCount(); ++i) {
	1815	UChar32 start = set.getRangeStart(i);
	1816	UChar32 end = set.getRangeEnd(i);
	1817	if (end > 0xFFFF) {
	1818	end = 0xFFFF;
	1819	i = set.getRangeCount(); // Should be unnecessary
	1820	}
	1821	pairs.append((UChar)start).append((UChar)end);
	1822	}
	1823	return pairs;
	1824	}
	1825
	1826	/**
	1827	* Basic consistency check for a few items.
	1828	* That the iterator works, and that we can create a pattern and
	1829	* get the same thing back
	1830	*/
	1831	void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
	1832	UErrorCode ec = U_ZERO_ERROR;
	1833
	1834	UnicodeSet t(s);
	1835	checkEqual(s, t, "copy ct");
	1836
	1837	t = s;
	1838	checkEqual(s, t, "operator=");
	1839
	1840	copyWithIterator(t, s, FALSE);
	1841	checkEqual(s, t, "iterator roundtrip");
	1842
	1843	copyWithIterator(t, s, TRUE); // try range
	1844	checkEqual(s, t, "iterator roundtrip");
	1845
	1846	UnicodeString pat; s.toPattern(pat, FALSE);
	1847	t.applyPattern(pat, ec);
	1848	if (U_FAILURE(ec)) {
	1849	errln("FAIL: applyPattern");
	1850	return;
	1851	} else {
	1852	checkEqual(s, t, "toPattern(false)");
	1853	}
	1854
	1855	s.toPattern(pat, TRUE);
	1856	t.applyPattern(pat, ec);
	1857	if (U_FAILURE(ec)) {
	1858	errln("FAIL: applyPattern");
	1859	return;
	1860	} else {
	1861	checkEqual(s, t, "toPattern(true)");
	1862	}
	1863	}
	1864
	1865	void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
	1866	t.clear();
	1867	UnicodeSetIterator it(s);
	1868	if (withRange) {
	1869	while (it.nextRange()) {
	1870	if (it.isString()) {
	1871	t.add(it.getString());
	1872	} else {
	1873	t.add(it.getCodepoint(), it.getCodepointEnd());
	1874	}
	1875	}
	1876	} else {
	1877	while (it.next()) {
	1878	if (it.isString()) {
	1879	t.add(it.getString());
	1880	} else {
	1881	t.add(it.getCodepoint());
	1882	}
	1883	}
	1884	}
	1885	}
	1886
	1887	UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
	1888	UnicodeString source; s.toPattern(source, TRUE);
	1889	UnicodeString result; t.toPattern(result, TRUE);
	1890	if (s != t) {
	1891	errln((UnicodeString)"FAIL: " + message
	1892	+ "; source = " + source
	1893	+ "; result = " + result
	1894	);
	1895	return FALSE;
	1896	} else {
	1897	logln((UnicodeString)"Ok: " + message
	1898	+ "; source = " + source
	1899	+ "; result = " + result
	1900	);
	1901	}
	1902	return TRUE;
	1903	}
	1904
	1905	void
	1906	UnicodeSetTest::expectContainment(const UnicodeString& pat,
	1907	const UnicodeString& charsIn,
	1908	const UnicodeString& charsOut) {
	1909	UErrorCode ec = U_ZERO_ERROR;
	1910	UnicodeSet set(pat, ec);
	1911	if (U_FAILURE(ec)) {
	1912	errln((UnicodeString)"FAIL: pattern \"" +
	1913	pat + "\" => " + u_errorName(ec));
	1914	return;
	1915	}
	1916	expectContainment(set, pat, charsIn, charsOut);
	1917	}
	1918
	1919	void
	1920	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	1921	const UnicodeString& charsIn,
	1922	const UnicodeString& charsOut) {
	1923	UnicodeString pat;
	1924	set.toPattern(pat);
	1925	expectContainment(set, pat, charsIn, charsOut);
	1926	}
	1927
	1928	void
	1929	UnicodeSetTest::expectContainment(const UnicodeSet& set,
	1930	const UnicodeString& setName,
	1931	const UnicodeString& charsIn,
	1932	const UnicodeString& charsOut) {
	1933	UnicodeString bad;
	1934	UChar32 c;
	1935	int32_t i;
	1936
	1937	for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
	1938	c = charsIn.char32At(i);
	1939	if (!set.contains(c)) {
	1940	bad.append(c);
	1941	}
	1942	}
	1943	if (bad.length() > 0) {
	1944	errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
	1945	", expected containment of " + prettify(charsIn));
	1946	} else {
	1947	logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
	1948	}
	1949
	1950	bad.truncate(0);
	1951	for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
	1952	c = charsOut.char32At(i);
	1953	if (set.contains(c)) {
	1954	bad.append(c);
	1955	}
	1956	}
	1957	if (bad.length() > 0) {
	1958	errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
	1959	", expected non-containment of " + prettify(charsOut));
	1960	} else {
	1961	logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
	1962	}
	1963	}
	1964
	1965	void
	1966	UnicodeSetTest::expectPattern(UnicodeSet& set,
	1967	const UnicodeString& pattern,
	1968	const UnicodeString& expectedPairs){
	1969	UErrorCode status = U_ZERO_ERROR;
	1970	set.applyPattern(pattern, status);
	1971	if (U_FAILURE(status)) {
	1972	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	1973	"\") failed");
	1974	return;
	1975	} else {
	1976	if (getPairs(set) != expectedPairs ) {
	1977	errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
	1978	"\") => pairs \"" +
	1979	escape(getPairs(set)) + "\", expected \"" +
	1980	escape(expectedPairs) + "\"");
	1981	} else {
	1982	logln(UnicodeString("Ok: applyPattern(\"") + pattern +
	1983	"\") => pairs \"" +
	1984	escape(getPairs(set)) + "\"");
	1985	}
	1986	}
	1987	// the result of calling set.toPattern(), which is the string representation of
	1988	// this set(set), is passed to a UnicodeSet constructor, and tested that it
	1989	// will produce another set that is equal to this one.
	1990	UnicodeString temppattern;
	1991	set.toPattern(temppattern);
	1992	UnicodeSet *tempset=new UnicodeSet(temppattern, status);
	1993	if (U_FAILURE(status)) {
	1994	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
	1995	return;
	1996	}
	1997	if(tempset != set \|\| getPairs(tempset) != getPairs(set)){
	1998	errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
	1999	escape(getPairs(set)) + "\""));
	2000	} else{
	2001	logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
	2002	}
	2003
	2004	delete tempset;
	2005
	2006	}
	2007
	2008	void
	2009	UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
	2010	if (getPairs(set) != expectedPairs) {
	2011	errln(UnicodeString("FAIL: Expected pair list \"") +
	2012	escape(expectedPairs) + "\", got \"" +
	2013	escape(getPairs(set)) + "\"");
	2014	}
	2015	}
	2016
	2017	void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
	2018	const UnicodeString& expPat,
	2019	const char** expStrings) {
	2020	UnicodeString pat;
	2021	set.toPattern(pat, TRUE);
	2022	if (pat == expPat) {
	2023	logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
	2024	} else {
	2025	errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
	2026	return;
	2027	}
	2028	if (expStrings == NULL) {
	2029	return;
	2030	}
	2031	UBool in = TRUE;
	2032	for (int32_t i=0; expStrings[i] != NULL; ++i) {
	2033	if (expStrings[i] == NOT) { // sic; pointer comparison
	2034	in = FALSE;
	2035	continue;
	2036	}
	2037	UnicodeString s = CharsToUnicodeString(expStrings[i]);
	2038	UBool contained = set.contains(s);
	2039	if (contained == in) {
	2040	logln((UnicodeString)"Ok: " + expPat +
	2041	(contained ? " contains {" : " does not contain {") +
	2042	escape(expStrings[i]) + "}");
	2043	} else {
	2044	errln((UnicodeString)"FAIL: " + expPat +
	2045	(contained ? " contains {" : " does not contain {") +
	2046	escape(expStrings[i]) + "}");
	2047	}
	2048	}
	2049	}
	2050
	2051	static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
	2052
	2053	void
	2054	UnicodeSetTest::doAssert(UBool condition, const char *message)
	2055	{
	2056	if (!condition) {
	2057	errln(UnicodeString("ERROR : ") + message);
	2058	}
	2059	}
	2060
	2061	UnicodeString
	2062	UnicodeSetTest::escape(const UnicodeString& s) {
	2063	UnicodeString buf;
	2064	for (int32_t i=0; i<s.length(); )
	2065	{
	2066	UChar32 c = s.char32At(i);
	2067	if (0x0020 <= c && c <= 0x007F) {
	2068	buf += c;
	2069	} else {
	2070	if (c <= 0xFFFF) {
	2071	buf += (UChar)0x5c; buf += (UChar)0x75;
	2072	} else {
	2073	buf += (UChar)0x5c; buf += (UChar)0x55;
	2074	buf += toHexString((c & 0xF0000000) >> 28);
	2075	buf += toHexString((c & 0x0F000000) >> 24);
	2076	buf += toHexString((c & 0x00F00000) >> 20);
	2077	buf += toHexString((c & 0x000F0000) >> 16);
	2078	}
	2079	buf += toHexString((c & 0xF000) >> 12);
	2080	buf += toHexString((c & 0x0F00) >> 8);
	2081	buf += toHexString((c & 0x00F0) >> 4);
	2082	buf += toHexString(c & 0x000F);
	2083	}
	2084	i += U16_LENGTH(c);
	2085	}
	2086	return buf;
	2087	}
	2088
	2089	void UnicodeSetTest::TestFreezable() {
	2090	UErrorCode errorCode=U_ZERO_ERROR;
	2091	UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
	2092	UnicodeSet idSet(idPattern, errorCode);
	2093	if(U_FAILURE(errorCode)) {
	2094	errln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
	2095	return;
	2096	}
	2097
	2098	UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
	2099	UnicodeSet wsSet(wsPattern, errorCode);
	2100	if(U_FAILURE(errorCode)) {
	2101	errln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
	2102	return;
	2103	}
	2104
	2105	idSet.add(idPattern);
	2106	UnicodeSet frozen(idSet);
	2107	frozen.freeze();
	2108
	2109	if(idSet.isFrozen() \|\| !frozen.isFrozen()) {
	2110	errln("FAIL: isFrozen() is wrong");
	2111	}
	2112	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2113	errln("FAIL: a copy-constructed frozen set differs from its original");
	2114	}
	2115
	2116	frozen=wsSet;
	2117	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2118	errln("FAIL: a frozen set was modified by operator=");
	2119	}
	2120
	2121	UnicodeSet frozen2(frozen);
	2122	if(frozen2!=frozen \|\| frozen2!=idSet) {
	2123	errln("FAIL: a copied frozen set differs from its frozen original");
	2124	}
	2125	if(!frozen2.isFrozen()) {
	2126	errln("FAIL: copy-constructing a frozen set results in a thawed one");
	2127	}
	2128	UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
	2129	if(frozen3.contains(0, 4) \|\| !frozen3.contains(5, 55) \|\| frozen3.contains(56, 0x10ffff)) {
	2130	errln("FAIL: UnicodeSet(5, 55) failed");
	2131	}
	2132	frozen3=frozen;
	2133	if(!frozen3.isFrozen()) {
	2134	errln("FAIL: copying a frozen set results in a thawed one");
	2135	}
	2136
	2137	UnicodeSet cloned=(UnicodeSet )frozen.clone();
	2138	if(!cloned->isFrozen() \|\| *cloned!=frozen \|\| cloned->containsSome(0xd802, 0xd805)) {
	2139	errln("FAIL: clone() failed");
	2140	}
	2141	cloned->add(0xd802, 0xd805);
	2142	if(cloned->containsSome(0xd802, 0xd805)) {
	2143	errln("FAIL: unable to modify clone");
	2144	}
	2145	delete cloned;
	2146
	2147	UnicodeSet thawed=(UnicodeSet )frozen.cloneAsThawed();
	2148	if(thawed->isFrozen() \|\| *thawed!=frozen \|\| thawed->containsSome(0xd802, 0xd805)) {
	2149	errln("FAIL: cloneAsThawed() failed");
	2150	}
	2151	thawed->add(0xd802, 0xd805);
	2152	if(!thawed->contains(0xd802, 0xd805)) {
	2153	errln("FAIL: unable to modify thawed clone");
	2154	}
	2155	delete thawed;
	2156
	2157	frozen.set(5, 55);
	2158	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2159	errln("FAIL: UnicodeSet::set() modified a frozen set");
	2160	}
	2161
	2162	frozen.clear();
	2163	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2164	errln("FAIL: UnicodeSet::clear() modified a frozen set");
	2165	}
	2166
	2167	frozen.closeOver(USET_CASE_INSENSITIVE);
	2168	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2169	errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
	2170	}
	2171
	2172	frozen.compact();
	2173	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2174	errln("FAIL: UnicodeSet::compact() modified a frozen set");
	2175	}
	2176
	2177	ParsePosition pos;
	2178	frozen.
	2179	applyPattern(wsPattern, errorCode).
	2180	applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
	2181	applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
	2182	applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
	2183	applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
	2184	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2185	errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
	2186	}
	2187
	2188	frozen.
	2189	add(0xd800).
	2190	add(0xd802, 0xd805).
	2191	add(wsPattern).
	2192	addAll(idPattern).
	2193	addAll(wsSet);
	2194	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2195	errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
	2196	}
	2197
	2198	frozen.
	2199	retain(0x62).
	2200	retain(0x64, 0x69).
	2201	retainAll(wsPattern).
	2202	retainAll(wsSet);
	2203	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2204	errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
	2205	}
	2206
	2207	frozen.
	2208	remove(0x62).
	2209	remove(0x64, 0x69).
	2210	remove(idPattern).
	2211	removeAll(idPattern).
	2212	removeAll(idSet);
	2213	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2214	errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
	2215	}
	2216
	2217	frozen.
	2218	complement().
	2219	complement(0x62).
	2220	complement(0x64, 0x69).
	2221	complement(idPattern).
	2222	complementAll(idPattern).
	2223	complementAll(idSet);
	2224	if(frozen!=idSet \|\| !(frozen==idSet)) {
	2225	errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
	2226	}
	2227	}
	2228
	2229	// Test span() etc. -------------------------------------------------------- ***
	2230
	2231	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
	2232	static int32_t
	2233	appendUTF8(const UChar s, int32_t length, char t, int32_t capacity) {
	2234	UErrorCode errorCode=U_ZERO_ERROR;
	2235	int32_t length8=0;
	2236	u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
	2237	if(U_SUCCESS(errorCode)) {
	2238	return length8;
	2239	} else {
	2240	// The string contains an unpaired surrogate.
	2241	// Ignore this string.
	2242	return 0;
	2243	}
	2244	}
	2245
	2246	class UnicodeSetWithStringsIterator;
	2247
	2248	// Make the strings in a UnicodeSet easily accessible.
	2249	class UnicodeSetWithStrings {
	2250	public:
	2251	UnicodeSetWithStrings(const UnicodeSet &normalSet) :
	2252	set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
	2253	int32_t size=set.size();
	2254	if(size>0 && set.charAt(size-1)<0) {
	2255	// If a set's last element is not a code point, then it must contain strings.
	2256	// Iterate over the set, skip all code point ranges, and cache the strings.
	2257	// Convert them to UTF-8 for spanUTF8().
	2258	UnicodeSetIterator iter(set);
	2259	const UnicodeString *s;
	2260	char *s8=utf8;
	2261	int32_t length8, utf8Count=0;
	2262	while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
	2263	if(iter.isString()) {
	2264	// Store the pointer to the set's string element
	2265	// which we happen to know is a stable pointer.
	2266	strings[stringsLength]=s=&iter.getString();
	2267	utf8Count+=
	2268	utf8Lengths[stringsLength]=length8=
	2269	appendUTF8(s->getBuffer(), s->length(),
	2270	s8, (int32_t)(sizeof(utf8)-utf8Count));
	2271	if(length8==0) {
	2272	hasSurrogates=TRUE; // Contains unpaired surrogates.
	2273	}
	2274	s8+=length8;
	2275	++stringsLength;
	2276	}
	2277	}
	2278	}
	2279	}
	2280
	2281	const UnicodeSet &getSet() const {
	2282	return set;
	2283	}
	2284
	2285	UBool hasStrings() const {
	2286	return (UBool)(stringsLength>0);
	2287	}
	2288
	2289	UBool hasStringsWithSurrogates() const {
	2290	return hasSurrogates;
	2291	}
	2292
	2293	private:
	2294	friend class UnicodeSetWithStringsIterator;
	2295
	2296	const UnicodeSet &set;
	2297
	2298	const UnicodeString *strings[20];
	2299	int32_t stringsLength;
	2300	UBool hasSurrogates;
	2301
	2302	char utf8[1024];
	2303	int32_t utf8Lengths[20];
	2304
	2305	int32_t nextStringIndex;
	2306	int32_t nextUTF8Start;
	2307	};
	2308
	2309	class UnicodeSetWithStringsIterator {
	2310	public:
	2311	UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
	2312	fSet(set), nextStringIndex(0), nextUTF8Start(0) {
	2313	}
	2314
	2315	void reset() {
	2316	nextStringIndex=nextUTF8Start=0;
	2317	}
	2318
	2319	const UnicodeString *nextString() {
	2320	if(nextStringIndex<fSet.stringsLength) {
	2321	return fSet.strings[nextStringIndex++];
	2322	} else {
	2323	return NULL;
	2324	}
	2325	}
	2326
	2327	// Do not mix with calls to nextString().
	2328	const char *nextUTF8(int32_t &length) {
	2329	if(nextStringIndex<fSet.stringsLength) {
	2330	const char *s8=fSet.utf8+nextUTF8Start;
	2331	nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
	2332	return s8;
	2333	} else {
	2334	length=0;
	2335	return NULL;
	2336	}
	2337	}
	2338
	2339	private:
	2340	const UnicodeSetWithStrings &fSet;
	2341	int32_t nextStringIndex;
	2342	int32_t nextUTF8Start;
	2343	};
	2344
	2345	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
	2346	// at code point boundaries.
	2347	// That is, each edge of a match must not be in the middle of a surrogate pair.
	2348	static inline UBool
	2349	matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
	2350	s+=start;
	2351	limit-=start;
	2352	int32_t length=t.length();
	2353	return 0==t.compare(s, length) &&
	2354	!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
	2355	!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
	2356	}
	2357
	2358	// Implement span() with contains() for comparison.
	2359	static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2360	USetSpanCondition spanCondition) {
	2361	const UnicodeSet &realSet(set.getSet());
	2362	if(!set.hasStrings()) {
	2363	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2364	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2365	}
	2366
	2367	UChar32 c;
	2368	int32_t start=0, prev;
	2369	while((prev=start)<length) {
	2370	U16_NEXT(s, start, length, c);
	2371	if(realSet.contains(c)!=spanCondition) {
	2372	break;
	2373	}
	2374	}
	2375	return prev;
	2376	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2377	UnicodeSetWithStringsIterator iter(set);
	2378	UChar32 c;
	2379	int32_t start, next;
	2380	for(start=next=0; start<length;) {
	2381	U16_NEXT(s, next, length, c);
	2382	if(realSet.contains(c)) {
	2383	break;
	2384	}
	2385	const UnicodeString *str;
	2386	iter.reset();
	2387	while((str=iter.nextString())!=NULL) {
	2388	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2389	// spanNeedsStrings=TRUE;
	2390	return start;
	2391	}
	2392	}
	2393	start=next;
	2394	}
	2395	return start;
	2396	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2397	UnicodeSetWithStringsIterator iter(set);
	2398	UChar32 c;
	2399	int32_t start, next, maxSpanLimit=0;
	2400	for(start=next=0; start<length;) {
	2401	U16_NEXT(s, next, length, c);
	2402	if(!realSet.contains(c)) {
	2403	next=start; // Do not span this single, not-contained code point.
	2404	}
	2405	const UnicodeString *str;
	2406	iter.reset();
	2407	while((str=iter.nextString())!=NULL) {
	2408	if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
	2409	// spanNeedsStrings=TRUE;
	2410	int32_t matchLimit=start+str->length();
	2411	if(matchLimit==length) {
	2412	return length;
	2413	}
	2414	if(spanCondition==USET_SPAN_CONTAINED) {
	2415	// Iterate for the shortest match at each position.
	2416	// Recurse for each but the shortest match.
	2417	if(next==start) {
	2418	next=matchLimit; // First match from start.
	2419	} else {
	2420	if(matchLimit<next) {
	2421	// Remember shortest match from start for iteration.
	2422	int32_t temp=next;
	2423	next=matchLimit;
	2424	matchLimit=temp;
	2425	}
	2426	// Recurse for non-shortest match from start.
	2427	int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
	2428	USET_SPAN_CONTAINED);
	2429	if((matchLimit+spanLength)>maxSpanLimit) {
	2430	maxSpanLimit=matchLimit+spanLength;
	2431	if(maxSpanLimit==length) {
	2432	return length;
	2433	}
	2434	}
	2435	}
	2436	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2437	if(matchLimit>next) {
	2438	// Remember longest match from start.
	2439	next=matchLimit;
	2440	}
	2441	}
	2442	}
	2443	}
	2444	if(next==start) {
	2445	break; // No match from start.
	2446	}
	2447	start=next;
	2448	}
	2449	if(start>maxSpanLimit) {
	2450	return start;
	2451	} else {
	2452	return maxSpanLimit;
	2453	}
	2454	}
	2455	}
	2456
	2457	static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
	2458	USetSpanCondition spanCondition) {
	2459	if(length==0) {
	2460	return 0;
	2461	}
	2462	const UnicodeSet &realSet(set.getSet());
	2463	if(!set.hasStrings()) {
	2464	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2465	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2466	}
	2467
	2468	UChar32 c;
	2469	int32_t prev=length;
	2470	do {
	2471	U16_PREV(s, 0, length, c);
	2472	if(realSet.contains(c)!=spanCondition) {
	2473	break;
	2474	}
	2475	} while((prev=length)>0);
	2476	return prev;
	2477	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2478	UnicodeSetWithStringsIterator iter(set);
	2479	UChar32 c;
	2480	int32_t prev=length, length0=length;
	2481	do {
	2482	U16_PREV(s, 0, length, c);
	2483	if(realSet.contains(c)) {
	2484	break;
	2485	}
	2486	const UnicodeString *str;
	2487	iter.reset();
	2488	while((str=iter.nextString())!=NULL) {
	2489	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2490	// spanNeedsStrings=TRUE;
	2491	return prev;
	2492	}
	2493	}
	2494	} while((prev=length)>0);
	2495	return prev;
	2496	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2497	UnicodeSetWithStringsIterator iter(set);
	2498	UChar32 c;
	2499	int32_t prev=length, minSpanStart=length, length0=length;
	2500	do {
	2501	U16_PREV(s, 0, length, c);
	2502	if(!realSet.contains(c)) {
	2503	length=prev; // Do not span this single, not-contained code point.
	2504	}
	2505	const UnicodeString *str;
	2506	iter.reset();
	2507	while((str=iter.nextString())!=NULL) {
	2508	if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
	2509	// spanNeedsStrings=TRUE;
	2510	int32_t matchStart=prev-str->length();
	2511	if(matchStart==0) {
	2512	return 0;
	2513	}
	2514	if(spanCondition==USET_SPAN_CONTAINED) {
	2515	// Iterate for the shortest match at each position.
	2516	// Recurse for each but the shortest match.
	2517	if(length==prev) {
	2518	length=matchStart; // First match from prev.
	2519	} else {
	2520	if(matchStart>length) {
	2521	// Remember shortest match from prev for iteration.
	2522	int32_t temp=length;
	2523	length=matchStart;
	2524	matchStart=temp;
	2525	}
	2526	// Recurse for non-shortest match from prev.
	2527	int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
	2528	USET_SPAN_CONTAINED);
	2529	if(spanStart<minSpanStart) {
	2530	minSpanStart=spanStart;
	2531	if(minSpanStart==0) {
	2532	return 0;
	2533	}
	2534	}
	2535	}
	2536	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2537	if(matchStart<length) {
	2538	// Remember longest match from prev.
	2539	length=matchStart;
	2540	}
	2541	}
	2542	}
	2543	}
	2544	if(length==prev) {
	2545	break; // No match from prev.
	2546	}
	2547	} while((prev=length)>0);
	2548	if(prev<minSpanStart) {
	2549	return prev;
	2550	} else {
	2551	return minSpanStart;
	2552	}
	2553	}
	2554	}
	2555
	2556	static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2557	USetSpanCondition spanCondition) {
	2558	const UnicodeSet &realSet(set.getSet());
	2559	if(!set.hasStrings()) {
	2560	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2561	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2562	}
	2563
	2564	UChar32 c;
	2565	int32_t start=0, prev;
	2566	while((prev=start)<length) {
	2567	U8_NEXT(s, start, length, c);
	2568	if(c<0) {
	2569	c=0xfffd;
	2570	}
	2571	if(realSet.contains(c)!=spanCondition) {
	2572	break;
	2573	}
	2574	}
	2575	return prev;
	2576	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2577	UnicodeSetWithStringsIterator iter(set);
	2578	UChar32 c;
	2579	int32_t start, next;
	2580	for(start=next=0; start<length;) {
	2581	U8_NEXT(s, next, length, c);
	2582	if(c<0) {
	2583	c=0xfffd;
	2584	}
	2585	if(realSet.contains(c)) {
	2586	break;
	2587	}
	2588	const char *s8;
	2589	int32_t length8;
	2590	iter.reset();
	2591	while((s8=iter.nextUTF8(length8))!=NULL) {
	2592	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2593	// spanNeedsStrings=TRUE;
	2594	return start;
	2595	}
	2596	}
	2597	start=next;
	2598	}
	2599	return start;
	2600	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2601	UnicodeSetWithStringsIterator iter(set);
	2602	UChar32 c;
	2603	int32_t start, next, maxSpanLimit=0;
	2604	for(start=next=0; start<length;) {
	2605	U8_NEXT(s, next, length, c);
	2606	if(c<0) {
	2607	c=0xfffd;
	2608	}
	2609	if(!realSet.contains(c)) {
	2610	next=start; // Do not span this single, not-contained code point.
	2611	}
	2612	const char *s8;
	2613	int32_t length8;
	2614	iter.reset();
	2615	while((s8=iter.nextUTF8(length8))!=NULL) {
	2616	if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
	2617	// spanNeedsStrings=TRUE;
	2618	int32_t matchLimit=start+length8;
	2619	if(matchLimit==length) {
	2620	return length;
	2621	}
	2622	if(spanCondition==USET_SPAN_CONTAINED) {
	2623	// Iterate for the shortest match at each position.
	2624	// Recurse for each but the shortest match.
	2625	if(next==start) {
	2626	next=matchLimit; // First match from start.
	2627	} else {
	2628	if(matchLimit<next) {
	2629	// Remember shortest match from start for iteration.
	2630	int32_t temp=next;
	2631	next=matchLimit;
	2632	matchLimit=temp;
	2633	}
	2634	// Recurse for non-shortest match from start.
	2635	int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
	2636	USET_SPAN_CONTAINED);
	2637	if((matchLimit+spanLength)>maxSpanLimit) {
	2638	maxSpanLimit=matchLimit+spanLength;
	2639	if(maxSpanLimit==length) {
	2640	return length;
	2641	}
	2642	}
	2643	}
	2644	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2645	if(matchLimit>next) {
	2646	// Remember longest match from start.
	2647	next=matchLimit;
	2648	}
	2649	}
	2650	}
	2651	}
	2652	if(next==start) {
	2653	break; // No match from start.
	2654	}
	2655	start=next;
	2656	}
	2657	if(start>maxSpanLimit) {
	2658	return start;
	2659	} else {
	2660	return maxSpanLimit;
	2661	}
	2662	}
	2663	}
	2664
	2665	static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
	2666	USetSpanCondition spanCondition) {
	2667	if(length==0) {
	2668	return 0;
	2669	}
	2670	const UnicodeSet &realSet(set.getSet());
	2671	if(!set.hasStrings()) {
	2672	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
	2673	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
	2674	}
	2675
	2676	UChar32 c;
	2677	int32_t prev=length;
	2678	do {
	2679	U8_PREV(s, 0, length, c);
	2680	if(c<0) {
	2681	c=0xfffd;
	2682	}
	2683	if(realSet.contains(c)!=spanCondition) {
	2684	break;
	2685	}
	2686	} while((prev=length)>0);
	2687	return prev;
	2688	} else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
	2689	UnicodeSetWithStringsIterator iter(set);
	2690	UChar32 c;
	2691	int32_t prev=length;
	2692	do {
	2693	U8_PREV(s, 0, length, c);
	2694	if(c<0) {
	2695	c=0xfffd;
	2696	}
	2697	if(realSet.contains(c)) {
	2698	break;
	2699	}
	2700	const char *s8;
	2701	int32_t length8;
	2702	iter.reset();
	2703	while((s8=iter.nextUTF8(length8))!=NULL) {
	2704	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2705	// spanNeedsStrings=TRUE;
	2706	return prev;
	2707	}
	2708	}
	2709	} while((prev=length)>0);
	2710	return prev;
	2711	} else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
	2712	UnicodeSetWithStringsIterator iter(set);
	2713	UChar32 c;
	2714	int32_t prev=length, minSpanStart=length;
	2715	do {
	2716	U8_PREV(s, 0, length, c);
	2717	if(c<0) {
	2718	c=0xfffd;
	2719	}
	2720	if(!realSet.contains(c)) {
	2721	length=prev; // Do not span this single, not-contained code point.
	2722	}
	2723	const char *s8;
	2724	int32_t length8;
	2725	iter.reset();
	2726	while((s8=iter.nextUTF8(length8))!=NULL) {
	2727	if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
	2728	// spanNeedsStrings=TRUE;
	2729	int32_t matchStart=prev-length8;
	2730	if(matchStart==0) {
	2731	return 0;
	2732	}
	2733	if(spanCondition==USET_SPAN_CONTAINED) {
	2734	// Iterate for the shortest match at each position.
	2735	// Recurse for each but the shortest match.
	2736	if(length==prev) {
	2737	length=matchStart; // First match from prev.
	2738	} else {
	2739	if(matchStart>length) {
	2740	// Remember shortest match from prev for iteration.
	2741	int32_t temp=length;
	2742	length=matchStart;
	2743	matchStart=temp;
	2744	}
	2745	// Recurse for non-shortest match from prev.
	2746	int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
	2747	USET_SPAN_CONTAINED);
	2748	if(spanStart<minSpanStart) {
	2749	minSpanStart=spanStart;
	2750	if(minSpanStart==0) {
	2751	return 0;
	2752	}
	2753	}
	2754	}
	2755	} else /* spanCondition==USET_SPAN_SIMPLE */ {
	2756	if(matchStart<length) {
	2757	// Remember longest match from prev.
	2758	length=matchStart;
	2759	}
	2760	}
	2761	}
	2762	}
	2763	if(length==prev) {
	2764	break; // No match from prev.
	2765	}
	2766	} while((prev=length)>0);
	2767	if(prev<minSpanStart) {
	2768	return prev;
	2769	} else {
	2770	return minSpanStart;
	2771	}
	2772	}
	2773	}
	2774
	2775	// spans to be performed and compared
	2776	enum {
	2777	SPAN_UTF16 =1,
	2778	SPAN_UTF8 =2,
	2779	SPAN_UTFS =3,
	2780
	2781	SPAN_SET =4,
	2782	SPAN_COMPLEMENT =8,
	2783	SPAN_POLARITY =0xc,
	2784
	2785	SPAN_FWD =0x10,
	2786	SPAN_BACK =0x20,
	2787	SPAN_DIRS =0x30,
	2788
	2789	SPAN_CONTAINED =0x100,
	2790	SPAN_SIMPLE =0x200,
	2791	SPAN_CONDITION =0x300,
	2792
	2793	SPAN_ALL =0x33f
	2794	};
	2795
	2796	static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
	2797	return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
	2798	}
	2799
	2800	static inline int32_t slen(const void *s, UBool isUTF16) {
	2801	return isUTF16 ? u_strlen((const UChar )s) : strlen((const char )s);
	2802	}
	2803
	2804	/*
	2805	* Count spans on a string with the method according to type and set the span limits.
	2806	* The set may be the complement of the original.
	2807	* When using spanBack() and comparing with span(), use a span condition for the first spanBack()
	2808	* according to the expected number of spans.
	2809	* Sets typeName to an empty string if there is no such type.
	2810	* Returns -1 if the span option is filtered out.
	2811	*/
	2812	static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
	2813	const void *s, int32_t length, UBool isUTF16,
	2814	uint32_t whichSpans,
	2815	int type, const char *&typeName,
	2816	int32_t limits[], int32_t limitsCapacity,
	2817	int32_t expectCount) {
	2818	const UnicodeSet &realSet(set.getSet());
	2819	int32_t start, count;
	2820	USetSpanCondition spanCondition, firstSpanCondition, contained;
	2821	UBool isForward;
	2822
	2823	if(type<0 \|\| 7<type) {
	2824	typeName="";
	2825	return 0;
	2826	}
	2827
	2828	static const char *const typeNames16[]={
	2829	"contains", "contains(LM)",
	2830	"span", "span(LM)",
	2831	"containsBack", "containsBack(LM)",
	2832	"spanBack", "spanBack(LM)"
	2833	};
	2834
	2835	static const char *const typeNames8[]={
	2836	"containsUTF8", "containsUTF8(LM)",
	2837	"spanUTF8", "spanUTF8(LM)",
	2838	"containsBackUTF8", "containsBackUTF8(LM)", // not implemented
	2839	"spanBackUTF8", "spanBackUTF8(LM)"
	2840	};
	2841
	2842	typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
	2843
	2844	// filter span options
	2845	if(type<=3) {
	2846	// span forward
	2847	if((whichSpans&SPAN_FWD)==0) {
	2848	return -1;
	2849	}
	2850	isForward=TRUE;
	2851	} else {
	2852	// span backward
	2853	if((whichSpans&SPAN_BACK)==0) {
	2854	return -1;
	2855	}
	2856	isForward=FALSE;
	2857	}
	2858	if((type&1)==0) {
	2859	// use USET_SPAN_CONTAINED
	2860	if((whichSpans&SPAN_CONTAINED)==0) {
	2861	return -1;
	2862	}
	2863	contained=USET_SPAN_CONTAINED;
	2864	} else {
	2865	// use USET_SPAN_SIMPLE
	2866	if((whichSpans&SPAN_SIMPLE)==0) {
	2867	return -1;
	2868	}
	2869	contained=USET_SPAN_SIMPLE;
	2870	}
	2871
	2872	// Default first span condition for going forward with an uncomplemented set.
	2873	spanCondition=USET_SPAN_NOT_CONTAINED;
	2874	if(isComplement) {
	2875	spanCondition=invertSpanCondition(spanCondition, contained);
	2876	}
	2877
	2878	// First span condition for span(), used to terminate the spanBack() iteration.
	2879	firstSpanCondition=spanCondition;
	2880
	2881	// spanBack(): Its initial span condition is span()'s last span condition,
	2882	// which is the opposite of span()'s first span condition
	2883	// if we expect an even number of spans.
	2884	// (The loop inverts spanCondition (expectCount-1) times
	2885	// before the expectCount'th span() call.)
	2886	// If we do not compare forward and backward directions, then we do not have an
	2887	// expectCount and just start with firstSpanCondition.
	2888	if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
	2889	spanCondition=invertSpanCondition(spanCondition, contained);
	2890	}
	2891
	2892	count=0;
	2893	switch(type) {
	2894	case 0:
	2895	case 1:
	2896	start=0;
	2897	if(length<0) {
	2898	length=slen(s, isUTF16);
	2899	}
	2900	for(;;) {
	2901	start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
	2902	containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
	2903	if(count<limitsCapacity) {
	2904	limits[count]=start;
	2905	}
	2906	++count;
	2907	if(start>=length) {
	2908	break;
	2909	}
	2910	spanCondition=invertSpanCondition(spanCondition, contained);
	2911	}
	2912	break;
	2913	case 2:
	2914	case 3:
	2915	start=0;
	2916	for(;;) {
	2917	start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
	2918	realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
	2919	if(count<limitsCapacity) {
	2920	limits[count]=start;
	2921	}
	2922	++count;
	2923	if(length>=0 ? start>=length :
	2924	isUTF16 ? ((const UChar *)s)[start]==0 :
	2925	((const char *)s)[start]==0
	2926	) {
	2927	break;
	2928	}
	2929	spanCondition=invertSpanCondition(spanCondition, contained);
	2930	}
	2931	break;
	2932	case 4:
	2933	case 5:
	2934	if(length<0) {
	2935	length=slen(s, isUTF16);
	2936	}
	2937	for(;;) {
	2938	++count;
	2939	if(count<=limitsCapacity) {
	2940	limits[limitsCapacity-count]=length;
	2941	}
	2942	length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
	2943	containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
	2944	if(length==0 && spanCondition==firstSpanCondition) {
	2945	break;
	2946	}
	2947	spanCondition=invertSpanCondition(spanCondition, contained);
	2948	}
	2949	if(count<limitsCapacity) {
	2950	memmove(limits, limits+(limitsCapacity-count), count*4);
	2951	}
	2952	break;
	2953	case 6:
	2954	case 7:
	2955	for(;;) {
	2956	++count;
	2957	if(count<=limitsCapacity) {
	2958	limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
	2959	}
	2960	// Note: Length<0 is tested only for the first spanBack().
	2961	// If we wanted to keep length<0 for all spanBack()s, we would have to
	2962	// temporarily modify the string by placing a NUL where the previous spanBack() stopped.
	2963	length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
	2964	realSet.spanBackUTF8((const char *)s, length, spanCondition);
	2965	if(length==0 && spanCondition==firstSpanCondition) {
	2966	break;
	2967	}
	2968	spanCondition=invertSpanCondition(spanCondition, contained);
	2969	}
	2970	if(count<limitsCapacity) {
	2971	memmove(limits, limits+(limitsCapacity-count), count*4);
	2972	}
	2973	break;
	2974	default:
	2975	typeName="";
	2976	return -1;
	2977	}
	2978
	2979	return count;
	2980	}
	2981
	2982	// sets to be tested; odd index=isComplement
	2983	enum {
	2984	SLOW,
	2985	SLOW_NOT,
	2986	FAST,
	2987	FAST_NOT,
	2988	SET_COUNT
	2989	};
	2990
	2991	static const char *const setNames[SET_COUNT]={
	2992	"slow",
	2993	"slow.not",
	2994	"fast",
	2995	"fast.not"
	2996	};
	2997
	2998	/*
	2999	* Verify that we get the same results whether we look at text with contains(),
	3000	* span() or spanBack(), using unfrozen or frozen versions of the set,
	3001	* and using the set or its complement (switching the spanConditions accordingly).
	3002	* The latter verifies that
	3003	* set.span(spanCondition) == set.complement().span(!spanCondition).
	3004	*
	3005	* The expectLimits[] are either provided by the caller (with expectCount>=0)
	3006	* or returned to the caller (with an input expectCount<0).
	3007	*/
	3008	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3009	const void *s, int32_t length, UBool isUTF16,
	3010	uint32_t whichSpans,
	3011	int32_t expectLimits[], int32_t &expectCount,
	3012	const char *testName, int32_t index) {
	3013	int32_t limits[500];
	3014	int32_t limitsCount;
	3015	int i, j;
	3016
	3017	const char *typeName;
	3018	int type;
	3019
	3020	for(i=0; i<SET_COUNT; ++i) {
	3021	if((i&1)==0) {
	3022	// Even-numbered sets are original, uncomplemented sets.
	3023	if((whichSpans&SPAN_SET)==0) {
	3024	continue;
	3025	}
	3026	} else {
	3027	// Odd-numbered sets are complemented.
	3028	if((whichSpans&SPAN_COMPLEMENT)==0) {
	3029	continue;
	3030	}
	3031	}
	3032	for(type=0;; ++type) {
	3033	limitsCount=getSpans(*sets[i], (UBool)(i&1),
	3034	s, length, isUTF16,
	3035	whichSpans,
	3036	type, typeName,
	3037	limits, LENGTHOF(limits), expectCount);
	3038	if(typeName[0]==0) {
	3039	break; // All types tried.
	3040	}
	3041	if(limitsCount<0) {
	3042	continue; // Span option filtered out.
	3043	}
	3044	if(expectCount<0) {
	3045	expectCount=limitsCount;
	3046	if(limitsCount>LENGTHOF(limits)) {
	3047	errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
	3048	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
	3049	return;
	3050	}
	3051	memcpy(expectLimits, limits, limitsCount*4);
	3052	} else if(limitsCount!=expectCount) {
	3053	errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
	3054	testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
	3055	} else {
	3056	for(j=0; j<limitsCount; ++j) {
	3057	if(limits[j]!=expectLimits[j]) {
	3058	errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
	3059	testName, (long)index, setNames[i], typeName, (long)limitsCount,
	3060	j, (long)limits[j], (long)expectLimits[j]);
	3061	break;
	3062	}
	3063	}
	3064	}
	3065	}
	3066	}
	3067
	3068	// Compare span() with containsAll()/containsNone(),
	3069	// but only if we have expectLimits[] from the uncomplemented set.
	3070	if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
	3071	const UChar s16=(const UChar )s;
	3072	UnicodeString string;
	3073	int32_t prev=0, limit, length;
	3074	for(i=0; i<expectCount; ++i) {
	3075	limit=expectLimits[i];
	3076	length=limit-prev;
	3077	if(length>0) {
	3078	string.setTo(FALSE, s16+prev, length); // read-only alias
	3079	if(i&1) {
	3080	if(!sets[SLOW]->getSet().containsAll(string)) {
	3081	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3082	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3083	return;
	3084	}
	3085	if(!sets[FAST]->getSet().containsAll(string)) {
	3086	errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
	3087	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3088	return;
	3089	}
	3090	} else {
	3091	if(!sets[SLOW]->getSet().containsNone(string)) {
	3092	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3093	testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
	3094	return;
	3095	}
	3096	if(!sets[FAST]->getSet().containsNone(string)) {
	3097	errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
	3098	testName, (long)index, setNames[FAST], (long)prev, (long)limit);
	3099	return;
	3100	}
	3101	}
	3102	}
	3103	prev=limit;
	3104	}
	3105	}
	3106	}
	3107
	3108	// Specifically test either UTF-16 or UTF-8.
	3109	void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
	3110	const void *s, int32_t length, UBool isUTF16,
	3111	uint32_t whichSpans,
	3112	const char *testName, int32_t index) {
	3113	int32_t expectLimits[500];
	3114	int32_t expectCount=-1;
	3115	testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
	3116	}
	3117
	3118	UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
	3119	UChar c, c2;
	3120
	3121	if(length>=0) {
	3122	while(length>0) {
	3123	c=*s++;
	3124	--length;
	3125	if(0xd800<=c && c<0xe000) {
	3126	if(c>=0xdc00 \|\| length==0 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3127	return TRUE;
	3128	}
	3129	--length;
	3130	}
	3131	}
	3132	} else {
	3133	while((c=*s++)!=0) {
	3134	if(0xd800<=c && c<0xe000) {
	3135	if(c>=0xdc00 \|\| !U16_IS_TRAIL(c2=*s++)) {
	3136	return TRUE;
	3137	}
	3138	}
	3139	}
	3140	}
	3141	return FALSE;
	3142	}
	3143
	3144	// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
	3145	// unless either UTF is turned off in whichSpans.
	3146	// Testing UTF-16 and UTF-8 together requires that surrogate code points
	3147	// have the same contains(c) value as U+FFFD.
	3148	void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
	3149	const UChar *s16, int32_t length16,
	3150	uint32_t whichSpans,
	3151	const char *testName, int32_t index) {
	3152	int32_t expectLimits[500];
	3153	int32_t expectCount;
	3154
	3155	expectCount=-1; // Get expectLimits[] from testSpan().
	3156
	3157	if((whichSpans&SPAN_UTF16)!=0) {
	3158	testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
	3159	}
	3160	if((whichSpans&SPAN_UTF8)==0) {
	3161	return;
	3162	}
	3163
	3164	// Convert s16[] and expectLimits[] to UTF-8.
	3165	uint8_t s8[3000];
	3166	int32_t offsets[3000];
	3167
	3168	const UChar *s16Limit=s16+length16;
	3169	char t=(char )s8;
	3170	char *tLimit=t+sizeof(s8);
	3171	int32_t *o=offsets;
	3172	UErrorCode errorCode=U_ZERO_ERROR;
	3173
	3174	// Convert with substitution: Turn unpaired surrogates into U+FFFD.
	3175	ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
	3176	if(U_FAILURE(errorCode)) {
	3177	errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
	3178	testName, (long)index, u_errorName(errorCode));
	3179	ucnv_resetFromUnicode(utf8Cnv);
	3180	return;
	3181	}
	3182	int32_t length8=(int32_t)(t-(char *)s8);
	3183
	3184	// Convert expectLimits[].
	3185	int32_t i, j, expect;
	3186	for(i=j=0; i<expectCount; ++i) {
	3187	expect=expectLimits[i];
	3188	if(expect==length16) {
	3189	expectLimits[i]=length8;
	3190	} else {
	3191	while(offsets[j]<expect) {
	3192	++j;
	3193	}
	3194	expectLimits[i]=j;
	3195	}
	3196	}
	3197
	3198	testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
	3199	}
	3200
	3201	static UChar32 nextCodePoint(UChar32 c) {
	3202	// Skip some large and boring ranges.
	3203	switch(c) {
	3204	case 0x3441:
	3205	return 0x4d7f;
	3206	case 0x5100:
	3207	return 0x9f00;
	3208	case 0xb040:
	3209	return 0xd780;
	3210	case 0xe041:
	3211	return 0xf8fe;
	3212	case 0x10100:
	3213	return 0x20000;
	3214	case 0x20041:
	3215	return 0xe0000;
	3216	case 0xe0101:
	3217	return 0x10fffd;
	3218	default:
	3219	return c+1;
	3220	}
	3221	}
	3222
	3223	// Verify that all implementations represent the same set.
	3224	void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3225	// contains(U+FFFD) is inconsistent with contains(some surrogates),
	3226	// or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
	3227	// Skip the UTF-8 part of the test - if the string contains surrogates -
	3228	// because it is likely to produce a different result.
	3229	UBool inconsistentSurrogates=
	3230	(!(sets[0]->getSet().contains(0xfffd) ?
	3231	sets[0]->getSet().contains(0xd800, 0xdfff) :
	3232	sets[0]->getSet().containsNone(0xd800, 0xdfff)) \|\|
	3233	sets[0]->hasStringsWithSurrogates());
	3234
	3235	UChar s[1000];
	3236	int32_t length=0;
	3237	uint32_t localWhichSpans;
	3238
	3239	UChar32 c, first;
	3240	for(first=c=0;; c=nextCodePoint(c)) {
	3241	if(c>0x10ffff \|\| length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
	3242	localWhichSpans=whichSpans;
	3243	if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
	3244	localWhichSpans&=~SPAN_UTF8;
	3245	}
	3246	testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
	3247	if(c>0x10ffff) {
	3248	break;
	3249	}
	3250	length=0;
	3251	first=c;
	3252	}
	3253	U16_APPEND_UNSAFE(s, length, c);
	3254	}
	3255	}
	3256
	3257	// Test with a particular, interesting string.
	3258	// Specify length and try NUL-termination.
	3259	void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3260	static const UChar s[]={
	3261	0x61, 0x62, 0x20, // Latin, space
	3262	0x3b1, 0x3b2, 0x3b3, // Greek
	3263	0xd900, // lead surrogate
	3264	0x3000, 0x30ab, 0x30ad, // wide space, Katakana
	3265	0xdc05, // trail surrogate
	3266	0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
	3267	0xd900, 0xdc05, // unassigned supplementary
	3268	0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
	3269	0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
	3270	0 // NUL
	3271	};
	3272
	3273	if((whichSpans&SPAN_UTF16)==0) {
	3274	return;
	3275	}
	3276	testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
	3277	testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
	3278	}
	3279
	3280	void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings sets[4], uint32_t whichSpans, const char testName) {
	3281	static const char s[]={
	3282	"abc" // Latin
	3283
	3284	/* trail byte in lead position */
	3285	"\x80"
	3286
	3287	" " // space
	3288
	3289	/* truncated multi-byte sequences */
	3290	"\xd0"
	3291	"\xe0"
	3292	"\xe1"
	3293	"\xed"
	3294	"\xee"
	3295	"\xf0"
	3296	"\xf1"
	3297	"\xf4"
	3298	"\xf8"
	3299	"\xfc"
	3300
	3301	"\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
	3302
	3303	/* trail byte in lead position */
	3304	"\x80"
	3305
	3306	"\xe0\x80"
	3307	"\xe0\xa0"
	3308	"\xe1\x80"
	3309	"\xed\x80"
	3310	"\xed\xa0"
	3311	"\xee\x80"
	3312	"\xf0\x80"
	3313	"\xf0\x90"
	3314	"\xf1\x80"
	3315	"\xf4\x80"
	3316	"\xf4\x90"
	3317	"\xf8\x80"
	3318	"\xfc\x80"
	3319
	3320	"\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
	3321
	3322	/* trail byte in lead position */
	3323	"\x80"
	3324
	3325	"\xf0\x80\x80"
	3326	"\xf0\x90\x80"
	3327	"\xf1\x80\x80"
	3328	"\xf4\x80\x80"
	3329	"\xf4\x90\x80"
	3330	"\xf8\x80\x80"
	3331	"\xfc\x80\x80"
	3332
	3333	"\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
	3334
	3335	/* trail byte in lead position */
	3336	"\x80"
	3337
	3338	"\xf8\x80\x80\x80"
	3339	"\xfc\x80\x80\x80"
	3340
	3341	"\xF1\x90\x80\x85" // unassigned supplementary
	3342
	3343	/* trail byte in lead position */
	3344	"\x80"
	3345
	3346	"\xfc\x80\x80\x80\x80"
	3347
	3348	"\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
	3349
	3350	/* trail byte in lead position */
	3351	"\x80"
	3352
	3353	/* complete sequences but non-shortest forms or out of range etc. */
	3354	"\xc0\x80"
	3355	"\xe0\x80\x80"
	3356	"\xed\xa0\x80"
	3357	"\xf0\x80\x80\x80"
	3358	"\xf4\x90\x80\x80"
	3359	"\xf8\x80\x80\x80\x80"
	3360	"\xfc\x80\x80\x80\x80\x80"
	3361	"\xfe"
	3362	"\xff"
	3363
	3364	/* trail byte in lead position */
	3365	"\x80"
	3366
	3367	"\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
	3368	};
	3369
	3370	if((whichSpans&SPAN_UTF8)==0) {
	3371	return;
	3372	}
	3373	testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
	3374	testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
	3375	}
	3376
	3377	// Take a set of span options and multiply them so that
	3378	// each portion only has one of the options a, b and c.
	3379	// If b==0, then the set of options is just modified with mask and a.
	3380	// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
	3381	static int32_t
	3382	addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
	3383	uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
	3384	uint32_t s;
	3385	int32_t i;
	3386
	3387	for(i=0; i<whichSpansCount; ++i) {
	3388	s=whichSpans[i]&mask;
	3389	whichSpans[i]=s\|a;
	3390	if(b!=0) {
	3391	whichSpans[whichSpansCount+i]=s\|b;
	3392	if(c!=0) {
	3393	whichSpans[2*whichSpansCount+i]=s\|c;
	3394	}
	3395	}
	3396	}
	3397	return b==0 ? whichSpansCount : c==0 ? 2whichSpansCount : 3whichSpansCount;
	3398	}
	3399
	3400	#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3401	#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
	3402	#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3403	#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
	3404
	3405	void UnicodeSetTest::TestSpan() {
	3406	// "[...]" is a UnicodeSet pattern.
	3407	// "*" performs tests on all Unicode code points and on a selection of
	3408	// malformed UTF-8/16 strings.
	3409	// "-options" limits the scope of testing for the current set.
	3410	// By default, the test verifies that equivalent boundaries are found
	3411	// for UTF-16 and UTF-8, going forward and backward,
	3412	// alternating USET_SPAN_NOT_CONTAINED with
	3413	// either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
	3414	// Single-character options:
	3415	// 8 -- UTF-16 and UTF-8 boundaries may differ.
	3416	// Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
	3417	// or the set contains strings with unpaired surrogates
	3418	// which do not translate to valid UTF-8.
	3419	// c -- set.span() and set.complement().span() boundaries may differ.
	3420	// Cause: Set strings are not complemented.
	3421	// b -- span() and spanBack() boundaries may differ.
	3422	// Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
	3423	// and spanBack(USET_SPAN_SIMPLE) are defined to
	3424	// match with non-overlapping substrings.
	3425	// For example, with a set containing "ab" and "ba",
	3426	// span() of "aba" yields boundaries { 0, 2, 3 }
	3427	// because the initial "ab" matches from 0 to 2,
	3428	// while spanBack() yields boundaries { 0, 1, 3 }
	3429	// because the final "ba" matches from 1 to 3.
	3430	// l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
	3431	// Cause: Strings in the set overlap, and a longer match may
	3432	// require a sequence including non-longest substrings.
	3433	// For example, with a set containing "ab", "abc" and "cd",
	3434	// span(contained) of "abcd" spans the entire string
	3435	// but span(longest match) only spans the first 3 characters.
	3436	// Each "-options" first resets all options and then applies the specified options.
	3437	// A "-" without options resets the options.
	3438	// The options are also reset for each new set.
	3439	// Other strings will be spanned.
	3440	static const char *const testdata[]={
	3441	"[:ID_Continue:]",
	3442	"*",
	3443	"[:White_Space:]",
	3444	"*",
	3445	"[]",
	3446	"*",
	3447	"[\\u0000-\\U0010FFFF]",
	3448	"*",
	3449	"[\\u0000\\u0080\\u0800\\U00010000]",
	3450	"*",
	3451	"[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
	3452	"*",
	3453	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
	3454	"-c",
	3455	"*",
	3456	"[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
	3457	"-c",
	3458	"*",
	3459
	3460	// Overlapping strings cause overlapping attempts to match.
	3461	"[x{xy}{xya}{axy}{ax}]",
	3462	"-cl",
	3463
	3464	// More repetitions of "xya" would take too long with the recursive
	3465	// reference implementation.
	3466	// containsAll()=FALSE
	3467	// test_string 0x14
	3468	"xx"
	3469	"xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
	3470	"xx" // set.complement().span(contained) will stop between the two 'x'es.
	3471	"xyaxyaxyaxya"
	3472	"xx"
	3473	"xyaxyaxyaxya" // span() ends here.
	3474	"aaa",
	3475
	3476	// containsAll()=TRUE
	3477	// test_string 0x15
	3478	"xx"
	3479	"xyaxyaxyaxya"
	3480	"xx"
	3481	"xyaxyaxyaxya"
	3482	"xx"
	3483	"xyaxyaxyaxy",
	3484
	3485	"-bc",
	3486	// test_string 0x17
	3487	"byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
	3488	"-c",
	3489	"byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
	3490	"byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
	3491	"-",
	3492	"byaya", // span() -> { 5 }
	3493	"byay", // span() -> { 4 }
	3494	"bya", // span() -> { 3 }
	3495
	3496	// span(longest match) will not span the whole string.
	3497	"[a{ab}{bc}]",
	3498	"-cl",
	3499	// test_string 0x21
	3500	"abc",
	3501
	3502	"[a{ab}{abc}{cd}]",
	3503	"-cl",
	3504	"acdabcdabccd",
	3505
	3506	// spanBack(longest match) will not span the whole string.
	3507	"[c{ab}{bc}]",
	3508	"-cl",
	3509	"abc",
	3510
	3511	"[d{cd}{bcd}{ab}]",
	3512	"-cl",
	3513	"abbcdabcdabd",
	3514
	3515	// Test with non-ASCII set strings - test proper handling of surrogate pairs
	3516	// and UTF-8 trail bytes.
	3517	// Copies of above test sets and strings, but transliterated to have
	3518	// different code points with similar trail units.
	3519	// Previous: a b c d
	3520	// Unicode: 042B 30AB 200AB 204AB
	3521	// UTF-16: 042B 30AB D840 DCAB D841 DCAB
	3522	// UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
	3523	"[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
	3524	"-cl",
	3525	"\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
	3526
	3527	"[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
	3528	"-cl",
	3529	"\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
	3530
	3531	// Stress bookkeeping and recursion.
	3532	// The following strings are barely doable with the recursive
	3533	// reference implementation.
	3534	// The not-contained character at the end prevents an early exit from the span().
	3535	"[b{bb}]",
	3536	"-c",
	3537	// test_string 0x33
	3538	"bbbbbbbbbbbbbbbbbbbbbbbb-",
	3539	// On complement sets, span() and spanBack() get different results
	3540	// because b is not in the complement set and there is an odd number of b's
	3541	// in the test string.
	3542	"-bc",
	3543	"bbbbbbbbbbbbbbbbbbbbbbbbb-",
	3544
	3545	// Test with set strings with an initial or final code point span
	3546	// longer than 254.
	3547	"[a{" _64_a _64_a _64_a _64_a "b}"
	3548	"{a" _64_b _64_b _64_b _64_b "}]",
	3549	"-c",
	3550	_64_a _64_a _64_a _63_a "b",
	3551	_64_a _64_a _64_a _64_a "b",
	3552	_64_a _64_a _64_a _64_a "aaaabbbb",
	3553	"a" _64_b _64_b _64_b _63_b,
	3554	"a" _64_b _64_b _64_b _64_b,
	3555	"aaaabbbb" _64_b _64_b _64_b _64_b,
	3556
	3557	// Test with strings containing unpaired surrogates.
	3558	// They are not representable in UTF-8, and a leading trail surrogate
	3559	// and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
	3560	// U+20001 == \\uD840\\uDC01
	3561	// U+20400 == \\uD841\\uDC00
	3562	"[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
	3563	"-8cl",
	3564	"aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
	3565	};
	3566	uint32_t whichSpans[96]={ SPAN_ALL };
	3567	int32_t whichSpansCount=1;
	3568
	3569	UnicodeSet *sets[SET_COUNT]={ NULL };
	3570	const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
	3571
	3572	char testName[1024];
	3573	char *testNameLimit=testName;
	3574
	3575	int32_t i, j;
	3576	for(i=0; i<LENGTHOF(testdata); ++i) {
	3577	const char *s=testdata[i];
	3578	if(s[0]=='[') {
	3579	// Create new test sets from this pattern.
	3580	for(j=0; j<SET_COUNT; ++j) {
	3581	delete sets_with_str[j];
	3582	delete sets[j];
	3583	}
	3584	UErrorCode errorCode=U_ZERO_ERROR;
	3585	sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
	3586	if(U_FAILURE(errorCode)) {
	3587	errln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
	3588	break;
	3589	}
	3590	sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
	3591	sets[SLOW_NOT]->complement();
	3592	// Intermediate set: Test cloning of a frozen set.
	3593	UnicodeSet fast=new UnicodeSet(sets[SLOW]);
	3594	fast->freeze();
	3595	sets[FAST]=(UnicodeSet *)fast->clone();
	3596	delete fast;
	3597	UnicodeSet fastNot=new UnicodeSet(sets[SLOW_NOT]);
	3598	fastNot->freeze();
	3599	sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
	3600	delete fastNot;
	3601
	3602	for(j=0; j<SET_COUNT; ++j) {
	3603	sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
	3604	}
	3605
	3606	strcpy(testName, s);
	3607	testNameLimit=strchr(testName, 0);
	3608	*testNameLimit++=':';
	3609	*testNameLimit=0;
	3610
	3611	whichSpans[0]=SPAN_ALL;
	3612	whichSpansCount=1;
	3613	} else if(s[0]=='-') {
	3614	whichSpans[0]=SPAN_ALL;
	3615	whichSpansCount=1;
	3616
	3617	while(*++s!=0) {
	3618	switch(*s) {
	3619	case 'c':
	3620	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3621	~SPAN_POLARITY,
	3622	SPAN_SET,
	3623	SPAN_COMPLEMENT,
	3624	0);
	3625	break;
	3626	case 'b':
	3627	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3628	~SPAN_DIRS,
	3629	SPAN_FWD,
	3630	SPAN_BACK,
	3631	0);
	3632	break;
	3633	case 'l':
	3634	// test USET_SPAN_CONTAINED FWD & BACK, and separately
	3635	// USET_SPAN_SIMPLE only FWD, and separately
	3636	// USET_SPAN_SIMPLE only BACK
	3637	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3638	~(SPAN_DIRS\|SPAN_CONDITION),
	3639	SPAN_DIRS\|SPAN_CONTAINED,
	3640	SPAN_FWD\|SPAN_SIMPLE,
	3641	SPAN_BACK\|SPAN_SIMPLE);
	3642	break;
	3643	case '8':
	3644	whichSpansCount=addAlternative(whichSpans, whichSpansCount,
	3645	~SPAN_UTFS,
	3646	SPAN_UTF16,
	3647	SPAN_UTF8,
	3648	0);
	3649	break;
	3650	default:
	3651	errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
	3652	break;
	3653	}
	3654	}
	3655	} else if(0==strcmp(s, "*")) {
	3656	strcpy(testNameLimit, "bad_string");
	3657	for(j=0; j<whichSpansCount; ++j) {
	3658	if(whichSpansCount>1) {
	3659	sprintf(testNameLimit+10 /* strlen("bad_string") */,
	3660	"%%0x%3x",
	3661	whichSpans[j]);
	3662	}
	3663	testSpanUTF16String(sets_with_str, whichSpans[j], testName);
	3664	testSpanUTF8String(sets_with_str, whichSpans[j], testName);
	3665	}
	3666
	3667	strcpy(testNameLimit, "contents");
	3668	for(j=0; j<whichSpansCount; ++j) {
	3669	if(whichSpansCount>1) {
	3670	sprintf(testNameLimit+8 /* strlen("contents") */,
	3671	"%%0x%3x",
	3672	whichSpans[j]);
	3673	}
	3674	testSpanContents(sets_with_str, whichSpans[j], testName);
	3675	}
	3676	} else {
	3677	UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
	3678	strcpy(testNameLimit, "test_string");
	3679	for(j=0; j<whichSpansCount; ++j) {
	3680	if(whichSpansCount>1) {
	3681	sprintf(testNameLimit+11 /* strlen("test_string") */,
	3682	"%%0x%3x",
	3683	whichSpans[j]);
	3684	}
	3685	testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
	3686	}
	3687	}
	3688	}
	3689	for(j=0; j<SET_COUNT; ++j) {
	3690	delete sets_with_str[j];
	3691	delete sets[j];
	3692	}
	3693	}
	3694
	3695	// Test select patterns and strings, and test USET_SPAN_SIMPLE.
	3696	void UnicodeSetTest::TestStringSpan() {
	3697	static const char *pattern="[x{xy}{xya}{axy}{ax}]";
	3698	static const char *const string=
	3699	"xx"
	3700	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3701	"xx"
	3702	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
	3703	"xx"
	3704	"xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
	3705	"aaaa";
	3706
	3707	UErrorCode errorCode=U_ZERO_ERROR;
	3708	UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
	3709	UnicodeSet set(pattern16, errorCode);
	3710	if(U_FAILURE(errorCode)) {
	3711	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3712	return;
	3713	}
	3714
	3715	UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
	3716
	3717	if(set.containsAll(string16)) {
	3718	errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
	3719	}
	3720
	3721	// Remove trailing "aaaa".
	3722	string16.truncate(string16.length()-4);
	3723	if(!set.containsAll(string16)) {
	3724	errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
	3725	}
	3726
	3727	string16=UNICODE_STRING_SIMPLE("byayaxya");
	3728	const UChar *s16=string16.getBuffer();
	3729	int32_t length16=string16.length();
	3730	if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3731	set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3732	set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3733	set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 \|\|
	3734	set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 \|\|
	3735	set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
	3736	) {
	3737	errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
	3738	}
	3739
	3740	pattern="[a{ab}{abc}{cd}]";
	3741	pattern16=UnicodeString(pattern, -1, US_INV);
	3742	set.applyPattern(pattern16, errorCode);
	3743	if(U_FAILURE(errorCode)) {
	3744	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3745	return;
	3746	}
	3747	string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
	3748	s16=string16.getBuffer();
	3749	length16=string16.length();
	3750	if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 \|\|
	3751	set.span(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3752	set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
	3753	) {
	3754	errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
	3755	}
	3756
	3757	pattern="[d{cd}{bcd}{ab}]";
	3758	pattern16=UnicodeString(pattern, -1, US_INV);
	3759	set.applyPattern(pattern16, errorCode).freeze();
	3760	if(U_FAILURE(errorCode)) {
	3761	errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
	3762	return;
	3763	}
	3764	string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
	3765	s16=string16.getBuffer();
	3766	length16=string16.length();
	3767	if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 \|\|
	3768	set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|
	3769	set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
	3770	) {
	3771	errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
	3772	}
	3773	}