]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
374ca955 | 3 | * Copyright (C) 1999-2004 Alan Liu ,International Business Machines Corporation and |
b75a7d8f A |
4 | * others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 10/20/99 alan Creation. | |
8 | * 03/22/2000 Madhu Added additional tests | |
9 | ********************************************************************** | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | #include "usettest.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/uchar.h" | |
16 | #include "unicode/usetiter.h" | |
17 | #include "unicode/ustring.h" | |
374ca955 A |
18 | #include "unicode/parsepos.h" |
19 | #include "unicode/symtable.h" | |
20 | #include "hash.h" | |
b75a7d8f A |
21 | |
22 | UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { | |
23 | UnicodeString pat; | |
24 | set.toPattern(pat); | |
25 | return left + UnicodeSetTest::escape(pat); | |
26 | } | |
27 | ||
28 | #define CASE(id,test) case id: \ | |
29 | name = #test; \ | |
30 | if (exec) { \ | |
31 | logln(#test "---"); \ | |
32 | logln((UnicodeString)""); \ | |
33 | test(); \ | |
34 | } \ | |
35 | break | |
36 | ||
37 | void | |
38 | UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, | |
39 | const char* &name, char* /*par*/) { | |
40 | // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); | |
41 | switch (index) { | |
42 | CASE(0,TestPatterns); | |
43 | CASE(1,TestAddRemove); | |
44 | CASE(2,TestCategories); | |
45 | CASE(3,TestCloneEqualHash); | |
46 | CASE(4,TestMinimalRep); | |
47 | CASE(5,TestAPI); | |
48 | CASE(6,TestScriptSet); | |
49 | CASE(7,TestPropertySet); | |
50 | CASE(8,TestClone); | |
51 | CASE(9,TestExhaustive); | |
52 | CASE(10,TestToPattern); | |
53 | CASE(11,TestIndexOf); | |
54 | CASE(12,TestStrings); | |
374ca955 A |
55 | CASE(13,Testj2268); |
56 | CASE(14,TestCloseOver); | |
57 | CASE(15,TestEscapePattern); | |
58 | CASE(16,TestInvalidCodePoint); | |
59 | CASE(17,TestSymbolTable); | |
60 | CASE(18,TestSurrogate); | |
b75a7d8f A |
61 | default: name = ""; break; |
62 | } | |
63 | } | |
64 | ||
374ca955 A |
65 | static const char NOT[] = "%%%%"; |
66 | ||
b75a7d8f A |
67 | /** |
68 | * UVector was improperly copying contents | |
69 | * This code will crash this is still true | |
70 | */ | |
71 | void UnicodeSetTest::Testj2268() { | |
72 | UnicodeSet t; | |
73 | t.add(UnicodeString("abc")); | |
74 | UnicodeSet test(t); | |
75 | UnicodeString ustrPat; | |
76 | test.toPattern(ustrPat, TRUE); | |
77 | } | |
78 | ||
79 | /** | |
374ca955 | 80 | * Test toPattern(). |
b75a7d8f A |
81 | */ |
82 | void UnicodeSetTest::TestToPattern() { | |
374ca955 | 83 | UErrorCode ec = U_ZERO_ERROR; |
b75a7d8f | 84 | |
374ca955 A |
85 | // Test that toPattern() round trips with syntax characters and |
86 | // whitespace. | |
87 | { | |
88 | static const char* OTHER_TOPATTERN_TESTS[] = { | |
89 | "[[:latin:]&[:greek:]]", | |
90 | "[[:latin:]-[:greek:]]", | |
91 | "[:nonspacing mark:]", | |
92 | NULL | |
93 | }; | |
94 | ||
95 | for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { | |
96 | ec = U_ZERO_ERROR; | |
97 | UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); | |
98 | if (U_FAILURE(ec)) { | |
99 | errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]); | |
b75a7d8f A |
100 | continue; |
101 | } | |
374ca955 A |
102 | checkPat(OTHER_TOPATTERN_TESTS[j], s); |
103 | } | |
104 | ||
105 | for (UChar32 i = 0; i <= 0x10FFFF; ++i) { | |
106 | if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { | |
107 | ||
108 | // check various combinations to make sure they all work. | |
109 | if (i != 0 && !toPatternAux(i, i)){ | |
110 | continue; | |
111 | } | |
112 | if (!toPatternAux(0, i)){ | |
113 | continue; | |
114 | } | |
115 | if (!toPatternAux(i, 0xFFFF)){ | |
116 | continue; | |
117 | } | |
b75a7d8f A |
118 | } |
119 | } | |
120 | } | |
374ca955 A |
121 | |
122 | // Test pattern behavior of multicharacter strings. | |
123 | { | |
124 | ec = U_ZERO_ERROR; | |
125 | UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); | |
126 | ||
127 | // This loop isn't a loop. It's here to make the compiler happy. | |
128 | // If you're curious, try removing it and changing the 'break' | |
129 | // statements (except for the last) to goto's. | |
130 | for (;;) { | |
131 | if (U_FAILURE(ec)) break; | |
132 | const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; | |
133 | expectToPattern(*s, "[a-z{aa}{ab}]", exp1); | |
134 | ||
135 | s->add("ac"); | |
136 | const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; | |
137 | expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); | |
138 | ||
139 | s->applyPattern("[a-z {\\{l} {r\\}}]", ec); | |
140 | if (U_FAILURE(ec)) break; | |
141 | const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; | |
142 | expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3); | |
143 | ||
144 | s->add("[]"); | |
145 | const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; | |
146 | expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4); | |
147 | ||
148 | s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec); | |
149 | if (U_FAILURE(ec)) break; | |
150 | const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; | |
151 | expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5); | |
152 | ||
153 | // j2189 | |
154 | s->clear(); | |
155 | s->add(UnicodeString("abc", "")); | |
156 | s->add(UnicodeString("abc", "")); | |
157 | const char* exp6[] = {"abc", NOT, "ab", NULL}; | |
158 | expectToPattern(*s, "[{abc}]", exp6); | |
159 | ||
160 | break; | |
161 | } | |
162 | ||
163 | if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); | |
164 | delete s; | |
165 | } | |
166 | ||
167 | // JB#3400: For 2 character ranges prefer [ab] to [a-b] | |
168 | UnicodeSet s; | |
169 | s.add((UChar)97, (UChar)98); // 'a', 'b' | |
170 | expectToPattern(s, "[ab]", NULL); | |
b75a7d8f A |
171 | } |
172 | ||
173 | UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { | |
174 | ||
175 | // use Integer.toString because Utility.hex doesn't handle ints | |
176 | UnicodeString pat = ""; | |
177 | // TODO do these in hex | |
178 | //String source = "0x" + Integer.toString(start,16).toUpperCase(); | |
179 | //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); | |
180 | UnicodeString source; | |
181 | source = source + (uint32_t)start; | |
182 | if (start != end) | |
183 | source = source + ".." + (uint32_t)end; | |
184 | UnicodeSet testSet; | |
185 | testSet.add(start, end); | |
186 | return checkPat(source, testSet); | |
187 | } | |
188 | ||
189 | UBool UnicodeSetTest::checkPat(const UnicodeString& source, | |
190 | const UnicodeSet& testSet) { | |
191 | // What we want to make sure of is that a pattern generated | |
192 | // by toPattern(), with or without escaped unprintables, can | |
193 | // be passed back into the UnicodeSet constructor. | |
194 | UnicodeString pat0; | |
195 | ||
196 | testSet.toPattern(pat0, TRUE); | |
197 | ||
198 | if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; | |
199 | ||
200 | //String pat1 = unescapeLeniently(pat0); | |
201 | //if (!checkPat(source + " (in code)", testSet, pat1)) return false; | |
202 | ||
203 | UnicodeString pat2; | |
204 | testSet.toPattern(pat2, FALSE); | |
205 | if (!checkPat(source, testSet, pat2)) return FALSE; | |
206 | ||
207 | //String pat3 = unescapeLeniently(pat2); | |
208 | // if (!checkPat(source + " (in code)", testSet, pat3)) return false; | |
209 | ||
210 | //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); | |
211 | logln((UnicodeString)source + " => " + pat0 + ", " + pat2); | |
212 | return TRUE; | |
213 | } | |
214 | ||
215 | UBool UnicodeSetTest::checkPat(const UnicodeString& source, | |
216 | const UnicodeSet& testSet, | |
217 | const UnicodeString& pat) { | |
218 | UErrorCode ec = U_ZERO_ERROR; | |
219 | UnicodeSet testSet2(pat, ec); | |
220 | if (testSet2 != testSet) { | |
221 | errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); | |
222 | return FALSE; | |
223 | } | |
224 | return TRUE; | |
225 | } | |
226 | ||
227 | void | |
228 | UnicodeSetTest::TestPatterns(void) { | |
229 | UnicodeSet set; | |
230 | expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); | |
231 | expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); | |
232 | expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); | |
233 | expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); | |
234 | expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); | |
235 | expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); | |
236 | ||
237 | // Throw in a test of complement | |
238 | set.complement(); | |
239 | UnicodeString exp; | |
240 | exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); | |
241 | expectPairs(set, exp); | |
242 | } | |
243 | ||
244 | void | |
245 | UnicodeSetTest::TestCategories(void) { | |
246 | UErrorCode status = U_ZERO_ERROR; | |
247 | const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] | |
248 | UnicodeSet set(pat, status); | |
249 | if (U_FAILURE(status)) { | |
250 | errln((UnicodeString)"Fail: Can't construct set with " + pat); | |
251 | } else { | |
252 | expectContainment(set, pat, "ABC", "abc"); | |
253 | } | |
254 | ||
255 | UChar32 i; | |
256 | int32_t failures = 0; | |
257 | // Make sure generation of L doesn't pollute cached Lu set | |
258 | // First generate L, then Lu | |
259 | set.applyPattern("[:L:]", status); | |
260 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
261 | for (i=0; i<0x200; ++i) { | |
262 | UBool l = u_isalpha((UChar)i); | |
263 | if (l != set.contains(i)) { | |
264 | errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + | |
265 | set.contains(i)); | |
266 | if (++failures == 10) break; | |
267 | } | |
268 | } | |
269 | ||
270 | set.applyPattern("[:Lu:]", status); | |
271 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
272 | for (i=0; i<0x200; ++i) { | |
273 | UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); | |
274 | if (lu != set.contains(i)) { | |
275 | errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + | |
276 | set.contains(i)); | |
277 | if (++failures == 20) break; | |
278 | } | |
279 | } | |
280 | } | |
281 | void | |
282 | UnicodeSetTest::TestCloneEqualHash(void) { | |
283 | UErrorCode status = U_ZERO_ERROR; | |
284 | // set1 and set2 used to be built with the obsolete constructor taking | |
285 | // UCharCategory values; replaced with pattern constructors | |
286 | // markus 20030502 | |
287 | UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase | |
288 | UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); // Letter, lowercase | |
289 | if (U_FAILURE(status)){ | |
290 | errln((UnicodeString)"FAIL: Can't construst set with category->Ll"); | |
291 | return; | |
292 | } | |
293 | UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status); //Number, Decimal digit | |
294 | UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit | |
295 | if (U_FAILURE(status)){ | |
296 | errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); | |
297 | return; | |
298 | } | |
299 | ||
300 | if (*set1 != *set1a) { | |
301 | errln("FAIL: category constructor for Ll broken"); | |
302 | } | |
303 | if (*set2 != *set2a) { | |
304 | errln("FAIL: category constructor for Nd broken"); | |
305 | } | |
306 | delete set1a; | |
307 | delete set2a; | |
308 | ||
309 | logln("Testing copy construction"); | |
310 | UnicodeSet *set1copy=new UnicodeSet(*set1); | |
311 | if(*set1 != *set1copy || *set1 == *set2 || | |
312 | getPairs(*set1) != getPairs(*set1copy) || | |
313 | set1->hashCode() != set1copy->hashCode()){ | |
314 | errln("FAIL : Error in copy construction"); | |
315 | return; | |
316 | } | |
317 | ||
318 | logln("Testing =operator"); | |
319 | UnicodeSet set1equal=*set1; | |
320 | UnicodeSet set2equal=*set2; | |
321 | if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || | |
322 | set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ | |
323 | errln("FAIL: Error in =operator"); | |
324 | } | |
325 | ||
326 | logln("Testing clone()"); | |
327 | UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); | |
328 | UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); | |
329 | if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || | |
330 | *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || | |
331 | *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ | |
332 | errln("FAIL: Error in clone"); | |
333 | } | |
334 | ||
335 | logln("Testing hashcode"); | |
336 | if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || | |
337 | set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || | |
338 | set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || | |
339 | set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || | |
340 | set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ | |
341 | errln("FAIL: Error in hashCode()"); | |
342 | } | |
343 | ||
344 | delete set1; | |
345 | delete set1copy; | |
346 | delete set2; | |
347 | delete set1clone; | |
348 | delete set2clone; | |
349 | ||
350 | ||
351 | } | |
352 | void | |
353 | UnicodeSetTest::TestAddRemove(void) { | |
354 | UnicodeSet set; // Construct empty set | |
355 | doAssert(set.isEmpty() == TRUE, "set should be empty"); | |
356 | doAssert(set.size() == 0, "size should be 0"); | |
374ca955 A |
357 | set.complement(); |
358 | doAssert(set.size() == 0x110000, "size should be 0x110000"); | |
359 | set.clear(); | |
b75a7d8f A |
360 | set.add(0x0061, 0x007a); |
361 | expectPairs(set, "az"); | |
362 | doAssert(set.isEmpty() == FALSE, "set should not be empty"); | |
363 | doAssert(set.size() != 0, "size should not be equal to 0"); | |
364 | doAssert(set.size() == 26, "size should be equal to 26"); | |
365 | set.remove(0x006d, 0x0070); | |
366 | expectPairs(set, "alqz"); | |
367 | doAssert(set.size() == 22, "size should be equal to 22"); | |
368 | set.remove(0x0065, 0x0067); | |
369 | expectPairs(set, "adhlqz"); | |
370 | doAssert(set.size() == 19, "size should be equal to 19"); | |
371 | set.remove(0x0064, 0x0069); | |
372 | expectPairs(set, "acjlqz"); | |
373 | doAssert(set.size() == 16, "size should be equal to 16"); | |
374 | set.remove(0x0063, 0x0072); | |
375 | expectPairs(set, "absz"); | |
376 | doAssert(set.size() == 10, "size should be equal to 10"); | |
377 | set.add(0x0066, 0x0071); | |
378 | expectPairs(set, "abfqsz"); | |
379 | doAssert(set.size() == 22, "size should be equal to 22"); | |
380 | set.remove(0x0061, 0x0067); | |
381 | expectPairs(set, "hqsz"); | |
382 | set.remove(0x0061, 0x007a); | |
383 | expectPairs(set, ""); | |
384 | doAssert(set.isEmpty() == TRUE, "set should be empty"); | |
385 | doAssert(set.size() == 0, "size should be 0"); | |
386 | set.add(0x0061); | |
387 | doAssert(set.isEmpty() == FALSE, "set should not be empty"); | |
388 | doAssert(set.size() == 1, "size should not be equal to 1"); | |
389 | set.add(0x0062); | |
390 | set.add(0x0063); | |
391 | expectPairs(set, "ac"); | |
392 | doAssert(set.size() == 3, "size should not be equal to 3"); | |
393 | set.add(0x0070); | |
394 | set.add(0x0071); | |
395 | expectPairs(set, "acpq"); | |
396 | doAssert(set.size() == 5, "size should not be equal to 5"); | |
397 | set.clear(); | |
398 | expectPairs(set, ""); | |
399 | doAssert(set.isEmpty() == TRUE, "set should be empty"); | |
400 | doAssert(set.size() == 0, "size should be 0"); | |
401 | ||
402 | // Try removing an entire set from another set | |
403 | expectPattern(set, "[c-x]", "cx"); | |
404 | UnicodeSet set2; | |
405 | expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); | |
406 | set.removeAll(set2); | |
407 | expectPairs(set, "deluxx"); | |
408 | ||
409 | // Try adding an entire set to another set | |
410 | expectPattern(set, "[jackiemclean]", "aacceein"); | |
411 | expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); | |
412 | set.addAll(set2); | |
413 | expectPairs(set, "aacehort"); | |
414 | doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); | |
415 | ||
416 | // Try retaining an set of elements contained in another set (intersection) | |
417 | UnicodeSet set3; | |
418 | expectPattern(set3, "[a-c]", "ac"); | |
419 | doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); | |
420 | set3.remove(0x0062); | |
421 | expectPairs(set3, "aacc"); | |
422 | doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); | |
423 | set.retainAll(set3); | |
424 | expectPairs(set, "aacc"); | |
425 | doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); | |
426 | doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); | |
427 | set.clear(); | |
428 | doAssert(set.size() != set3.size(), "set.size() != set3.size()"); | |
429 | ||
430 | // Test commutativity | |
431 | expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); | |
432 | expectPattern(set2, "[jackiemclean]", "aacceein"); | |
433 | set.addAll(set2); | |
434 | expectPairs(set, "aacehort"); | |
435 | doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); | |
436 | ||
437 | ||
438 | ||
439 | ||
440 | } | |
441 | ||
442 | /** | |
443 | * Make sure minimal representation is maintained. | |
444 | */ | |
445 | void UnicodeSetTest::TestMinimalRep() { | |
446 | UErrorCode status = U_ZERO_ERROR; | |
447 | // This is pretty thoroughly tested by checkCanonicalRep() | |
448 | // run against the exhaustive operation results. Use the code | |
449 | // here for debugging specific spot problems. | |
450 | ||
451 | // 1 overlap against 2 | |
452 | UnicodeSet set("[h-km-q]", status); | |
453 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
454 | UnicodeSet set2("[i-o]", status); | |
455 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
456 | set.addAll(set2); | |
457 | expectPairs(set, "hq"); | |
458 | // right | |
459 | set.applyPattern("[a-m]", status); | |
460 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
461 | set2.applyPattern("[e-o]", status); | |
462 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
463 | set.addAll(set2); | |
464 | expectPairs(set, "ao"); | |
465 | // left | |
466 | set.applyPattern("[e-o]", status); | |
467 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
468 | set2.applyPattern("[a-m]", status); | |
469 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
470 | set.addAll(set2); | |
471 | expectPairs(set, "ao"); | |
472 | // 1 overlap against 3 | |
473 | set.applyPattern("[a-eg-mo-w]", status); | |
474 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
475 | set2.applyPattern("[d-q]", status); | |
476 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
477 | set.addAll(set2); | |
478 | expectPairs(set, "aw"); | |
479 | } | |
480 | ||
481 | void UnicodeSetTest::TestAPI() { | |
482 | UErrorCode status = U_ZERO_ERROR; | |
483 | // default ct | |
484 | UnicodeSet set; | |
485 | if (!set.isEmpty() || set.getRangeCount() != 0) { | |
486 | errln((UnicodeString)"FAIL, set should be empty but isn't: " + | |
487 | set); | |
488 | } | |
489 | ||
490 | // clear(), isEmpty() | |
491 | set.add(0x0061); | |
492 | if (set.isEmpty()) { | |
493 | errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + | |
494 | set); | |
495 | } | |
496 | set.clear(); | |
497 | if (!set.isEmpty()) { | |
498 | errln((UnicodeString)"FAIL, set should be empty but isn't: " + | |
499 | set); | |
500 | } | |
501 | ||
502 | // size() | |
503 | set.clear(); | |
504 | if (set.size() != 0) { | |
505 | errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + | |
506 | ": " + set); | |
507 | } | |
508 | set.add(0x0061); | |
509 | if (set.size() != 1) { | |
510 | errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + | |
511 | ": " + set); | |
512 | } | |
513 | set.add(0x0031, 0x0039); | |
514 | if (set.size() != 10) { | |
515 | errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + | |
516 | ": " + set); | |
517 | } | |
518 | ||
519 | // contains(first, last) | |
520 | set.clear(); | |
521 | set.applyPattern("[A-Y 1-8 b-d l-y]", status); | |
522 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
523 | for (int32_t i = 0; i<set.getRangeCount(); ++i) { | |
524 | UChar32 a = set.getRangeStart(i); | |
525 | UChar32 b = set.getRangeEnd(i); | |
526 | if (!set.contains(a, b)) { | |
527 | errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + | |
528 | " but doesn't: " + set); | |
529 | } | |
530 | if (set.contains((UChar32)(a-1), b)) { | |
531 | errln((UnicodeString)"FAIL, shouldn't contain " + | |
532 | (unsigned short)(a-1) + '-' + (unsigned short)b + | |
533 | " but does: " + set); | |
534 | } | |
535 | if (set.contains(a, (UChar32)(b+1))) { | |
536 | errln((UnicodeString)"FAIL, shouldn't contain " + | |
537 | (unsigned short)a + '-' + (unsigned short)(b+1) + | |
538 | " but does: " + set); | |
539 | } | |
540 | } | |
541 | ||
542 | // Ported InversionList test. | |
543 | UnicodeSet a((UChar32)3,(UChar32)10); | |
544 | UnicodeSet b((UChar32)7,(UChar32)15); | |
545 | UnicodeSet c; | |
546 | ||
547 | logln((UnicodeString)"a [3-10]: " + a); | |
548 | logln((UnicodeString)"b [7-15]: " + b); | |
374ca955 A |
549 | c = a; |
550 | c.addAll(b); | |
b75a7d8f A |
551 | UnicodeSet exp((UChar32)3,(UChar32)15); |
552 | if (c == exp) { | |
553 | logln((UnicodeString)"c.set(a).add(b): " + c); | |
554 | } else { | |
555 | errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); | |
556 | } | |
557 | c.complement(); | |
558 | exp.set((UChar32)0, (UChar32)2); | |
559 | exp.add((UChar32)16, UnicodeSet::MAX_VALUE); | |
560 | if (c == exp) { | |
561 | logln((UnicodeString)"c.complement(): " + c); | |
562 | } else { | |
563 | errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); | |
564 | } | |
565 | c.complement(); | |
566 | exp.set((UChar32)3, (UChar32)15); | |
567 | if (c == exp) { | |
568 | logln((UnicodeString)"c.complement(): " + c); | |
569 | } else { | |
570 | errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); | |
571 | } | |
374ca955 A |
572 | c = a; |
573 | c.complementAll(b); | |
b75a7d8f A |
574 | exp.set((UChar32)3,(UChar32)6); |
575 | exp.add((UChar32)11,(UChar32) 15); | |
576 | if (c == exp) { | |
577 | logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); | |
578 | } else { | |
579 | errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); | |
580 | } | |
581 | ||
582 | exp = c; | |
583 | bitsToSet(setToBits(c), c); | |
584 | if (c == exp) { | |
585 | logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); | |
586 | } else { | |
587 | errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); | |
588 | } | |
589 | ||
590 | // Additional tests for coverage JB#2118 | |
591 | //UnicodeSet::complement(class UnicodeString const &) | |
592 | //UnicodeSet::complementAll(class UnicodeString const &) | |
593 | //UnicodeSet::containsNone(class UnicodeSet const &) | |
594 | //UnicodeSet::containsNone(long,long) | |
595 | //UnicodeSet::containsSome(class UnicodeSet const &) | |
596 | //UnicodeSet::containsSome(long,long) | |
597 | //UnicodeSet::removeAll(class UnicodeString const &) | |
598 | //UnicodeSet::retain(long) | |
599 | //UnicodeSet::retainAll(class UnicodeString const &) | |
600 | //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) | |
601 | //UnicodeSetIterator::getString(void) | |
602 | set.clear(); | |
603 | set.complement("ab"); | |
604 | exp.applyPattern("[{ab}]", status); | |
605 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
606 | if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } | |
607 | ||
608 | UnicodeSetIterator iset(set); | |
609 | if (!iset.next() || !iset.isString()) { | |
610 | errln("FAIL: UnicodeSetIterator::next/isString"); | |
611 | } else if (iset.getString() != "ab") { | |
612 | errln("FAIL: UnicodeSetIterator::getString"); | |
613 | } | |
614 | ||
615 | set.add((UChar32)0x61, (UChar32)0x7A); | |
616 | set.complementAll("alan"); | |
617 | exp.applyPattern("[{ab}b-kmo-z]", status); | |
618 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
619 | if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } | |
620 | ||
621 | exp.applyPattern("[a-z]", status); | |
622 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
623 | if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } | |
624 | if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } | |
625 | exp.applyPattern("[aln]", status); | |
626 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
627 | if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } | |
628 | if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } | |
629 | ||
630 | if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { | |
631 | errln("FAIL: containsNone(UChar32, UChar32)"); | |
632 | } | |
633 | if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { | |
634 | errln("FAIL: containsSome(UChar32, UChar32)"); | |
635 | } | |
636 | if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { | |
637 | errln("FAIL: containsNone(UChar32, UChar32)"); | |
638 | } | |
639 | if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { | |
640 | errln("FAIL: containsSome(UChar32, UChar32)"); | |
641 | } | |
642 | ||
643 | set.removeAll("liu"); | |
644 | exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); | |
645 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
646 | if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } | |
647 | ||
648 | set.retainAll("star"); | |
649 | exp.applyPattern("[rst]", status); | |
650 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
651 | if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } | |
652 | ||
653 | set.retain((UChar32)0x73); | |
654 | exp.applyPattern("[s]", status); | |
655 | if (U_FAILURE(status)) { errln("FAIL"); return; } | |
656 | if (set != exp) { errln("FAIL: retain('s')"); return; } | |
657 | ||
658 | uint16_t buf[32]; | |
659 | int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); | |
660 | if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } | |
661 | if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { | |
662 | errln("FAIL: serialize"); | |
663 | return; | |
664 | } | |
665 | } | |
666 | ||
667 | void UnicodeSetTest::TestStrings() { | |
668 | UErrorCode ec = U_ZERO_ERROR; | |
669 | ||
670 | UnicodeSet* testList[] = { | |
671 | UnicodeSet::createFromAll("abc"), | |
672 | new UnicodeSet("[a-c]", ec), | |
673 | ||
674 | &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), | |
675 | new UnicodeSet("[{ll}{ch}a-z]", ec), | |
676 | ||
677 | UnicodeSet::createFrom("ab}c"), | |
678 | new UnicodeSet("[{ab\\}c}]", ec), | |
679 | ||
680 | &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), | |
681 | new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), | |
682 | ||
683 | NULL | |
684 | }; | |
685 | ||
686 | if (U_FAILURE(ec)) { | |
687 | errln("FAIL: couldn't construct test sets"); | |
688 | } | |
689 | ||
690 | for (int32_t i = 0; testList[i] != NULL; i+=2) { | |
691 | if (U_SUCCESS(ec)) { | |
692 | UnicodeString pat0, pat1; | |
693 | testList[i]->toPattern(pat0, TRUE); | |
694 | testList[i+1]->toPattern(pat1, TRUE); | |
695 | if (*testList[i] == *testList[i+1]) { | |
696 | logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); | |
697 | } else { | |
698 | logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); | |
699 | } | |
700 | } | |
701 | delete testList[i]; | |
702 | delete testList[i+1]; | |
703 | } | |
704 | } | |
705 | ||
b75a7d8f A |
706 | /** |
707 | * Test the [:Latin:] syntax. | |
708 | */ | |
709 | void UnicodeSetTest::TestScriptSet() { | |
710 | expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); | |
711 | ||
712 | expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); | |
713 | ||
714 | /* Jitterbug 1423 */ | |
715 | expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); | |
716 | ||
717 | } | |
718 | ||
719 | /** | |
720 | * Test the [:Latin:] syntax. | |
721 | */ | |
722 | void UnicodeSetTest::TestPropertySet() { | |
723 | static const char* DATA[] = { | |
724 | // Pattern, Chars IN, Chars NOT in | |
725 | ||
726 | "[:Latin:]", | |
727 | "aA", | |
728 | "\\u0391\\u03B1", | |
729 | ||
730 | "[\\p{Greek}]", | |
731 | "\\u0391\\u03B1", | |
732 | "aA", | |
733 | ||
734 | "\\P{ GENERAL Category = upper case letter }", | |
735 | "abc", | |
736 | "ABC", | |
737 | ||
738 | // Combining class: @since ICU 2.2 | |
739 | // Check both symbolic and numeric | |
740 | "\\p{ccc=Nukta}", | |
741 | "\\u0ABC", | |
742 | "abc", | |
743 | ||
744 | "\\p{Canonical Combining Class = 11}", | |
745 | "\\u05B1", | |
746 | "\\u05B2", | |
747 | ||
748 | "[:c c c = iota subscript :]", | |
749 | "\\u0345", | |
750 | "xyz", | |
751 | ||
752 | // Bidi class: @since ICU 2.2 | |
753 | "\\p{bidiclass=lefttoright}", | |
754 | "abc", | |
755 | "\\u0671\\u0672", | |
756 | ||
757 | // Binary properties: @since ICU 2.2 | |
758 | "\\p{ideographic}", | |
759 | "\\u4E0A", | |
760 | "x", | |
761 | ||
762 | "[:math=false:]", | |
374ca955 A |
763 | "q)*(", |
764 | // weiv: )(and * were removed from math in Unicode 4.0.1 | |
765 | //"(*+)", | |
766 | "+<>^", | |
b75a7d8f A |
767 | |
768 | // JB#1767 \N{}, \p{ASCII} | |
769 | "[:Ascii:]", | |
770 | "abc\\u0000\\u007F", | |
771 | "\\u0080\\u4E00", | |
772 | ||
773 | "[\\N{ latin small letter a }[:name= latin small letter z:]]", | |
774 | "az", | |
775 | "qrs", | |
776 | ||
777 | // JB#2015 | |
778 | "[:any:]", | |
779 | "a\\U0010FFFF", | |
780 | "", | |
781 | ||
782 | "[:nv=0.5:]", | |
783 | "\\u00BD\\u0F2A", | |
784 | "\\u00BC", | |
785 | ||
786 | // JB#2653: Age | |
787 | "[:Age=1.1:]", | |
788 | "\\u03D6", // 1.1 | |
789 | "\\u03D8\\u03D9", // 3.2 | |
790 | ||
791 | "[:Age=3.1:]", | |
792 | "\\u1800\\u3400\\U0002f800", | |
793 | "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", | |
794 | ||
795 | // JB#2350: Case_Sensitive | |
796 | "[:Case Sensitive:]", | |
797 | "A\\u1FFC\\U00010410", | |
798 | ";\\u00B4\\U00010500", | |
799 | ||
800 | // JB#2832: C99-compatibility props | |
801 | "[:blank:]", | |
802 | " \\u0009", | |
803 | "1-9A-Z", | |
804 | ||
805 | "[:graph:]", | |
806 | "19AZ", | |
807 | " \\u0003\\u0007\\u0009\\u000A\\u000D", | |
808 | ||
809 | "[:punct:]", | |
810 | "!@#%&*()[]{}-_\\/;:,.?'\"", | |
811 | "09azAZ", | |
812 | ||
813 | "[:xdigit:]", | |
814 | "09afAF", | |
815 | "gG!", | |
816 | ||
817 | // Regex compatibility test | |
818 | "[-b]", // leading '-' is literal | |
819 | "-b", | |
820 | "ac", | |
821 | ||
822 | "[^-b]", // leading '-' is literal | |
823 | "ac", | |
824 | "-b", | |
825 | ||
826 | "[b-]", // trailing '-' is literal | |
827 | "-b", | |
828 | "ac", | |
829 | ||
830 | "[^b-]", // trailing '-' is literal | |
831 | "ac", | |
374ca955 A |
832 | "-b", |
833 | ||
834 | "[a-b-]", // trailing '-' is literal | |
835 | "ab-", | |
836 | "c=", | |
837 | ||
838 | "[[a-q]&[p-z]-]", // trailing '-' is literal | |
839 | "pq-", | |
840 | "or=", | |
841 | ||
842 | "[\\s|\\)|:|$|\\>]", // from regex tests | |
843 | "s|):$>", | |
844 | "abc", | |
845 | ||
846 | "[\\uDC00cd]", // JB#2906: isolated trail at start | |
847 | "cd\\uDC00", | |
848 | "ab\\uD800\\U00010000", | |
849 | ||
850 | "[ab\\uD800]", // JB#2906: isolated trail at start | |
851 | "ab\\uD800", | |
852 | "cd\\uDC00\\U00010000", | |
853 | ||
854 | "[ab\\uD800cd]", // JB#2906: isolated lead in middle | |
855 | "abcd\\uD800", | |
856 | "ef\\uDC00\\U00010000", | |
857 | ||
858 | "[ab\\uDC00cd]", // JB#2906: isolated trail in middle | |
859 | "abcd\\uDC00", | |
860 | "ef\\uD800\\U00010000", | |
861 | ||
862 | "[:^lccc=0:]", // Lead canonical class | |
863 | "\\u0300\\u0301", | |
864 | "abcd\\u00c0\\u00c5", | |
865 | ||
866 | "[:^tccc=0:]", // Trail canonical class | |
867 | "\\u0300\\u0301\\u00c0\\u00c5", | |
868 | "abcd", | |
869 | ||
870 | "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class | |
871 | "\\u0300\\u0301\\u00c0\\u00c5", | |
872 | "abcd", | |
873 | ||
874 | "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) | |
875 | "", | |
876 | "abcd\\u0300\\u0301\\u00c0\\u00c5", | |
877 | ||
878 | "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not | |
879 | "\\u0F73\\u0F75\\u0F81", | |
880 | "abcd\\u0300\\u0301\\u00c0\\u00c5", | |
881 | ||
b75a7d8f A |
882 | }; |
883 | ||
884 | static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); | |
885 | ||
886 | for (int32_t i=0; i<DATA_LEN; i+=3) { | |
887 | expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]), | |
888 | CharsToUnicodeString(DATA[i+2])); | |
889 | } | |
890 | } | |
891 | ||
892 | /** | |
893 | * Test cloning of UnicodeSet. For C++, we test the copy constructor. | |
894 | */ | |
895 | void UnicodeSetTest::TestClone() { | |
896 | UErrorCode ec = U_ZERO_ERROR; | |
897 | UnicodeSet s("[abcxyz]", ec); | |
898 | UnicodeSet t(s); | |
899 | expectContainment(t, "abc", "def"); | |
900 | } | |
901 | ||
902 | /** | |
903 | * Test the indexOf() and charAt() methods. | |
904 | */ | |
905 | void UnicodeSetTest::TestIndexOf() { | |
906 | UErrorCode ec = U_ZERO_ERROR; | |
907 | UnicodeSet set("[a-cx-y3578]", ec); | |
908 | if (U_FAILURE(ec)) { | |
909 | errln("FAIL: UnicodeSet constructor"); | |
910 | return; | |
911 | } | |
912 | for (int32_t i=0; i<set.size(); ++i) { | |
913 | UChar32 c = set.charAt(i); | |
914 | if (set.indexOf(c) != i) { | |
915 | errln("FAIL: charAt(%d) = %X => indexOf() => %d", | |
916 | i, c, set.indexOf(c)); | |
917 | } | |
918 | } | |
919 | UChar32 c = set.charAt(set.size()); | |
920 | if (c != -1) { | |
921 | errln("FAIL: charAt(<out of range>) = %X", c); | |
922 | } | |
923 | int32_t j = set.indexOf((UChar32)0x71/*'q'*/); | |
924 | if (j != -1) { | |
925 | errln((UnicodeString)"FAIL: indexOf('q') = " + j); | |
926 | } | |
927 | } | |
928 | ||
929 | /** | |
930 | * Test closure API. | |
931 | */ | |
932 | void UnicodeSetTest::TestCloseOver() { | |
933 | UErrorCode ec = U_ZERO_ERROR; | |
934 | ||
935 | char CASE[] = {(char)USET_CASE}; | |
374ca955 | 936 | char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; |
b75a7d8f A |
937 | const char* DATA[] = { |
938 | // selector, input, output | |
939 | CASE, | |
940 | "[aq\\u00DF{Bc}{bC}{Fi}]", | |
941 | "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]", | |
942 | ||
943 | CASE, | |
944 | "[\\u01F1]", // 'DZ' | |
945 | "[\\u01F1\\u01F2\\u01F3]", | |
946 | ||
947 | CASE, | |
948 | "[\\u1FB4]", | |
949 | "[\\u1FB4{\\u03AC\\u03B9}]", | |
950 | ||
951 | CASE, | |
952 | "[{F\\uFB01}]", | |
953 | "[\\uFB03{ffi}]", | |
954 | ||
955 | CASE, // make sure binary search finds limits | |
956 | "[a\\uFF3A]", | |
957 | "[aA\\uFF3A\\uFF5A]", | |
958 | ||
959 | CASE, | |
960 | "[a-z]","[A-Za-z\\u017F\\u212A]", | |
961 | CASE, | |
962 | "[abc]","[A-Ca-c]", | |
963 | CASE, | |
964 | "[ABC]","[A-Ca-c]", | |
965 | ||
374ca955 A |
966 | CASE_MAPPINGS, |
967 | "[aq\\u00DF{Bc}{bC}{Fi}]", | |
968 | "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", | |
969 | ||
970 | CASE_MAPPINGS, | |
971 | "[\\u01F1]", // 'DZ' | |
972 | "[\\u01F1\\u01F2\\u01F3]", | |
973 | ||
974 | CASE_MAPPINGS, | |
975 | "[a-z]", | |
976 | "[A-Za-z]", | |
977 | ||
b75a7d8f A |
978 | NULL |
979 | }; | |
980 | ||
981 | UnicodeSet s; | |
982 | UnicodeSet t; | |
983 | for (int32_t i=0; DATA[i]!=NULL; i+=3) { | |
984 | int32_t selector = DATA[i][0]; | |
985 | UnicodeString pat(DATA[i+1]); | |
986 | UnicodeString exp(DATA[i+2]); | |
987 | s.applyPattern(pat, ec); | |
988 | s.closeOver(selector); | |
989 | t.applyPattern(exp, ec); | |
990 | if (U_FAILURE(ec)) { | |
991 | errln("FAIL: applyPattern failed"); | |
992 | continue; | |
993 | } | |
994 | if (s == t) { | |
995 | logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); | |
996 | } else { | |
997 | UnicodeString buf; | |
998 | errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + | |
999 | s.toPattern(buf, TRUE) + ", expected " + exp); | |
1000 | } | |
1001 | } | |
1002 | ||
1003 | // Test the pattern API | |
374ca955 | 1004 | s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); |
b75a7d8f A |
1005 | if (U_FAILURE(ec)) { |
1006 | errln("FAIL: applyPattern failed"); | |
1007 | } else { | |
1008 | expectContainment(s, "abcABC", "defDEF"); | |
1009 | } | |
374ca955 | 1010 | UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); |
b75a7d8f A |
1011 | if (U_FAILURE(ec)) { |
1012 | errln("FAIL: constructor failed"); | |
1013 | } else { | |
1014 | expectContainment(v, "defDEF", "abcABC"); | |
1015 | } | |
374ca955 A |
1016 | UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); |
1017 | if (U_FAILURE(ec)) { | |
1018 | errln("FAIL: construct w/case mappings failed"); | |
1019 | } else { | |
1020 | expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); | |
1021 | } | |
b75a7d8f A |
1022 | } |
1023 | ||
1024 | void UnicodeSetTest::TestEscapePattern() { | |
1025 | const char pattern[] = | |
374ca955 | 1026 | "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; |
b75a7d8f | 1027 | const char exp[] = |
374ca955 | 1028 | "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; |
b75a7d8f | 1029 | // We test this with two passes; in the second pass we |
374ca955 A |
1030 | // pre-unescape the pattern. Since U+200E is rule whitespace, |
1031 | // this fails -- which is what we expect. | |
b75a7d8f A |
1032 | for (int32_t pass=1; pass<=2; ++pass) { |
1033 | UErrorCode ec = U_ZERO_ERROR; | |
1034 | UnicodeString pat(pattern); | |
1035 | if (pass==2) { | |
1036 | pat = pat.unescape(); | |
1037 | } | |
1038 | // Pattern is only good for pass 1 | |
1039 | UBool isPatternValid = (pass==1); | |
1040 | ||
1041 | UnicodeSet set(pat, ec); | |
1042 | if (U_SUCCESS(ec) != isPatternValid){ | |
1043 | errln((UnicodeString)"FAIL: applyPattern(" + | |
1044 | escape(pat) + ") => " + | |
1045 | u_errorName(ec)); | |
1046 | continue; | |
1047 | } | |
1048 | if (U_FAILURE(ec)) { | |
1049 | continue; | |
1050 | } | |
1051 | if (set.contains((UChar)0x0644)){ | |
1052 | errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); | |
1053 | } | |
1054 | ||
1055 | UnicodeString newpat; | |
1056 | set.toPattern(newpat, TRUE); | |
1057 | if (newpat == exp) { | |
1058 | logln(escape(pat) + " => " + newpat); | |
1059 | } else { | |
1060 | errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); | |
1061 | } | |
1062 | ||
1063 | for (int32_t i=0; i<set.getRangeCount(); ++i) { | |
1064 | UnicodeString str("Range "); | |
1065 | str.append((UChar)(0x30 + i)) | |
1066 | .append(": ") | |
1067 | .append((UChar32)set.getRangeStart(i)) | |
1068 | .append(" - ") | |
1069 | .append((UChar32)set.getRangeEnd(i)); | |
1070 | str = str + " (" + set.getRangeStart(i) + " - " + | |
1071 | set.getRangeEnd(i) + ")"; | |
1072 | if (set.getRangeStart(i) < 0) { | |
1073 | errln((UnicodeString)"FAIL: " + escape(str)); | |
1074 | } else { | |
1075 | logln(escape(str)); | |
1076 | } | |
1077 | } | |
1078 | } | |
1079 | } | |
1080 | ||
1081 | void UnicodeSetTest::expectRange(const UnicodeString& label, | |
1082 | const UnicodeSet& set, | |
1083 | UChar32 start, UChar32 end) { | |
1084 | UnicodeSet exp(start, end); | |
1085 | UnicodeString pat; | |
1086 | if (set == exp) { | |
1087 | logln(label + " => " + set.toPattern(pat, TRUE)); | |
1088 | } else { | |
1089 | UnicodeString xpat; | |
1090 | errln((UnicodeString)"FAIL: " + label + " => " + | |
1091 | set.toPattern(pat, TRUE) + | |
1092 | ", expected " + exp.toPattern(xpat, TRUE)); | |
1093 | } | |
1094 | } | |
1095 | ||
1096 | void UnicodeSetTest::TestInvalidCodePoint() { | |
1097 | ||
1098 | const UChar32 DATA[] = { | |
1099 | // Test range Expected range | |
1100 | 0, 0x10FFFF, 0, 0x10FFFF, | |
1101 | (UChar32)-1, 8, 0, 8, | |
1102 | 8, 0x110000, 8, 0x10FFFF | |
1103 | }; | |
1104 | const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); | |
1105 | ||
1106 | UnicodeString pat; | |
1107 | int32_t i; | |
1108 | ||
1109 | for (i=0; i<DATA_LENGTH; i+=4) { | |
1110 | UChar32 start = DATA[i]; | |
1111 | UChar32 end = DATA[i+1]; | |
1112 | UChar32 xstart = DATA[i+2]; | |
1113 | UChar32 xend = DATA[i+3]; | |
1114 | ||
1115 | // Try various API using the test code points | |
1116 | ||
1117 | UnicodeSet set(start, end); | |
1118 | expectRange((UnicodeString)"ct(" + start + "," + end + ")", | |
1119 | set, xstart, xend); | |
1120 | ||
1121 | set.clear(); | |
1122 | set.set(start, end); | |
1123 | expectRange((UnicodeString)"set(" + start + "," + end + ")", | |
1124 | set, xstart, xend); | |
1125 | ||
1126 | UBool b = set.contains(start); | |
1127 | b = set.contains(start, end); | |
1128 | b = set.containsNone(start, end); | |
1129 | b = set.containsSome(start, end); | |
1130 | ||
374ca955 | 1131 | /*int32_t index = set.indexOf(start);*/ |
b75a7d8f A |
1132 | |
1133 | set.clear(); | |
1134 | set.add(start); | |
1135 | set.add(start, end); | |
1136 | expectRange((UnicodeString)"add(" + start + "," + end + ")", | |
1137 | set, xstart, xend); | |
1138 | ||
1139 | set.set(0, 0x10FFFF); | |
1140 | set.retain(start, end); | |
1141 | expectRange((UnicodeString)"retain(" + start + "," + end + ")", | |
1142 | set, xstart, xend); | |
1143 | set.retain(start); | |
1144 | ||
1145 | set.set(0, 0x10FFFF); | |
1146 | set.remove(start); | |
1147 | set.remove(start, end); | |
1148 | set.complement(); | |
1149 | expectRange((UnicodeString)"!remove(" + start + "," + end + ")", | |
1150 | set, xstart, xend); | |
1151 | ||
1152 | set.set(0, 0x10FFFF); | |
1153 | set.complement(start, end); | |
1154 | set.complement(); | |
1155 | expectRange((UnicodeString)"!complement(" + start + "," + end + ")", | |
1156 | set, xstart, xend); | |
1157 | set.complement(start); | |
1158 | } | |
1159 | ||
1160 | const UChar32 DATA2[] = { | |
1161 | 0, | |
1162 | 0x10FFFF, | |
1163 | (UChar32)-1, | |
1164 | 0x110000 | |
1165 | }; | |
1166 | const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); | |
1167 | ||
1168 | for (i=0; i<DATA2_LENGTH; ++i) { | |
1169 | UChar32 c = DATA2[i], end = 0x10FFFF; | |
1170 | UBool valid = (c >= 0 && c <= 0x10FFFF); | |
1171 | ||
1172 | UnicodeSet set(0, 0x10FFFF); | |
1173 | ||
1174 | // For single-codepoint contains, invalid codepoints are NOT contained | |
1175 | UBool b = set.contains(c); | |
1176 | if (b == valid) { | |
1177 | logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + | |
1178 | ") = " + b); | |
1179 | } else { | |
1180 | errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + | |
1181 | ") = " + b); | |
1182 | } | |
1183 | ||
1184 | // For codepoint range contains, containsNone, and containsSome, | |
1185 | // invalid or empty (start > end) ranges have UNDEFINED behavior. | |
1186 | b = set.contains(c, end); | |
1187 | logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + | |
1188 | "," + end + ") = " + b); | |
1189 | ||
1190 | b = set.containsNone(c, end); | |
1191 | logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + | |
1192 | "," + end + ") = " + b); | |
1193 | ||
1194 | b = set.containsSome(c, end); | |
1195 | logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + | |
1196 | "," + end + ") = " + b); | |
1197 | ||
1198 | int32_t index = set.indexOf(c); | |
1199 | if ((index >= 0) == valid) { | |
1200 | logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + | |
1201 | ") = " + index); | |
1202 | } else { | |
1203 | errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + | |
1204 | ") = " + index); | |
1205 | } | |
1206 | } | |
1207 | } | |
1208 | ||
374ca955 A |
1209 | // Used by TestSymbolTable |
1210 | class TokenSymbolTable : public SymbolTable { | |
1211 | public: | |
1212 | Hashtable contents; | |
1213 | ||
1214 | TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { | |
1215 | contents.setValueDeleter(uhash_deleteUnicodeString); | |
1216 | } | |
1217 | ||
1218 | ~TokenSymbolTable() {} | |
1219 | ||
1220 | /** | |
1221 | * (Non-SymbolTable API) Add the given variable and value to | |
1222 | * the table. Variable should NOT contain leading '$'. | |
1223 | */ | |
1224 | void add(const UnicodeString& var, const UnicodeString& value, | |
1225 | UErrorCode& ec) { | |
1226 | if (U_SUCCESS(ec)) { | |
1227 | contents.put(var, new UnicodeString(value), ec); | |
1228 | } | |
1229 | } | |
1230 | ||
1231 | /** | |
1232 | * SymbolTable API | |
1233 | */ | |
1234 | virtual const UnicodeString* lookup(const UnicodeString& s) const { | |
1235 | return (const UnicodeString*) contents.get(s); | |
1236 | } | |
1237 | ||
1238 | /** | |
1239 | * SymbolTable API | |
1240 | */ | |
1241 | virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { | |
1242 | return NULL; | |
1243 | } | |
1244 | ||
1245 | /** | |
1246 | * SymbolTable API | |
1247 | */ | |
1248 | virtual UnicodeString parseReference(const UnicodeString& text, | |
1249 | ParsePosition& pos, int32_t limit) const { | |
1250 | int32_t start = pos.getIndex(); | |
1251 | int32_t i = start; | |
1252 | UnicodeString result; | |
1253 | while (i < limit) { | |
1254 | UChar c = text.charAt(i); | |
1255 | if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { | |
1256 | break; | |
1257 | } | |
1258 | ++i; | |
1259 | } | |
1260 | if (i == start) { // No valid name chars | |
1261 | return result; // Indicate failure with empty string | |
1262 | } | |
1263 | pos.setIndex(i); | |
1264 | text.extractBetween(start, i, result); | |
1265 | return result; | |
1266 | } | |
1267 | }; | |
1268 | ||
1269 | void UnicodeSetTest::TestSymbolTable() { | |
1270 | // Multiple test cases can be set up here. Each test case | |
1271 | // is terminated by null: | |
1272 | // var, value, var, value,..., input pat., exp. output pat., null | |
1273 | const char* DATA[] = { | |
1274 | "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, | |
1275 | "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, | |
1276 | "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, | |
1277 | NULL | |
1278 | }; | |
1279 | ||
1280 | for (int32_t i=0; DATA[i]!=NULL; ++i) { | |
1281 | UErrorCode ec = U_ZERO_ERROR; | |
1282 | TokenSymbolTable sym(ec); | |
1283 | if (U_FAILURE(ec)) { | |
1284 | errln("FAIL: couldn't construct TokenSymbolTable"); | |
1285 | continue; | |
1286 | } | |
1287 | ||
1288 | // Set up variables | |
1289 | while (DATA[i+2] != NULL) { | |
1290 | sym.add(DATA[i], DATA[i+1], ec); | |
1291 | if (U_FAILURE(ec)) { | |
1292 | errln("FAIL: couldn't add to TokenSymbolTable"); | |
1293 | continue; | |
1294 | } | |
1295 | i += 2; | |
1296 | } | |
1297 | ||
1298 | // Input pattern and expected output pattern | |
1299 | UnicodeString inpat = DATA[i], exppat = DATA[i+1]; | |
1300 | i += 2; | |
1301 | ||
1302 | ParsePosition pos(0); | |
1303 | UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); | |
1304 | if (U_FAILURE(ec)) { | |
1305 | errln("FAIL: couldn't construct UnicodeSet"); | |
1306 | continue; | |
1307 | } | |
1308 | ||
1309 | // results | |
1310 | if (pos.getIndex() != inpat.length()) { | |
1311 | errln((UnicodeString)"Failed to read to end of string \"" | |
1312 | + inpat + "\": read to " | |
1313 | + pos.getIndex() + ", length is " | |
1314 | + inpat.length()); | |
1315 | } | |
1316 | ||
1317 | UnicodeSet us2(exppat, ec); | |
1318 | if (U_FAILURE(ec)) { | |
1319 | errln("FAIL: couldn't construct expected UnicodeSet"); | |
1320 | continue; | |
1321 | } | |
1322 | ||
1323 | UnicodeString a, b; | |
1324 | if (us != us2) { | |
1325 | errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + | |
1326 | ", expected " + us2.toPattern(b, TRUE)); | |
1327 | } else { | |
1328 | logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); | |
1329 | } | |
1330 | } | |
1331 | } | |
1332 | ||
1333 | void UnicodeSetTest::TestSurrogate() { | |
1334 | const char* DATA[] = { | |
1335 | // These should all behave identically | |
1336 | "[abc\\uD800\\uDC00]", | |
1337 | // "[abc\uD800\uDC00]", // Can't do this on C -- only Java | |
1338 | "[abc\\U00010000]", | |
1339 | 0 | |
1340 | }; | |
1341 | for (int i=0; DATA[i] != 0; ++i) { | |
1342 | UErrorCode ec = U_ZERO_ERROR; | |
1343 | logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]); | |
1344 | UnicodeSet set(DATA[i], ec); | |
1345 | if (U_FAILURE(ec)) { | |
1346 | errln("FAIL: UnicodeSet constructor"); | |
1347 | continue; | |
1348 | } | |
1349 | expectContainment(set, | |
1350 | CharsToUnicodeString("abc\\U00010000"), | |
1351 | CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair | |
1352 | if (set.size() != 4) { | |
1353 | errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " + | |
1354 | set.size() + ", expected 4"); | |
1355 | } | |
1356 | } | |
1357 | } | |
1358 | ||
b75a7d8f A |
1359 | void UnicodeSetTest::TestExhaustive() { |
1360 | // exhaustive tests. Simulate UnicodeSets with integers. | |
1361 | // That gives us very solid tests (except for large memory tests). | |
1362 | ||
1363 | int32_t limit = 128; | |
1364 | ||
1365 | UnicodeSet x, y, z, aa; | |
1366 | ||
1367 | for (int32_t i = 0; i < limit; ++i) { | |
1368 | bitsToSet(i, x); | |
1369 | logln((UnicodeString)"Testing " + i + ", " + x); | |
1370 | _testComplement(i, x, y); | |
1371 | ||
1372 | // AS LONG AS WE ARE HERE, check roundtrip | |
1373 | checkRoundTrip(bitsToSet(i, aa)); | |
1374 | ||
1375 | for (int32_t j = 0; j < limit; ++j) { | |
1376 | _testAdd(i,j, x,y,z); | |
1377 | _testXor(i,j, x,y,z); | |
1378 | _testRetain(i,j, x,y,z); | |
1379 | _testRemove(i,j, x,y,z); | |
1380 | } | |
1381 | } | |
1382 | } | |
1383 | ||
1384 | void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { | |
1385 | bitsToSet(a, x); | |
1386 | z = x; | |
1387 | z.complement(); | |
1388 | int32_t c = setToBits(z); | |
1389 | if (c != (~a)) { | |
1390 | errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); | |
1391 | errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); | |
1392 | } | |
1393 | checkCanonicalRep(z, (UnicodeString)"complement " + a); | |
1394 | } | |
1395 | ||
1396 | void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { | |
1397 | bitsToSet(a, x); | |
1398 | bitsToSet(b, y); | |
1399 | z = x; | |
1400 | z.addAll(y); | |
1401 | int32_t c = setToBits(z); | |
1402 | if (c != (a | b)) { | |
1403 | errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); | |
1404 | errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); | |
1405 | } | |
1406 | checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); | |
1407 | } | |
1408 | ||
1409 | void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { | |
1410 | bitsToSet(a, x); | |
1411 | bitsToSet(b, y); | |
1412 | z = x; | |
1413 | z.retainAll(y); | |
1414 | int32_t c = setToBits(z); | |
1415 | if (c != (a & b)) { | |
1416 | errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); | |
1417 | errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); | |
1418 | } | |
1419 | checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); | |
1420 | } | |
1421 | ||
1422 | void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { | |
1423 | bitsToSet(a, x); | |
1424 | bitsToSet(b, y); | |
1425 | z = x; | |
1426 | z.removeAll(y); | |
1427 | int32_t c = setToBits(z); | |
1428 | if (c != (a &~ b)) { | |
1429 | errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); | |
1430 | errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); | |
1431 | } | |
1432 | checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); | |
1433 | } | |
1434 | ||
1435 | void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { | |
1436 | bitsToSet(a, x); | |
1437 | bitsToSet(b, y); | |
1438 | z = x; | |
1439 | z.complementAll(y); | |
1440 | int32_t c = setToBits(z); | |
1441 | if (c != (a ^ b)) { | |
1442 | errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); | |
1443 | errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); | |
1444 | } | |
1445 | checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); | |
1446 | } | |
1447 | ||
1448 | /** | |
1449 | * Check that ranges are monotonically increasing and non- | |
1450 | * overlapping. | |
1451 | */ | |
1452 | void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { | |
1453 | int32_t n = set.getRangeCount(); | |
1454 | if (n < 0) { | |
1455 | errln((UnicodeString)"FAIL result of " + msg + | |
1456 | ": range count should be >= 0 but is " + | |
1457 | n /*+ " for " + set.toPattern())*/); | |
1458 | return; | |
1459 | } | |
1460 | UChar32 last = 0; | |
1461 | for (int32_t i=0; i<n; ++i) { | |
1462 | UChar32 start = set.getRangeStart(i); | |
1463 | UChar32 end = set.getRangeEnd(i); | |
1464 | if (start > end) { | |
1465 | errln((UnicodeString)"FAIL result of " + msg + | |
1466 | ": range " + (i+1) + | |
1467 | " start > end: " + (int)start + ", " + (int)end + | |
1468 | " for " + set); | |
1469 | } | |
1470 | if (i > 0 && start <= last) { | |
1471 | errln((UnicodeString)"FAIL result of " + msg + | |
1472 | ": range " + (i+1) + | |
1473 | " overlaps previous range: " + (int)start + ", " + (int)end + | |
1474 | " for " + set); | |
1475 | } | |
1476 | last = end; | |
1477 | } | |
1478 | } | |
1479 | ||
1480 | /** | |
1481 | * Convert a bitmask to a UnicodeSet. | |
1482 | */ | |
1483 | UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { | |
1484 | result.clear(); | |
1485 | for (UChar32 i = 0; i < 32; ++i) { | |
1486 | if ((a & (1<<i)) != 0) { | |
1487 | result.add(i); | |
1488 | } | |
1489 | } | |
1490 | return result; | |
1491 | } | |
1492 | ||
1493 | /** | |
1494 | * Convert a UnicodeSet to a bitmask. Only the characters | |
1495 | * U+0000 to U+0020 are represented in the bitmask. | |
1496 | */ | |
1497 | int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { | |
1498 | int32_t result = 0; | |
1499 | for (int32_t i = 0; i < 32; ++i) { | |
1500 | if (x.contains((UChar32)i)) { | |
1501 | result |= (1<<i); | |
1502 | } | |
1503 | } | |
1504 | return result; | |
1505 | } | |
1506 | ||
1507 | /** | |
1508 | * Return the representation of an inversion list based UnicodeSet | |
1509 | * as a pairs list. Ranges are listed in ascending Unicode order. | |
1510 | * For example, the set [a-zA-M3] is represented as "33AMaz". | |
1511 | */ | |
1512 | UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { | |
1513 | UnicodeString pairs; | |
1514 | for (int32_t i=0; i<set.getRangeCount(); ++i) { | |
1515 | UChar32 start = set.getRangeStart(i); | |
1516 | UChar32 end = set.getRangeEnd(i); | |
1517 | if (end > 0xFFFF) { | |
1518 | end = 0xFFFF; | |
1519 | i = set.getRangeCount(); // Should be unnecessary | |
1520 | } | |
1521 | pairs.append((UChar)start).append((UChar)end); | |
1522 | } | |
1523 | return pairs; | |
1524 | } | |
1525 | ||
1526 | /** | |
1527 | * Basic consistency check for a few items. | |
1528 | * That the iterator works, and that we can create a pattern and | |
1529 | * get the same thing back | |
1530 | */ | |
1531 | void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { | |
1532 | UErrorCode ec = U_ZERO_ERROR; | |
1533 | ||
1534 | UnicodeSet t(s); | |
1535 | checkEqual(s, t, "copy ct"); | |
1536 | ||
1537 | t = s; | |
1538 | checkEqual(s, t, "operator="); | |
1539 | ||
1540 | copyWithIterator(t, s, FALSE); | |
1541 | checkEqual(s, t, "iterator roundtrip"); | |
1542 | ||
1543 | copyWithIterator(t, s, TRUE); // try range | |
1544 | checkEqual(s, t, "iterator roundtrip"); | |
1545 | ||
1546 | UnicodeString pat; s.toPattern(pat, FALSE); | |
1547 | t.applyPattern(pat, ec); | |
1548 | if (U_FAILURE(ec)) { | |
1549 | errln("FAIL: applyPattern"); | |
1550 | return; | |
1551 | } else { | |
1552 | checkEqual(s, t, "toPattern(false)"); | |
1553 | } | |
1554 | ||
1555 | s.toPattern(pat, TRUE); | |
1556 | t.applyPattern(pat, ec); | |
1557 | if (U_FAILURE(ec)) { | |
1558 | errln("FAIL: applyPattern"); | |
1559 | return; | |
1560 | } else { | |
1561 | checkEqual(s, t, "toPattern(true)"); | |
1562 | } | |
1563 | } | |
1564 | ||
1565 | void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { | |
1566 | t.clear(); | |
1567 | UnicodeSetIterator it(s); | |
1568 | if (withRange) { | |
1569 | while (it.nextRange()) { | |
1570 | if (it.isString()) { | |
1571 | t.add(it.getString()); | |
1572 | } else { | |
1573 | t.add(it.getCodepoint(), it.getCodepointEnd()); | |
1574 | } | |
1575 | } | |
1576 | } else { | |
1577 | while (it.next()) { | |
1578 | if (it.isString()) { | |
1579 | t.add(it.getString()); | |
1580 | } else { | |
1581 | t.add(it.getCodepoint()); | |
1582 | } | |
1583 | } | |
1584 | } | |
1585 | } | |
1586 | ||
1587 | UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { | |
1588 | UnicodeString source; s.toPattern(source, TRUE); | |
1589 | UnicodeString result; t.toPattern(result, TRUE); | |
1590 | if (s != t) { | |
1591 | errln((UnicodeString)"FAIL: " + message | |
1592 | + "; source = " + source | |
1593 | + "; result = " + result | |
1594 | ); | |
1595 | return FALSE; | |
1596 | } else { | |
1597 | logln((UnicodeString)"Ok: " + message | |
1598 | + "; source = " + source | |
1599 | + "; result = " + result | |
1600 | ); | |
1601 | } | |
1602 | return TRUE; | |
1603 | } | |
1604 | ||
1605 | void | |
1606 | UnicodeSetTest::expectContainment(const UnicodeString& pat, | |
1607 | const UnicodeString& charsIn, | |
1608 | const UnicodeString& charsOut) { | |
1609 | UErrorCode ec = U_ZERO_ERROR; | |
1610 | UnicodeSet set(pat, ec); | |
1611 | if (U_FAILURE(ec)) { | |
1612 | errln((UnicodeString)"FAIL: pattern \"" + | |
1613 | pat + "\" => " + u_errorName(ec)); | |
1614 | return; | |
1615 | } | |
1616 | expectContainment(set, pat, charsIn, charsOut); | |
1617 | } | |
1618 | ||
1619 | void | |
1620 | UnicodeSetTest::expectContainment(const UnicodeSet& set, | |
1621 | const UnicodeString& charsIn, | |
1622 | const UnicodeString& charsOut) { | |
1623 | UnicodeString pat; | |
1624 | set.toPattern(pat); | |
1625 | expectContainment(set, pat, charsIn, charsOut); | |
1626 | } | |
1627 | ||
1628 | void | |
1629 | UnicodeSetTest::expectContainment(const UnicodeSet& set, | |
1630 | const UnicodeString& setName, | |
1631 | const UnicodeString& charsIn, | |
1632 | const UnicodeString& charsOut) { | |
1633 | UnicodeString bad; | |
1634 | UChar32 c; | |
1635 | int32_t i; | |
1636 | ||
1637 | for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { | |
1638 | c = charsIn.char32At(i); | |
1639 | if (!set.contains(c)) { | |
1640 | bad.append(c); | |
1641 | } | |
1642 | } | |
1643 | if (bad.length() > 0) { | |
1644 | errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + | |
1645 | ", expected containment of " + prettify(charsIn)); | |
1646 | } else { | |
1647 | logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); | |
1648 | } | |
1649 | ||
1650 | bad.truncate(0); | |
1651 | for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { | |
1652 | c = charsOut.char32At(i); | |
1653 | if (set.contains(c)) { | |
1654 | bad.append(c); | |
1655 | } | |
1656 | } | |
1657 | if (bad.length() > 0) { | |
1658 | errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + | |
1659 | ", expected non-containment of " + prettify(charsOut)); | |
1660 | } else { | |
1661 | logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); | |
1662 | } | |
1663 | } | |
1664 | ||
1665 | void | |
1666 | UnicodeSetTest::expectPattern(UnicodeSet& set, | |
1667 | const UnicodeString& pattern, | |
1668 | const UnicodeString& expectedPairs){ | |
1669 | UErrorCode status = U_ZERO_ERROR; | |
1670 | set.applyPattern(pattern, status); | |
1671 | if (U_FAILURE(status)) { | |
1672 | errln(UnicodeString("FAIL: applyPattern(\"") + pattern + | |
1673 | "\") failed"); | |
1674 | return; | |
1675 | } else { | |
1676 | if (getPairs(set) != expectedPairs ) { | |
1677 | errln(UnicodeString("FAIL: applyPattern(\"") + pattern + | |
1678 | "\") => pairs \"" + | |
1679 | escape(getPairs(set)) + "\", expected \"" + | |
1680 | escape(expectedPairs) + "\""); | |
1681 | } else { | |
1682 | logln(UnicodeString("Ok: applyPattern(\"") + pattern + | |
1683 | "\") => pairs \"" + | |
1684 | escape(getPairs(set)) + "\""); | |
1685 | } | |
1686 | } | |
1687 | // the result of calling set.toPattern(), which is the string representation of | |
1688 | // this set(set), is passed to a UnicodeSet constructor, and tested that it | |
1689 | // will produce another set that is equal to this one. | |
1690 | UnicodeString temppattern; | |
1691 | set.toPattern(temppattern); | |
1692 | UnicodeSet *tempset=new UnicodeSet(temppattern, status); | |
1693 | if (U_FAILURE(status)) { | |
1694 | errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); | |
1695 | return; | |
1696 | } | |
1697 | if(*tempset != set || getPairs(*tempset) != getPairs(set)){ | |
1698 | errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + | |
1699 | escape(getPairs(set)) + "\"")); | |
1700 | } else{ | |
1701 | logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); | |
1702 | } | |
1703 | ||
1704 | delete tempset; | |
1705 | ||
1706 | } | |
1707 | ||
1708 | void | |
1709 | UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { | |
1710 | if (getPairs(set) != expectedPairs) { | |
1711 | errln(UnicodeString("FAIL: Expected pair list \"") + | |
1712 | escape(expectedPairs) + "\", got \"" + | |
1713 | escape(getPairs(set)) + "\""); | |
1714 | } | |
1715 | } | |
1716 | ||
1717 | void UnicodeSetTest::expectToPattern(const UnicodeSet& set, | |
1718 | const UnicodeString& expPat, | |
1719 | const char** expStrings) { | |
1720 | UnicodeString pat; | |
1721 | set.toPattern(pat, TRUE); | |
1722 | if (pat == expPat) { | |
1723 | logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); | |
1724 | } else { | |
1725 | errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); | |
1726 | return; | |
1727 | } | |
374ca955 A |
1728 | if (expStrings == NULL) { |
1729 | return; | |
1730 | } | |
b75a7d8f A |
1731 | UBool in = TRUE; |
1732 | for (int32_t i=0; expStrings[i] != NULL; ++i) { | |
1733 | if (expStrings[i] == NOT) { // sic; pointer comparison | |
1734 | in = FALSE; | |
1735 | continue; | |
1736 | } | |
1737 | UnicodeString s = CharsToUnicodeString(expStrings[i]); | |
1738 | UBool contained = set.contains(s); | |
1739 | if (contained == in) { | |
1740 | logln((UnicodeString)"Ok: " + expPat + | |
1741 | (contained ? " contains {" : " does not contain {") + | |
1742 | escape(expStrings[i]) + "}"); | |
1743 | } else { | |
1744 | errln((UnicodeString)"FAIL: " + expPat + | |
1745 | (contained ? " contains {" : " does not contain {") + | |
1746 | escape(expStrings[i]) + "}"); | |
1747 | } | |
1748 | } | |
1749 | } | |
1750 | ||
1751 | static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } | |
1752 | ||
1753 | void | |
1754 | UnicodeSetTest::doAssert(UBool condition, const char *message) | |
1755 | { | |
1756 | if (!condition) { | |
1757 | errln(UnicodeString("ERROR : ") + message); | |
1758 | } | |
1759 | } | |
1760 | ||
1761 | UnicodeString | |
1762 | UnicodeSetTest::escape(const UnicodeString& s) { | |
1763 | UnicodeString buf; | |
1764 | for (int32_t i=0; i<s.length(); ) | |
1765 | { | |
1766 | UChar32 c = s.char32At(i); | |
1767 | if (0x0020 <= c && c <= 0x007F) { | |
1768 | buf += c; | |
1769 | } else { | |
1770 | if (c <= 0xFFFF) { | |
1771 | buf += (UChar)0x5c; buf += (UChar)0x75; | |
1772 | } else { | |
1773 | buf += (UChar)0x5c; buf += (UChar)0x55; | |
1774 | buf += toHexString((c & 0xF0000000) >> 28); | |
1775 | buf += toHexString((c & 0x0F000000) >> 24); | |
1776 | buf += toHexString((c & 0x00F00000) >> 20); | |
1777 | buf += toHexString((c & 0x000F0000) >> 16); | |
1778 | } | |
1779 | buf += toHexString((c & 0xF000) >> 12); | |
1780 | buf += toHexString((c & 0x0F00) >> 8); | |
1781 | buf += toHexString((c & 0x00F0) >> 4); | |
1782 | buf += toHexString(c & 0x000F); | |
1783 | } | |
1784 | i += U16_LENGTH(c); | |
1785 | } | |
1786 | return buf; | |
1787 | } |