]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /******************************************************************** |
2 | * COPYRIGHT: | |
2ca993e8 | 3 | * Copyright (c) 2002-2016, International Business Machines Corporation and |
b75a7d8f A |
4 | * others. All Rights Reserved. |
5 | ******************************************************************** | |
6 | * | |
7 | * @author Mark E. Davis | |
8 | * @author Vladimir Weinstein | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_NORMALIZATION | |
14 | ||
15 | #include "intltest.h" | |
2ca993e8 | 16 | #include "cmemory.h" |
b75a7d8f A |
17 | #include "cstring.h" |
18 | #include "canittst.h" | |
19 | #include "unicode/caniter.h" | |
374ca955 | 20 | #include "unicode/normlzr.h" |
b75a7d8f | 21 | #include "unicode/uchar.h" |
374ca955 | 22 | #include "hash.h" |
b75a7d8f | 23 | |
b75a7d8f A |
24 | #define CASE(id,test) case id: \ |
25 | name = #test; \ | |
26 | if (exec) { \ | |
27 | logln(#test "---"); \ | |
28 | logln((UnicodeString)""); \ | |
29 | test(); \ | |
30 | } \ | |
31 | break | |
32 | ||
33 | void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, | |
34 | const char* &name, char* /*par*/) { | |
35 | switch (index) { | |
36 | CASE(0, TestBasic); | |
37 | CASE(1, TestExhaustive); | |
38 | CASE(2, TestAPI); | |
39 | default: name = ""; break; | |
40 | } | |
41 | } | |
42 | ||
43 | /** | |
44 | * Convert Java-style strings with \u Unicode escapes into UnicodeString objects | |
45 | static UnicodeString str(const char *input) | |
46 | { | |
47 | UnicodeString str(input, ""); // Invariant conversion | |
48 | return str.unescape(); | |
49 | } | |
50 | */ | |
51 | ||
52 | ||
53 | CanonicalIteratorTest::CanonicalIteratorTest() : | |
54 | nameTrans(NULL), hexTrans(NULL) | |
55 | { | |
56 | } | |
57 | ||
58 | CanonicalIteratorTest::~CanonicalIteratorTest() | |
59 | { | |
60 | #if !UCONFIG_NO_TRANSLITERATION | |
61 | if(nameTrans != NULL) { | |
62 | delete(nameTrans); | |
63 | } | |
64 | if(hexTrans != NULL) { | |
65 | delete(hexTrans); | |
66 | } | |
67 | #endif | |
68 | } | |
69 | ||
70 | void CanonicalIteratorTest::TestExhaustive() { | |
71 | UErrorCode status = U_ZERO_ERROR; | |
72 | CanonicalIterator it("", status); | |
729e4ab9 A |
73 | if (U_FAILURE(status)) { |
74 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); | |
75 | return; | |
76 | } | |
b75a7d8f | 77 | UChar32 i = 0; |
73c04bcf | 78 | UnicodeString s; |
374ca955 | 79 | // Test static and dynamic class IDs |
b75a7d8f | 80 | if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ |
374ca955 A |
81 | errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); |
82 | } | |
b75a7d8f A |
83 | for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { |
84 | //for (i = 0xae00; i < 0xaf00; ++i) { | |
85 | ||
86 | if ((i % 0x100) == 0) { | |
87 | logln("Testing U+%06X", i); | |
88 | } | |
89 | ||
90 | // skip characters we know don't have decomps | |
91 | int8_t type = u_charType(i); | |
92 | if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR | |
93 | || type == U_SURROGATE) continue; | |
94 | ||
95 | s = i; | |
73c04bcf A |
96 | characterTest(s, i, it); |
97 | ||
b75a7d8f | 98 | s += (UChar32)0x0345; //"\\u0345"; |
73c04bcf | 99 | characterTest(s, i, it); |
b75a7d8f A |
100 | } |
101 | } | |
102 | ||
103 | void CanonicalIteratorTest::TestBasic() { | |
104 | ||
105 | UErrorCode status = U_ZERO_ERROR; | |
106 | ||
107 | static const char * const testArray[][2] = { | |
108 | {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " | |
109 | "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " | |
110 | "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " | |
111 | "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, | |
112 | {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, | |
113 | {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, | |
114 | }; | |
115 | ||
116 | #if 0 | |
117 | // This is not interesting for C/C++ as the data is already built beforehand | |
118 | // check build | |
119 | UnicodeSet ss = CanonicalIterator.getSafeStart(); | |
120 | logln("Safe Start: " + ss.toPattern(true)); | |
121 | ss = CanonicalIterator.getStarts('a'); | |
122 | expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), | |
123 | new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" | |
124 | + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") | |
125 | ); | |
126 | #endif | |
127 | ||
128 | // check permute | |
129 | // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! | |
130 | ||
131 | Hashtable *permutations = new Hashtable(FALSE, status); | |
4388f060 | 132 | permutations->setValueDeleter(uprv_deleteUObject); |
b75a7d8f A |
133 | UnicodeString toPermute("ABC"); |
134 | ||
135 | CanonicalIterator::permute(toPermute, FALSE, permutations, status); | |
136 | ||
137 | logln("testing permutation"); | |
138 | ||
139 | expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); | |
140 | ||
141 | delete permutations; | |
142 | ||
143 | // try samples | |
144 | logln("testing samples"); | |
145 | Hashtable *set = new Hashtable(FALSE, status); | |
4388f060 | 146 | set->setValueDeleter(uprv_deleteUObject); |
b75a7d8f A |
147 | int32_t i = 0; |
148 | CanonicalIterator it("", status); | |
149 | if(U_SUCCESS(status)) { | |
2ca993e8 | 150 | for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) { |
b75a7d8f A |
151 | //logln("Results for: " + name.transliterate(testArray[i])); |
152 | UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); | |
153 | it.setSource(testStr, status); | |
154 | set->removeAll(); | |
46f4442e | 155 | for (;;) { |
b75a7d8f A |
156 | //UnicodeString *result = new UnicodeString(it.next()); |
157 | UnicodeString result(it.next()); | |
158 | if (result.isBogus()) { | |
159 | break; | |
160 | } | |
161 | set->put(result, new UnicodeString(result), status); // Add result to the table | |
162 | //logln(++counter + ": " + hex.transliterate(result)); | |
163 | //logln(" = " + name.transliterate(result)); | |
164 | } | |
51004dcb | 165 | expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); |
b75a7d8f A |
166 | |
167 | } | |
168 | } else { | |
729e4ab9 | 169 | dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); |
b75a7d8f A |
170 | } |
171 | delete set; | |
172 | } | |
173 | ||
73c04bcf A |
174 | void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) |
175 | { | |
176 | UErrorCode status = U_ZERO_ERROR; | |
177 | UnicodeString decomp, comp; | |
178 | UBool gotDecomp = FALSE; | |
179 | UBool gotComp = FALSE; | |
180 | UBool gotSource = FALSE; | |
181 | ||
182 | Normalizer::decompose(s, FALSE, 0, decomp, status); | |
183 | Normalizer::compose(s, FALSE, 0, comp, status); | |
184 | ||
185 | // skip characters that don't have either decomp. | |
186 | // need quick test for this! | |
187 | if (s == decomp && s == comp) { | |
188 | return; | |
189 | } | |
190 | ||
191 | it.setSource(s, status); | |
192 | ||
46f4442e | 193 | for (;;) { |
73c04bcf A |
194 | UnicodeString item = it.next(); |
195 | if (item.isBogus()) break; | |
196 | if (item == s) gotSource = TRUE; | |
197 | if (item == decomp) gotDecomp = TRUE; | |
198 | if (item == comp) gotComp = TRUE; | |
199 | } | |
200 | ||
201 | if (!gotSource || !gotDecomp || !gotComp) { | |
202 | errln("FAIL CanonicalIterator: " + s + (int)ch); | |
203 | } | |
204 | } | |
205 | ||
b75a7d8f A |
206 | void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { |
207 | if (!(a==b)) { | |
208 | errln("FAIL: " + message + getReadable(item)); | |
209 | errln("\t" + getReadable(a)); | |
210 | errln("\t" + getReadable(b)); | |
211 | } else { | |
212 | logln("Checked: " + message + getReadable(item)); | |
213 | logln("\t" + getReadable(a)); | |
214 | logln("\t" + getReadable(b)); | |
215 | } | |
216 | } | |
217 | ||
218 | UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { | |
219 | UErrorCode status = U_ZERO_ERROR; | |
220 | UnicodeString result = "["; | |
221 | if (s.length() == 0) return ""; | |
222 | // set up for readable display | |
223 | #if !UCONFIG_NO_TRANSLITERATION | |
224 | if(verbose) { | |
225 | if (nameTrans == NULL) | |
226 | nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); | |
227 | UnicodeString sName = s; | |
228 | nameTrans->transliterate(sName); | |
229 | result += sName; | |
230 | result += ";"; | |
231 | } | |
232 | if (hexTrans == NULL) | |
233 | hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); | |
234 | #endif | |
235 | UnicodeString sHex = s; | |
236 | #if !UCONFIG_NO_TRANSLITERATION | |
237 | if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated | |
238 | hexTrans->transliterate(sHex); | |
239 | } | |
240 | #endif | |
241 | result += sHex; | |
242 | result += "]"; | |
243 | return result; | |
244 | //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; | |
245 | } | |
246 | ||
374ca955 A |
247 | U_CFUNC int U_CALLCONV |
248 | compareUnicodeStrings(const void *s1, const void *s2) { | |
b75a7d8f A |
249 | UnicodeString **st1 = (UnicodeString **)s1; |
250 | UnicodeString **st2 = (UnicodeString **)s2; | |
251 | ||
252 | return (*st1)->compare(**st2); | |
253 | } | |
254 | ||
255 | ||
256 | UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { | |
257 | UnicodeString result; | |
258 | ||
259 | // Iterate over the Hashtable, then qsort. | |
260 | ||
261 | UnicodeString **resArray = new UnicodeString*[col->count()]; | |
262 | int32_t i = 0; | |
263 | ||
264 | const UHashElement *ne = NULL; | |
b331163b | 265 | int32_t el = UHASH_FIRST; |
b75a7d8f A |
266 | //Iterator it = basic.iterator(); |
267 | ne = col->nextElement(el); | |
268 | //while (it.hasNext()) | |
269 | while (ne != NULL) { | |
270 | //String item = (String) it.next(); | |
271 | UnicodeString *item = (UnicodeString *)(ne->value.pointer); | |
272 | resArray[i++] = item; | |
273 | ne = col->nextElement(el); | |
274 | } | |
275 | ||
276 | for(i = 0; i<col->count(); ++i) { | |
277 | logln(*resArray[i]); | |
278 | } | |
279 | ||
280 | qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); | |
281 | ||
282 | result = *resArray[0]; | |
283 | ||
284 | for(i = 1; i<col->count(); ++i) { | |
285 | result += ", "; | |
286 | result += *resArray[i]; | |
287 | } | |
288 | ||
289 | /* | |
290 | Iterator it = col.iterator(); | |
291 | while (it.hasNext()) { | |
292 | if (result.length() != 0) result.append(", "); | |
293 | result.append(it.next().toString()); | |
294 | } | |
295 | */ | |
296 | ||
297 | delete [] resArray; | |
298 | ||
299 | return result; | |
300 | } | |
301 | ||
302 | void CanonicalIteratorTest::TestAPI() { | |
303 | UErrorCode status = U_ZERO_ERROR; | |
304 | // Test reset and getSource | |
305 | UnicodeString start("ljubav"); | |
306 | logln("Testing CanonicalIterator::getSource"); | |
307 | logln("Instantiating canonical iterator with string "+start); | |
308 | CanonicalIterator can(start, status); | |
729e4ab9 A |
309 | if (U_FAILURE(status)) { |
310 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); | |
311 | return; | |
312 | } | |
b75a7d8f A |
313 | UnicodeString source = can.getSource(); |
314 | logln("CanonicalIterator::getSource returned "+source); | |
315 | if(start != source) { | |
316 | errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); | |
317 | } | |
318 | logln("Testing CanonicalIterator::reset"); | |
319 | UnicodeString next = can.next(); | |
320 | logln("CanonicalIterator::next returned "+next); | |
321 | ||
322 | can.reset(); | |
323 | ||
324 | UnicodeString afterReset = can.next(); | |
325 | logln("After reset, CanonicalIterator::next returned "+afterReset); | |
326 | ||
327 | if(next != afterReset) { | |
328 | errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); | |
329 | } | |
330 | ||
331 | logln("Testing getStaticClassID and getDynamicClassID"); | |
332 | if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ | |
333 | errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); | |
334 | } | |
335 | } | |
336 | ||
337 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |