]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /******************************************************************** |
2 | * COPYRIGHT: | |
51004dcb | 3 | * Copyright (c) 2002-2012, International Business Machines Corporation and |
b75a7d8f A |
4 | * others. All Rights Reserved. |
5 | ******************************************************************** | |
6 | * | |
7 | * @author Mark E. Davis | |
8 | * @author Vladimir Weinstein | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_NORMALIZATION | |
14 | ||
15 | #include "intltest.h" | |
16 | #include "cstring.h" | |
17 | #include "canittst.h" | |
18 | #include "unicode/caniter.h" | |
374ca955 | 19 | #include "unicode/normlzr.h" |
b75a7d8f | 20 | #include "unicode/uchar.h" |
374ca955 | 21 | #include "hash.h" |
b75a7d8f A |
22 | |
23 | #define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array))) | |
24 | ||
25 | #define CASE(id,test) case id: \ | |
26 | name = #test; \ | |
27 | if (exec) { \ | |
28 | logln(#test "---"); \ | |
29 | logln((UnicodeString)""); \ | |
30 | test(); \ | |
31 | } \ | |
32 | break | |
33 | ||
34 | void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, | |
35 | const char* &name, char* /*par*/) { | |
36 | switch (index) { | |
37 | CASE(0, TestBasic); | |
38 | CASE(1, TestExhaustive); | |
39 | CASE(2, TestAPI); | |
40 | default: name = ""; break; | |
41 | } | |
42 | } | |
43 | ||
44 | /** | |
45 | * Convert Java-style strings with \u Unicode escapes into UnicodeString objects | |
46 | static UnicodeString str(const char *input) | |
47 | { | |
48 | UnicodeString str(input, ""); // Invariant conversion | |
49 | return str.unescape(); | |
50 | } | |
51 | */ | |
52 | ||
53 | ||
54 | CanonicalIteratorTest::CanonicalIteratorTest() : | |
55 | nameTrans(NULL), hexTrans(NULL) | |
56 | { | |
57 | } | |
58 | ||
59 | CanonicalIteratorTest::~CanonicalIteratorTest() | |
60 | { | |
61 | #if !UCONFIG_NO_TRANSLITERATION | |
62 | if(nameTrans != NULL) { | |
63 | delete(nameTrans); | |
64 | } | |
65 | if(hexTrans != NULL) { | |
66 | delete(hexTrans); | |
67 | } | |
68 | #endif | |
69 | } | |
70 | ||
71 | void CanonicalIteratorTest::TestExhaustive() { | |
72 | UErrorCode status = U_ZERO_ERROR; | |
73 | CanonicalIterator it("", status); | |
729e4ab9 A |
74 | if (U_FAILURE(status)) { |
75 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); | |
76 | return; | |
77 | } | |
b75a7d8f | 78 | UChar32 i = 0; |
73c04bcf | 79 | UnicodeString s; |
374ca955 | 80 | // Test static and dynamic class IDs |
b75a7d8f | 81 | if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ |
374ca955 A |
82 | errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); |
83 | } | |
b75a7d8f A |
84 | for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { |
85 | //for (i = 0xae00; i < 0xaf00; ++i) { | |
86 | ||
87 | if ((i % 0x100) == 0) { | |
88 | logln("Testing U+%06X", i); | |
89 | } | |
90 | ||
91 | // skip characters we know don't have decomps | |
92 | int8_t type = u_charType(i); | |
93 | if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR | |
94 | || type == U_SURROGATE) continue; | |
95 | ||
96 | s = i; | |
73c04bcf A |
97 | characterTest(s, i, it); |
98 | ||
b75a7d8f | 99 | s += (UChar32)0x0345; //"\\u0345"; |
73c04bcf | 100 | characterTest(s, i, it); |
b75a7d8f A |
101 | } |
102 | } | |
103 | ||
104 | void CanonicalIteratorTest::TestBasic() { | |
105 | ||
106 | UErrorCode status = U_ZERO_ERROR; | |
107 | ||
108 | static const char * const testArray[][2] = { | |
109 | {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " | |
110 | "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " | |
111 | "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " | |
112 | "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, | |
113 | {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, | |
114 | {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, | |
115 | }; | |
116 | ||
117 | #if 0 | |
118 | // This is not interesting for C/C++ as the data is already built beforehand | |
119 | // check build | |
120 | UnicodeSet ss = CanonicalIterator.getSafeStart(); | |
121 | logln("Safe Start: " + ss.toPattern(true)); | |
122 | ss = CanonicalIterator.getStarts('a'); | |
123 | expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), | |
124 | new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" | |
125 | + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") | |
126 | ); | |
127 | #endif | |
128 | ||
129 | // check permute | |
130 | // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! | |
131 | ||
132 | Hashtable *permutations = new Hashtable(FALSE, status); | |
4388f060 | 133 | permutations->setValueDeleter(uprv_deleteUObject); |
b75a7d8f A |
134 | UnicodeString toPermute("ABC"); |
135 | ||
136 | CanonicalIterator::permute(toPermute, FALSE, permutations, status); | |
137 | ||
138 | logln("testing permutation"); | |
139 | ||
140 | expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); | |
141 | ||
142 | delete permutations; | |
143 | ||
144 | // try samples | |
145 | logln("testing samples"); | |
146 | Hashtable *set = new Hashtable(FALSE, status); | |
4388f060 | 147 | set->setValueDeleter(uprv_deleteUObject); |
b75a7d8f A |
148 | int32_t i = 0; |
149 | CanonicalIterator it("", status); | |
150 | if(U_SUCCESS(status)) { | |
151 | for (i = 0; i < ARRAY_LENGTH(testArray); ++i) { | |
152 | //logln("Results for: " + name.transliterate(testArray[i])); | |
153 | UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); | |
154 | it.setSource(testStr, status); | |
155 | set->removeAll(); | |
46f4442e | 156 | for (;;) { |
b75a7d8f A |
157 | //UnicodeString *result = new UnicodeString(it.next()); |
158 | UnicodeString result(it.next()); | |
159 | if (result.isBogus()) { | |
160 | break; | |
161 | } | |
162 | set->put(result, new UnicodeString(result), status); // Add result to the table | |
163 | //logln(++counter + ": " + hex.transliterate(result)); | |
164 | //logln(" = " + name.transliterate(result)); | |
165 | } | |
51004dcb | 166 | expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); |
b75a7d8f A |
167 | |
168 | } | |
169 | } else { | |
729e4ab9 | 170 | dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); |
b75a7d8f A |
171 | } |
172 | delete set; | |
173 | } | |
174 | ||
73c04bcf A |
175 | void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) |
176 | { | |
177 | UErrorCode status = U_ZERO_ERROR; | |
178 | UnicodeString decomp, comp; | |
179 | UBool gotDecomp = FALSE; | |
180 | UBool gotComp = FALSE; | |
181 | UBool gotSource = FALSE; | |
182 | ||
183 | Normalizer::decompose(s, FALSE, 0, decomp, status); | |
184 | Normalizer::compose(s, FALSE, 0, comp, status); | |
185 | ||
186 | // skip characters that don't have either decomp. | |
187 | // need quick test for this! | |
188 | if (s == decomp && s == comp) { | |
189 | return; | |
190 | } | |
191 | ||
192 | it.setSource(s, status); | |
193 | ||
46f4442e | 194 | for (;;) { |
73c04bcf A |
195 | UnicodeString item = it.next(); |
196 | if (item.isBogus()) break; | |
197 | if (item == s) gotSource = TRUE; | |
198 | if (item == decomp) gotDecomp = TRUE; | |
199 | if (item == comp) gotComp = TRUE; | |
200 | } | |
201 | ||
202 | if (!gotSource || !gotDecomp || !gotComp) { | |
203 | errln("FAIL CanonicalIterator: " + s + (int)ch); | |
204 | } | |
205 | } | |
206 | ||
b75a7d8f A |
207 | void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { |
208 | if (!(a==b)) { | |
209 | errln("FAIL: " + message + getReadable(item)); | |
210 | errln("\t" + getReadable(a)); | |
211 | errln("\t" + getReadable(b)); | |
212 | } else { | |
213 | logln("Checked: " + message + getReadable(item)); | |
214 | logln("\t" + getReadable(a)); | |
215 | logln("\t" + getReadable(b)); | |
216 | } | |
217 | } | |
218 | ||
219 | UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { | |
220 | UErrorCode status = U_ZERO_ERROR; | |
221 | UnicodeString result = "["; | |
222 | if (s.length() == 0) return ""; | |
223 | // set up for readable display | |
224 | #if !UCONFIG_NO_TRANSLITERATION | |
225 | if(verbose) { | |
226 | if (nameTrans == NULL) | |
227 | nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); | |
228 | UnicodeString sName = s; | |
229 | nameTrans->transliterate(sName); | |
230 | result += sName; | |
231 | result += ";"; | |
232 | } | |
233 | if (hexTrans == NULL) | |
234 | hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); | |
235 | #endif | |
236 | UnicodeString sHex = s; | |
237 | #if !UCONFIG_NO_TRANSLITERATION | |
238 | if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated | |
239 | hexTrans->transliterate(sHex); | |
240 | } | |
241 | #endif | |
242 | result += sHex; | |
243 | result += "]"; | |
244 | return result; | |
245 | //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; | |
246 | } | |
247 | ||
374ca955 A |
248 | U_CFUNC int U_CALLCONV |
249 | compareUnicodeStrings(const void *s1, const void *s2) { | |
b75a7d8f A |
250 | UnicodeString **st1 = (UnicodeString **)s1; |
251 | UnicodeString **st2 = (UnicodeString **)s2; | |
252 | ||
253 | return (*st1)->compare(**st2); | |
254 | } | |
255 | ||
256 | ||
257 | UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { | |
258 | UnicodeString result; | |
259 | ||
260 | // Iterate over the Hashtable, then qsort. | |
261 | ||
262 | UnicodeString **resArray = new UnicodeString*[col->count()]; | |
263 | int32_t i = 0; | |
264 | ||
265 | const UHashElement *ne = NULL; | |
266 | int32_t el = -1; | |
267 | //Iterator it = basic.iterator(); | |
268 | ne = col->nextElement(el); | |
269 | //while (it.hasNext()) | |
270 | while (ne != NULL) { | |
271 | //String item = (String) it.next(); | |
272 | UnicodeString *item = (UnicodeString *)(ne->value.pointer); | |
273 | resArray[i++] = item; | |
274 | ne = col->nextElement(el); | |
275 | } | |
276 | ||
277 | for(i = 0; i<col->count(); ++i) { | |
278 | logln(*resArray[i]); | |
279 | } | |
280 | ||
281 | qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); | |
282 | ||
283 | result = *resArray[0]; | |
284 | ||
285 | for(i = 1; i<col->count(); ++i) { | |
286 | result += ", "; | |
287 | result += *resArray[i]; | |
288 | } | |
289 | ||
290 | /* | |
291 | Iterator it = col.iterator(); | |
292 | while (it.hasNext()) { | |
293 | if (result.length() != 0) result.append(", "); | |
294 | result.append(it.next().toString()); | |
295 | } | |
296 | */ | |
297 | ||
298 | delete [] resArray; | |
299 | ||
300 | return result; | |
301 | } | |
302 | ||
303 | void CanonicalIteratorTest::TestAPI() { | |
304 | UErrorCode status = U_ZERO_ERROR; | |
305 | // Test reset and getSource | |
306 | UnicodeString start("ljubav"); | |
307 | logln("Testing CanonicalIterator::getSource"); | |
308 | logln("Instantiating canonical iterator with string "+start); | |
309 | CanonicalIterator can(start, status); | |
729e4ab9 A |
310 | if (U_FAILURE(status)) { |
311 | dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); | |
312 | return; | |
313 | } | |
b75a7d8f A |
314 | UnicodeString source = can.getSource(); |
315 | logln("CanonicalIterator::getSource returned "+source); | |
316 | if(start != source) { | |
317 | errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); | |
318 | } | |
319 | logln("Testing CanonicalIterator::reset"); | |
320 | UnicodeString next = can.next(); | |
321 | logln("CanonicalIterator::next returned "+next); | |
322 | ||
323 | can.reset(); | |
324 | ||
325 | UnicodeString afterReset = can.next(); | |
326 | logln("After reset, CanonicalIterator::next returned "+afterReset); | |
327 | ||
328 | if(next != afterReset) { | |
329 | errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); | |
330 | } | |
331 | ||
332 | logln("Testing getStaticClassID and getDynamicClassID"); | |
333 | if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ | |
334 | errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); | |
335 | } | |
336 | } | |
337 | ||
338 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |