]>
Commit | Line | Data |
---|---|---|
1 | /******************************************************************** | |
2 | * COPYRIGHT: | |
3 | * Copyright (c) 1997-2011, International Business Machines Corporation and | |
4 | * others. All Rights Reserved. | |
5 | ********************************************************************/ | |
6 | ||
7 | #include "unicode/ustring.h" | |
8 | #include "unicode/uchar.h" | |
9 | #include "unicode/uniset.h" | |
10 | #include "unicode/putil.h" | |
11 | #include "cstring.h" | |
12 | #include "hash.h" | |
13 | #include "patternprops.h" | |
14 | #include "normalizer2impl.h" | |
15 | #include "uparse.h" | |
16 | #include "ucdtest.h" | |
17 | ||
18 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0])) | |
19 | ||
20 | static const char *ignorePropNames[]={ | |
21 | "FC_NFKC", | |
22 | "NFD_QC", | |
23 | "NFC_QC", | |
24 | "NFKD_QC", | |
25 | "NFKC_QC", | |
26 | "Expands_On_NFD", | |
27 | "Expands_On_NFC", | |
28 | "Expands_On_NFKD", | |
29 | "Expands_On_NFKC", | |
30 | "NFKC_CF" | |
31 | }; | |
32 | ||
33 | UnicodeTest::UnicodeTest() | |
34 | { | |
35 | UErrorCode errorCode=U_ZERO_ERROR; | |
36 | unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode); | |
37 | if(U_FAILURE(errorCode)) { | |
38 | delete unknownPropertyNames; | |
39 | unknownPropertyNames=NULL; | |
40 | } | |
41 | // Ignore some property names altogether. | |
42 | for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) { | |
43 | unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode); | |
44 | } | |
45 | } | |
46 | ||
47 | UnicodeTest::~UnicodeTest() | |
48 | { | |
49 | delete unknownPropertyNames; | |
50 | } | |
51 | ||
52 | void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) | |
53 | { | |
54 | if(exec) { | |
55 | logln("TestSuite UnicodeTest: "); | |
56 | } | |
57 | TESTCASE_AUTO_BEGIN; | |
58 | TESTCASE_AUTO(TestAdditionalProperties); | |
59 | TESTCASE_AUTO(TestBinaryValues); | |
60 | TESTCASE_AUTO(TestConsistency); | |
61 | TESTCASE_AUTO(TestPatternProperties); | |
62 | TESTCASE_AUTO_END; | |
63 | } | |
64 | ||
65 | //==================================================== | |
66 | // private data used by the tests | |
67 | //==================================================== | |
68 | ||
69 | // test DerivedCoreProperties.txt ------------------------------------------- | |
70 | ||
71 | // copied from genprops.c | |
72 | static int32_t | |
73 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { | |
74 | const char *t, *z; | |
75 | int32_t i, j; | |
76 | ||
77 | s=u_skipWhitespace(s); | |
78 | for(i=0; i<countTokens; ++i) { | |
79 | t=tokens[i]; | |
80 | if(t!=NULL) { | |
81 | for(j=0;; ++j) { | |
82 | if(t[j]!=0) { | |
83 | if(s[j]!=t[j]) { | |
84 | break; | |
85 | } | |
86 | } else { | |
87 | z=u_skipWhitespace(s+j); | |
88 | if(*z==';' || *z==0) { | |
89 | return i; | |
90 | } else { | |
91 | break; | |
92 | } | |
93 | } | |
94 | } | |
95 | } | |
96 | } | |
97 | return -1; | |
98 | } | |
99 | ||
100 | static const char *const | |
101 | derivedPropsNames[]={ | |
102 | "Math", | |
103 | "Alphabetic", | |
104 | "Lowercase", | |
105 | "Uppercase", | |
106 | "ID_Start", | |
107 | "ID_Continue", | |
108 | "XID_Start", | |
109 | "XID_Continue", | |
110 | "Default_Ignorable_Code_Point", | |
111 | "Full_Composition_Exclusion", | |
112 | "Grapheme_Extend", | |
113 | "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */ | |
114 | "Grapheme_Base", | |
115 | "Cased", | |
116 | "Case_Ignorable", | |
117 | "Changes_When_Lowercased", | |
118 | "Changes_When_Uppercased", | |
119 | "Changes_When_Titlecased", | |
120 | "Changes_When_Casefolded", | |
121 | "Changes_When_Casemapped", | |
122 | "Changes_When_NFKC_Casefolded" | |
123 | }; | |
124 | ||
125 | static const UProperty | |
126 | derivedPropsIndex[]={ | |
127 | UCHAR_MATH, | |
128 | UCHAR_ALPHABETIC, | |
129 | UCHAR_LOWERCASE, | |
130 | UCHAR_UPPERCASE, | |
131 | UCHAR_ID_START, | |
132 | UCHAR_ID_CONTINUE, | |
133 | UCHAR_XID_START, | |
134 | UCHAR_XID_CONTINUE, | |
135 | UCHAR_DEFAULT_IGNORABLE_CODE_POINT, | |
136 | UCHAR_FULL_COMPOSITION_EXCLUSION, | |
137 | UCHAR_GRAPHEME_EXTEND, | |
138 | UCHAR_GRAPHEME_LINK, | |
139 | UCHAR_GRAPHEME_BASE, | |
140 | UCHAR_CASED, | |
141 | UCHAR_CASE_IGNORABLE, | |
142 | UCHAR_CHANGES_WHEN_LOWERCASED, | |
143 | UCHAR_CHANGES_WHEN_UPPERCASED, | |
144 | UCHAR_CHANGES_WHEN_TITLECASED, | |
145 | UCHAR_CHANGES_WHEN_CASEFOLDED, | |
146 | UCHAR_CHANGES_WHEN_CASEMAPPED, | |
147 | UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED | |
148 | }; | |
149 | ||
150 | static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 }; | |
151 | ||
152 | enum { MAX_ERRORS=50 }; | |
153 | ||
154 | U_CFUNC void U_CALLCONV | |
155 | derivedPropsLineFn(void *context, | |
156 | char *fields[][2], int32_t /* fieldCount */, | |
157 | UErrorCode *pErrorCode) | |
158 | { | |
159 | UnicodeTest *me=(UnicodeTest *)context; | |
160 | uint32_t start, end; | |
161 | int32_t i; | |
162 | ||
163 | u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); | |
164 | if(U_FAILURE(*pErrorCode)) { | |
165 | me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]); | |
166 | return; | |
167 | } | |
168 | ||
169 | /* parse derived binary property name, ignore unknown names */ | |
170 | i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]); | |
171 | if(i<0) { | |
172 | UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0])); | |
173 | propName.trim(); | |
174 | if(me->unknownPropertyNames->find(propName)==NULL) { | |
175 | UErrorCode errorCode=U_ZERO_ERROR; | |
176 | me->unknownPropertyNames->puti(propName, 1, errorCode); | |
177 | me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]); | |
178 | } | |
179 | return; | |
180 | } | |
181 | ||
182 | me->derivedProps[i].add(start, end); | |
183 | } | |
184 | ||
185 | void UnicodeTest::TestAdditionalProperties() { | |
186 | #if !UCONFIG_NO_NORMALIZATION | |
187 | // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt | |
188 | if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) { | |
189 | errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n", | |
190 | LENGTHOF(derivedPropsNames)); | |
191 | return; | |
192 | } | |
193 | if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) { | |
194 | errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n"); | |
195 | return; | |
196 | } | |
197 | ||
198 | char newPath[256]; | |
199 | char backupPath[256]; | |
200 | char *fields[2][2]; | |
201 | UErrorCode errorCode=U_ZERO_ERROR; | |
202 | ||
203 | /* Look inside ICU_DATA first */ | |
204 | strcpy(newPath, pathToDataDirectory()); | |
205 | strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt"); | |
206 | ||
207 | // As a fallback, try to guess where the source data was located | |
208 | // at the time ICU was built, and look there. | |
209 | # ifdef U_TOPSRCDIR | |
210 | strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); | |
211 | # else | |
212 | strcpy(backupPath, loadTestData(errorCode)); | |
213 | strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); | |
214 | # endif | |
215 | strcat(backupPath, U_FILE_SEP_STRING); | |
216 | strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt"); | |
217 | ||
218 | char *path=newPath; | |
219 | u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode); | |
220 | ||
221 | if(errorCode==U_FILE_ACCESS_ERROR) { | |
222 | errorCode=U_ZERO_ERROR; | |
223 | path=backupPath; | |
224 | u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode); | |
225 | } | |
226 | if(U_FAILURE(errorCode)) { | |
227 | errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode)); | |
228 | return; | |
229 | } | |
230 | char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt"); | |
231 | strcpy(basename, "DerivedNormalizationProps.txt"); | |
232 | u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); | |
233 | if(U_FAILURE(errorCode)) { | |
234 | errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode)); | |
235 | return; | |
236 | } | |
237 | ||
238 | // now we have all derived core properties in the UnicodeSets | |
239 | // run them all through the API | |
240 | int32_t rangeCount, range; | |
241 | uint32_t i; | |
242 | UChar32 start, end; | |
243 | ||
244 | // test all TRUE properties | |
245 | for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { | |
246 | rangeCount=derivedProps[i].getRangeCount(); | |
247 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { | |
248 | start=derivedProps[i].getRangeStart(range); | |
249 | end=derivedProps[i].getRangeEnd(range); | |
250 | for(; start<=end; ++start) { | |
251 | if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) { | |
252 | dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]); | |
253 | if(++numErrors[i]>=MAX_ERRORS) { | |
254 | dataerrln("Too many errors, moving to the next test"); | |
255 | break; | |
256 | } | |
257 | } | |
258 | } | |
259 | } | |
260 | } | |
261 | ||
262 | // invert all properties | |
263 | for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { | |
264 | derivedProps[i].complement(); | |
265 | } | |
266 | ||
267 | // test all FALSE properties | |
268 | for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { | |
269 | rangeCount=derivedProps[i].getRangeCount(); | |
270 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { | |
271 | start=derivedProps[i].getRangeStart(range); | |
272 | end=derivedProps[i].getRangeEnd(range); | |
273 | for(; start<=end; ++start) { | |
274 | if(u_hasBinaryProperty(start, derivedPropsIndex[i])) { | |
275 | errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]); | |
276 | if(++numErrors[i]>=MAX_ERRORS) { | |
277 | errln("Too many errors, moving to the next test"); | |
278 | break; | |
279 | } | |
280 | } | |
281 | } | |
282 | } | |
283 | } | |
284 | #endif /* !UCONFIG_NO_NORMALIZATION */ | |
285 | } | |
286 | ||
287 | void UnicodeTest::TestBinaryValues() { | |
288 | /* | |
289 | * Unicode 5.1 explicitly defines binary property value aliases. | |
290 | * Verify that they are all recognized. | |
291 | */ | |
292 | UErrorCode errorCode=U_ZERO_ERROR; | |
293 | UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode); | |
294 | if(U_FAILURE(errorCode)) { | |
295 | dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode)); | |
296 | return; | |
297 | } | |
298 | ||
299 | static const char *const falseValues[]={ "N", "No", "F", "False" }; | |
300 | static const char *const trueValues[]={ "Y", "Yes", "T", "True" }; | |
301 | int32_t i; | |
302 | for(i=0; i<LENGTHOF(falseValues); ++i) { | |
303 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); | |
304 | pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV)); | |
305 | errorCode=U_ZERO_ERROR; | |
306 | UnicodeSet set(pattern, errorCode); | |
307 | if(U_FAILURE(errorCode)) { | |
308 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode)); | |
309 | continue; | |
310 | } | |
311 | set.complement(); | |
312 | if(set!=alpha) { | |
313 | errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]); | |
314 | } | |
315 | } | |
316 | for(i=0; i<LENGTHOF(trueValues); ++i) { | |
317 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); | |
318 | pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV)); | |
319 | errorCode=U_ZERO_ERROR; | |
320 | UnicodeSet set(pattern, errorCode); | |
321 | if(U_FAILURE(errorCode)) { | |
322 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode)); | |
323 | continue; | |
324 | } | |
325 | if(set!=alpha) { | |
326 | errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]); | |
327 | } | |
328 | } | |
329 | } | |
330 | ||
331 | void UnicodeTest::TestConsistency() { | |
332 | #if !UCONFIG_NO_NORMALIZATION | |
333 | /* | |
334 | * Test for an example that getCanonStartSet() delivers | |
335 | * all characters that compose from the input one, | |
336 | * even in multiple steps. | |
337 | * For example, the set for "I" (0049) should contain both | |
338 | * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). | |
339 | * In general, the set for the middle such character should be a subset | |
340 | * of the set for the first. | |
341 | */ | |
342 | IcuTestErrorCode errorCode(*this, "TestConsistency"); | |
343 | const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); | |
344 | const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); | |
345 | if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { | |
346 | dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", | |
347 | errorCode.errorName()); | |
348 | errorCode.reset(); | |
349 | return; | |
350 | } | |
351 | ||
352 | UnicodeSet set1, set2; | |
353 | if (nfcImpl->getCanonStartSet(0x49, set1)) { | |
354 | /* enumerate all characters that are plausible to be latin letters */ | |
355 | for(UChar start=0xa0; start<0x2000; ++start) { | |
356 | UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode); | |
357 | if(decomp.length()>1 && decomp[0]==0x49) { | |
358 | set2.add(start); | |
359 | } | |
360 | } | |
361 | ||
362 | if (set1!=set2) { | |
363 | errln("[canon start set of 0049] != [all c with canon decomp with 0049]"); | |
364 | } | |
365 | // This was available in cucdtst.c but the test had to move to intltest | |
366 | // because the new internal normalization functions are in C++. | |
367 | //compareUSets(set1, set2, | |
368 | // "[canon start set of 0049]", "[all c with canon decomp with 0049]", | |
369 | // TRUE); | |
370 | } else { | |
371 | errln("NFC.getCanonStartSet() returned FALSE"); | |
372 | } | |
373 | #endif | |
374 | } | |
375 | ||
376 | /** | |
377 | * Test various implementations of Pattern_Syntax & Pattern_White_Space. | |
378 | */ | |
379 | void UnicodeTest::TestPatternProperties() { | |
380 | IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); | |
381 | UnicodeSet syn_pp; | |
382 | UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); | |
383 | UnicodeSet syn_list( | |
384 | "[!-/\\:-@\\[-\\^`\\{-~" | |
385 | "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" | |
386 | "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" | |
387 | "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode); | |
388 | UnicodeSet ws_pp; | |
389 | UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); | |
390 | UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); | |
391 | UnicodeSet syn_ws_pp; | |
392 | UnicodeSet syn_ws_prop(syn_prop); | |
393 | syn_ws_prop.addAll(ws_prop); | |
394 | for(UChar32 c=0; c<=0xffff; ++c) { | |
395 | if(PatternProps::isSyntax(c)) { | |
396 | syn_pp.add(c); | |
397 | } | |
398 | if(PatternProps::isWhiteSpace(c)) { | |
399 | ws_pp.add(c); | |
400 | } | |
401 | if(PatternProps::isSyntaxOrWhiteSpace(c)) { | |
402 | syn_ws_pp.add(c); | |
403 | } | |
404 | } | |
405 | compareUSets(syn_pp, syn_prop, | |
406 | "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); | |
407 | compareUSets(syn_pp, syn_list, | |
408 | "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); | |
409 | compareUSets(ws_pp, ws_prop, | |
410 | "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); | |
411 | compareUSets(ws_pp, ws_list, | |
412 | "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); | |
413 | compareUSets(syn_ws_pp, syn_ws_prop, | |
414 | "PatternProps.isSyntaxOrWhiteSpace()", | |
415 | "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); | |
416 | } | |
417 | ||
418 | // So far only minimal port of Java & cucdtst.c compareUSets(). | |
419 | UBool | |
420 | UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, | |
421 | const char *a_name, const char *b_name, | |
422 | UBool diffIsError) { | |
423 | UBool same= a==b; | |
424 | if(!same && diffIsError) { | |
425 | errln("Sets are different: %s vs. %s\n", a_name, b_name); | |
426 | } | |
427 | return same; | |
428 | } |