]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/ucdtest.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / test / intltest / ucdtest.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 #include "unicode/ustring.h"
8 #include "unicode/uchar.h"
9 #include "unicode/uniset.h"
10 #include "unicode/putil.h"
11 #include "cstring.h"
12 #include "hash.h"
13 #include "normalizer2impl.h"
14 #include "uparse.h"
15 #include "ucdtest.h"
16
17 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
18
19 static const char *ignorePropNames[]={
20 "FC_NFKC",
21 "NFD_QC",
22 "NFC_QC",
23 "NFKD_QC",
24 "NFKC_QC",
25 "Expands_On_NFD",
26 "Expands_On_NFC",
27 "Expands_On_NFKD",
28 "Expands_On_NFKC",
29 "NFKC_CF"
30 };
31
32 UnicodeTest::UnicodeTest()
33 {
34 UErrorCode errorCode=U_ZERO_ERROR;
35 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
36 if(U_FAILURE(errorCode)) {
37 delete unknownPropertyNames;
38 unknownPropertyNames=NULL;
39 }
40 // Ignore some property names altogether.
41 for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
42 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
43 }
44 }
45
46 UnicodeTest::~UnicodeTest()
47 {
48 delete unknownPropertyNames;
49 }
50
51 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
52 {
53 if (exec) logln("TestSuite UnicodeTest: ");
54 switch (index) {
55 case 0: name = "TestAdditionalProperties"; if(exec) TestAdditionalProperties(); break;
56 case 1: name = "TestBinaryValues"; if(exec) TestBinaryValues(); break;
57 case 2: name = "TestConsistency"; if(exec) TestConsistency(); break;
58 default: name = ""; break; //needed to end loop
59 }
60 }
61
62 //====================================================
63 // private data used by the tests
64 //====================================================
65
66 // test DerivedCoreProperties.txt -------------------------------------------
67
68 // copied from genprops.c
69 static int32_t
70 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
71 const char *t, *z;
72 int32_t i, j;
73
74 s=u_skipWhitespace(s);
75 for(i=0; i<countTokens; ++i) {
76 t=tokens[i];
77 if(t!=NULL) {
78 for(j=0;; ++j) {
79 if(t[j]!=0) {
80 if(s[j]!=t[j]) {
81 break;
82 }
83 } else {
84 z=u_skipWhitespace(s+j);
85 if(*z==';' || *z==0) {
86 return i;
87 } else {
88 break;
89 }
90 }
91 }
92 }
93 }
94 return -1;
95 }
96
97 static const char *const
98 derivedPropsNames[]={
99 "Math",
100 "Alphabetic",
101 "Lowercase",
102 "Uppercase",
103 "ID_Start",
104 "ID_Continue",
105 "XID_Start",
106 "XID_Continue",
107 "Default_Ignorable_Code_Point",
108 "Full_Composition_Exclusion",
109 "Grapheme_Extend",
110 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
111 "Grapheme_Base",
112 "Cased",
113 "Case_Ignorable",
114 "Changes_When_Lowercased",
115 "Changes_When_Uppercased",
116 "Changes_When_Titlecased",
117 "Changes_When_Casefolded",
118 "Changes_When_Casemapped",
119 "Changes_When_NFKC_Casefolded"
120 };
121
122 static const UProperty
123 derivedPropsIndex[]={
124 UCHAR_MATH,
125 UCHAR_ALPHABETIC,
126 UCHAR_LOWERCASE,
127 UCHAR_UPPERCASE,
128 UCHAR_ID_START,
129 UCHAR_ID_CONTINUE,
130 UCHAR_XID_START,
131 UCHAR_XID_CONTINUE,
132 UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
133 UCHAR_FULL_COMPOSITION_EXCLUSION,
134 UCHAR_GRAPHEME_EXTEND,
135 UCHAR_GRAPHEME_LINK,
136 UCHAR_GRAPHEME_BASE,
137 UCHAR_CASED,
138 UCHAR_CASE_IGNORABLE,
139 UCHAR_CHANGES_WHEN_LOWERCASED,
140 UCHAR_CHANGES_WHEN_UPPERCASED,
141 UCHAR_CHANGES_WHEN_TITLECASED,
142 UCHAR_CHANGES_WHEN_CASEFOLDED,
143 UCHAR_CHANGES_WHEN_CASEMAPPED,
144 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
145 };
146
147 static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
148
149 enum { MAX_ERRORS=50 };
150
151 U_CFUNC void U_CALLCONV
152 derivedPropsLineFn(void *context,
153 char *fields[][2], int32_t /* fieldCount */,
154 UErrorCode *pErrorCode)
155 {
156 UnicodeTest *me=(UnicodeTest *)context;
157 uint32_t start, end;
158 int32_t i;
159
160 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
161 if(U_FAILURE(*pErrorCode)) {
162 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
163 return;
164 }
165
166 /* parse derived binary property name, ignore unknown names */
167 i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
168 if(i<0) {
169 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
170 propName.trim();
171 if(me->unknownPropertyNames->find(propName)==NULL) {
172 UErrorCode errorCode=U_ZERO_ERROR;
173 me->unknownPropertyNames->puti(propName, 1, errorCode);
174 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
175 }
176 return;
177 }
178
179 me->derivedProps[i].add(start, end);
180 }
181
182 void UnicodeTest::TestAdditionalProperties() {
183 #if !UCONFIG_NO_NORMALIZATION
184 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
185 if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
186 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
187 LENGTHOF(derivedPropsNames));
188 return;
189 }
190 if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
191 errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
192 return;
193 }
194
195 char newPath[256];
196 char backupPath[256];
197 char *fields[2][2];
198 UErrorCode errorCode=U_ZERO_ERROR;
199
200 /* Look inside ICU_DATA first */
201 strcpy(newPath, pathToDataDirectory());
202 strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
203
204 // As a fallback, try to guess where the source data was located
205 // at the time ICU was built, and look there.
206 # ifdef U_TOPSRCDIR
207 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data");
208 # else
209 strcpy(backupPath, loadTestData(errorCode));
210 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
211 # endif
212 strcat(backupPath, U_FILE_SEP_STRING);
213 strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
214
215 char *path=newPath;
216 u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
217
218 if(errorCode==U_FILE_ACCESS_ERROR) {
219 errorCode=U_ZERO_ERROR;
220 path=backupPath;
221 u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
222 }
223 if(U_FAILURE(errorCode)) {
224 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
225 return;
226 }
227 char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
228 strcpy(basename, "DerivedNormalizationProps.txt");
229 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
230 if(U_FAILURE(errorCode)) {
231 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
232 return;
233 }
234
235 // now we have all derived core properties in the UnicodeSets
236 // run them all through the API
237 int32_t rangeCount, range;
238 uint32_t i;
239 UChar32 start, end;
240
241 // test all TRUE properties
242 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
243 rangeCount=derivedProps[i].getRangeCount();
244 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
245 start=derivedProps[i].getRangeStart(range);
246 end=derivedProps[i].getRangeEnd(range);
247 for(; start<=end; ++start) {
248 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
249 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]);
250 if(++numErrors[i]>=MAX_ERRORS) {
251 dataerrln("Too many errors, moving to the next test");
252 break;
253 }
254 }
255 }
256 }
257 }
258
259 // invert all properties
260 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
261 derivedProps[i].complement();
262 }
263
264 // test all FALSE properties
265 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
266 rangeCount=derivedProps[i].getRangeCount();
267 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
268 start=derivedProps[i].getRangeStart(range);
269 end=derivedProps[i].getRangeEnd(range);
270 for(; start<=end; ++start) {
271 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
272 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
273 if(++numErrors[i]>=MAX_ERRORS) {
274 errln("Too many errors, moving to the next test");
275 break;
276 }
277 }
278 }
279 }
280 }
281 #endif /* !UCONFIG_NO_NORMALIZATION */
282 }
283
284 void UnicodeTest::TestBinaryValues() {
285 /*
286 * Unicode 5.1 explicitly defines binary property value aliases.
287 * Verify that they are all recognized.
288 */
289 UErrorCode errorCode=U_ZERO_ERROR;
290 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
291 if(U_FAILURE(errorCode)) {
292 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
293 return;
294 }
295
296 static const char *const falseValues[]={ "N", "No", "F", "False" };
297 static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
298 int32_t i;
299 for(i=0; i<LENGTHOF(falseValues); ++i) {
300 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
301 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
302 errorCode=U_ZERO_ERROR;
303 UnicodeSet set(pattern, errorCode);
304 if(U_FAILURE(errorCode)) {
305 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
306 continue;
307 }
308 set.complement();
309 if(set!=alpha) {
310 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
311 }
312 }
313 for(i=0; i<LENGTHOF(trueValues); ++i) {
314 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
315 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
316 errorCode=U_ZERO_ERROR;
317 UnicodeSet set(pattern, errorCode);
318 if(U_FAILURE(errorCode)) {
319 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
320 continue;
321 }
322 if(set!=alpha) {
323 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
324 }
325 }
326 }
327
328 void UnicodeTest::TestConsistency() {
329 #if !UCONFIG_NO_NORMALIZATION
330 /*
331 * Test for an example that getCanonStartSet() delivers
332 * all characters that compose from the input one,
333 * even in multiple steps.
334 * For example, the set for "I" (0049) should contain both
335 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
336 * In general, the set for the middle such character should be a subset
337 * of the set for the first.
338 */
339 IcuTestErrorCode errorCode(*this, "TestConsistency");
340 const Normalizer2 *nfd=Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
341 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
342 if(errorCode.isFailure()) {
343 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
344 errorCode.errorName());
345 errorCode.reset();
346 return;
347 }
348
349 UnicodeSet set1, set2;
350 if (nfcImpl->getCanonStartSet(0x49, set1)) {
351 /* enumerate all characters that are plausible to be latin letters */
352 for(UChar start=0xa0; start<0x2000; ++start) {
353 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
354 if(decomp.length()>1 && decomp[0]==0x49) {
355 set2.add(start);
356 }
357 }
358
359 if (set1!=set2) {
360 errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
361 }
362 // This was available in cucdtst.c but the test had to move to intltest
363 // because the new internal normalization functions are in C++.
364 //compareUSets(set1, set2,
365 // "[canon start set of 0049]", "[all c with canon decomp with 0049]",
366 // TRUE);
367 } else {
368 errln("NFC.getCanonStartSet() returned FALSE");
369 }
370 #endif
371 }