]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f | 3 | /******************************************************************** |
2ca993e8 | 4 | * Copyright (c) 1997-2016, International Business Machines Corporation and |
b75a7d8f A |
5 | * others. All Rights Reserved. |
6 | ********************************************************************/ | |
7 | ||
8 | #include "unicode/ustring.h" | |
9 | #include "unicode/uchar.h" | |
10 | #include "unicode/uniset.h" | |
374ca955 | 11 | #include "unicode/putil.h" |
51004dcb | 12 | #include "unicode/uscript.h" |
b75a7d8f | 13 | #include "cstring.h" |
729e4ab9 | 14 | #include "hash.h" |
4388f060 | 15 | #include "patternprops.h" |
729e4ab9 | 16 | #include "normalizer2impl.h" |
b75a7d8f A |
17 | #include "uparse.h" |
18 | #include "ucdtest.h" | |
19 | ||
729e4ab9 A |
20 | static const char *ignorePropNames[]={ |
21 | "FC_NFKC", | |
22 | "NFD_QC", | |
23 | "NFC_QC", | |
24 | "NFKD_QC", | |
25 | "NFKC_QC", | |
26 | "Expands_On_NFD", | |
27 | "Expands_On_NFC", | |
28 | "Expands_On_NFKD", | |
29 | "Expands_On_NFKC", | |
30 | "NFKC_CF" | |
31 | }; | |
32 | ||
b75a7d8f A |
33 | UnicodeTest::UnicodeTest() |
34 | { | |
729e4ab9 A |
35 | UErrorCode errorCode=U_ZERO_ERROR; |
36 | unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode); | |
37 | if(U_FAILURE(errorCode)) { | |
38 | delete unknownPropertyNames; | |
39 | unknownPropertyNames=NULL; | |
40 | } | |
41 | // Ignore some property names altogether. | |
b331163b | 42 | for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) { |
729e4ab9 A |
43 | unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode); |
44 | } | |
b75a7d8f A |
45 | } |
46 | ||
47 | UnicodeTest::~UnicodeTest() | |
48 | { | |
729e4ab9 | 49 | delete unknownPropertyNames; |
b75a7d8f A |
50 | } |
51 | ||
52 | void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) | |
53 | { | |
4388f060 A |
54 | if(exec) { |
55 | logln("TestSuite UnicodeTest: "); | |
b75a7d8f | 56 | } |
4388f060 A |
57 | TESTCASE_AUTO_BEGIN; |
58 | TESTCASE_AUTO(TestAdditionalProperties); | |
59 | TESTCASE_AUTO(TestBinaryValues); | |
60 | TESTCASE_AUTO(TestConsistency); | |
61 | TESTCASE_AUTO(TestPatternProperties); | |
51004dcb | 62 | TESTCASE_AUTO(TestScriptMetadata); |
57a6839d | 63 | TESTCASE_AUTO(TestBidiPairedBracketType); |
2ca993e8 | 64 | TESTCASE_AUTO(TestEmojiProperties); |
0f5d89e8 A |
65 | TESTCASE_AUTO(TestDefaultScriptExtensions); |
66 | TESTCASE_AUTO(TestInvalidCodePointFolding); | |
4388f060 | 67 | TESTCASE_AUTO_END; |
b75a7d8f A |
68 | } |
69 | ||
70 | //==================================================== | |
71 | // private data used by the tests | |
72 | //==================================================== | |
73 | ||
74 | // test DerivedCoreProperties.txt ------------------------------------------- | |
75 | ||
76 | // copied from genprops.c | |
77 | static int32_t | |
78 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { | |
79 | const char *t, *z; | |
80 | int32_t i, j; | |
81 | ||
82 | s=u_skipWhitespace(s); | |
83 | for(i=0; i<countTokens; ++i) { | |
84 | t=tokens[i]; | |
85 | if(t!=NULL) { | |
86 | for(j=0;; ++j) { | |
87 | if(t[j]!=0) { | |
88 | if(s[j]!=t[j]) { | |
89 | break; | |
90 | } | |
91 | } else { | |
92 | z=u_skipWhitespace(s+j); | |
93 | if(*z==';' || *z==0) { | |
94 | return i; | |
95 | } else { | |
96 | break; | |
97 | } | |
98 | } | |
99 | } | |
100 | } | |
101 | } | |
102 | return -1; | |
103 | } | |
104 | ||
105 | static const char *const | |
729e4ab9 | 106 | derivedPropsNames[]={ |
b75a7d8f A |
107 | "Math", |
108 | "Alphabetic", | |
109 | "Lowercase", | |
110 | "Uppercase", | |
111 | "ID_Start", | |
112 | "ID_Continue", | |
113 | "XID_Start", | |
114 | "XID_Continue", | |
115 | "Default_Ignorable_Code_Point", | |
729e4ab9 | 116 | "Full_Composition_Exclusion", |
b75a7d8f | 117 | "Grapheme_Extend", |
73c04bcf | 118 | "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */ |
729e4ab9 A |
119 | "Grapheme_Base", |
120 | "Cased", | |
121 | "Case_Ignorable", | |
122 | "Changes_When_Lowercased", | |
123 | "Changes_When_Uppercased", | |
124 | "Changes_When_Titlecased", | |
125 | "Changes_When_Casefolded", | |
126 | "Changes_When_Casemapped", | |
127 | "Changes_When_NFKC_Casefolded" | |
b75a7d8f A |
128 | }; |
129 | ||
130 | static const UProperty | |
729e4ab9 | 131 | derivedPropsIndex[]={ |
b75a7d8f A |
132 | UCHAR_MATH, |
133 | UCHAR_ALPHABETIC, | |
134 | UCHAR_LOWERCASE, | |
135 | UCHAR_UPPERCASE, | |
136 | UCHAR_ID_START, | |
137 | UCHAR_ID_CONTINUE, | |
138 | UCHAR_XID_START, | |
139 | UCHAR_XID_CONTINUE, | |
140 | UCHAR_DEFAULT_IGNORABLE_CODE_POINT, | |
729e4ab9 | 141 | UCHAR_FULL_COMPOSITION_EXCLUSION, |
b75a7d8f | 142 | UCHAR_GRAPHEME_EXTEND, |
73c04bcf | 143 | UCHAR_GRAPHEME_LINK, |
729e4ab9 A |
144 | UCHAR_GRAPHEME_BASE, |
145 | UCHAR_CASED, | |
146 | UCHAR_CASE_IGNORABLE, | |
147 | UCHAR_CHANGES_WHEN_LOWERCASED, | |
148 | UCHAR_CHANGES_WHEN_UPPERCASED, | |
149 | UCHAR_CHANGES_WHEN_TITLECASED, | |
150 | UCHAR_CHANGES_WHEN_CASEFOLDED, | |
151 | UCHAR_CHANGES_WHEN_CASEMAPPED, | |
152 | UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED | |
b75a7d8f A |
153 | }; |
154 | ||
b331163b | 155 | static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 }; |
729e4ab9 A |
156 | |
157 | enum { MAX_ERRORS=50 }; | |
158 | ||
374ca955 | 159 | U_CFUNC void U_CALLCONV |
729e4ab9 A |
160 | derivedPropsLineFn(void *context, |
161 | char *fields[][2], int32_t /* fieldCount */, | |
162 | UErrorCode *pErrorCode) | |
b75a7d8f A |
163 | { |
164 | UnicodeTest *me=(UnicodeTest *)context; | |
165 | uint32_t start, end; | |
166 | int32_t i; | |
167 | ||
168 | u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); | |
169 | if(U_FAILURE(*pErrorCode)) { | |
729e4ab9 | 170 | me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]); |
b75a7d8f A |
171 | return; |
172 | } | |
173 | ||
174 | /* parse derived binary property name, ignore unknown names */ | |
b331163b | 175 | i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]); |
b75a7d8f | 176 | if(i<0) { |
729e4ab9 A |
177 | UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0])); |
178 | propName.trim(); | |
179 | if(me->unknownPropertyNames->find(propName)==NULL) { | |
180 | UErrorCode errorCode=U_ZERO_ERROR; | |
181 | me->unknownPropertyNames->puti(propName, 1, errorCode); | |
182 | me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]); | |
183 | } | |
b75a7d8f A |
184 | return; |
185 | } | |
186 | ||
729e4ab9 | 187 | me->derivedProps[i].add(start, end); |
b75a7d8f A |
188 | } |
189 | ||
190 | void UnicodeTest::TestAdditionalProperties() { | |
729e4ab9 A |
191 | #if !UCONFIG_NO_NORMALIZATION |
192 | // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt | |
b331163b | 193 | if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) { |
729e4ab9 | 194 | errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n", |
b331163b A |
195 | UPRV_LENGTHOF(derivedPropsNames)); |
196 | return; | |
197 | } | |
198 | if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) { | |
199 | errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n"); | |
b75a7d8f A |
200 | return; |
201 | } | |
b331163b A |
202 | |
203 | char path[500]; | |
204 | if(getUnidataPath(path) == NULL) { | |
205 | errln("unable to find path to source/data/unidata/"); | |
b75a7d8f A |
206 | return; |
207 | } | |
b331163b A |
208 | char *basename=strchr(path, 0); |
209 | strcpy(basename, "DerivedCoreProperties.txt"); | |
b75a7d8f | 210 | |
b75a7d8f | 211 | char *fields[2][2]; |
b75a7d8f | 212 | UErrorCode errorCode=U_ZERO_ERROR; |
b331163b | 213 | u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); |
b75a7d8f A |
214 | if(U_FAILURE(errorCode)) { |
215 | errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode)); | |
216 | return; | |
217 | } | |
b331163b | 218 | |
729e4ab9 A |
219 | strcpy(basename, "DerivedNormalizationProps.txt"); |
220 | u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); | |
221 | if(U_FAILURE(errorCode)) { | |
222 | errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode)); | |
223 | return; | |
224 | } | |
b75a7d8f A |
225 | |
226 | // now we have all derived core properties in the UnicodeSets | |
227 | // run them all through the API | |
228 | int32_t rangeCount, range; | |
229 | uint32_t i; | |
230 | UChar32 start, end; | |
b75a7d8f A |
231 | |
232 | // test all TRUE properties | |
b331163b | 233 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
729e4ab9 A |
234 | rangeCount=derivedProps[i].getRangeCount(); |
235 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { | |
236 | start=derivedProps[i].getRangeStart(range); | |
237 | end=derivedProps[i].getRangeEnd(range); | |
b75a7d8f | 238 | for(; start<=end; ++start) { |
729e4ab9 A |
239 | if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) { |
240 | dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]); | |
241 | if(++numErrors[i]>=MAX_ERRORS) { | |
242 | dataerrln("Too many errors, moving to the next test"); | |
b75a7d8f A |
243 | break; |
244 | } | |
245 | } | |
246 | } | |
247 | } | |
248 | } | |
249 | ||
b75a7d8f | 250 | // invert all properties |
b331163b | 251 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
729e4ab9 | 252 | derivedProps[i].complement(); |
b75a7d8f A |
253 | } |
254 | ||
255 | // test all FALSE properties | |
b331163b | 256 | for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { |
729e4ab9 A |
257 | rangeCount=derivedProps[i].getRangeCount(); |
258 | for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { | |
259 | start=derivedProps[i].getRangeStart(range); | |
260 | end=derivedProps[i].getRangeEnd(range); | |
b75a7d8f | 261 | for(; start<=end; ++start) { |
729e4ab9 A |
262 | if(u_hasBinaryProperty(start, derivedPropsIndex[i])) { |
263 | errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]); | |
264 | if(++numErrors[i]>=MAX_ERRORS) { | |
b75a7d8f A |
265 | errln("Too many errors, moving to the next test"); |
266 | break; | |
267 | } | |
268 | } | |
269 | } | |
270 | } | |
271 | } | |
729e4ab9 | 272 | #endif /* !UCONFIG_NO_NORMALIZATION */ |
b75a7d8f | 273 | } |
46f4442e A |
274 | |
275 | void UnicodeTest::TestBinaryValues() { | |
276 | /* | |
277 | * Unicode 5.1 explicitly defines binary property value aliases. | |
278 | * Verify that they are all recognized. | |
279 | */ | |
280 | UErrorCode errorCode=U_ZERO_ERROR; | |
281 | UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode); | |
282 | if(U_FAILURE(errorCode)) { | |
729e4ab9 | 283 | dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode)); |
46f4442e A |
284 | return; |
285 | } | |
286 | ||
287 | static const char *const falseValues[]={ "N", "No", "F", "False" }; | |
288 | static const char *const trueValues[]={ "Y", "Yes", "T", "True" }; | |
289 | int32_t i; | |
b331163b | 290 | for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) { |
46f4442e A |
291 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); |
292 | pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV)); | |
293 | errorCode=U_ZERO_ERROR; | |
294 | UnicodeSet set(pattern, errorCode); | |
295 | if(U_FAILURE(errorCode)) { | |
296 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode)); | |
297 | continue; | |
298 | } | |
299 | set.complement(); | |
300 | if(set!=alpha) { | |
301 | errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]); | |
302 | } | |
303 | } | |
b331163b | 304 | for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) { |
46f4442e A |
305 | UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); |
306 | pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV)); | |
307 | errorCode=U_ZERO_ERROR; | |
308 | UnicodeSet set(pattern, errorCode); | |
309 | if(U_FAILURE(errorCode)) { | |
310 | errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode)); | |
311 | continue; | |
312 | } | |
313 | if(set!=alpha) { | |
314 | errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]); | |
315 | } | |
316 | } | |
317 | } | |
729e4ab9 A |
318 | |
319 | void UnicodeTest::TestConsistency() { | |
320 | #if !UCONFIG_NO_NORMALIZATION | |
321 | /* | |
322 | * Test for an example that getCanonStartSet() delivers | |
323 | * all characters that compose from the input one, | |
324 | * even in multiple steps. | |
325 | * For example, the set for "I" (0049) should contain both | |
326 | * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). | |
327 | * In general, the set for the middle such character should be a subset | |
328 | * of the set for the first. | |
329 | */ | |
330 | IcuTestErrorCode errorCode(*this, "TestConsistency"); | |
4388f060 | 331 | const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); |
729e4ab9 | 332 | const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); |
4388f060 | 333 | if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { |
729e4ab9 A |
334 | dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", |
335 | errorCode.errorName()); | |
336 | errorCode.reset(); | |
337 | return; | |
338 | } | |
339 | ||
340 | UnicodeSet set1, set2; | |
341 | if (nfcImpl->getCanonStartSet(0x49, set1)) { | |
342 | /* enumerate all characters that are plausible to be latin letters */ | |
343 | for(UChar start=0xa0; start<0x2000; ++start) { | |
344 | UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode); | |
345 | if(decomp.length()>1 && decomp[0]==0x49) { | |
346 | set2.add(start); | |
347 | } | |
348 | } | |
349 | ||
350 | if (set1!=set2) { | |
351 | errln("[canon start set of 0049] != [all c with canon decomp with 0049]"); | |
352 | } | |
353 | // This was available in cucdtst.c but the test had to move to intltest | |
354 | // because the new internal normalization functions are in C++. | |
355 | //compareUSets(set1, set2, | |
356 | // "[canon start set of 0049]", "[all c with canon decomp with 0049]", | |
357 | // TRUE); | |
358 | } else { | |
359 | errln("NFC.getCanonStartSet() returned FALSE"); | |
360 | } | |
361 | #endif | |
362 | } | |
4388f060 A |
363 | |
364 | /** | |
365 | * Test various implementations of Pattern_Syntax & Pattern_White_Space. | |
366 | */ | |
367 | void UnicodeTest::TestPatternProperties() { | |
368 | IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); | |
369 | UnicodeSet syn_pp; | |
370 | UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); | |
371 | UnicodeSet syn_list( | |
372 | "[!-/\\:-@\\[-\\^`\\{-~" | |
373 | "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" | |
374 | "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" | |
375 | "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode); | |
376 | UnicodeSet ws_pp; | |
377 | UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); | |
378 | UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); | |
379 | UnicodeSet syn_ws_pp; | |
380 | UnicodeSet syn_ws_prop(syn_prop); | |
381 | syn_ws_prop.addAll(ws_prop); | |
382 | for(UChar32 c=0; c<=0xffff; ++c) { | |
383 | if(PatternProps::isSyntax(c)) { | |
384 | syn_pp.add(c); | |
385 | } | |
386 | if(PatternProps::isWhiteSpace(c)) { | |
387 | ws_pp.add(c); | |
388 | } | |
389 | if(PatternProps::isSyntaxOrWhiteSpace(c)) { | |
390 | syn_ws_pp.add(c); | |
391 | } | |
392 | } | |
393 | compareUSets(syn_pp, syn_prop, | |
394 | "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); | |
395 | compareUSets(syn_pp, syn_list, | |
396 | "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); | |
397 | compareUSets(ws_pp, ws_prop, | |
398 | "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); | |
399 | compareUSets(ws_pp, ws_list, | |
400 | "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); | |
401 | compareUSets(syn_ws_pp, syn_ws_prop, | |
402 | "PatternProps.isSyntaxOrWhiteSpace()", | |
403 | "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); | |
404 | } | |
405 | ||
406 | // So far only minimal port of Java & cucdtst.c compareUSets(). | |
407 | UBool | |
408 | UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, | |
409 | const char *a_name, const char *b_name, | |
410 | UBool diffIsError) { | |
411 | UBool same= a==b; | |
412 | if(!same && diffIsError) { | |
413 | errln("Sets are different: %s vs. %s\n", a_name, b_name); | |
414 | } | |
415 | return same; | |
416 | } | |
51004dcb A |
417 | |
418 | namespace { | |
419 | ||
420 | /** | |
421 | * Maps a special script code to the most common script of its encoded characters. | |
422 | */ | |
423 | UScriptCode getCharScript(UScriptCode script) { | |
424 | switch(script) { | |
f3c0d7a5 | 425 | case USCRIPT_HAN_WITH_BOPOMOFO: |
51004dcb A |
426 | case USCRIPT_SIMPLIFIED_HAN: |
427 | case USCRIPT_TRADITIONAL_HAN: | |
428 | return USCRIPT_HAN; | |
429 | case USCRIPT_JAPANESE: | |
430 | return USCRIPT_HIRAGANA; | |
f3c0d7a5 | 431 | case USCRIPT_JAMO: |
51004dcb A |
432 | case USCRIPT_KOREAN: |
433 | return USCRIPT_HANGUL; | |
f3c0d7a5 A |
434 | case USCRIPT_SYMBOLS_EMOJI: |
435 | return USCRIPT_SYMBOLS; | |
51004dcb A |
436 | default: |
437 | return script; | |
438 | } | |
439 | } | |
440 | ||
441 | } // namespace | |
442 | ||
443 | void UnicodeTest::TestScriptMetadata() { | |
444 | IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); | |
445 | UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); | |
446 | // So far, sample characters are uppercase. | |
447 | // Georgian is special. | |
448 | UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); | |
449 | for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { | |
450 | UScriptCode sc = (UScriptCode)sci; | |
451 | // Run the test with -v to see which script has failures: | |
f3c0d7a5 | 452 | // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL |
51004dcb A |
453 | logln(uscript_getShortName(sc)); |
454 | UScriptUsage usage = uscript_getUsage(sc); | |
455 | UnicodeString sample = uscript_getSampleUnicodeString(sc); | |
456 | UnicodeSet scriptSet; | |
457 | scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); | |
458 | if(usage == USCRIPT_USAGE_NOT_ENCODED) { | |
459 | assertTrue("not encoded, no sample", sample.isEmpty()); | |
460 | assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); | |
461 | assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); | |
462 | assertFalse("not encoded, not cased", uscript_isCased(sc)); | |
463 | assertTrue("not encoded, no characters", scriptSet.isEmpty()); | |
464 | } else { | |
465 | assertFalse("encoded, has a sample character", sample.isEmpty()); | |
466 | UChar32 firstChar = sample.char32At(0); | |
467 | UScriptCode charScript = getCharScript(sc); | |
468 | assertEquals("script(sample(script))", | |
57a6839d A |
469 | (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode)); |
470 | assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc)); | |
471 | assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc)); | |
472 | assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty())); | |
51004dcb A |
473 | if(uscript_isRightToLeft(sc)) { |
474 | rtl.removeAll(scriptSet); | |
475 | } | |
476 | if(uscript_isCased(sc)) { | |
477 | cased.removeAll(scriptSet); | |
478 | } | |
479 | } | |
480 | } | |
481 | UnicodeString pattern; | |
482 | assertEquals("no remaining RTL characters", | |
483 | UnicodeString("[]"), rtl.toPattern(pattern)); | |
484 | assertEquals("no remaining cased characters", | |
485 | UnicodeString("[]"), cased.toPattern(pattern)); | |
486 | ||
487 | assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); | |
488 | assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); | |
489 | assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); | |
490 | } | |
57a6839d A |
491 | |
492 | void UnicodeTest::TestBidiPairedBracketType() { | |
493 | // BidiBrackets-6.3.0.txt says: | |
494 | // | |
495 | // The set of code points listed in this file was originally derived | |
496 | // using the character properties General_Category (gc), Bidi_Class (bc), | |
497 | // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows: | |
498 | // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe, | |
499 | // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket | |
500 | // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type | |
501 | // property values are Open and Close, respectively. | |
502 | IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()"); | |
503 | UnicodeSet bpt("[:^bpt=n:]", errorCode); | |
504 | assertTrue("bpt!=None is not empty", !bpt.isEmpty()); | |
505 | // The following should always be true. | |
506 | UnicodeSet mirrored("[:Bidi_M:]", errorCode); | |
507 | UnicodeSet other_neutral("[:bc=ON:]", errorCode); | |
508 | assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt)); | |
509 | assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt)); | |
510 | // The following are true at least initially in Unicode 6.3. | |
511 | UnicodeSet bpt_open("[:bpt=o:]", errorCode); | |
512 | UnicodeSet bpt_close("[:bpt=c:]", errorCode); | |
513 | UnicodeSet ps("[:Ps:]", errorCode); | |
514 | UnicodeSet pe("[:Pe:]", errorCode); | |
515 | assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open)); | |
516 | assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close)); | |
517 | } | |
2ca993e8 A |
518 | |
519 | void UnicodeTest::TestEmojiProperties() { | |
520 | assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI)); | |
521 | assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI)); | |
522 | IcuTestErrorCode errorCode(*this, "TestEmojiProperties()"); | |
523 | UnicodeSet emoji("[:Emoji:]", errorCode); | |
524 | assertTrue("lots of Emoji", emoji.size() > 700); | |
525 | ||
526 | assertTrue("shooting star is Emoji_Presentation", | |
527 | u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION)); | |
528 | assertTrue("Fitzpatrick 6 is Emoji_Modifier", | |
529 | u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER)); | |
530 | assertTrue("happy person is Emoji_Modifier_Base", | |
531 | u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE)); | |
6be67b06 A |
532 | assertTrue("asterisk is Emoji_Component", |
533 | u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT)); | |
0f5d89e8 A |
534 | assertTrue("copyright is Extended_Pictographic", |
535 | u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC)); | |
536 | #if U_PLATFORM_IS_DARWIN_BASED | |
6be67b06 A |
537 | assertTrue("TAG char is Emoji_Component", |
538 | u_hasBinaryProperty(0xE0061, UCHAR_EMOJI_COMPONENT)); // Apple addition | |
539 | assertTrue("ZWJ char is Emoji_Component", | |
540 | u_hasBinaryProperty(0x200D, UCHAR_EMOJI_COMPONENT)); // Apple addition | |
0f5d89e8 A |
541 | #endif |
542 | } | |
543 | ||
544 | void UnicodeTest::TestDefaultScriptExtensions() { | |
545 | // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii | |
546 | // but some of its characters revert to scx=<script> which is usually Common. | |
547 | IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()"); | |
548 | UScriptCode scx[20]; | |
549 | scx[0] = USCRIPT_INVALID_CODE; | |
550 | assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE | |
551 | uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode)); | |
552 | assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]); | |
553 | scx[0] = USCRIPT_INVALID_CODE; | |
554 | assertEquals("U+3012 num scx", 1, // POSTAL MARK | |
555 | uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode)); | |
556 | assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]); | |
557 | } | |
558 | ||
559 | void UnicodeTest::TestInvalidCodePointFolding(void) { | |
560 | // Test behavior when an invalid code point is passed to u_foldCase | |
561 | static const UChar32 invalidCodePoints[] = { | |
562 | 0xD800, // lead surrogate | |
563 | 0xDFFF, // trail surrogate | |
564 | 0xFDD0, // noncharacter | |
565 | 0xFFFF, // noncharacter | |
566 | 0x110000, // out of range | |
567 | -1 // negative | |
568 | }; | |
569 | for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) { | |
570 | UChar32 cp = invalidCodePoints[i]; | |
571 | assertEquals("Invalid code points should be echoed back", | |
572 | cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT)); | |
573 | assertEquals("Invalid code points should be echoed back", | |
574 | cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I)); | |
575 | } | |
2ca993e8 | 576 | } |