1 /********************************************************************
3 * Copyright (c) 1997-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_NORMALIZATION
11 #include "unicode/uchar.h"
12 #include "unicode/errorcode.h"
13 #include "unicode/normlzr.h"
14 #include "unicode/uniset.h"
15 #include "unicode/usetiter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/utf16.h"
19 #include "normalizer2impl.h"
22 #define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0])))
23 #define ARRAY_LENGTH(array) LENGTHOF(array)
25 #define CASE(id,test) case id: \
29 logln((UnicodeString)""); \
34 static UErrorCode status
= U_ZERO_ERROR
;
36 void BasicNormalizerTest::runIndexedTest(int32_t index
, UBool exec
,
37 const char* &name
, char* /*par*/) {
40 CASE(1,TestCompatDecomp
);
41 CASE(2,TestCanonCompose
);
42 CASE(3,TestCompatCompose
);
44 CASE(5,TestHangulDecomp
);
45 CASE(6,TestHangulCompose
);
47 CASE(8,TestCompositionExclusion
);
48 CASE(9,TestZeroIndex
);
49 CASE(10,TestVerisign
);
50 CASE(11,TestPreviousNext
);
51 CASE(12,TestNormalizerAPI
);
52 CASE(13,TestConcatenate
);
53 CASE(14,FindFoldFCDExceptions
);
55 CASE(16,TestSkippable
);
56 #if !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
57 CASE(17,TestCustomComp
);
58 CASE(18,TestCustomFCC
);
60 CASE(19,TestFilteredNormalizer2Coverage
);
61 default: name
= ""; break;
66 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
68 static UnicodeString
str(const char *input
)
70 UnicodeString
str(input
, ""); // Invariant conversion
71 return str
.unescape();
75 BasicNormalizerTest::BasicNormalizerTest()
78 // Input Decomposed Composed
80 canonTests
[0][0] = str("cat"); canonTests
[0][1] = str("cat"); canonTests
[0][2] = str("cat");
82 canonTests
[1][0] = str("\\u00e0ardvark"); canonTests
[1][1] = str("a\\u0300ardvark"); canonTests
[1][2] = str("\\u00e0ardvark");
84 canonTests
[2][0] = str("\\u1e0a"); canonTests
[2][1] = str("D\\u0307"); canonTests
[2][2] = str("\\u1e0a"); // D-dot_above
86 canonTests
[3][0] = str("D\\u0307"); canonTests
[3][1] = str("D\\u0307"); canonTests
[3][2] = str("\\u1e0a"); // D dot_above
88 canonTests
[4][0] = str("\\u1e0c\\u0307"); canonTests
[4][1] = str("D\\u0323\\u0307"); canonTests
[4][2] = str("\\u1e0c\\u0307"); // D-dot_below dot_above
90 canonTests
[5][0] = str("\\u1e0a\\u0323"); canonTests
[5][1] = str("D\\u0323\\u0307"); canonTests
[5][2] = str("\\u1e0c\\u0307"); // D-dot_above dot_below
92 canonTests
[6][0] = str("D\\u0307\\u0323"); canonTests
[6][1] = str("D\\u0323\\u0307"); canonTests
[6][2] = str("\\u1e0c\\u0307"); // D dot_below dot_above
94 canonTests
[7][0] = str("\\u1e10\\u0307\\u0323"); canonTests
[7][1] = str("D\\u0327\\u0323\\u0307"); canonTests
[7][2] = str("\\u1e10\\u0323\\u0307"); // D dot_below cedilla dot_above
96 canonTests
[8][0] = str("D\\u0307\\u0328\\u0323"); canonTests
[8][1] = str("D\\u0328\\u0323\\u0307"); canonTests
[8][2] = str("\\u1e0c\\u0328\\u0307"); // D dot_above ogonek dot_below
98 canonTests
[9][0] = str("\\u1E14"); canonTests
[9][1] = str("E\\u0304\\u0300"); canonTests
[9][2] = str("\\u1E14"); // E-macron-grave
100 canonTests
[10][0] = str("\\u0112\\u0300"); canonTests
[10][1] = str("E\\u0304\\u0300"); canonTests
[10][2] = str("\\u1E14"); // E-macron + grave
102 canonTests
[11][0] = str("\\u00c8\\u0304"); canonTests
[11][1] = str("E\\u0300\\u0304"); canonTests
[11][2] = str("\\u00c8\\u0304"); // E-grave + macron
104 canonTests
[12][0] = str("\\u212b"); canonTests
[12][1] = str("A\\u030a"); canonTests
[12][2] = str("\\u00c5"); // angstrom_sign
106 canonTests
[13][0] = str("\\u00c5"); canonTests
[13][1] = str("A\\u030a"); canonTests
[13][2] = str("\\u00c5"); // A-ring
108 canonTests
[14][0] = str("\\u00C4ffin"); canonTests
[14][1] = str("A\\u0308ffin"); canonTests
[14][2] = str("\\u00C4ffin");
110 canonTests
[15][0] = str("\\u00C4\\uFB03n"); canonTests
[15][1] = str("A\\u0308\\uFB03n"); canonTests
[15][2] = str("\\u00C4\\uFB03n");
112 canonTests
[16][0] = str("Henry IV"); canonTests
[16][1] = str("Henry IV"); canonTests
[16][2] = str("Henry IV");
114 canonTests
[17][0] = str("Henry \\u2163"); canonTests
[17][1] = str("Henry \\u2163"); canonTests
[17][2] = str("Henry \\u2163");
116 canonTests
[18][0] = str("\\u30AC"); canonTests
[18][1] = str("\\u30AB\\u3099"); canonTests
[18][2] = str("\\u30AC"); // ga (Katakana)
118 canonTests
[19][0] = str("\\u30AB\\u3099"); canonTests
[19][1] = str("\\u30AB\\u3099"); canonTests
[19][2] = str("\\u30AC"); // ka + ten
120 canonTests
[20][0] = str("\\uFF76\\uFF9E"); canonTests
[20][1] = str("\\uFF76\\uFF9E"); canonTests
[20][2] = str("\\uFF76\\uFF9E"); // hw_ka + hw_ten
122 canonTests
[21][0] = str("\\u30AB\\uFF9E"); canonTests
[21][1] = str("\\u30AB\\uFF9E"); canonTests
[21][2] = str("\\u30AB\\uFF9E"); // ka + hw_ten
124 canonTests
[22][0] = str("\\uFF76\\u3099"); canonTests
[22][1] = str("\\uFF76\\u3099"); canonTests
[22][2] = str("\\uFF76\\u3099"); // hw_ka + ten
126 canonTests
[23][0] = str("A\\u0300\\u0316"); canonTests
[23][1] = str("A\\u0316\\u0300"); canonTests
[23][2] = str("\\u00C0\\u0316");
129 // Input Decomposed Composed
130 compatTests
[0][0] = str("cat"); compatTests
[0][1] = str("cat"); compatTests
[0][2] = str("cat") ;
132 compatTests
[1][0] = str("\\uFB4f"); compatTests
[1][1] = str("\\u05D0\\u05DC"); compatTests
[1][2] = str("\\u05D0\\u05DC"); // Alef-Lamed vs. Alef, Lamed
134 compatTests
[2][0] = str("\\u00C4ffin"); compatTests
[2][1] = str("A\\u0308ffin"); compatTests
[2][2] = str("\\u00C4ffin") ;
136 compatTests
[3][0] = str("\\u00C4\\uFB03n"); compatTests
[3][1] = str("A\\u0308ffin"); compatTests
[3][2] = str("\\u00C4ffin") ; // ffi ligature -> f + f + i
138 compatTests
[4][0] = str("Henry IV"); compatTests
[4][1] = str("Henry IV"); compatTests
[4][2] = str("Henry IV") ;
140 compatTests
[5][0] = str("Henry \\u2163"); compatTests
[5][1] = str("Henry IV"); compatTests
[5][2] = str("Henry IV") ;
142 compatTests
[6][0] = str("\\u30AC"); compatTests
[6][1] = str("\\u30AB\\u3099"); compatTests
[6][2] = str("\\u30AC") ; // ga (Katakana)
144 compatTests
[7][0] = str("\\u30AB\\u3099"); compatTests
[7][1] = str("\\u30AB\\u3099"); compatTests
[7][2] = str("\\u30AC") ; // ka + ten
146 compatTests
[8][0] = str("\\uFF76\\u3099"); compatTests
[8][1] = str("\\u30AB\\u3099"); compatTests
[8][2] = str("\\u30AC") ; // hw_ka + ten
148 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later */
149 compatTests
[9][0] = str("\\uFF76\\uFF9E"); compatTests
[9][1] = str("\\u30AB\\u3099"); compatTests
[9][2] = str("\\u30AC") ; // hw_ka + hw_ten
151 compatTests
[10][0] = str("\\u30AB\\uFF9E"); compatTests
[10][1] = str("\\u30AB\\u3099"); compatTests
[10][2] = str("\\u30AC") ; // ka + hw_ten
153 /* Hangul Canonical */
154 // Input Decomposed Composed
155 hangulCanon
[0][0] = str("\\ud4db"); hangulCanon
[0][1] = str("\\u1111\\u1171\\u11b6"); hangulCanon
[0][2] = str("\\ud4db") ;
157 hangulCanon
[1][0] = str("\\u1111\\u1171\\u11b6"), hangulCanon
[1][1] = str("\\u1111\\u1171\\u11b6"), hangulCanon
[1][2] = str("\\ud4db");
160 BasicNormalizerTest::~BasicNormalizerTest()
164 void BasicNormalizerTest::TestPrevious()
166 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
168 logln("testing decomp...");
170 for (i
= 0; i
< ARRAY_LENGTH(canonTests
); i
++) {
171 backAndForth(norm
, canonTests
[i
][0]);
174 logln("testing compose...");
175 norm
->setMode(UNORM_NFC
);
176 for (i
= 0; i
< ARRAY_LENGTH(canonTests
); i
++) {
177 backAndForth(norm
, canonTests
[i
][0]);
183 void BasicNormalizerTest::TestDecomp()
185 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
186 iterateTest(norm
, canonTests
, ARRAY_LENGTH(canonTests
), 1);
187 staticTest(UNORM_NFD
, 0, canonTests
, ARRAY_LENGTH(canonTests
), 1);
191 void BasicNormalizerTest::TestCompatDecomp()
193 Normalizer
* norm
= new Normalizer("", UNORM_NFKD
);
194 iterateTest(norm
, compatTests
, ARRAY_LENGTH(compatTests
), 1);
196 staticTest(UNORM_NFKD
, 0,
197 compatTests
, ARRAY_LENGTH(compatTests
), 1);
201 void BasicNormalizerTest::TestCanonCompose()
203 Normalizer
* norm
= new Normalizer("", UNORM_NFC
);
204 iterateTest(norm
, canonTests
, ARRAY_LENGTH(canonTests
), 2);
206 staticTest(UNORM_NFC
, 0, canonTests
,
207 ARRAY_LENGTH(canonTests
), 2);
211 void BasicNormalizerTest::TestCompatCompose()
213 Normalizer
* norm
= new Normalizer("", UNORM_NFKC
);
214 iterateTest(norm
, compatTests
, ARRAY_LENGTH(compatTests
), 2);
216 staticTest(UNORM_NFKC
, 0,
217 compatTests
, ARRAY_LENGTH(compatTests
), 2);
222 //-------------------------------------------------------------------------------
224 void BasicNormalizerTest::TestHangulCompose()
226 // Make sure that the static composition methods work
227 logln("Canonical composition...");
228 staticTest(UNORM_NFC
, 0, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 2);
229 logln("Compatibility composition...");
231 // Now try iterative composition....
232 logln("Static composition...");
233 Normalizer
* norm
= new Normalizer("", UNORM_NFC
);
234 iterateTest(norm
, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 2);
235 norm
->setMode(UNORM_NFKC
);
237 // And finally, make sure you can do it in reverse too
238 logln("Reverse iteration...");
239 norm
->setMode(UNORM_NFC
);
240 for (uint32_t i
= 0; i
< ARRAY_LENGTH(hangulCanon
); i
++) {
241 backAndForth(norm
, hangulCanon
[i
][0]);
246 void BasicNormalizerTest::TestHangulDecomp()
248 // Make sure that the static decomposition methods work
249 logln("Canonical decomposition...");
250 staticTest(UNORM_NFD
, 0, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 1);
251 logln("Compatibility decomposition...");
253 // Now the iterative decomposition methods...
254 logln("Iterative decomposition...");
255 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
256 iterateTest(norm
, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 1);
257 norm
->setMode(UNORM_NFKD
);
259 // And finally, make sure you can do it in reverse too
260 logln("Reverse iteration...");
261 norm
->setMode(UNORM_NFD
);
262 for (uint32_t i
= 0; i
< ARRAY_LENGTH(hangulCanon
); i
++) {
263 backAndForth(norm
, hangulCanon
[i
][0]);
269 * The Tibetan vowel sign AA, 0f71, was messed up prior to Unicode version 2.1.9.
271 void BasicNormalizerTest::TestTibetan(void) {
272 UnicodeString decomp
[1][3];
273 decomp
[0][0] = str("\\u0f77");
274 decomp
[0][1] = str("\\u0f77");
275 decomp
[0][2] = str("\\u0fb2\\u0f71\\u0f80");
277 UnicodeString compose
[1][3];
278 compose
[0][0] = str("\\u0fb2\\u0f71\\u0f80");
279 compose
[0][1] = str("\\u0fb2\\u0f71\\u0f80");
280 compose
[0][2] = str("\\u0fb2\\u0f71\\u0f80");
282 staticTest(UNORM_NFD
, 0, decomp
, ARRAY_LENGTH(decomp
), 1);
283 staticTest(UNORM_NFKD
, 0, decomp
, ARRAY_LENGTH(decomp
), 2);
284 staticTest(UNORM_NFC
, 0, compose
, ARRAY_LENGTH(compose
), 1);
285 staticTest(UNORM_NFKC
, 0, compose
, ARRAY_LENGTH(compose
), 2);
289 * Make sure characters in the CompositionExclusion.txt list do not get
292 void BasicNormalizerTest::TestCompositionExclusion(void) {
293 // This list is generated from CompositionExclusion.txt.
294 // Update whenever the normalizer tables are updated. Note
295 // that we test all characters listed, even those that can be
296 // derived from the Unicode DB and are therefore commented
298 // ### TODO read composition exclusion from source/data/unidata file
299 // and test against that
300 UnicodeString EXCLUDED
= str(
301 "\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
302 "\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
303 "\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
304 "\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
305 "\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
306 "\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
307 "\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB"
308 "\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000"
309 "\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10"
310 "\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F"
311 "\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31"
312 "\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A"
313 "\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
314 "\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E"
316 for (int32_t i
=0; i
<EXCLUDED
.length(); ++i
) {
317 UnicodeString
a(EXCLUDED
.charAt(i
));
320 Normalizer::normalize(a
, UNORM_NFKD
, 0, b
, status
);
321 Normalizer::normalize(b
, UNORM_NFC
, 0, c
, status
);
323 errln("FAIL: " + hex(a
) + " x DECOMP_COMPAT => " +
324 hex(b
) + " x COMPOSE => " +
326 } else if (verbose
) {
327 logln("Ok: " + hex(a
) + " x DECOMP_COMPAT => " +
328 hex(b
) + " x COMPOSE => " +
335 * Test for a problem that showed up just before ICU 1.6 release
336 * having to do with combining characters with an index of zero.
337 * Such characters do not participate in any canonical
338 * decompositions. However, having an index of zero means that
339 * they all share one typeMask[] entry, that is, they all have to
340 * map to the same canonical class, which is not the case, in
343 void BasicNormalizerTest::TestZeroIndex(void) {
344 const char* DATA
[] = {
345 // Expect col1 x COMPOSE_COMPAT => col2
346 // Expect col2 x DECOMP => col3
347 "A\\u0316\\u0300", "\\u00C0\\u0316", "A\\u0316\\u0300",
348 "A\\u0300\\u0316", "\\u00C0\\u0316", "A\\u0316\\u0300",
349 "A\\u0327\\u0300", "\\u00C0\\u0327", "A\\u0327\\u0300",
350 "c\\u0321\\u0327", "c\\u0321\\u0327", "c\\u0321\\u0327",
351 "c\\u0327\\u0321", "\\u00E7\\u0321", "c\\u0327\\u0321",
353 int32_t DATA_length
= (int32_t)(sizeof(DATA
) / sizeof(DATA
[0]));
355 for (int32_t i
=0; i
<DATA_length
; i
+=3) {
356 UErrorCode status
= U_ZERO_ERROR
;
357 UnicodeString
a(DATA
[i
], "");
360 Normalizer::normalize(a
, UNORM_NFKC
, 0, b
, status
);
361 if (U_FAILURE(status
)) {
362 dataerrln("Error calling normalize UNORM_NFKC: %s", u_errorName(status
));
364 UnicodeString
exp(DATA
[i
+1], "");
365 exp
= exp
.unescape();
367 logln((UnicodeString
)"Ok: " + hex(a
) + " x COMPOSE_COMPAT => " + hex(b
));
369 errln((UnicodeString
)"FAIL: " + hex(a
) + " x COMPOSE_COMPAT => " + hex(b
) +
370 ", expect " + hex(exp
));
373 Normalizer::normalize(b
, UNORM_NFD
, 0, a
, status
);
374 if (U_FAILURE(status
)) {
375 dataerrln("Error calling normalize UNORM_NFD: %s", u_errorName(status
));
377 UnicodeString exp
= UnicodeString(DATA
[i
+2], "").unescape();
379 logln((UnicodeString
)"Ok: " + hex(b
) + " x DECOMP => " + hex(a
));
381 errln((UnicodeString
)"FAIL: " + hex(b
) + " x DECOMP => " + hex(a
) +
382 ", expect " + hex(exp
));
389 * Run a few specific cases that are failing for Verisign.
391 void BasicNormalizerTest::TestVerisign(void) {
394 > 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
395 > Their output (supposedly from ICU):
396 > 05B8 05B1 05B9 0591 05C3 05B0 05AC 059F
397 > My output from charlint:
398 > 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
400 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F => 05B1 05B8 05B9 0591 05C3 05B0
403 U+05B8 18 E HEBREW POINT QAMATS
404 U+05B9 19 F HEBREW POINT HOLAM
405 U+05B1 11 HEBREW POINT HATAF SEGOL
406 U+0591 220 HEBREW ACCENT ETNAHTA
407 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
408 U+05B0 10 HEBREW POINT SHEVA
409 U+05AC 230 HEBREW ACCENT ILUY
410 U+059F 230 HEBREW ACCENT QARNEY PARA
412 U+05B1 11 HEBREW POINT HATAF SEGOL
413 U+05B8 18 HEBREW POINT QAMATS
414 U+05B9 19 HEBREW POINT HOLAM
415 U+0591 220 HEBREW ACCENT ETNAHTA
416 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
417 U+05B0 10 HEBREW POINT SHEVA
418 U+05AC 230 HEBREW ACCENT ILUY
419 U+059F 230 HEBREW ACCENT QARNEY PARA
422 U+05B8 18 HEBREW POINT QAMATS
423 U+05B1 11 HEBREW POINT HATAF SEGOL
424 U+05B9 19 HEBREW POINT HOLAM
425 U+0591 220 HEBREW ACCENT ETNAHTA
426 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
427 U+05B0 10 HEBREW POINT SHEVA
428 U+05AC 230 HEBREW ACCENT ILUY
429 U+059F 230 HEBREW ACCENT QARNEY PARA
433 >0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD
434 >Their output (supposedly from ICU):
435 >0592 05B0 05B7 05BC 05A5 05C0 05AD 05C4
436 >My output from charlint:
437 >05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4
439 0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD => 05B0 05B7 05BC 05A5 0592 05C0
442 U+0592 230 HEBREW ACCENT SEGOL
443 U+05B7 17 HEBREW POINT PATAH
444 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
445 U+05A5 220 HEBREW ACCENT MERKHA
446 U+05B0 10 HEBREW POINT SHEVA
447 U+05C0 0 HEBREW PUNCTUATION PASEQ
448 U+05C4 230 HEBREW MARK UPPER DOT
449 U+05AD 222 HEBREW ACCENT DEHI
451 U+05B0 10 HEBREW POINT SHEVA
452 U+05B7 17 HEBREW POINT PATAH
453 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
454 U+05A5 220 HEBREW ACCENT MERKHA
455 U+0592 230 HEBREW ACCENT SEGOL
456 U+05C0 0 HEBREW PUNCTUATION PASEQ
457 U+05AD 222 HEBREW ACCENT DEHI
458 U+05C4 230 HEBREW MARK UPPER DOT
461 U+0592 230 HEBREW ACCENT SEGOL
462 U+05B0 10 HEBREW POINT SHEVA
463 U+05B7 17 HEBREW POINT PATAH
464 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
465 U+05A5 220 HEBREW ACCENT MERKHA
466 U+05C0 0 HEBREW PUNCTUATION PASEQ
467 U+05AD 222 HEBREW ACCENT DEHI
468 U+05C4 230 HEBREW MARK UPPER DOT
470 UnicodeString data
[2][3];
471 data
[0][0] = str("\\u05B8\\u05B9\\u05B1\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
472 data
[0][1] = str("\\u05B1\\u05B8\\u05B9\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
473 data
[0][2] = str("");
474 data
[1][0] = str("\\u0592\\u05B7\\u05BC\\u05A5\\u05B0\\u05C0\\u05C4\\u05AD");
475 data
[1][1] = str("\\u05B0\\u05B7\\u05BC\\u05A5\\u0592\\u05C0\\u05AD\\u05C4");
476 data
[1][2] = str("");
478 staticTest(UNORM_NFD
, 0, data
, ARRAY_LENGTH(data
), 1);
479 staticTest(UNORM_NFC
, 0, data
, ARRAY_LENGTH(data
), 1);
482 //------------------------------------------------------------------------
483 // Internal utilities
486 UnicodeString
BasicNormalizerTest::hex(UChar ch
) {
487 UnicodeString result
;
488 return appendHex(ch
, 4, result
);
491 UnicodeString
BasicNormalizerTest::hex(const UnicodeString
& s
) {
492 UnicodeString result
;
493 for (int i
= 0; i
< s
.length(); ++i
) {
494 if (i
!= 0) result
+= (UChar
)0x2c/*,*/;
495 appendHex(s
[i
], 4, result
);
501 inline static void insert(UnicodeString
& dest
, int pos
, UChar32 ch
)
503 dest
.replace(pos
, 0, ch
);
506 void BasicNormalizerTest::backAndForth(Normalizer
* iter
, const UnicodeString
& input
)
509 iter
->setText(input
, status
);
511 // Run through the iterator forwards and stick it into a StringBuffer
512 UnicodeString forward
;
513 for (ch
= iter
->first(); ch
!= iter
->DONE
; ch
= iter
->next()) {
517 // Now do it backwards
518 UnicodeString reverse
;
519 for (ch
= iter
->last(); ch
!= iter
->DONE
; ch
= iter
->previous()) {
520 insert(reverse
, 0, ch
);
523 if (forward
!= reverse
) {
524 errln("Forward/reverse mismatch for input " + hex(input
)
525 + ", forward: " + hex(forward
) + ", backward: " + hex(reverse
));
529 void BasicNormalizerTest::staticTest(UNormalizationMode mode
, int options
,
530 UnicodeString tests
[][3], int length
,
533 for (int i
= 0; i
< length
; i
++)
535 UnicodeString
& input
= tests
[i
][0];
536 UnicodeString
& expect
= tests
[i
][outCol
];
538 logln("Normalizing '" + input
+ "' (" + hex(input
) + ")" );
540 UnicodeString output
;
541 Normalizer::normalize(input
, mode
, options
, output
, status
);
543 if (output
!= expect
) {
544 dataerrln(UnicodeString("ERROR: case ") + i
+ " normalized " + hex(input
) + "\n"
545 + " expected " + hex(expect
) + "\n"
546 + " static got " + hex(output
) );
551 void BasicNormalizerTest::iterateTest(Normalizer
* iter
,
552 UnicodeString tests
[][3], int length
,
555 for (int i
= 0; i
< length
; i
++)
557 UnicodeString
& input
= tests
[i
][0];
558 UnicodeString
& expect
= tests
[i
][outCol
];
560 logln("Normalizing '" + input
+ "' (" + hex(input
) + ")" );
562 iter
->setText(input
, status
);
563 assertEqual(input
, expect
, iter
, UnicodeString("ERROR: case ") + i
+ " ");
567 void BasicNormalizerTest::assertEqual(const UnicodeString
& input
,
568 const UnicodeString
& expected
,
570 const UnicodeString
& errPrefix
)
572 UnicodeString result
;
574 for (UChar32 ch
= iter
->first(); ch
!= iter
->DONE
; ch
= iter
->next()) {
577 if (result
!= expected
) {
578 dataerrln(errPrefix
+ "normalized " + hex(input
) + "\n"
579 + " expected " + hex(expected
) + "\n"
580 + " iterate got " + hex(result
) );
584 // helper class for TestPreviousNext()
585 // simple UTF-32 character iterator
586 class UChar32Iterator
{
588 UChar32Iterator(const UChar32
*text
, int32_t len
, int32_t index
) :
589 s(text
), length(len
), i(index
) {}
624 BasicNormalizerTest::TestPreviousNext(const UChar
*src
, int32_t srcLength
,
625 const UChar32
*expect
, int32_t expectLength
,
626 const int32_t *expectIndex
, // its length=expectLength+1
627 int32_t srcMiddle
, int32_t expectMiddle
,
629 UNormalizationMode mode
,
632 Normalizer
iter(src
, srcLength
, mode
);
634 // test getStaticClassID and getDynamicClassID
635 if(iter
.getDynamicClassID() != Normalizer::getStaticClassID()) {
636 errln("getStaticClassID != getDynamicClassID for Normalizer.");
639 UChar32Iterator
iter32(expect
, expectLength
, expectMiddle
);
644 // initially set the indexes into the middle of the strings
645 iter
.setIndexOnly(srcMiddle
);
647 // move around and compare the iteration code points with
649 const char *move
=moves
;
650 while((m
=*move
++)!=0) {
653 c2
=iter32
.previous();
657 } else /* m=='+' */ {
664 // copy the moves until the current (m) move, and terminate
666 uprv_strcpy(history
, moves
);
667 history
[move
-moves
]=0;
668 dataerrln("error: mismatch in Normalizer iteration (%s) at %s: "
669 "got c1=U+%04lx != expected c2=U+%04lx",
670 name
, history
, c1
, c2
);
675 if(iter
.getIndex()!=expectIndex
[iter32
.getIndex()]) {
676 // copy the moves until the current (m) move, and terminate
678 uprv_strcpy(history
, moves
);
679 history
[move
-moves
]=0;
680 errln("error: index mismatch in Normalizer iteration (%s) at %s: "
681 "Normalizer index %ld expected %ld\n",
682 name
, history
, iter
.getIndex(), expectIndex
[iter32
.getIndex()]);
689 BasicNormalizerTest::TestPreviousNext() {
690 // src and expect strings
691 static const UChar src
[]={
692 U16_LEAD(0x2f999), U16_TRAIL(0x2f999),
693 U16_LEAD(0x1d15f), U16_TRAIL(0x1d15f),
697 static const UChar32 expect
[]={
704 // expected src indexes corresponding to expect indexes
705 static const int32_t expectIndex
[]={
710 6 // behind last character
713 // src and expect strings for regression test for j2911
714 static const UChar src_j2911
[]={
715 U16_LEAD(0x2f999), U16_TRAIL(0x2f999),
716 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
720 static const UChar32 expect_j2911
[]={
722 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
727 // expected src indexes corresponding to expect indexes
728 static const int32_t expectIndex_j2911
[]={
733 8 // behind last character
736 // initial indexes into the src and expect strings
737 // for both sets of test data
746 // - for previous(), 0 for current(), + for next()
747 // for both sets of test data
748 static const char *const moves
="0+0+0--0-0-+++0--+++++++0--------";
750 TestPreviousNext(src
, LENGTHOF(src
),
751 expect
, LENGTHOF(expect
),
753 SRC_MIDDLE
, EXPECT_MIDDLE
,
754 moves
, UNORM_NFD
, "basic");
756 TestPreviousNext(src_j2911
, LENGTHOF(src_j2911
),
757 expect_j2911
, LENGTHOF(expect_j2911
),
759 SRC_MIDDLE
, EXPECT_MIDDLE
,
760 moves
, UNORM_NFKC
, "j2911");
762 // try again from different "middle" indexes
763 TestPreviousNext(src
, LENGTHOF(src
),
764 expect
, LENGTHOF(expect
),
766 SRC_MIDDLE_2
, EXPECT_MIDDLE_2
,
767 moves
, UNORM_NFD
, "basic_2");
769 TestPreviousNext(src_j2911
, LENGTHOF(src_j2911
),
770 expect_j2911
, LENGTHOF(expect_j2911
),
772 SRC_MIDDLE_2
, EXPECT_MIDDLE_2
,
773 moves
, UNORM_NFKC
, "j2911_2");
776 void BasicNormalizerTest::TestConcatenate() {
777 static const char *const
779 /* mode, left, right, result */
792 /* ### TODO: add more interesting cases */
796 "\\u0C4D\\U000110BA\\U0001D169",
797 "\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345"
801 UnicodeString left
, right
, expect
, result
, r
;
802 UErrorCode errorCode
;
803 UNormalizationMode mode
;
806 /* test concatenation */
807 for(i
=0; i
<(int32_t)(sizeof(cases
)/sizeof(cases
[0])); ++i
) {
808 switch(*cases
[i
][0]) {
809 case 'C': mode
=UNORM_NFC
; break;
810 case 'D': mode
=UNORM_NFD
; break;
811 case 'c': mode
=UNORM_NFKC
; break;
812 case 'd': mode
=UNORM_NFKD
; break;
813 default: mode
=UNORM_NONE
; break;
816 left
=UnicodeString(cases
[i
][1], "").unescape();
817 right
=UnicodeString(cases
[i
][2], "").unescape();
818 expect
=UnicodeString(cases
[i
][3], "").unescape();
820 //result=r=UnicodeString();
821 errorCode
=U_ZERO_ERROR
;
823 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
824 if(U_FAILURE(errorCode
) || /*result!=r ||*/ result
!=expect
) {
825 dataerrln("error in Normalizer::concatenate(), cases[] fails with "+
826 UnicodeString(u_errorName(errorCode
))+", result==expect: expected: "+
827 hex(expect
)+" =========> got: " + hex(result
));
831 /* test error cases */
833 /* left.getBuffer()==result.getBuffer() */
834 result
=r
=expect
=UnicodeString("zz", "");
835 errorCode
=U_UNEXPECTED_TOKEN
;
836 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
837 if(errorCode
!=U_UNEXPECTED_TOKEN
|| result
!=r
|| !result
.isBogus()) {
838 errln("error in Normalizer::concatenate(), violates UErrorCode protocol");
842 errorCode
=U_ZERO_ERROR
;
843 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
844 if(errorCode
!=U_ILLEGAL_ARGUMENT_ERROR
|| result
!=r
|| !result
.isBogus()) {
845 errln("error in Normalizer::concatenate(), does not detect left.isBogus()");
849 // reference implementation of Normalizer::compare
851 ref_norm_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
, UErrorCode
&errorCode
) {
852 UnicodeString r1
, r2
, t1
, t2
;
853 int32_t normOptions
=(int32_t)(options
>>UNORM_COMPARE_NORM_OPTIONS_SHIFT
);
855 if(options
&U_COMPARE_IGNORE_CASE
) {
856 Normalizer::decompose(s1
, FALSE
, normOptions
, r1
, errorCode
);
857 Normalizer::decompose(s2
, FALSE
, normOptions
, r2
, errorCode
);
859 r1
.foldCase(options
);
860 r2
.foldCase(options
);
866 Normalizer::decompose(r1
, FALSE
, normOptions
, t1
, errorCode
);
867 Normalizer::decompose(r2
, FALSE
, normOptions
, t2
, errorCode
);
869 if(options
&U_COMPARE_CODE_POINT_ORDER
) {
870 return t1
.compareCodePointOrder(t2
);
872 return t1
.compare(t2
);
876 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
878 _norm_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
, UErrorCode
&errorCode
) {
879 int32_t normOptions
=(int32_t)(options
>>UNORM_COMPARE_NORM_OPTIONS_SHIFT
);
881 if( UNORM_YES
==Normalizer::quickCheck(s1
, UNORM_FCD
, normOptions
, errorCode
) &&
882 UNORM_YES
==Normalizer::quickCheck(s2
, UNORM_FCD
, normOptions
, errorCode
)) {
883 options
|=UNORM_INPUT_IS_FCD
;
886 return Normalizer::compare(s1
, s2
, options
, errorCode
);
889 // reference implementation of UnicodeString::caseCompare
891 ref_case_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
) {
892 UnicodeString t1
, t2
;
897 t1
.foldCase(options
);
898 t2
.foldCase(options
);
900 if(options
&U_COMPARE_CODE_POINT_ORDER
) {
901 return t1
.compareCodePointOrder(t2
);
903 return t1
.compare(t2
);
907 // reduce an integer to -1/0/1
908 static inline int32_t
909 _sign(int32_t value
) {
913 return (value
>>31)|1;
918 _signString(int32_t value
) {
921 } else if(value
==0) {
923 } else /* value>0 */ {
929 BasicNormalizerTest::TestCompare() {
930 // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
931 // by comparing it with its semantic equivalent
932 // since we trust the pieces, this is sufficient
934 // test each string with itself and each other
935 // each time with all options
936 static const char *const
938 // some cases from NormalizationTest.txt
940 "D\\u031B\\u0307\\u0323",
941 "\\u1E0C\\u031B\\u0307",
942 "D\\u031B\\u0323\\u0307",
943 "d\\u031B\\u0323\\u0307",
950 // Angstrom sign = A ring
958 "a\\u059A\\u0316\\u302A\\u032Fb",
959 "a\\u302A\\u0316\\u032F\\u059Ab",
960 "a\\u302A\\u0316\\u032F\\u059Ab",
961 "A\\u059A\\u0316\\u302A\\u032Fb",
963 // from ICU case folding tests
965 "A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131",
966 "ass\\u03bcffi\\U00010434i",
967 "\\u0061\\u0042\\u0131\\u03a3\\u00df\\ufb03\\ud93f\\udfff",
968 "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udfff",
969 "\\u0041\\u0062\\u0131\\u03c3\\u0053\\u0073\\u0066\\u0046\\u0069\\ud93f\\udfff",
970 "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udffd",
972 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold
973 // vs. U+10000 at bottom - code point order
975 "\\ud800\\ud800\\udc01",
978 // other code point order tests from ustrtest.cpp
981 "\\u20ac\\ud800\\udc00",
986 "\\uff61\\ud800\\udc02",
990 // long strings, see cnormtst.c/TestNormCoverage()
991 // equivalent if case-insensitive
993 "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
994 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
995 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
996 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
997 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
998 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
999 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
1000 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
1001 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
1002 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
1003 "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
1004 "d\\u031B\\u0307\\u0323",
1006 "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1007 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1008 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1009 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1010 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1011 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1012 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
1013 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
1014 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
1015 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
1016 "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1017 "\\u1E0C\\u031B\\u0307",
1019 // some strings that may make a difference whether the compare function
1020 // case-folds or decomposes first
1022 "\\u0360\\u0345\\u0334",
1023 "\\u0360\\u03b9\\u0334",
1025 "\\u0360\\u1f80\\u0334",
1026 "\\u0360\\u03b1\\u0313\\u03b9\\u0334",
1028 "\\u0360\\u1ffc\\u0334",
1029 "\\u0360\\u03c9\\u03b9\\u0334",
1031 "a\\u0360\\u0345\\u0360\\u0345b",
1032 "a\\u0345\\u0360\\u0345\\u0360b",
1034 // interesting cases for canonical caseless match with turkic i handling
1039 // strings with post-Unicode 3.2 normalization or normalization corrections
1041 "\\u00e4\\u193b\\U0002f868",
1042 "\\u0061\\u193b\\u0308\\u36fc",
1049 UnicodeString s
[100]; // at least as many items as in strings[] !
1051 // all combinations of options
1052 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
1053 // set UNORM_UNICODE_3_2 in one additional combination
1054 static const struct {
1059 { U_COMPARE_CODE_POINT_ORDER
, "c.p. order" },
1060 { U_COMPARE_IGNORE_CASE
, "ignore case" },
1061 { U_COMPARE_CODE_POINT_ORDER
|U_COMPARE_IGNORE_CASE
, "c.p. order & ignore case" },
1062 { U_COMPARE_IGNORE_CASE
|U_FOLD_CASE_EXCLUDE_SPECIAL_I
, "ignore case & special i" },
1063 { U_COMPARE_CODE_POINT_ORDER
|U_COMPARE_IGNORE_CASE
|U_FOLD_CASE_EXCLUDE_SPECIAL_I
, "c.p. order & ignore case & special i" },
1064 { UNORM_UNICODE_3_2
<<UNORM_COMPARE_NORM_OPTIONS_SHIFT
, "Unicode 3.2" }
1067 int32_t i
, j
, k
, count
=LENGTHOF(strings
);
1068 int32_t result
, refResult
;
1070 UErrorCode errorCode
;
1072 // create the UnicodeStrings
1073 for(i
=0; i
<count
; ++i
) {
1074 s
[i
]=UnicodeString(strings
[i
], "").unescape();
1077 // test them each with each other
1078 for(i
=0; i
<count
; ++i
) {
1079 for(j
=i
; j
<count
; ++j
) {
1080 for(k
=0; k
<LENGTHOF(opt
); ++k
) {
1081 // test Normalizer::compare
1082 errorCode
=U_ZERO_ERROR
;
1083 result
=_norm_compare(s
[i
], s
[j
], opt
[k
].options
, errorCode
);
1084 refResult
=ref_norm_compare(s
[i
], s
[j
], opt
[k
].options
, errorCode
);
1085 if(_sign(result
)!=_sign(refResult
)) {
1086 errln("Normalizer::compare(%d, %d, %s)%s should be %s %s",
1087 i
, j
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1088 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1091 // test UnicodeString::caseCompare - same internal implementation function
1092 if(opt
[k
].options
&U_COMPARE_IGNORE_CASE
) {
1093 errorCode
=U_ZERO_ERROR
;
1094 result
=s
[i
].caseCompare(s
[j
], opt
[k
].options
);
1095 refResult
=ref_case_compare(s
[i
], s
[j
], opt
[k
].options
);
1096 if(_sign(result
)!=_sign(refResult
)) {
1097 errln("UniStr::caseCompare(%d, %d, %s)%s should be %s %s",
1098 i
, j
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1099 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1106 // test cases with i and I to make sure Turkic works
1107 static const UChar iI
[]={ 0x49, 0x69, 0x130, 0x131 };
1108 UnicodeSet iSet
, set
;
1110 UnicodeString s1
, s2
;
1112 const Normalizer2Impl
*nfcImpl
=Normalizer2Factory::getNFCImpl(errorCode
);
1113 if(U_FAILURE(errorCode
) || !nfcImpl
->ensureCanonIterData(errorCode
)) {
1114 dataerrln("Normalizer2Factory::getNFCImpl().ensureCanonIterData() failed: %s",
1115 u_errorName(errorCode
));
1119 // collect all sets into one for contiguous output
1120 for(i
=0; i
<LENGTHOF(iI
); ++i
) {
1121 if(nfcImpl
->getCanonStartSet(iI
[i
], iSet
)) {
1126 // test all of these precomposed characters
1127 const Normalizer2
*nfcNorm2
=Normalizer2Factory::getNFCInstance(errorCode
);
1128 UnicodeSetIterator
it(set
);
1129 while(it
.next() && !it
.isString()) {
1130 UChar32 c
=it
.getCodepoint();
1131 if(!nfcNorm2
->getDecomposition(c
, s2
)) {
1132 dataerrln("NFC.getDecomposition(i-composite U+%04lx) failed", (long)c
);
1137 for(k
=0; k
<LENGTHOF(opt
); ++k
) {
1138 // test Normalizer::compare
1139 errorCode
=U_ZERO_ERROR
;
1140 result
=_norm_compare(s1
, s2
, opt
[k
].options
, errorCode
);
1141 refResult
=ref_norm_compare(s1
, s2
, opt
[k
].options
, errorCode
);
1142 if(_sign(result
)!=_sign(refResult
)) {
1143 errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s",
1144 c
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1145 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1148 // test UnicodeString::caseCompare - same internal implementation function
1149 if(opt
[k
].options
&U_COMPARE_IGNORE_CASE
) {
1150 errorCode
=U_ZERO_ERROR
;
1151 result
=s1
.caseCompare(s2
, opt
[k
].options
);
1152 refResult
=ref_case_compare(s1
, s2
, opt
[k
].options
);
1153 if(_sign(result
)!=_sign(refResult
)) {
1154 errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s",
1155 c
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1156 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1162 // test getDecomposition() for some characters that do not decompose
1163 if( nfcNorm2
->getDecomposition(0x20, s2
) ||
1164 nfcNorm2
->getDecomposition(0x4e00, s2
) ||
1165 nfcNorm2
->getDecomposition(0x20002, s2
)
1167 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
1170 // test getRawDecomposition() for some characters that do not decompose
1171 if( nfcNorm2
->getRawDecomposition(0x20, s2
) ||
1172 nfcNorm2
->getRawDecomposition(0x4e00, s2
) ||
1173 nfcNorm2
->getRawDecomposition(0x20002, s2
)
1175 errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
1178 // test composePair() for some pairs of characters that do not compose
1179 if( nfcNorm2
->composePair(0x20, 0x301)>=0 ||
1180 nfcNorm2
->composePair(0x61, 0x305)>=0 ||
1181 nfcNorm2
->composePair(0x1100, 0x1160)>=0 ||
1182 nfcNorm2
->composePair(0xac00, 0x11a7)>=0
1184 errln("NFC.composePair() incorrectly composes some pairs of characters");
1187 // test FilteredNormalizer2::getDecomposition()
1188 UnicodeSet
filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode
);
1189 FilteredNormalizer2
fn2(*nfcNorm2
, filter
);
1190 if( fn2
.getDecomposition(0xe4, s1
) || !fn2
.getDecomposition(0x100, s2
) ||
1191 s2
.length()!=2 || s2
[0]!=0x41 || s2
[1]!=0x304
1193 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed");
1196 // test FilteredNormalizer2::getRawDecomposition()
1197 if( fn2
.getRawDecomposition(0xe4, s1
) || !fn2
.getRawDecomposition(0x100, s2
) ||
1198 s2
.length()!=2 || s2
[0]!=0x41 || s2
[1]!=0x304
1200 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
1203 // test FilteredNormalizer2::composePair()
1204 if( 0x100!=fn2
.composePair(0x41, 0x304) ||
1205 fn2
.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
1207 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
1211 // verify that case-folding does not un-FCD strings
1213 BasicNormalizerTest::countFoldFCDExceptions(uint32_t foldingOptions
) {
1214 UnicodeString s
, fold
, d
;
1217 uint8_t cc
, trailCC
, foldCC
, foldTrailCC
;
1218 UNormalizationCheckResult qcResult
;
1221 UErrorCode errorCode
;
1223 logln("Test if case folding may un-FCD a string (folding options %04lx)", foldingOptions
);
1226 for(c
=0; c
<=0x10ffff; ++c
) {
1227 errorCode
= U_ZERO_ERROR
;
1228 category
=u_charType(c
);
1229 if(category
==U_UNASSIGNED
) {
1230 continue; // skip unassigned code points
1233 c
=0xd7a3; // skip Hangul - no case folding there
1236 // skip Han blocks - no case folding there either
1252 // get leading and trailing cc for c
1253 Normalizer::decompose(s
, FALSE
, 0, d
, errorCode
);
1255 cc
=u_getCombiningClass(d
.char32At(0));
1256 trailCC
=u_getCombiningClass(d
.char32At(d
.length()-1));
1258 // get leading and trailing cc for the case-folding of c
1259 s
.foldCase(foldingOptions
);
1260 Normalizer::decompose(s
, FALSE
, 0, d
, errorCode
);
1261 foldCC
=u_getCombiningClass(d
.char32At(0));
1262 foldTrailCC
=u_getCombiningClass(d
.char32At(d
.length()-1));
1264 qcResult
=Normalizer::quickCheck(s
, UNORM_FCD
, errorCode
);
1266 if (U_FAILURE(errorCode
)) {
1268 dataerrln("U+%04lx: Failed with error %s", u_errorName(errorCode
));
1272 // - character maps to empty string: adjacent characters may then need reordering
1273 // - folding has different leading/trailing cc's, and they don't become just 0
1274 // - folding itself is not FCD
1275 if( qcResult
!=UNORM_YES
||
1277 (cc
!=foldCC
&& foldCC
!=0) || (trailCC
!=foldTrailCC
&& foldTrailCC
!=0)
1280 dataerrln("U+%04lx: case-folding may un-FCD a string (folding options %04lx)", c
, foldingOptions
);
1281 dataerrln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc
, trailCC
, d
.char32At(0), foldCC
, d
.char32At(d
.length()-1), foldTrailCC
, qcResult
);
1286 // if a code point is in NFD but its case folding is not, then
1287 // unorm_compare will also fail
1288 if(isNFD
&& UNORM_YES
!=Normalizer::quickCheck(s
, UNORM_NFD
, errorCode
)) {
1290 errln("U+%04lx: case-folding un-NFDs this character (folding options %04lx)", c
, foldingOptions
);
1294 logln("There are %ld code points for which case-folding may un-FCD a string (folding options %04lx)", count
, foldingOptions
);
1299 BasicNormalizerTest::FindFoldFCDExceptions() {
1302 count
=countFoldFCDExceptions(0);
1303 count
+=countFoldFCDExceptions(U_FOLD_CASE_EXCLUDE_SPECIAL_I
);
1306 * If case-folding un-FCDs any strings, then unorm_compare() must be
1308 * It currently assumes that one can check for FCD then case-fold
1309 * and then still have FCD strings for raw decomposition without reordering.
1311 dataerrln("error: There are %ld code points for which case-folding may un-FCD a string for all folding options.\n"
1312 "See comment in BasicNormalizerTest::FindFoldFCDExceptions()!", count
);
1317 initExpectedSkippables(UnicodeSet skipSets
[UNORM_MODE_COUNT
], UErrorCode
&errorCode
) {
1318 skipSets
[UNORM_NFD
].applyPattern(
1319 UNICODE_STRING_SIMPLE("[[:NFD_QC=Yes:]&[:ccc=0:]]"), errorCode
);
1320 skipSets
[UNORM_NFC
].applyPattern(
1321 UNICODE_STRING_SIMPLE("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode
);
1322 skipSets
[UNORM_NFKD
].applyPattern(
1323 UNICODE_STRING_SIMPLE("[[:NFKD_QC=Yes:]&[:ccc=0:]]"), errorCode
);
1324 skipSets
[UNORM_NFKC
].applyPattern(
1325 UNICODE_STRING_SIMPLE("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode
);
1327 // Remove from the NFC and NFKC sets all those characters that change
1328 // when a back-combining character is added.
1329 // First, get all of the back-combining characters and their combining classes.
1330 UnicodeSet
combineBack("[:NFC_QC=Maybe:]", errorCode
);
1331 int32_t numCombineBack
=combineBack
.size();
1332 int32_t *combineBackCharsAndCc
=new int32_t[numCombineBack
*2];
1333 UnicodeSetIterator
iter(combineBack
);
1334 for(int32_t i
=0; i
<numCombineBack
; ++i
) {
1336 UChar32 c
=iter
.getCodepoint();
1337 combineBackCharsAndCc
[2*i
]=c
;
1338 combineBackCharsAndCc
[2*i
+1]=u_getCombiningClass(c
);
1341 // We need not look at control codes, Han characters nor Hangul LVT syllables because they
1342 // do not combine forward. LV syllables are already removed.
1343 UnicodeSet
notInteresting("[[:C:][:Unified_Ideograph:][:HST=LVT:]]", errorCode
);
1344 LocalPointer
<UnicodeSet
> unsure(&((UnicodeSet
*)(skipSets
[UNORM_NFC
].clone()))->removeAll(notInteresting
));
1345 // System.out.format("unsure.size()=%d\n", unsure.size());
1347 // For each character about which we are unsure, see if it changes when we add
1348 // one of the back-combining characters.
1349 const Normalizer2
*norm2
=Normalizer2::getNFCInstance(errorCode
);
1351 iter
.reset(*unsure
);
1352 while(iter
.next()) {
1353 UChar32 c
=iter
.getCodepoint();
1355 int32_t cLength
=s
.length();
1356 int32_t tccc
=u_getIntPropertyValue(c
, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS
);
1357 for(int32_t i
=0; i
<numCombineBack
; ++i
) {
1358 // If c's decomposition ends with a character with non-zero combining class, then
1359 // c can only change if it combines with a character with a non-zero combining class.
1360 int32_t cc2
=combineBackCharsAndCc
[2*i
+1];
1361 if(tccc
==0 || cc2
!=0) {
1362 UChar32 c2
=combineBackCharsAndCc
[2*i
];
1364 if(!norm2
->isNormalized(s
, errorCode
)) {
1365 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
1366 skipSets
[UNORM_NFC
].remove(c
);
1367 skipSets
[UNORM_NFKC
].remove(c
);
1370 s
.truncate(cLength
);
1374 delete [] combineBackCharsAndCc
;
1378 BasicNormalizerTest::TestSkippable() {
1379 UnicodeSet diff
, skipSets
[UNORM_MODE_COUNT
], expectSets
[UNORM_MODE_COUNT
];
1380 UnicodeString s
, pattern
;
1382 /* build NF*Skippable sets from runtime data */
1383 IcuTestErrorCode
errorCode(*this, "TestSkippable");
1384 skipSets
[UNORM_NFD
].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode
);
1385 skipSets
[UNORM_NFKD
].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode
);
1386 skipSets
[UNORM_NFC
].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode
);
1387 skipSets
[UNORM_NFKC
].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode
);
1388 if(errorCode
.logDataIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) {
1392 /* get expected sets from hardcoded patterns */
1393 initExpectedSkippables(expectSets
, errorCode
);
1394 errorCode
.assertSuccess();
1396 for(int32_t i
=UNORM_NONE
; i
<UNORM_MODE_COUNT
; ++i
) {
1397 if(skipSets
[i
]!=expectSets
[i
]) {
1398 errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n", i
, i
);
1399 // Note: This used to depend on hardcoded UnicodeSet patterns generated by
1400 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
1401 // running com.ibm.text.UCD.Main with the option NFSkippable.
1402 // Since ICU 4.6/Unicode 6, we are generating the
1403 // expectSets ourselves in initSkippables().
1405 s
=UNICODE_STRING_SIMPLE("skip-expect=");
1406 (diff
=skipSets
[i
]).removeAll(expectSets
[i
]).toPattern(pattern
, TRUE
);
1410 s
.append(UNICODE_STRING_SIMPLE("\n\nexpect-skip="));
1411 (diff
=expectSets
[i
]).removeAll(skipSets
[i
]).toPattern(pattern
, TRUE
);
1413 s
.append(UNICODE_STRING_SIMPLE("\n\n"));
1420 struct StringPair
{ const char *input
, *expected
; };
1423 BasicNormalizerTest::TestCustomComp() {
1424 static const StringPair pairs
[]={
1425 { "\\uD801\\uE000\\uDFFE", "" },
1426 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1427 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1428 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
1429 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1430 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1431 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1432 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1434 IcuTestErrorCode
errorCode(*this, "BasicNormalizerTest/TestCustomComp");
1435 const Normalizer2
*customNorm2
=
1436 Normalizer2::getInstance(loadTestData(errorCode
), "testnorm",
1437 UNORM2_COMPOSE
, errorCode
);
1438 if(errorCode
.logDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1441 for(int32_t i
=0; i
<LENGTHOF(pairs
); ++i
) {
1442 const StringPair
&pair
=pairs
[i
];
1443 UnicodeString input
=UnicodeString(pair
.input
, -1, US_INV
).unescape();
1444 UnicodeString expected
=UnicodeString(pair
.expected
, -1, US_INV
).unescape();
1445 UnicodeString result
=customNorm2
->normalize(input
, errorCode
);
1446 if(result
!=expected
) {
1447 errln("custom compose Normalizer2 did not normalize input %d as expected", i
);
1453 BasicNormalizerTest::TestCustomFCC() {
1454 static const StringPair pairs
[]={
1455 { "\\uD801\\uE000\\uDFFE", "" },
1456 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1457 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1458 // The following expected result is different from CustomComp
1459 // because of only-contiguous composition.
1460 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
1461 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1462 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1463 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1464 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1466 IcuTestErrorCode
errorCode(*this, "BasicNormalizerTest/TestCustomFCC");
1467 const Normalizer2
*customNorm2
=
1468 Normalizer2::getInstance(loadTestData(errorCode
), "testnorm",
1469 UNORM2_COMPOSE_CONTIGUOUS
, errorCode
);
1470 if(errorCode
.logDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1473 for(int32_t i
=0; i
<LENGTHOF(pairs
); ++i
) {
1474 const StringPair
&pair
=pairs
[i
];
1475 UnicodeString input
=UnicodeString(pair
.input
, -1, US_INV
).unescape();
1476 UnicodeString expected
=UnicodeString(pair
.expected
, -1, US_INV
).unescape();
1477 UnicodeString result
=customNorm2
->normalize(input
, errorCode
);
1478 if(result
!=expected
) {
1479 errln("custom FCC Normalizer2 did not normalize input %d as expected", i
);
1484 /* Improve code coverage of Normalizer2 */
1486 BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
1487 UErrorCode errorCode
= U_ZERO_ERROR
;
1488 const Normalizer2
*nfcNorm2
=Normalizer2Factory::getNFCInstance(errorCode
);
1489 if (U_FAILURE(errorCode
)) {
1490 dataerrln("Normalizer2Factory::getNFCInstance() call failed - %s", u_errorName(status
));
1493 UnicodeSet
filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff\\u0310-\\u031f]"), errorCode
);
1494 FilteredNormalizer2
fn2(*nfcNorm2
, filter
);
1496 UChar32 char32
= 0x0054;
1498 if (fn2
.isInert(char32
)) {
1499 errln("FilteredNormalizer2.isInert() failed.");
1502 if (fn2
.hasBoundaryAfter(char32
)) {
1503 errln("FilteredNormalizer2.hasBoundaryAfter() failed.");
1507 for(c
=0; c
<=0x3ff; ++c
) {
1508 uint8_t expectedCC
= filter
.contains(c
) ? nfcNorm2
->getCombiningClass(c
) : 0;
1509 uint8_t cc
=fn2
.getCombiningClass(c
);
1510 if(cc
!=expectedCC
) {
1512 UnicodeString("FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+")+
1514 ")==filtered NFC.getCC()");
1518 UnicodeString newString1
= UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]");
1519 UnicodeString newString2
= UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]");
1520 fn2
.append(newString1
, newString2
, errorCode
);
1521 if (U_FAILURE(errorCode
)) {
1522 errln("FilteredNormalizer2.append() failed.");
1526 #endif /* #if !UCONFIG_NO_NORMALIZATION */