1 /********************************************************************
3 * Copyright (c) 1997-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_NORMALIZATION
11 #include "unicode/uchar.h"
12 #include "unicode/errorcode.h"
13 #include "unicode/normlzr.h"
14 #include "unicode/uniset.h"
15 #include "unicode/usetiter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/utf16.h"
19 #include "normalizer2impl.h"
22 #define ARRAY_LENGTH(array) UPRV_LENGTHOF(array)
24 #define CASE(id,test) case id: \
28 logln((UnicodeString)""); \
33 static UErrorCode status
= U_ZERO_ERROR
;
35 void BasicNormalizerTest::runIndexedTest(int32_t index
, UBool exec
,
36 const char* &name
, char* /*par*/) {
39 CASE(1,TestCompatDecomp
);
40 CASE(2,TestCanonCompose
);
41 CASE(3,TestCompatCompose
);
43 CASE(5,TestHangulDecomp
);
44 CASE(6,TestHangulCompose
);
46 CASE(8,TestCompositionExclusion
);
47 CASE(9,TestZeroIndex
);
48 CASE(10,TestVerisign
);
49 CASE(11,TestPreviousNext
);
50 CASE(12,TestNormalizerAPI
);
51 CASE(13,TestConcatenate
);
52 CASE(14,FindFoldFCDExceptions
);
54 CASE(16,TestSkippable
);
55 #if !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
56 CASE(17,TestCustomComp
);
57 CASE(18,TestCustomFCC
);
59 CASE(19,TestFilteredNormalizer2Coverage
);
60 default: name
= ""; break;
65 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
67 static UnicodeString
str(const char *input
)
69 UnicodeString
str(input
, ""); // Invariant conversion
70 return str
.unescape();
74 BasicNormalizerTest::BasicNormalizerTest()
77 // Input Decomposed Composed
79 canonTests
[0][0] = str("cat"); canonTests
[0][1] = str("cat"); canonTests
[0][2] = str("cat");
81 canonTests
[1][0] = str("\\u00e0ardvark"); canonTests
[1][1] = str("a\\u0300ardvark"); canonTests
[1][2] = str("\\u00e0ardvark");
83 canonTests
[2][0] = str("\\u1e0a"); canonTests
[2][1] = str("D\\u0307"); canonTests
[2][2] = str("\\u1e0a"); // D-dot_above
85 canonTests
[3][0] = str("D\\u0307"); canonTests
[3][1] = str("D\\u0307"); canonTests
[3][2] = str("\\u1e0a"); // D dot_above
87 canonTests
[4][0] = str("\\u1e0c\\u0307"); canonTests
[4][1] = str("D\\u0323\\u0307"); canonTests
[4][2] = str("\\u1e0c\\u0307"); // D-dot_below dot_above
89 canonTests
[5][0] = str("\\u1e0a\\u0323"); canonTests
[5][1] = str("D\\u0323\\u0307"); canonTests
[5][2] = str("\\u1e0c\\u0307"); // D-dot_above dot_below
91 canonTests
[6][0] = str("D\\u0307\\u0323"); canonTests
[6][1] = str("D\\u0323\\u0307"); canonTests
[6][2] = str("\\u1e0c\\u0307"); // D dot_below dot_above
93 canonTests
[7][0] = str("\\u1e10\\u0307\\u0323"); canonTests
[7][1] = str("D\\u0327\\u0323\\u0307"); canonTests
[7][2] = str("\\u1e10\\u0323\\u0307"); // D dot_below cedilla dot_above
95 canonTests
[8][0] = str("D\\u0307\\u0328\\u0323"); canonTests
[8][1] = str("D\\u0328\\u0323\\u0307"); canonTests
[8][2] = str("\\u1e0c\\u0328\\u0307"); // D dot_above ogonek dot_below
97 canonTests
[9][0] = str("\\u1E14"); canonTests
[9][1] = str("E\\u0304\\u0300"); canonTests
[9][2] = str("\\u1E14"); // E-macron-grave
99 canonTests
[10][0] = str("\\u0112\\u0300"); canonTests
[10][1] = str("E\\u0304\\u0300"); canonTests
[10][2] = str("\\u1E14"); // E-macron + grave
101 canonTests
[11][0] = str("\\u00c8\\u0304"); canonTests
[11][1] = str("E\\u0300\\u0304"); canonTests
[11][2] = str("\\u00c8\\u0304"); // E-grave + macron
103 canonTests
[12][0] = str("\\u212b"); canonTests
[12][1] = str("A\\u030a"); canonTests
[12][2] = str("\\u00c5"); // angstrom_sign
105 canonTests
[13][0] = str("\\u00c5"); canonTests
[13][1] = str("A\\u030a"); canonTests
[13][2] = str("\\u00c5"); // A-ring
107 canonTests
[14][0] = str("\\u00C4ffin"); canonTests
[14][1] = str("A\\u0308ffin"); canonTests
[14][2] = str("\\u00C4ffin");
109 canonTests
[15][0] = str("\\u00C4\\uFB03n"); canonTests
[15][1] = str("A\\u0308\\uFB03n"); canonTests
[15][2] = str("\\u00C4\\uFB03n");
111 canonTests
[16][0] = str("Henry IV"); canonTests
[16][1] = str("Henry IV"); canonTests
[16][2] = str("Henry IV");
113 canonTests
[17][0] = str("Henry \\u2163"); canonTests
[17][1] = str("Henry \\u2163"); canonTests
[17][2] = str("Henry \\u2163");
115 canonTests
[18][0] = str("\\u30AC"); canonTests
[18][1] = str("\\u30AB\\u3099"); canonTests
[18][2] = str("\\u30AC"); // ga (Katakana)
117 canonTests
[19][0] = str("\\u30AB\\u3099"); canonTests
[19][1] = str("\\u30AB\\u3099"); canonTests
[19][2] = str("\\u30AC"); // ka + ten
119 canonTests
[20][0] = str("\\uFF76\\uFF9E"); canonTests
[20][1] = str("\\uFF76\\uFF9E"); canonTests
[20][2] = str("\\uFF76\\uFF9E"); // hw_ka + hw_ten
121 canonTests
[21][0] = str("\\u30AB\\uFF9E"); canonTests
[21][1] = str("\\u30AB\\uFF9E"); canonTests
[21][2] = str("\\u30AB\\uFF9E"); // ka + hw_ten
123 canonTests
[22][0] = str("\\uFF76\\u3099"); canonTests
[22][1] = str("\\uFF76\\u3099"); canonTests
[22][2] = str("\\uFF76\\u3099"); // hw_ka + ten
125 canonTests
[23][0] = str("A\\u0300\\u0316"); canonTests
[23][1] = str("A\\u0316\\u0300"); canonTests
[23][2] = str("\\u00C0\\u0316");
128 // Input Decomposed Composed
129 compatTests
[0][0] = str("cat"); compatTests
[0][1] = str("cat"); compatTests
[0][2] = str("cat") ;
131 compatTests
[1][0] = str("\\uFB4f"); compatTests
[1][1] = str("\\u05D0\\u05DC"); compatTests
[1][2] = str("\\u05D0\\u05DC"); // Alef-Lamed vs. Alef, Lamed
133 compatTests
[2][0] = str("\\u00C4ffin"); compatTests
[2][1] = str("A\\u0308ffin"); compatTests
[2][2] = str("\\u00C4ffin") ;
135 compatTests
[3][0] = str("\\u00C4\\uFB03n"); compatTests
[3][1] = str("A\\u0308ffin"); compatTests
[3][2] = str("\\u00C4ffin") ; // ffi ligature -> f + f + i
137 compatTests
[4][0] = str("Henry IV"); compatTests
[4][1] = str("Henry IV"); compatTests
[4][2] = str("Henry IV") ;
139 compatTests
[5][0] = str("Henry \\u2163"); compatTests
[5][1] = str("Henry IV"); compatTests
[5][2] = str("Henry IV") ;
141 compatTests
[6][0] = str("\\u30AC"); compatTests
[6][1] = str("\\u30AB\\u3099"); compatTests
[6][2] = str("\\u30AC") ; // ga (Katakana)
143 compatTests
[7][0] = str("\\u30AB\\u3099"); compatTests
[7][1] = str("\\u30AB\\u3099"); compatTests
[7][2] = str("\\u30AC") ; // ka + ten
145 compatTests
[8][0] = str("\\uFF76\\u3099"); compatTests
[8][1] = str("\\u30AB\\u3099"); compatTests
[8][2] = str("\\u30AC") ; // hw_ka + ten
147 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later */
148 compatTests
[9][0] = str("\\uFF76\\uFF9E"); compatTests
[9][1] = str("\\u30AB\\u3099"); compatTests
[9][2] = str("\\u30AC") ; // hw_ka + hw_ten
150 compatTests
[10][0] = str("\\u30AB\\uFF9E"); compatTests
[10][1] = str("\\u30AB\\u3099"); compatTests
[10][2] = str("\\u30AC") ; // ka + hw_ten
152 /* Hangul Canonical */
153 // Input Decomposed Composed
154 hangulCanon
[0][0] = str("\\ud4db"); hangulCanon
[0][1] = str("\\u1111\\u1171\\u11b6"); hangulCanon
[0][2] = str("\\ud4db") ;
156 hangulCanon
[1][0] = str("\\u1111\\u1171\\u11b6"), hangulCanon
[1][1] = str("\\u1111\\u1171\\u11b6"), hangulCanon
[1][2] = str("\\ud4db");
159 BasicNormalizerTest::~BasicNormalizerTest()
163 void BasicNormalizerTest::TestPrevious()
165 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
167 logln("testing decomp...");
169 for (i
= 0; i
< ARRAY_LENGTH(canonTests
); i
++) {
170 backAndForth(norm
, canonTests
[i
][0]);
173 logln("testing compose...");
174 norm
->setMode(UNORM_NFC
);
175 for (i
= 0; i
< ARRAY_LENGTH(canonTests
); i
++) {
176 backAndForth(norm
, canonTests
[i
][0]);
182 void BasicNormalizerTest::TestDecomp()
184 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
185 iterateTest(norm
, canonTests
, ARRAY_LENGTH(canonTests
), 1);
186 staticTest(UNORM_NFD
, 0, canonTests
, ARRAY_LENGTH(canonTests
), 1);
190 void BasicNormalizerTest::TestCompatDecomp()
192 Normalizer
* norm
= new Normalizer("", UNORM_NFKD
);
193 iterateTest(norm
, compatTests
, ARRAY_LENGTH(compatTests
), 1);
195 staticTest(UNORM_NFKD
, 0,
196 compatTests
, ARRAY_LENGTH(compatTests
), 1);
200 void BasicNormalizerTest::TestCanonCompose()
202 Normalizer
* norm
= new Normalizer("", UNORM_NFC
);
203 iterateTest(norm
, canonTests
, ARRAY_LENGTH(canonTests
), 2);
205 staticTest(UNORM_NFC
, 0, canonTests
,
206 ARRAY_LENGTH(canonTests
), 2);
210 void BasicNormalizerTest::TestCompatCompose()
212 Normalizer
* norm
= new Normalizer("", UNORM_NFKC
);
213 iterateTest(norm
, compatTests
, ARRAY_LENGTH(compatTests
), 2);
215 staticTest(UNORM_NFKC
, 0,
216 compatTests
, ARRAY_LENGTH(compatTests
), 2);
221 //-------------------------------------------------------------------------------
223 void BasicNormalizerTest::TestHangulCompose()
225 // Make sure that the static composition methods work
226 logln("Canonical composition...");
227 staticTest(UNORM_NFC
, 0, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 2);
228 logln("Compatibility composition...");
230 // Now try iterative composition....
231 logln("Static composition...");
232 Normalizer
* norm
= new Normalizer("", UNORM_NFC
);
233 iterateTest(norm
, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 2);
234 norm
->setMode(UNORM_NFKC
);
236 // And finally, make sure you can do it in reverse too
237 logln("Reverse iteration...");
238 norm
->setMode(UNORM_NFC
);
239 for (uint32_t i
= 0; i
< ARRAY_LENGTH(hangulCanon
); i
++) {
240 backAndForth(norm
, hangulCanon
[i
][0]);
245 void BasicNormalizerTest::TestHangulDecomp()
247 // Make sure that the static decomposition methods work
248 logln("Canonical decomposition...");
249 staticTest(UNORM_NFD
, 0, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 1);
250 logln("Compatibility decomposition...");
252 // Now the iterative decomposition methods...
253 logln("Iterative decomposition...");
254 Normalizer
* norm
= new Normalizer("", UNORM_NFD
);
255 iterateTest(norm
, hangulCanon
, ARRAY_LENGTH(hangulCanon
), 1);
256 norm
->setMode(UNORM_NFKD
);
258 // And finally, make sure you can do it in reverse too
259 logln("Reverse iteration...");
260 norm
->setMode(UNORM_NFD
);
261 for (uint32_t i
= 0; i
< ARRAY_LENGTH(hangulCanon
); i
++) {
262 backAndForth(norm
, hangulCanon
[i
][0]);
268 * The Tibetan vowel sign AA, 0f71, was messed up prior to Unicode version 2.1.9.
270 void BasicNormalizerTest::TestTibetan(void) {
271 UnicodeString decomp
[1][3];
272 decomp
[0][0] = str("\\u0f77");
273 decomp
[0][1] = str("\\u0f77");
274 decomp
[0][2] = str("\\u0fb2\\u0f71\\u0f80");
276 UnicodeString compose
[1][3];
277 compose
[0][0] = str("\\u0fb2\\u0f71\\u0f80");
278 compose
[0][1] = str("\\u0fb2\\u0f71\\u0f80");
279 compose
[0][2] = str("\\u0fb2\\u0f71\\u0f80");
281 staticTest(UNORM_NFD
, 0, decomp
, ARRAY_LENGTH(decomp
), 1);
282 staticTest(UNORM_NFKD
, 0, decomp
, ARRAY_LENGTH(decomp
), 2);
283 staticTest(UNORM_NFC
, 0, compose
, ARRAY_LENGTH(compose
), 1);
284 staticTest(UNORM_NFKC
, 0, compose
, ARRAY_LENGTH(compose
), 2);
288 * Make sure characters in the CompositionExclusion.txt list do not get
291 void BasicNormalizerTest::TestCompositionExclusion(void) {
292 // This list is generated from CompositionExclusion.txt.
293 // Update whenever the normalizer tables are updated. Note
294 // that we test all characters listed, even those that can be
295 // derived from the Unicode DB and are therefore commented
297 // ### TODO read composition exclusion from source/data/unidata file
298 // and test against that
299 UnicodeString EXCLUDED
= str(
300 "\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
301 "\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
302 "\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
303 "\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
304 "\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
305 "\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
306 "\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB"
307 "\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000"
308 "\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10"
309 "\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F"
310 "\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31"
311 "\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A"
312 "\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
313 "\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E"
315 for (int32_t i
=0; i
<EXCLUDED
.length(); ++i
) {
316 UnicodeString
a(EXCLUDED
.charAt(i
));
319 Normalizer::normalize(a
, UNORM_NFKD
, 0, b
, status
);
320 Normalizer::normalize(b
, UNORM_NFC
, 0, c
, status
);
322 errln("FAIL: " + hex(a
) + " x DECOMP_COMPAT => " +
323 hex(b
) + " x COMPOSE => " +
325 } else if (verbose
) {
326 logln("Ok: " + hex(a
) + " x DECOMP_COMPAT => " +
327 hex(b
) + " x COMPOSE => " +
334 * Test for a problem that showed up just before ICU 1.6 release
335 * having to do with combining characters with an index of zero.
336 * Such characters do not participate in any canonical
337 * decompositions. However, having an index of zero means that
338 * they all share one typeMask[] entry, that is, they all have to
339 * map to the same canonical class, which is not the case, in
342 void BasicNormalizerTest::TestZeroIndex(void) {
343 const char* DATA
[] = {
344 // Expect col1 x COMPOSE_COMPAT => col2
345 // Expect col2 x DECOMP => col3
346 "A\\u0316\\u0300", "\\u00C0\\u0316", "A\\u0316\\u0300",
347 "A\\u0300\\u0316", "\\u00C0\\u0316", "A\\u0316\\u0300",
348 "A\\u0327\\u0300", "\\u00C0\\u0327", "A\\u0327\\u0300",
349 "c\\u0321\\u0327", "c\\u0321\\u0327", "c\\u0321\\u0327",
350 "c\\u0327\\u0321", "\\u00E7\\u0321", "c\\u0327\\u0321",
352 int32_t DATA_length
= (int32_t)(sizeof(DATA
) / sizeof(DATA
[0]));
354 for (int32_t i
=0; i
<DATA_length
; i
+=3) {
355 UErrorCode status
= U_ZERO_ERROR
;
356 UnicodeString
a(DATA
[i
], "");
359 Normalizer::normalize(a
, UNORM_NFKC
, 0, b
, status
);
360 if (U_FAILURE(status
)) {
361 dataerrln("Error calling normalize UNORM_NFKC: %s", u_errorName(status
));
363 UnicodeString
exp(DATA
[i
+1], "");
364 exp
= exp
.unescape();
366 logln((UnicodeString
)"Ok: " + hex(a
) + " x COMPOSE_COMPAT => " + hex(b
));
368 errln((UnicodeString
)"FAIL: " + hex(a
) + " x COMPOSE_COMPAT => " + hex(b
) +
369 ", expect " + hex(exp
));
372 Normalizer::normalize(b
, UNORM_NFD
, 0, a
, status
);
373 if (U_FAILURE(status
)) {
374 dataerrln("Error calling normalize UNORM_NFD: %s", u_errorName(status
));
376 UnicodeString exp
= UnicodeString(DATA
[i
+2], "").unescape();
378 logln((UnicodeString
)"Ok: " + hex(b
) + " x DECOMP => " + hex(a
));
380 errln((UnicodeString
)"FAIL: " + hex(b
) + " x DECOMP => " + hex(a
) +
381 ", expect " + hex(exp
));
388 * Run a few specific cases that are failing for Verisign.
390 void BasicNormalizerTest::TestVerisign(void) {
393 > 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
394 > Their output (supposedly from ICU):
395 > 05B8 05B1 05B9 0591 05C3 05B0 05AC 059F
396 > My output from charlint:
397 > 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
399 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F => 05B1 05B8 05B9 0591 05C3 05B0
402 U+05B8 18 E HEBREW POINT QAMATS
403 U+05B9 19 F HEBREW POINT HOLAM
404 U+05B1 11 HEBREW POINT HATAF SEGOL
405 U+0591 220 HEBREW ACCENT ETNAHTA
406 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
407 U+05B0 10 HEBREW POINT SHEVA
408 U+05AC 230 HEBREW ACCENT ILUY
409 U+059F 230 HEBREW ACCENT QARNEY PARA
411 U+05B1 11 HEBREW POINT HATAF SEGOL
412 U+05B8 18 HEBREW POINT QAMATS
413 U+05B9 19 HEBREW POINT HOLAM
414 U+0591 220 HEBREW ACCENT ETNAHTA
415 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
416 U+05B0 10 HEBREW POINT SHEVA
417 U+05AC 230 HEBREW ACCENT ILUY
418 U+059F 230 HEBREW ACCENT QARNEY PARA
421 U+05B8 18 HEBREW POINT QAMATS
422 U+05B1 11 HEBREW POINT HATAF SEGOL
423 U+05B9 19 HEBREW POINT HOLAM
424 U+0591 220 HEBREW ACCENT ETNAHTA
425 U+05C3 0 HEBREW PUNCTUATION SOF PASUQ
426 U+05B0 10 HEBREW POINT SHEVA
427 U+05AC 230 HEBREW ACCENT ILUY
428 U+059F 230 HEBREW ACCENT QARNEY PARA
432 >0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD
433 >Their output (supposedly from ICU):
434 >0592 05B0 05B7 05BC 05A5 05C0 05AD 05C4
435 >My output from charlint:
436 >05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4
438 0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD => 05B0 05B7 05BC 05A5 0592 05C0
441 U+0592 230 HEBREW ACCENT SEGOL
442 U+05B7 17 HEBREW POINT PATAH
443 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
444 U+05A5 220 HEBREW ACCENT MERKHA
445 U+05B0 10 HEBREW POINT SHEVA
446 U+05C0 0 HEBREW PUNCTUATION PASEQ
447 U+05C4 230 HEBREW MARK UPPER DOT
448 U+05AD 222 HEBREW ACCENT DEHI
450 U+05B0 10 HEBREW POINT SHEVA
451 U+05B7 17 HEBREW POINT PATAH
452 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
453 U+05A5 220 HEBREW ACCENT MERKHA
454 U+0592 230 HEBREW ACCENT SEGOL
455 U+05C0 0 HEBREW PUNCTUATION PASEQ
456 U+05AD 222 HEBREW ACCENT DEHI
457 U+05C4 230 HEBREW MARK UPPER DOT
460 U+0592 230 HEBREW ACCENT SEGOL
461 U+05B0 10 HEBREW POINT SHEVA
462 U+05B7 17 HEBREW POINT PATAH
463 U+05BC 21 HEBREW POINT DAGESH OR MAPIQ
464 U+05A5 220 HEBREW ACCENT MERKHA
465 U+05C0 0 HEBREW PUNCTUATION PASEQ
466 U+05AD 222 HEBREW ACCENT DEHI
467 U+05C4 230 HEBREW MARK UPPER DOT
469 UnicodeString data
[2][3];
470 data
[0][0] = str("\\u05B8\\u05B9\\u05B1\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
471 data
[0][1] = str("\\u05B1\\u05B8\\u05B9\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
472 data
[0][2] = str("");
473 data
[1][0] = str("\\u0592\\u05B7\\u05BC\\u05A5\\u05B0\\u05C0\\u05C4\\u05AD");
474 data
[1][1] = str("\\u05B0\\u05B7\\u05BC\\u05A5\\u0592\\u05C0\\u05AD\\u05C4");
475 data
[1][2] = str("");
477 staticTest(UNORM_NFD
, 0, data
, ARRAY_LENGTH(data
), 1);
478 staticTest(UNORM_NFC
, 0, data
, ARRAY_LENGTH(data
), 1);
481 //------------------------------------------------------------------------
482 // Internal utilities
485 UnicodeString
BasicNormalizerTest::hex(UChar ch
) {
486 UnicodeString result
;
487 return appendHex(ch
, 4, result
);
490 UnicodeString
BasicNormalizerTest::hex(const UnicodeString
& s
) {
491 UnicodeString result
;
492 for (int i
= 0; i
< s
.length(); ++i
) {
493 if (i
!= 0) result
+= (UChar
)0x2c/*,*/;
494 appendHex(s
[i
], 4, result
);
500 inline static void insert(UnicodeString
& dest
, int pos
, UChar32 ch
)
502 dest
.replace(pos
, 0, ch
);
505 void BasicNormalizerTest::backAndForth(Normalizer
* iter
, const UnicodeString
& input
)
508 iter
->setText(input
, status
);
510 // Run through the iterator forwards and stick it into a StringBuffer
511 UnicodeString forward
;
512 for (ch
= iter
->first(); ch
!= iter
->DONE
; ch
= iter
->next()) {
516 // Now do it backwards
517 UnicodeString reverse
;
518 for (ch
= iter
->last(); ch
!= iter
->DONE
; ch
= iter
->previous()) {
519 insert(reverse
, 0, ch
);
522 if (forward
!= reverse
) {
523 errln("Forward/reverse mismatch for input " + hex(input
)
524 + ", forward: " + hex(forward
) + ", backward: " + hex(reverse
));
528 void BasicNormalizerTest::staticTest(UNormalizationMode mode
, int options
,
529 UnicodeString tests
[][3], int length
,
532 for (int i
= 0; i
< length
; i
++)
534 UnicodeString
& input
= tests
[i
][0];
535 UnicodeString
& expect
= tests
[i
][outCol
];
537 logln("Normalizing '" + input
+ "' (" + hex(input
) + ")" );
539 UnicodeString output
;
540 Normalizer::normalize(input
, mode
, options
, output
, status
);
542 if (output
!= expect
) {
543 dataerrln(UnicodeString("ERROR: case ") + i
+ " normalized " + hex(input
) + "\n"
544 + " expected " + hex(expect
) + "\n"
545 + " static got " + hex(output
) );
550 void BasicNormalizerTest::iterateTest(Normalizer
* iter
,
551 UnicodeString tests
[][3], int length
,
554 for (int i
= 0; i
< length
; i
++)
556 UnicodeString
& input
= tests
[i
][0];
557 UnicodeString
& expect
= tests
[i
][outCol
];
559 logln("Normalizing '" + input
+ "' (" + hex(input
) + ")" );
561 iter
->setText(input
, status
);
562 assertEqual(input
, expect
, iter
, UnicodeString("ERROR: case ") + i
+ " ");
566 void BasicNormalizerTest::assertEqual(const UnicodeString
& input
,
567 const UnicodeString
& expected
,
569 const UnicodeString
& errPrefix
)
571 UnicodeString result
;
573 for (UChar32 ch
= iter
->first(); ch
!= iter
->DONE
; ch
= iter
->next()) {
576 if (result
!= expected
) {
577 dataerrln(errPrefix
+ "normalized " + hex(input
) + "\n"
578 + " expected " + hex(expected
) + "\n"
579 + " iterate got " + hex(result
) );
583 // helper class for TestPreviousNext()
584 // simple UTF-32 character iterator
585 class UChar32Iterator
{
587 UChar32Iterator(const UChar32
*text
, int32_t len
, int32_t index
) :
588 s(text
), length(len
), i(index
) {}
623 BasicNormalizerTest::TestPreviousNext(const UChar
*src
, int32_t srcLength
,
624 const UChar32
*expect
, int32_t expectLength
,
625 const int32_t *expectIndex
, // its length=expectLength+1
626 int32_t srcMiddle
, int32_t expectMiddle
,
628 UNormalizationMode mode
,
631 Normalizer
iter(src
, srcLength
, mode
);
633 // test getStaticClassID and getDynamicClassID
634 if(iter
.getDynamicClassID() != Normalizer::getStaticClassID()) {
635 errln("getStaticClassID != getDynamicClassID for Normalizer.");
638 UChar32Iterator
iter32(expect
, expectLength
, expectMiddle
);
643 // initially set the indexes into the middle of the strings
644 iter
.setIndexOnly(srcMiddle
);
646 // move around and compare the iteration code points with
648 const char *move
=moves
;
649 while((m
=*move
++)!=0) {
652 c2
=iter32
.previous();
656 } else /* m=='+' */ {
663 // copy the moves until the current (m) move, and terminate
665 uprv_strcpy(history
, moves
);
666 history
[move
-moves
]=0;
667 dataerrln("error: mismatch in Normalizer iteration (%s) at %s: "
668 "got c1=U+%04lx != expected c2=U+%04lx",
669 name
, history
, c1
, c2
);
674 if(iter
.getIndex()!=expectIndex
[iter32
.getIndex()]) {
675 // copy the moves until the current (m) move, and terminate
677 uprv_strcpy(history
, moves
);
678 history
[move
-moves
]=0;
679 errln("error: index mismatch in Normalizer iteration (%s) at %s: "
680 "Normalizer index %ld expected %ld\n",
681 name
, history
, iter
.getIndex(), expectIndex
[iter32
.getIndex()]);
688 BasicNormalizerTest::TestPreviousNext() {
689 // src and expect strings
690 static const UChar src
[]={
691 U16_LEAD(0x2f999), U16_TRAIL(0x2f999),
692 U16_LEAD(0x1d15f), U16_TRAIL(0x1d15f),
696 static const UChar32 expect
[]={
703 // expected src indexes corresponding to expect indexes
704 static const int32_t expectIndex
[]={
709 6 // behind last character
712 // src and expect strings for regression test for j2911
713 static const UChar src_j2911
[]={
714 U16_LEAD(0x2f999), U16_TRAIL(0x2f999),
715 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
719 static const UChar32 expect_j2911
[]={
721 0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
726 // expected src indexes corresponding to expect indexes
727 static const int32_t expectIndex_j2911
[]={
732 8 // behind last character
735 // initial indexes into the src and expect strings
736 // for both sets of test data
745 // - for previous(), 0 for current(), + for next()
746 // for both sets of test data
747 static const char *const moves
="0+0+0--0-0-+++0--+++++++0--------";
749 TestPreviousNext(src
, UPRV_LENGTHOF(src
),
750 expect
, UPRV_LENGTHOF(expect
),
752 SRC_MIDDLE
, EXPECT_MIDDLE
,
753 moves
, UNORM_NFD
, "basic");
755 TestPreviousNext(src_j2911
, UPRV_LENGTHOF(src_j2911
),
756 expect_j2911
, UPRV_LENGTHOF(expect_j2911
),
758 SRC_MIDDLE
, EXPECT_MIDDLE
,
759 moves
, UNORM_NFKC
, "j2911");
761 // try again from different "middle" indexes
762 TestPreviousNext(src
, UPRV_LENGTHOF(src
),
763 expect
, UPRV_LENGTHOF(expect
),
765 SRC_MIDDLE_2
, EXPECT_MIDDLE_2
,
766 moves
, UNORM_NFD
, "basic_2");
768 TestPreviousNext(src_j2911
, UPRV_LENGTHOF(src_j2911
),
769 expect_j2911
, UPRV_LENGTHOF(expect_j2911
),
771 SRC_MIDDLE_2
, EXPECT_MIDDLE_2
,
772 moves
, UNORM_NFKC
, "j2911_2");
775 void BasicNormalizerTest::TestConcatenate() {
776 static const char *const
778 /* mode, left, right, result */
791 /* ### TODO: add more interesting cases */
795 "\\u0C4D\\U000110BA\\U0001D169",
796 "\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345"
800 UnicodeString left
, right
, expect
, result
, r
;
801 UErrorCode errorCode
;
802 UNormalizationMode mode
;
805 /* test concatenation */
806 for(i
=0; i
<(int32_t)(sizeof(cases
)/sizeof(cases
[0])); ++i
) {
807 switch(*cases
[i
][0]) {
808 case 'C': mode
=UNORM_NFC
; break;
809 case 'D': mode
=UNORM_NFD
; break;
810 case 'c': mode
=UNORM_NFKC
; break;
811 case 'd': mode
=UNORM_NFKD
; break;
812 default: mode
=UNORM_NONE
; break;
815 left
=UnicodeString(cases
[i
][1], "").unescape();
816 right
=UnicodeString(cases
[i
][2], "").unescape();
817 expect
=UnicodeString(cases
[i
][3], "").unescape();
819 //result=r=UnicodeString();
820 errorCode
=U_ZERO_ERROR
;
822 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
823 if(U_FAILURE(errorCode
) || /*result!=r ||*/ result
!=expect
) {
824 dataerrln("error in Normalizer::concatenate(), cases[] fails with "+
825 UnicodeString(u_errorName(errorCode
))+", result==expect: expected: "+
826 hex(expect
)+" =========> got: " + hex(result
));
830 /* test error cases */
832 /* left.getBuffer()==result.getBuffer() */
833 result
=r
=expect
=UnicodeString("zz", "");
834 errorCode
=U_UNEXPECTED_TOKEN
;
835 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
836 if(errorCode
!=U_UNEXPECTED_TOKEN
|| result
!=r
|| !result
.isBogus()) {
837 errln("error in Normalizer::concatenate(), violates UErrorCode protocol");
841 errorCode
=U_ZERO_ERROR
;
842 r
=Normalizer::concatenate(left
, right
, result
, mode
, 0, errorCode
);
843 if(errorCode
!=U_ILLEGAL_ARGUMENT_ERROR
|| result
!=r
|| !result
.isBogus()) {
844 errln("error in Normalizer::concatenate(), does not detect left.isBogus()");
848 // reference implementation of Normalizer::compare
850 ref_norm_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
, UErrorCode
&errorCode
) {
851 UnicodeString r1
, r2
, t1
, t2
;
852 int32_t normOptions
=(int32_t)(options
>>UNORM_COMPARE_NORM_OPTIONS_SHIFT
);
854 if(options
&U_COMPARE_IGNORE_CASE
) {
855 Normalizer::decompose(s1
, FALSE
, normOptions
, r1
, errorCode
);
856 Normalizer::decompose(s2
, FALSE
, normOptions
, r2
, errorCode
);
858 r1
.foldCase(options
);
859 r2
.foldCase(options
);
865 Normalizer::decompose(r1
, FALSE
, normOptions
, t1
, errorCode
);
866 Normalizer::decompose(r2
, FALSE
, normOptions
, t2
, errorCode
);
868 if(options
&U_COMPARE_CODE_POINT_ORDER
) {
869 return t1
.compareCodePointOrder(t2
);
871 return t1
.compare(t2
);
875 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
877 _norm_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
, UErrorCode
&errorCode
) {
878 int32_t normOptions
=(int32_t)(options
>>UNORM_COMPARE_NORM_OPTIONS_SHIFT
);
880 if( UNORM_YES
==Normalizer::quickCheck(s1
, UNORM_FCD
, normOptions
, errorCode
) &&
881 UNORM_YES
==Normalizer::quickCheck(s2
, UNORM_FCD
, normOptions
, errorCode
)) {
882 options
|=UNORM_INPUT_IS_FCD
;
885 return Normalizer::compare(s1
, s2
, options
, errorCode
);
888 // reference implementation of UnicodeString::caseCompare
890 ref_case_compare(const UnicodeString
&s1
, const UnicodeString
&s2
, uint32_t options
) {
891 UnicodeString t1
, t2
;
896 t1
.foldCase(options
);
897 t2
.foldCase(options
);
899 if(options
&U_COMPARE_CODE_POINT_ORDER
) {
900 return t1
.compareCodePointOrder(t2
);
902 return t1
.compare(t2
);
906 // reduce an integer to -1/0/1
907 static inline int32_t
908 _sign(int32_t value
) {
912 return (value
>>31)|1;
917 _signString(int32_t value
) {
920 } else if(value
==0) {
922 } else /* value>0 */ {
928 BasicNormalizerTest::TestCompare() {
929 // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
930 // by comparing it with its semantic equivalent
931 // since we trust the pieces, this is sufficient
933 // test each string with itself and each other
934 // each time with all options
935 static const char *const
937 // some cases from NormalizationTest.txt
939 "D\\u031B\\u0307\\u0323",
940 "\\u1E0C\\u031B\\u0307",
941 "D\\u031B\\u0323\\u0307",
942 "d\\u031B\\u0323\\u0307",
949 // Angstrom sign = A ring
957 "a\\u059A\\u0316\\u302A\\u032Fb",
958 "a\\u302A\\u0316\\u032F\\u059Ab",
959 "a\\u302A\\u0316\\u032F\\u059Ab",
960 "A\\u059A\\u0316\\u302A\\u032Fb",
962 // from ICU case folding tests
964 "A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131",
965 "ass\\u03bcffi\\U00010434i",
966 "\\u0061\\u0042\\u0131\\u03a3\\u00df\\ufb03\\ud93f\\udfff",
967 "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udfff",
968 "\\u0041\\u0062\\u0131\\u03c3\\u0053\\u0073\\u0066\\u0046\\u0069\\ud93f\\udfff",
969 "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udffd",
971 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold
972 // vs. U+10000 at bottom - code point order
974 "\\ud800\\ud800\\udc01",
977 // other code point order tests from ustrtest.cpp
980 "\\u20ac\\ud800\\udc00",
985 "\\uff61\\ud800\\udc02",
989 // long strings, see cnormtst.c/TestNormCoverage()
990 // equivalent if case-insensitive
992 "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
993 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
994 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
995 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
996 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
997 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
998 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
999 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
1000 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
1001 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
1002 "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
1003 "d\\u031B\\u0307\\u0323",
1005 "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1006 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1007 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1008 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1009 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1010 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1011 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
1012 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
1013 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
1014 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
1015 "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1016 "\\u1E0C\\u031B\\u0307",
1018 // some strings that may make a difference whether the compare function
1019 // case-folds or decomposes first
1021 "\\u0360\\u0345\\u0334",
1022 "\\u0360\\u03b9\\u0334",
1024 "\\u0360\\u1f80\\u0334",
1025 "\\u0360\\u03b1\\u0313\\u03b9\\u0334",
1027 "\\u0360\\u1ffc\\u0334",
1028 "\\u0360\\u03c9\\u03b9\\u0334",
1030 "a\\u0360\\u0345\\u0360\\u0345b",
1031 "a\\u0345\\u0360\\u0345\\u0360b",
1033 // interesting cases for canonical caseless match with turkic i handling
1038 // strings with post-Unicode 3.2 normalization or normalization corrections
1040 "\\u00e4\\u193b\\U0002f868",
1041 "\\u0061\\u193b\\u0308\\u36fc",
1048 UnicodeString s
[100]; // at least as many items as in strings[] !
1050 // all combinations of options
1051 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
1052 // set UNORM_UNICODE_3_2 in one additional combination
1053 static const struct {
1058 { U_COMPARE_CODE_POINT_ORDER
, "c.p. order" },
1059 { U_COMPARE_IGNORE_CASE
, "ignore case" },
1060 { U_COMPARE_CODE_POINT_ORDER
|U_COMPARE_IGNORE_CASE
, "c.p. order & ignore case" },
1061 { U_COMPARE_IGNORE_CASE
|U_FOLD_CASE_EXCLUDE_SPECIAL_I
, "ignore case & special i" },
1062 { U_COMPARE_CODE_POINT_ORDER
|U_COMPARE_IGNORE_CASE
|U_FOLD_CASE_EXCLUDE_SPECIAL_I
, "c.p. order & ignore case & special i" },
1063 { UNORM_UNICODE_3_2
<<UNORM_COMPARE_NORM_OPTIONS_SHIFT
, "Unicode 3.2" }
1066 int32_t i
, j
, k
, count
=UPRV_LENGTHOF(strings
);
1067 int32_t result
, refResult
;
1069 UErrorCode errorCode
;
1071 // create the UnicodeStrings
1072 for(i
=0; i
<count
; ++i
) {
1073 s
[i
]=UnicodeString(strings
[i
], "").unescape();
1076 // test them each with each other
1077 for(i
=0; i
<count
; ++i
) {
1078 for(j
=i
; j
<count
; ++j
) {
1079 for(k
=0; k
<UPRV_LENGTHOF(opt
); ++k
) {
1080 // test Normalizer::compare
1081 errorCode
=U_ZERO_ERROR
;
1082 result
=_norm_compare(s
[i
], s
[j
], opt
[k
].options
, errorCode
);
1083 refResult
=ref_norm_compare(s
[i
], s
[j
], opt
[k
].options
, errorCode
);
1084 if(_sign(result
)!=_sign(refResult
)) {
1085 errln("Normalizer::compare(%d, %d, %s)%s should be %s %s",
1086 i
, j
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1087 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1090 // test UnicodeString::caseCompare - same internal implementation function
1091 if(opt
[k
].options
&U_COMPARE_IGNORE_CASE
) {
1092 errorCode
=U_ZERO_ERROR
;
1093 result
=s
[i
].caseCompare(s
[j
], opt
[k
].options
);
1094 refResult
=ref_case_compare(s
[i
], s
[j
], opt
[k
].options
);
1095 if(_sign(result
)!=_sign(refResult
)) {
1096 errln("UniStr::caseCompare(%d, %d, %s)%s should be %s %s",
1097 i
, j
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1098 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1105 // test cases with i and I to make sure Turkic works
1106 static const UChar iI
[]={ 0x49, 0x69, 0x130, 0x131 };
1107 UnicodeSet iSet
, set
;
1109 UnicodeString s1
, s2
;
1111 const Normalizer2Impl
*nfcImpl
=Normalizer2Factory::getNFCImpl(errorCode
);
1112 if(U_FAILURE(errorCode
) || !nfcImpl
->ensureCanonIterData(errorCode
)) {
1113 dataerrln("Normalizer2Factory::getNFCImpl().ensureCanonIterData() failed: %s",
1114 u_errorName(errorCode
));
1118 // collect all sets into one for contiguous output
1119 for(i
=0; i
<UPRV_LENGTHOF(iI
); ++i
) {
1120 if(nfcImpl
->getCanonStartSet(iI
[i
], iSet
)) {
1125 // test all of these precomposed characters
1126 const Normalizer2
*nfcNorm2
=Normalizer2::getNFCInstance(errorCode
);
1127 UnicodeSetIterator
it(set
);
1128 while(it
.next() && !it
.isString()) {
1129 UChar32 c
=it
.getCodepoint();
1130 if(!nfcNorm2
->getDecomposition(c
, s2
)) {
1131 dataerrln("NFC.getDecomposition(i-composite U+%04lx) failed", (long)c
);
1136 for(k
=0; k
<UPRV_LENGTHOF(opt
); ++k
) {
1137 // test Normalizer::compare
1138 errorCode
=U_ZERO_ERROR
;
1139 result
=_norm_compare(s1
, s2
, opt
[k
].options
, errorCode
);
1140 refResult
=ref_norm_compare(s1
, s2
, opt
[k
].options
, errorCode
);
1141 if(_sign(result
)!=_sign(refResult
)) {
1142 errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s",
1143 c
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1144 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1147 // test UnicodeString::caseCompare - same internal implementation function
1148 if(opt
[k
].options
&U_COMPARE_IGNORE_CASE
) {
1149 errorCode
=U_ZERO_ERROR
;
1150 result
=s1
.caseCompare(s2
, opt
[k
].options
);
1151 refResult
=ref_case_compare(s1
, s2
, opt
[k
].options
);
1152 if(_sign(result
)!=_sign(refResult
)) {
1153 errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s",
1154 c
, opt
[k
].name
, _signString(result
), _signString(refResult
),
1155 U_SUCCESS(errorCode
) ? "" : u_errorName(errorCode
));
1161 // test getDecomposition() for some characters that do not decompose
1162 if( nfcNorm2
->getDecomposition(0x20, s2
) ||
1163 nfcNorm2
->getDecomposition(0x4e00, s2
) ||
1164 nfcNorm2
->getDecomposition(0x20002, s2
)
1166 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
1169 // test getRawDecomposition() for some characters that do not decompose
1170 if( nfcNorm2
->getRawDecomposition(0x20, s2
) ||
1171 nfcNorm2
->getRawDecomposition(0x4e00, s2
) ||
1172 nfcNorm2
->getRawDecomposition(0x20002, s2
)
1174 errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
1177 // test composePair() for some pairs of characters that do not compose
1178 if( nfcNorm2
->composePair(0x20, 0x301)>=0 ||
1179 nfcNorm2
->composePair(0x61, 0x305)>=0 ||
1180 nfcNorm2
->composePair(0x1100, 0x1160)>=0 ||
1181 nfcNorm2
->composePair(0xac00, 0x11a7)>=0
1183 errln("NFC.composePair() incorrectly composes some pairs of characters");
1186 // test FilteredNormalizer2::getDecomposition()
1187 UnicodeSet
filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode
);
1188 FilteredNormalizer2
fn2(*nfcNorm2
, filter
);
1189 if( fn2
.getDecomposition(0xe4, s1
) || !fn2
.getDecomposition(0x100, s2
) ||
1190 s2
.length()!=2 || s2
[0]!=0x41 || s2
[1]!=0x304
1192 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed");
1195 // test FilteredNormalizer2::getRawDecomposition()
1196 if( fn2
.getRawDecomposition(0xe4, s1
) || !fn2
.getRawDecomposition(0x100, s2
) ||
1197 s2
.length()!=2 || s2
[0]!=0x41 || s2
[1]!=0x304
1199 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
1202 // test FilteredNormalizer2::composePair()
1203 if( 0x100!=fn2
.composePair(0x41, 0x304) ||
1204 fn2
.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
1206 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
1210 // verify that case-folding does not un-FCD strings
1212 BasicNormalizerTest::countFoldFCDExceptions(uint32_t foldingOptions
) {
1213 UnicodeString s
, fold
, d
;
1216 uint8_t cc
, trailCC
, foldCC
, foldTrailCC
;
1217 UNormalizationCheckResult qcResult
;
1220 UErrorCode errorCode
;
1222 logln("Test if case folding may un-FCD a string (folding options %04lx)", foldingOptions
);
1225 for(c
=0; c
<=0x10ffff; ++c
) {
1226 errorCode
= U_ZERO_ERROR
;
1227 category
=u_charType(c
);
1228 if(category
==U_UNASSIGNED
) {
1229 continue; // skip unassigned code points
1232 c
=0xd7a3; // skip Hangul - no case folding there
1235 // skip Han blocks - no case folding there either
1251 // get leading and trailing cc for c
1252 Normalizer::decompose(s
, FALSE
, 0, d
, errorCode
);
1254 cc
=u_getCombiningClass(d
.char32At(0));
1255 trailCC
=u_getCombiningClass(d
.char32At(d
.length()-1));
1257 // get leading and trailing cc for the case-folding of c
1258 s
.foldCase(foldingOptions
);
1259 Normalizer::decompose(s
, FALSE
, 0, d
, errorCode
);
1260 foldCC
=u_getCombiningClass(d
.char32At(0));
1261 foldTrailCC
=u_getCombiningClass(d
.char32At(d
.length()-1));
1263 qcResult
=Normalizer::quickCheck(s
, UNORM_FCD
, errorCode
);
1265 if (U_FAILURE(errorCode
)) {
1267 dataerrln("U+%04lx: Failed with error %s", u_errorName(errorCode
));
1271 // - character maps to empty string: adjacent characters may then need reordering
1272 // - folding has different leading/trailing cc's, and they don't become just 0
1273 // - folding itself is not FCD
1274 if( qcResult
!=UNORM_YES
||
1276 (cc
!=foldCC
&& foldCC
!=0) || (trailCC
!=foldTrailCC
&& foldTrailCC
!=0)
1279 dataerrln("U+%04lx: case-folding may un-FCD a string (folding options %04lx)", c
, foldingOptions
);
1280 dataerrln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc
, trailCC
, d
.char32At(0), foldCC
, d
.char32At(d
.length()-1), foldTrailCC
, qcResult
);
1285 // if a code point is in NFD but its case folding is not, then
1286 // unorm_compare will also fail
1287 if(isNFD
&& UNORM_YES
!=Normalizer::quickCheck(s
, UNORM_NFD
, errorCode
)) {
1289 errln("U+%04lx: case-folding un-NFDs this character (folding options %04lx)", c
, foldingOptions
);
1293 logln("There are %ld code points for which case-folding may un-FCD a string (folding options %04lx)", count
, foldingOptions
);
1298 BasicNormalizerTest::FindFoldFCDExceptions() {
1301 count
=countFoldFCDExceptions(0);
1302 count
+=countFoldFCDExceptions(U_FOLD_CASE_EXCLUDE_SPECIAL_I
);
1305 * If case-folding un-FCDs any strings, then unorm_compare() must be
1307 * It currently assumes that one can check for FCD then case-fold
1308 * and then still have FCD strings for raw decomposition without reordering.
1310 dataerrln("error: There are %ld code points for which case-folding may un-FCD a string for all folding options.\n"
1311 "See comment in BasicNormalizerTest::FindFoldFCDExceptions()!", count
);
1316 initExpectedSkippables(UnicodeSet skipSets
[UNORM_MODE_COUNT
], UErrorCode
&errorCode
) {
1317 skipSets
[UNORM_NFD
].applyPattern(
1318 UNICODE_STRING_SIMPLE("[[:NFD_QC=Yes:]&[:ccc=0:]]"), errorCode
);
1319 skipSets
[UNORM_NFC
].applyPattern(
1320 UNICODE_STRING_SIMPLE("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode
);
1321 skipSets
[UNORM_NFKD
].applyPattern(
1322 UNICODE_STRING_SIMPLE("[[:NFKD_QC=Yes:]&[:ccc=0:]]"), errorCode
);
1323 skipSets
[UNORM_NFKC
].applyPattern(
1324 UNICODE_STRING_SIMPLE("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode
);
1326 // Remove from the NFC and NFKC sets all those characters that change
1327 // when a back-combining character is added.
1328 // First, get all of the back-combining characters and their combining classes.
1329 UnicodeSet
combineBack("[:NFC_QC=Maybe:]", errorCode
);
1330 int32_t numCombineBack
=combineBack
.size();
1331 int32_t *combineBackCharsAndCc
=new int32_t[numCombineBack
*2];
1332 UnicodeSetIterator
iter(combineBack
);
1333 for(int32_t i
=0; i
<numCombineBack
; ++i
) {
1335 UChar32 c
=iter
.getCodepoint();
1336 combineBackCharsAndCc
[2*i
]=c
;
1337 combineBackCharsAndCc
[2*i
+1]=u_getCombiningClass(c
);
1340 // We need not look at control codes, Han characters nor Hangul LVT syllables because they
1341 // do not combine forward. LV syllables are already removed.
1342 UnicodeSet
notInteresting("[[:C:][:Unified_Ideograph:][:HST=LVT:]]", errorCode
);
1343 LocalPointer
<UnicodeSet
> unsure(&((UnicodeSet
*)(skipSets
[UNORM_NFC
].clone()))->removeAll(notInteresting
));
1344 // System.out.format("unsure.size()=%d\n", unsure.size());
1346 // For each character about which we are unsure, see if it changes when we add
1347 // one of the back-combining characters.
1348 const Normalizer2
*norm2
=Normalizer2::getNFCInstance(errorCode
);
1350 iter
.reset(*unsure
);
1351 while(iter
.next()) {
1352 UChar32 c
=iter
.getCodepoint();
1354 int32_t cLength
=s
.length();
1355 int32_t tccc
=u_getIntPropertyValue(c
, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS
);
1356 for(int32_t i
=0; i
<numCombineBack
; ++i
) {
1357 // If c's decomposition ends with a character with non-zero combining class, then
1358 // c can only change if it combines with a character with a non-zero combining class.
1359 int32_t cc2
=combineBackCharsAndCc
[2*i
+1];
1360 if(tccc
==0 || cc2
!=0) {
1361 UChar32 c2
=combineBackCharsAndCc
[2*i
];
1363 if(!norm2
->isNormalized(s
, errorCode
)) {
1364 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
1365 skipSets
[UNORM_NFC
].remove(c
);
1366 skipSets
[UNORM_NFKC
].remove(c
);
1369 s
.truncate(cLength
);
1373 delete [] combineBackCharsAndCc
;
1377 BasicNormalizerTest::TestSkippable() {
1378 UnicodeSet diff
, skipSets
[UNORM_MODE_COUNT
], expectSets
[UNORM_MODE_COUNT
];
1379 UnicodeString s
, pattern
;
1381 /* build NF*Skippable sets from runtime data */
1382 IcuTestErrorCode
errorCode(*this, "TestSkippable");
1383 skipSets
[UNORM_NFD
].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode
);
1384 skipSets
[UNORM_NFKD
].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode
);
1385 skipSets
[UNORM_NFC
].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode
);
1386 skipSets
[UNORM_NFKC
].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode
);
1387 if(errorCode
.logDataIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) {
1391 /* get expected sets from hardcoded patterns */
1392 initExpectedSkippables(expectSets
, errorCode
);
1393 errorCode
.assertSuccess();
1395 for(int32_t i
=UNORM_NONE
; i
<UNORM_MODE_COUNT
; ++i
) {
1396 if(skipSets
[i
]!=expectSets
[i
]) {
1397 errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n", i
, i
);
1398 // Note: This used to depend on hardcoded UnicodeSet patterns generated by
1399 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
1400 // running com.ibm.text.UCD.Main with the option NFSkippable.
1401 // Since ICU 4.6/Unicode 6, we are generating the
1402 // expectSets ourselves in initSkippables().
1404 s
=UNICODE_STRING_SIMPLE("skip-expect=");
1405 (diff
=skipSets
[i
]).removeAll(expectSets
[i
]).toPattern(pattern
, TRUE
);
1409 s
.append(UNICODE_STRING_SIMPLE("\n\nexpect-skip="));
1410 (diff
=expectSets
[i
]).removeAll(skipSets
[i
]).toPattern(pattern
, TRUE
);
1412 s
.append(UNICODE_STRING_SIMPLE("\n\n"));
1419 struct StringPair
{ const char *input
, *expected
; };
1422 BasicNormalizerTest::TestCustomComp() {
1423 static const StringPair pairs
[]={
1424 { "\\uD801\\uE000\\uDFFE", "" },
1425 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1426 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1427 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
1428 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1429 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1430 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1431 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1433 IcuTestErrorCode
errorCode(*this, "BasicNormalizerTest/TestCustomComp");
1434 const Normalizer2
*customNorm2
=
1435 Normalizer2::getInstance(loadTestData(errorCode
), "testnorm",
1436 UNORM2_COMPOSE
, errorCode
);
1437 if(errorCode
.logDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1440 for(int32_t i
=0; i
<UPRV_LENGTHOF(pairs
); ++i
) {
1441 const StringPair
&pair
=pairs
[i
];
1442 UnicodeString input
=UnicodeString(pair
.input
, -1, US_INV
).unescape();
1443 UnicodeString expected
=UnicodeString(pair
.expected
, -1, US_INV
).unescape();
1444 UnicodeString result
=customNorm2
->normalize(input
, errorCode
);
1445 if(result
!=expected
) {
1446 errln("custom compose Normalizer2 did not normalize input %d as expected", i
);
1452 BasicNormalizerTest::TestCustomFCC() {
1453 static const StringPair pairs
[]={
1454 { "\\uD801\\uE000\\uDFFE", "" },
1455 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1456 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1457 // The following expected result is different from CustomComp
1458 // because of only-contiguous composition.
1459 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
1460 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1461 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1462 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1463 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1465 IcuTestErrorCode
errorCode(*this, "BasicNormalizerTest/TestCustomFCC");
1466 const Normalizer2
*customNorm2
=
1467 Normalizer2::getInstance(loadTestData(errorCode
), "testnorm",
1468 UNORM2_COMPOSE_CONTIGUOUS
, errorCode
);
1469 if(errorCode
.logDataIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1472 for(int32_t i
=0; i
<UPRV_LENGTHOF(pairs
); ++i
) {
1473 const StringPair
&pair
=pairs
[i
];
1474 UnicodeString input
=UnicodeString(pair
.input
, -1, US_INV
).unescape();
1475 UnicodeString expected
=UnicodeString(pair
.expected
, -1, US_INV
).unescape();
1476 UnicodeString result
=customNorm2
->normalize(input
, errorCode
);
1477 if(result
!=expected
) {
1478 errln("custom FCC Normalizer2 did not normalize input %d as expected", i
);
1483 /* Improve code coverage of Normalizer2 */
1485 BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
1486 UErrorCode errorCode
= U_ZERO_ERROR
;
1487 const Normalizer2
*nfcNorm2
=Normalizer2::getNFCInstance(errorCode
);
1488 if (U_FAILURE(errorCode
)) {
1489 dataerrln("Normalizer2::getNFCInstance() call failed - %s", u_errorName(status
));
1492 UnicodeSet
filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff\\u0310-\\u031f]"), errorCode
);
1493 FilteredNormalizer2
fn2(*nfcNorm2
, filter
);
1495 UChar32 char32
= 0x0054;
1497 if (fn2
.isInert(char32
)) {
1498 errln("FilteredNormalizer2.isInert() failed.");
1501 if (fn2
.hasBoundaryAfter(char32
)) {
1502 errln("FilteredNormalizer2.hasBoundaryAfter() failed.");
1506 for(c
=0; c
<=0x3ff; ++c
) {
1507 uint8_t expectedCC
= filter
.contains(c
) ? nfcNorm2
->getCombiningClass(c
) : 0;
1508 uint8_t cc
=fn2
.getCombiningClass(c
);
1509 if(cc
!=expectedCC
) {
1511 UnicodeString("FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+")+
1513 ")==filtered NFC.getCC()");
1517 UnicodeString newString1
= UNICODE_STRING_SIMPLE("[^\\u0100-\\u01ff]");
1518 UnicodeString newString2
= UNICODE_STRING_SIMPLE("[^\\u0200-\\u02ff]");
1519 fn2
.append(newString1
, newString2
, errorCode
);
1520 if (U_FAILURE(errorCode
)) {
1521 errln("FilteredNormalizer2.append() failed.");
1525 #endif /* #if !UCONFIG_NO_NORMALIZATION */