1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
3 # Copyright (c) 2012-2015 International Business Machines
4 # Corporation and others. All Rights Reserved.
6 # This file should be in UTF-8 with a signature byte sequence ("BOM").
8 # collationtest.txt: Collation test data.
10 # created on: 2012apr13
11 # created by: Markus W. Scherer
13 # A line with "** test: description" is used for verbose and error output.
15 # A collator can be set with "@ root" or "@ locale language-tag",
16 # for example "@ locale de-u-co-phonebk".
17 # An old-style locale ID can also be used, for example "@ locale de@collation=phonebook".
19 # A collator can be built with "@ rules".
20 # An "@ rules" line is followed by one or more lines with the tailoring rules.
22 # A collator can be modified with "% attribute=value".
24 # "* compare" tests the order (= or <) of the following strings.
25 # The relation can be "=" or "<" (the level of the difference is not specified)
26 # or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference).
28 # Test sections ("* compare") are terminated by
29 # definitions of new collators, changing attributes, or new test sections.
31 ** test: simple CEs & expansions
32 # Many types of mappings are tested elsewhere, including via the UCA conformance tests.
33 # Here we mostly cover a few unusual mappings.
35 &\x01 # most control codes are ignorable
36 <<<\u0300 # tertiary CE
37 &9<\x00 # NUL not ignorable
38 &\uA00A\uA00B=\uA002 # two long-primary CEs
39 &\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits
53 <1 か # Hiragana Ka (U+304B)
54 <2 か\u3099 # plus voiced sound mark
55 = が # Hiragana Ga (U+304C)
58 <1 \uA00A\uA00B\u00050004
59 <1 \uA00A\uA00B\u00050005
61 <1 \uA00A\uA00B\u00050006
64 # Create some interesting mappings, and map some normalization-inert characters
65 # (which are not subject to canonical reordering)
66 # to some of the same CEs to check the sequence of CEs.
69 # Contractions starting with 'a' should not continue with any character < U+0300
70 # so that we can test a shortcut for that.
73 &d<dz\u0301=ⓓ # d+z+acute
75 <a\u0301=Ⓐ # a+acute sorts after z
76 <a\u0301\u0301=Ⓑ # a+acute+acute
77 <a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right
79 <a\u0323=Ⓔ # a+dot below
80 <a\u0323\u0358=Ⓕ # a+dot below+dot above right
81 <a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring
82 <a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z
84 &\U0001D158=⁰ # musical notehead black (has a symbol primary)
85 <\U0001D158\U0001D165=¼ # musical quarter note
87 # deliberately missing prefix contractions:
94 <<<\U0001D165=¹ # musical stem (ccc=216)
95 <<<\U0001D16D=² # musical augmentation dot (ccc=226)
96 <<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226)
97 &\u0301=❶ # acute (ccc=230)
98 &\u030a=❷ # ring (ccc=230)
99 &\u0308=❸ # diaeresis (ccc=230)
100 <<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230)
101 &\u0327=❺ # cedilla (ccc=202)
102 &\u0323=❻ # dot below (ccc=220)
103 &\u0331=❼ # macron below (ccc=220)
104 <<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232)
105 &\u0334=❾ # tilde overlay (ccc=1)
106 &\u0358=❿ # dot above right (ccc=232)
108 &\u0f71=① # tibetan vowel sign aa
109 &\u0f72=② # tibetan vowel sign i
110 # \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73
111 &\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129)
113 ** test: simple contractions
115 # Some strings are chosen to cause incremental contiguous contraction matching to
116 # go into partial matches for prefixes of contractions
117 # (where the prefixes are deliberately not also contractions).
118 # When there is no complete match, then the matching code must back out of those
119 # so that discontiguous contractions work as specified.
122 # contraction starter with no following text, or mismatch, or blocked
134 <2 a\u0308\u030a # ring blocked by diaeresis
142 <2 \u0308\u030a\u0301 # acute blocked by ring
148 <1 \U0001D158\U0001D165
151 # no discontiguous contraction because of missing prefix contraction d+z,
152 # and a starter ('z') after the 'd'
157 # contiguous contractions
171 <1 a\u0301\u0301\u0358
177 <1 a\u0327\u0323\u030a # match despite missing prefix
183 <2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second
187 <1 \U0001D158\U0001D165
191 <3 \U0001D165\U0001D16D
194 ** test: discontiguous contractions
196 <1 a\u0327\u030a # a+ring skips cedilla
198 <2 a\u0327\u0327\u030a # a+ring skips 2 cedillas
200 <2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas
202 <2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas
204 <1 a\u0327\u0323 # a+dot below skips cedilla
206 <1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute
208 <2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay
212 <2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below
216 <1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron)
218 <1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla
220 <2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas
222 <2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla
224 <2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla
228 <1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla
230 <1 a\U0001D165\u0323 # a+dot below skips stem
233 # partial contiguous match, backs up, matches discontiguous contraction
239 # a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks
241 <1 a\u0327\u0301\u0301\u0358
246 <1 a\u0f73\u0301 # a+acute skips tibetan ii
249 # FCD but the 0f71 inside the 0f73 must be skipped
250 # to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73
252 <1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72
255 ** test: discontiguous contractions with nested contractions
257 <1 a\u0323\u0308\u0301\u0358
259 <2 a\u0323\u0308\u0301\u0308\u0301\u0358
262 ** test: discontiguous contractions with interleaved contractions
264 # a+ring & cedilla & macron below+dot above right
265 <1 a\u0327\u0331\u030a\u0358
268 # a+ring & 1x..3x macron below+dot above right
269 <2 a\u0331\u030a\u0358
271 <2 a\u0331\u0331\u030a\u0358\u0358
274 <2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358
277 # a+dot below & stem+augmentation dot, followed by contiguous d+z+acute
278 <1 a\U0001D165\u0323\U0001D16Ddz\u0301
281 ** test: some simple string comparisons
284 # first string compares against ""
291 ** test: compare with strength=primary
298 ** test: compare with strength=secondary
305 ** test: compare with strength=tertiary
312 ** test: compare with strength=quaternary
313 % strength=quaternary
319 ** test: compare with strength=identical
326 ** test: côté with forwards secondary
334 ** test: côté with forwards secondary vs. U+FFFE merge separator
335 # Merged sort keys: On each level, any difference in the first segment
336 # must trump any further difference.
343 ** test: côté with backwards secondary
351 ** test: côté with backwards secondary vs. U+FFFE merge separator
352 # Merged sort keys: On each level, any difference in the first segment
353 # must trump any further difference.
360 ** test: U+FFFE on identical level
364 # All of these control codes are completely-ignorable, so that
365 # their low code points are compared with the merge separator.
366 # The merge separator must compare less than any other character.
367 <1 \uFFFE\u0001\u0002\u0003
368 <i \u0001\uFFFE\u0002\u0003
369 <i \u0001\u0002\uFFFE\u0003
370 <i \u0001\u0002\u0003\uFFFE
373 # The merge separator must even compare less than U+0000.
374 <1 \uFFFE\u0000\u0000
375 <i \u0000\uFFFE\u0000
376 <i \u0000\u0000\uFFFE
378 ** test: Hani < surrogates < U+FFFD
379 # Note: compareUTF8() treats unpaired surrogates like U+FFFD,
380 # so with that the strings with surrogates will compare equal to each other
381 # and equal to the string with U+FFFD.
394 ** test: script reordering
396 % reorder Hani Zzzz digit
422 ** test: very simple rules
424 &a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z
425 % strength=quaternary
438 ** test: tailoring twice before a root position: primary
448 ** test: tailoring twice before a root position: secondary
458 # secondary-before common weight
468 ** test: tailoring twice before a root position: tertiary
478 # tertiary-before common weight
499 ** test: tailor after completely ignorable
508 ** test: secondary tailoring gaps, ICU ticket 9362
511 &s<<r # secondary between s and ſ (long s)
512 &ſ<<*a-q # more than 15 between ſ and secondary CE boundary
513 &[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE
514 &[last primary ignorable]<<y<<z
519 <2 \u0332 # lowest secondary CE
535 ** test: tertiary tailoring gaps, ICU ticket 9362
538 &t<<<r # tertiary between t and fullwidth t
539 &ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary
540 &[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE
541 &[last secondary ignorable]<<<y<<<z
546 # Note: The root collator currently does not map any characters to tertiary CEs.
562 ** test: secondary & tertiary around root character
577 ** test: secondary & tertiary around tailored item
593 ** test: more nesting of secondary & tertiary before
618 &w<x # tailored CE getting case bits
619 =uv=uV=Uv=UV # 2 chars -> 1 CE
620 &ae=ch=cH=Ch=CH # 2 chars -> 2 CEs
621 &rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs
638 = Uv # mixed case on single CE cannot distinguish variations
641 ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower
643 &\u0001<<<t<<<T # tertiary CEs
654 ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper
665 ** test: reset on expansion, ICU tickets 9415 & 9593
667 &æ<x # tailor the last primary CE so that x sorts between ae and af
668 &æb=bæ # copy all reset CEs to make bæ sort the same
669 &각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂
670 &⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference
671 &l·=z # handle the pre-context for · when fetching reset CEs
672 <<u # copy/tailor 2 CEs
692 <1 · # by itself: primary CE
694 <2 l· # l+middle dot has only a secondary difference from l
700 <3 ⒀ # DUCET sets special tertiary weights in all CEs
709 = y # alternate=shifted removes the tailoring difference on the last CE
712 ** test: contraction inside extension, ICU ticket 9378
714 &а<<х/й # all letters are Cyrillic
719 ** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104
721 &t<x &ᵀ<y # same primary weights
722 &q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent
733 # Principle: Each rule builds on the state of preceding rules and ignores following rules.
735 ** test: later rule does not affect earlier reset position, ICU ticket 10105
737 &a < u < v < w &ov < x &b < v
741 <1 x # CE(o) followed by CE between u and w
746 ** test: later rule does not affect earlier extension (1), ICU ticket 10105
762 ** test: later rule does not affect earlier extension (2), ICU ticket 10105
764 &a <<< c / e &g <<< e / l
773 ** test: later rule does not affect earlier extension (3), ICU ticket 10105
775 &a = b / c &d = c / e
778 <1 AC # C is still only tertiary different from the original c
783 ** test: extension contains tailored character, ICU ticket 10105
793 ** test: add simple mappings for characters with root context
795 &z=· # middle dot has a prefix mapping in the CLDR root
796 &n=и # и (U+0438) has contractions in the root
799 <2 l· # root mapping for l|· still works
806 <1 и\u0306 # root mapping for й=и\u0306 still works
810 ** test: add context mappings around characters with root context
812 &z=·h # middle dot has a prefix mapping in the CLDR root
813 &n=ә|и # и (U+0438) has contractions in the root
816 <2 l· # root mapping for l|· still works
822 <1 и\u0306 # root mapping for й=и\u0306 still works
829 ** test: many secondary CEs at the top of their range
831 &[last primary ignorable]<<*\u2801-\u28ff
843 ** test: many tertiary CEs at the top of their range
845 &[last secondary ignorable]<<<*a-z
857 ** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101
869 = opx # first contraction op, then prefix p|x
873 ** test: reset position with prefix (pre-context), ICU ticket 10102
884 ** test: prefix+contraction together (1), ICU ticket 10071
904 ** test: prefix+contraction together (2), ICU ticket 10071
918 = abc # prefix match a|b takes precedence over contraction match bc
923 ** test: prefix+contraction together (3), ICU ticket 10071
925 &x=a|b &w=bc # reverse order of rules as previous test, order should not matter here
926 * compare # same "compare" sequences as previous test
937 = abc # prefix match a|b takes precedence over contraction match bc
942 ** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962
951 = pch # no-prefix contraction ch matches
954 = pci # prefix+contraction p|ci matches
957 ** test: tailor in & around compact ranges of root primaries
958 # The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs
959 # which should be reliably encoded as one range in the root elements data.
972 <1 ᣵ # U+18F5 last Canadian Aboriginal
988 <1 ᚠ # U+16A0 first Runic
990 ** test: suppressContractions
992 &z<ch<әж [suppressContractions [·cә]]
995 <3 cH # ch was suppressed
997 <1 l· # primary difference, not secondary, because l|· was suppressed
999 <2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed
1003 ** test: Hangul & Jamo
1005 &L=\u1100 # first Jamo L
1006 &V=\u1161 # first Jamo V
1007 &T=\u11A8 # first Jamo T
1008 &\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs
1016 = \u1100\u1161\u11A8
1027 ** test: adjust special reset positions according to previous rules, CLDR ticket 6070
1030 [maxVariable space] # has effect only after building, no effect on following rules
1032 &[before 1][first regular]<z
1034 <1 ? # some punctuation
1041 &[last primary ignorable]<<x<<<y
1042 &[last primary ignorable]<<z
1051 &[last secondary ignorable]<<<x
1052 &[last secondary ignorable]<<<y
1059 &[before 2][first variable]<<z
1060 &[before 2][first variable]<<y
1061 &[before 3][first variable]<<<x
1062 &[before 3][first variable]<<<w
1063 &[before 1][first variable]<v
1064 &[before 2][first variable]<<u
1065 &[before 3][first variable]<<<t
1066 &[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary
1081 &[before 2][first regular]<<z
1082 &[before 3][first regular]<<<y
1083 &[before 1][first regular]<x
1084 &[before 3][first regular]<<<w
1085 &[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary
1086 &[before 3][first regular]<<<u
1087 &[before 1][first regular]<p # primary before the boundary: becomes variable
1088 &[before 3][first regular]<<<t # not affected by p
1089 &[last variable]<q # after p!
1103 # check that p & q are indeed variable
1119 &[before 2][first trailing]<<z
1120 &[before 1][first trailing]<y
1121 &[before 3][first trailing]<<<x
1123 <1 \u4E00 # first Han, first implicit
1124 <1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary
1125 # Note: The root collator currently does not map any characters to the trailing first boundary primary.
1129 <2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary.
1132 &[before 2][first primary ignorable]<<z
1133 &[before 2][first primary ignorable]<<y
1134 &[before 3][first primary ignorable]<<<x
1135 &[before 3][first primary ignorable]<<<w
1145 &[before 3][first secondary ignorable]<<<y
1146 &[before 3][first secondary ignorable]<<<x
1153 ** test: canonical closure
1162 = Ấ # A with circumflex & acute
1167 <2 X\u030A # with ring above
1170 = \u212B # Angstrom sign
1179 = \uFA0C\uFA0D # CJK compatibility characters
1182 # canonical closure on prefix rules, ICU ticket 9444
1186 <1 äs # not tailored
1194 ** test: conjoining Jamo map to expansions
1196 &gg=\u1101 # Jamo Lead consonant GG
1197 &nj=\u11AC # Jamo Trail consonant NJ
1200 = \u1101\u1161\u11AC
1206 ** test: canonical tail closure, ICU ticket 5913
1213 <2 a\u0323\u0302 # discontiguous contraction
1214 = ạ\u0302 # equivalent
1224 <2 a\u0323\u0302 # contiguous contraction plus extra diacritic
1225 = ạ\u0302 # equivalent
1229 # Tail closure should work even if there is a prefix and/or contraction.
1232 # In order to find discontiguous contractions for \u5140|câ
1233 # there must exist a mapping for \u5140|ca, regardless of what it maps to.
1234 # (This follows from the UCA spec.)
1239 <1 \u5140câ # tailored
1243 <2 \u5140ca\u0323\u0302 # discontiguous contraction
1244 = \uFA0Cca\u0323\u0302
1254 # Double-check that without the extra mapping there will be no discontiguous match.
1260 <1 \u5140câ # tailored
1266 <1 \u5140ca\u0323\u0302 # no discontiguous contraction
1267 = \uFA0Cca\u0323\u0302
1279 <2 ca\u0323\u0302 # contiguous contraction plus extra diacritic
1280 = cạ\u0302 # equivalent
1284 # ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1285 # = 03C9 0313 0300 0345
1286 # ccc = 0, 230, 230, 240
1289 # In order to find discontiguous contractions for αῳ
1290 # there must exist a mapping for αω, regardless of what it maps to.
1291 # (This follows from the UCA spec.)
1297 <2 αω\u0313\u0300\u0345 # discontiguous contraction
1301 <2 αω\u0300\u0313\u0345
1303 = αῲ\u0313 # not FCD
1307 # Double-check that without the extra mapping there will be no discontiguous match.
1311 <1 αω\u0313\u0300\u0345 # no discontiguous contraction
1315 <2 αω\u0300\u0313\u0345
1317 = αῲ\u0313 # not FCD
1322 # Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232.
1323 # Tests code paths where the tailored string has a combining mark
1324 # that does not occur in any composite's decomposition.
1328 <1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above.
1334 <2 αω\u0300\u0315\u0345
1336 = αῲ\u0315 # not FCD
1338 ** test: danish a+a vs. a-umlaut, ICU ticket 9319
1347 ** test: Jamo L with and in prefix
1348 # Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L).
1350 # Jamo Lead consonant G after G or GG
1351 &[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100
1352 # Jamo Lead consonant GG sorts like G+G
1353 &\u1100\u1100=\u1101
1354 # Note: Making G|GG and GG|GG sort the same as G|G+G
1355 # would require the ability to reset on G|G+G,
1356 # or we could make G-after-G equal to some secondary-CE character,
1357 # and reset on a pair of those.
1358 # (It does not matter much if there are at most two G in a row in real text.)
1361 <2 \u1100\u1100 # only one primary from a sequence of G lead consonants
1363 <2 \u1100\u1100\u1100
1365 # but not = \u1100\u1101, see above
1368 <2 \u1100\u1100\u1161
1369 = \u1100\uAC00 # prefix match from the L of the LV syllable
1373 ** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546
1375 # Low secondary CEs for Jamo V & T.
1376 # Note: T should sort before V for proper syllable order.
1377 &\u0332 # COMBINING LOW LINE (first primary ignorable)
1380 # Korean Jamo lead consonant search rules, part 2:
1381 # Make modern compound L jamo primary equivalent to non-compound forms.
1383 # Secondary CEs for Jamo L-after-L, greater than Jamo V & T.
1384 &\u0313 # COMBINING COMMA ABOVE (second primary ignorable)
1391 # Compound L Jamo map to equivalent expansions of primary+secondary CE.
1392 &\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK
1393 &\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT
1394 &\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP
1395 &\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS
1396 &\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC
1403 <2 \u1100\u1100\u1161
1409 ** test: Hangul syllables in prefix & in the interior of a contraction
1411 &x=\u1100\u1161|a\u1102\u1162z
1414 = \u1100\u1161a\u1102\u1162z
1415 = \u1100\u1161a\uB0B4z
1416 = \uAC00a\u1102\u1162z
1419 ** test: digits are unsafe-backwards when numeric=on
1423 # If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a".
1424 # We need to back up before the identical prefix "1" and compare the full numbers.
1428 ** test: simple locale data test
1436 @ locale de-u-co-phonebk
1443 # The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt.
1445 ** test: DataDrivenCollationTest/TestMorePinyin
1446 # Testing the primary strength.
1463 ** test: DataDrivenCollationTest/TestLithuanian
1464 # Lithuanian sort order.
1478 ** test: DataDrivenCollationTest/TestLatvian
1479 # Latvian sort order.
1508 ** test: DataDrivenCollationTest/TestEstonian
1509 # Estonian sort order.
1531 ** test: DataDrivenCollationTest/TestAlbanian
1532 # Albanian sort order.
1568 ** test: DataDrivenCollationTest/TestSimplifiedChineseOrder
1569 # Sorted file has different order.
1571 # normalization=on turned on & off automatically.
1574 < \u5F20\u4E00\u8E3F
1576 ** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash
1577 # This pretty much crashes.
1580 < \u0f71\u0f72\u0f80\u0f71\u0f72
1583 ** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems
1584 # These are examples of strings that caused trouble in partial sort key testing.
1587 < \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C
1588 < \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18
1590 < \u0E01\u0E07\u0E01\u0E32\u0E23
1591 < \u0E01\u0E07\u0E42\u0E01\u0E49
1593 < \u0E01\u0E23\u0E19\u0E17\u0E32
1594 < \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32
1596 < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27
1597 < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27
1599 < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D
1600 < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32
1602 ** test: DataDrivenCollationTest/TestJavaStyleRule
1603 # java.text allows rules to start as '<<<x<<<y...'
1604 # we emulate this by assuming a &[first tertiary ignorable] in this case.
1606 &\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b
1612 = b # x had become the new first primary ignorable
1615 ** test: DataDrivenCollationTest/TestShiftedIgnorable
1616 # The UCA states that primary ignorables should be completely
1617 # ignorable when following a shifted code point.
1620 % strength=quaternary
1639 ** test: DataDrivenCollationTest/TestNShiftedIgnorable
1640 # The UCA states that primary ignorables should be completely
1641 # ignorable when following a shifted code point.
1643 % alternate=non-ignorable
1663 ** test: DataDrivenCollationTest/TestSafeSurrogates
1664 # It turned out that surrogates were not skipped properly
1665 # when iterating backwards if they were in the middle of a
1666 # contraction. This test assures that this is fixed.
1673 ** test: DataDrivenCollationTest/da_TestPrimary
1674 # This test goes through primary strength cases
1687 ** test: DataDrivenCollationTest/da_TestTertiary
1688 # This test goes through tertiary strength cases
1729 < HAANDV\u00c6RKSBANKEN
1732 < NIELS\u0020J\u00d8RGEN
1741 < SCH\u00dcTT,\u0020H
1743 < SCH\u00dcTT,\u0020M
1747 < STORE\u0020VILDMOSE
1749 < STORM\u0020PETERSEN
1753 < \u00feORVAR\u00d0UR
1755 < VESTERG\u00c5RD,\u0020A
1756 < VESTERGAARD,\u0020A
1757 < VESTERG\u00c5RD,\u0020B
1771 < \u010du\u010d\u0113t
1819 ** test: DataDrivenCollationTest/hi_TestNewRules
1820 # This test goes through new rules and tests against old rules
1828 ** test: DataDrivenCollationTest/ro_TestNewRules
1829 # This test goes through new rules and tests against old rules
1883 ** test: DataDrivenCollationTest/testOffsets
1884 # This tests cases where forwards and backwards iteration get different offsets
1888 < a\uD800\uDC00\uDC00
1889 < b\uD800\uDC00\uDC00
1891 < \u0301A\u0301\u0301
1892 < \u0301B\u0301\u0301
1896 # TODO: test offsets in new CollationTest
1898 # End of test cases moved here from ICU 52's DataDrivenCollationTest.txt.
1900 ** test: was ICU 52 cmsccoll/TestRedundantRules
1902 & a < b < c < d& [before 1] c < m
1911 & a < b <<< c << d <<< e& [before 3] e <<< x
1921 & a < b <<< c << d <<< e <<< f < g& [before 1] g < x
1933 & a <<< b << c < d& a < m
1964 & a < b < c < d& r < c
1973 & a < b < c < d& c < m
1982 & a < b < c < d& a < m
1990 ** test: was ICU 52 cmsccoll/TestExpansionSyntax
1991 # The following two rules should sort the particular list of strings the same.
1993 &AE <<< a << b <<< c &d <<< f
2003 &A <<< a / E << b / E <<< c /E &d <<< f
2012 # The following two rules should sort the particular list of strings the same.
2014 &AE <<< a <<< b << c << d < e < f <<< g
2026 &A <<< a / E <<< b / E << c / E << d / E < e < f <<< g
2037 # The following two rules should sort the particular list of strings the same.
2039 &AE <<< B <<< C / D <<< F
2048 &A <<< B / E <<< C / ED <<< F / E
2056 ** test: never reorder trailing primaries
2066 ** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes
2079 # With the following rules, there is only one prefix per composite ĉ or ç,
2080 # but both prefixes apply to just c in NFD form.
2081 # We would get different results for composed vs. NFD input
2082 # if we fell back directly from longest-prefix mappings to no-prefix mappings.
2098 # The mapping is used which has the longest matching prefix for which
2099 # there is also a suffix match, with the longest suffix match among several for that prefix.
2139 ** test: prefix + discontiguous contraction with missing prefix contraction
2140 # Unfortunate terminology: The first "prefix" here is the pre-context,
2141 # the second "prefix" refers to the contraction/relation string that is
2142 # one shorter than the one being tested.
2147 # No mapping for op|e:
2148 # Discontiguous contraction matching should not match op|ê in opệ
2149 # because it would have to skip the dot below and extend a match on op|e by the circumflex,
2150 # but there is no match on op|e.
2156 = opy\u0323 # y not z
2160 # We cannot test for fallback by whether the contraction default CE32
2161 # is for another contraction. With the following rules, there is no mapping for op|e,
2162 # and the fallback to prefix p has no contractions.
2171 = opx\u0323\u0302 # x not z
2175 # One more variation: Fallback to the simple code point, no shorter non-empty prefix.
2185 = opx\u0323\u0302 # x not z
2189 ** test: maxVariable via rules
2191 [maxVariable space][alternate shifted]
2200 ** test: maxVariable via setting
2202 % maxVariable=currency
2212 ** test: ICU4J CollationMiscTest/TestContractionClosure (ää)
2213 # This tests canonical closure, but it also tests that CollationFastLatin
2214 # bails out properly for contractions with combining marks.
2215 # For that we need pairs of strings that remain in the Latin fastpath
2216 # long enough, hence the extra "= b" lines.
2229 ** test: ICU4J CollationMiscTest/TestContractionClosure (Å)
2240 ** test: reset-before on already-tailored characters, ICU ticket 10108
2242 &a<w<<x &[before 2]x<<y
2250 &a<<w<<<x &[before 2]x<<y
2258 &a<w<x &[before 2]x<<y
2266 &a<w<<<x &[before 2]x<<y
2273 ** test: numeric collation with other settings, ICU ticket 9092
2275 % strength=identical
2282 ** test: collation type fallback from unsupported type, ICU ticket 10149
2283 @ locale fr-CA-u-co-phonebk
2284 # Expect the same result as with fr-CA, using backwards-secondary order.
2285 # That is, we should fall back from the unsupported collation type
2286 # to the locale's default collation type.
2293 ** test: @ is equivalent to [backwards 2], ICU ticket 9956
2307 ** test: shifted+reordering, ICU ticket 9507
2309 % reorder Grek punct space
2311 % strength=quaternary
2312 # Which primaries are "variable" should be determined without script reordering,
2313 # and then primaries should be reordered whether they are shifted to quaternary or not.
2320 <1 $ # currency symbol
2334 ** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351
2342 ** test: secondary+case ignores secondary ignorables, ICU ticket 9355
2345 % strength=secondary
2352 ** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328
2354 &[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57
2361 ** test: quoted apostrophe in compact syntax, ICU ticket 8204
2373 # ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()"
2374 ** test: locale -u- with collation keywords, ICU ticket 8260
2375 @ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4
2377 <4 \u0020 # space is shifted, strength=quaternary
2378 <1 ! # punctuation is regular
2380 <1 12 # numeric sorting
2382 <c b # uppercase first on case level
2384 <2 x\u0308\u0301 # normalization off
2386 ** test: locale @ with collation keywords, ICU ticket 8260
2387 @ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted
2389 <4 $ # currency symbols are shifted, strength=quaternary
2391 <2 alà # backwards secondary level
2393 ** test: locale -u- with script reordering, ICU ticket 8260
2394 @ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai
2409 ** test: locale @collation=type should be case-insensitive
2410 @ locale de@coLLation=PhoneBook
2416 ** test: import root search rules plus German phonebook rules, ICU ticket 8962
2417 @ locale de-u-co-search
2425 # Once more, but with runtime builder.
2427 [import und-u-co-search][import de-u-co-phonebk]
2435 # Once again, with import from "root" not "und" (as in a proper language tag).
2437 [import root-u-co-search][import de-u-co-phonebk]
2445 ** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998
2446 # Greek should sort Greek first.
2454 # Import Greek, and then reset the reordering.
2456 [import el][reorder Zzzz]
2462 # "others" is a synonym for Zzzz.
2464 [import el][reorder others]
2470 ** test: regression test for CollationFastLatinBuilder, ICU ticket 11388
2473 % strength=secondary
2482 ** test: tailor tertiary-after a common tertiary where there is a lower one
2483 # Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one.
2484 # See ICU ticket 11448 & CLDR ticket 7222.
2496 ** test: tailor tertiary-after a below-common tertiary
2508 ** test: tailor tertiary-before a common tertiary where there is a lower one
2510 &[before 3]あ<<<x<<<y<<<z
2520 ** test: tailor tertiary-before a below-common tertiary
2522 &[before 3]ぁ<<<x<<<y<<<z
2532 ** test: reorder single scripts not groups, ICU ticket 11449
2540 # Before ICU 55, the following reordered together with Gothic.