]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html | |
3 | # Copyright (c) 2012-2015 International Business Machines | |
57a6839d A |
4 | # Corporation and others. All Rights Reserved. |
5 | # | |
6 | # This file should be in UTF-8 with a signature byte sequence ("BOM"). | |
7 | # | |
8 | # collationtest.txt: Collation test data. | |
9 | # | |
10 | # created on: 2012apr13 | |
11 | # created by: Markus W. Scherer | |
12 | ||
13 | # A line with "** test: description" is used for verbose and error output. | |
14 | ||
15 | # A collator can be set with "@ root" or "@ locale language-tag", | |
16 | # for example "@ locale de-u-co-phonebk". | |
b331163b | 17 | # An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". |
57a6839d A |
18 | |
19 | # A collator can be built with "@ rules". | |
20 | # An "@ rules" line is followed by one or more lines with the tailoring rules. | |
21 | ||
22 | # A collator can be modified with "% attribute=value". | |
23 | ||
24 | # "* compare" tests the order (= or <) of the following strings. | |
25 | # The relation can be "=" or "<" (the level of the difference is not specified) | |
26 | # or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). | |
27 | ||
28 | # Test sections ("* compare") are terminated by | |
29 | # definitions of new collators, changing attributes, or new test sections. | |
30 | ||
31 | ** test: simple CEs & expansions | |
32 | # Many types of mappings are tested elsewhere, including via the UCA conformance tests. | |
33 | # Here we mostly cover a few unusual mappings. | |
34 | @ rules | |
35 | &\x01 # most control codes are ignorable | |
36 | <<<\u0300 # tertiary CE | |
37 | &9<\x00 # NUL not ignorable | |
38 | &\uA00A\uA00B=\uA002 # two long-primary CEs | |
39 | &\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits | |
40 | ||
41 | * compare | |
42 | = \x01 | |
43 | = \x02 | |
44 | <3 \u0300 | |
45 | <1 9 | |
46 | <1 \x00 | |
47 | = \x01\x00\x02 | |
48 | <1 a | |
49 | <3 a\u0300 | |
50 | <2 a\u0308 | |
51 | = ä | |
52 | <1 b | |
53 | <1 か # Hiragana Ka (U+304B) | |
54 | <2 か\u3099 # plus voiced sound mark | |
55 | = が # Hiragana Ga (U+304C) | |
56 | <1 \uA00A\uA00B | |
57 | = \uA002 | |
58 | <1 \uA00A\uA00B\u00050004 | |
59 | <1 \uA00A\uA00B\u00050005 | |
60 | = \uA003 | |
61 | <1 \uA00A\uA00B\u00050006 | |
62 | ||
63 | ** test: contractions | |
64 | # Create some interesting mappings, and map some normalization-inert characters | |
65 | # (which are not subject to canonical reordering) | |
66 | # to some of the same CEs to check the sequence of CEs. | |
67 | @ rules | |
68 | ||
69 | # Contractions starting with 'a' should not continue with any character < U+0300 | |
70 | # so that we can test a shortcut for that. | |
71 | &a=ⓐ | |
72 | &b<bz=ⓑ | |
73 | &d<dz\u0301=ⓓ # d+z+acute | |
74 | &z | |
75 | <a\u0301=Ⓐ # a+acute sorts after z | |
76 | <a\u0301\u0301=Ⓑ # a+acute+acute | |
77 | <a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right | |
78 | <a\u030a=Ⓓ # a+ring | |
79 | <a\u0323=Ⓔ # a+dot below | |
80 | <a\u0323\u0358=Ⓕ # a+dot below+dot above right | |
81 | <a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring | |
82 | <a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z | |
83 | ||
84 | &\U0001D158=⁰ # musical notehead black (has a symbol primary) | |
85 | <\U0001D158\U0001D165=¼ # musical quarter note | |
86 | ||
87 | # deliberately missing prefix contractions: | |
88 | # dz | |
89 | # a\u0327 | |
90 | # a\u0327\u0323 | |
91 | # a\u0327\u0323b | |
92 | ||
93 | &\x01 | |
94 | <<<\U0001D165=¹ # musical stem (ccc=216) | |
95 | <<<\U0001D16D=² # musical augmentation dot (ccc=226) | |
96 | <<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) | |
97 | &\u0301=❶ # acute (ccc=230) | |
98 | &\u030a=❷ # ring (ccc=230) | |
99 | &\u0308=❸ # diaeresis (ccc=230) | |
100 | <<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) | |
101 | &\u0327=❺ # cedilla (ccc=202) | |
102 | &\u0323=❻ # dot below (ccc=220) | |
103 | &\u0331=❼ # macron below (ccc=220) | |
104 | <<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) | |
105 | &\u0334=❾ # tilde overlay (ccc=1) | |
106 | &\u0358=❿ # dot above right (ccc=232) | |
107 | ||
108 | &\u0f71=① # tibetan vowel sign aa | |
109 | &\u0f72=② # tibetan vowel sign i | |
110 | # \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 | |
111 | &\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) | |
112 | ||
113 | ** test: simple contractions | |
114 | ||
115 | # Some strings are chosen to cause incremental contiguous contraction matching to | |
116 | # go into partial matches for prefixes of contractions | |
117 | # (where the prefixes are deliberately not also contractions). | |
118 | # When there is no complete match, then the matching code must back out of those | |
119 | # so that discontiguous contractions work as specified. | |
120 | ||
121 | * compare | |
122 | # contraction starter with no following text, or mismatch, or blocked | |
123 | <1 a | |
124 | = ⓐ | |
125 | <1 aa | |
126 | = ⓐⓐ | |
127 | <1 ab | |
128 | = ⓐb | |
129 | <1 az | |
130 | = ⓐz | |
131 | ||
132 | * compare | |
133 | <1 a | |
134 | <2 a\u0308\u030a # ring blocked by diaeresis | |
135 | = ⓐ❸❷ | |
136 | <2 a\u0327 | |
137 | = ⓐ❺ | |
138 | ||
139 | * compare | |
140 | <2 \u0308 | |
141 | = ❸ | |
142 | <2 \u0308\u030a\u0301 # acute blocked by ring | |
143 | = ❸❷❶ | |
144 | ||
145 | * compare | |
146 | <1 \U0001D158 | |
147 | = ⁰ | |
148 | <1 \U0001D158\U0001D165 | |
149 | = ¼ | |
150 | ||
151 | # no discontiguous contraction because of missing prefix contraction d+z, | |
152 | # and a starter ('z') after the 'd' | |
153 | * compare | |
154 | <1 dz\u0323\u0301 | |
155 | = dz❻❶ | |
156 | ||
157 | # contiguous contractions | |
158 | * compare | |
159 | <1 abz | |
160 | = ⓐⓑ | |
161 | <1 abzz | |
162 | = ⓐⓑz | |
163 | ||
164 | * compare | |
165 | <1 a | |
166 | <1 z | |
167 | <1 a\u0301 | |
168 | = Ⓐ | |
169 | <1 a\u0301\u0301 | |
170 | = Ⓑ | |
171 | <1 a\u0301\u0301\u0358 | |
172 | = Ⓒ | |
173 | <1 a\u030a | |
174 | = Ⓓ | |
175 | <1 a\u0323\u0358 | |
176 | = Ⓕ | |
177 | <1 a\u0327\u0323\u030a # match despite missing prefix | |
178 | = Ⓖ | |
179 | <1 a\u0327\u0323bz | |
180 | = Ⓗ | |
181 | ||
182 | * compare | |
183 | <2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second | |
184 | = ❸❹ | |
185 | ||
186 | * compare | |
187 | <1 \U0001D158\U0001D165 | |
188 | = ¼ | |
189 | ||
190 | * compare | |
191 | <3 \U0001D165\U0001D16D | |
192 | = ³ | |
193 | ||
194 | ** test: discontiguous contractions | |
195 | * compare | |
196 | <1 a\u0327\u030a # a+ring skips cedilla | |
197 | = Ⓓ❺ | |
198 | <2 a\u0327\u0327\u030a # a+ring skips 2 cedillas | |
199 | = Ⓓ❺❺ | |
200 | <2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas | |
201 | = Ⓓ❺❺❺ | |
202 | <2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas | |
203 | = Ⓓ❾❺❺ | |
204 | <1 a\u0327\u0323 # a+dot below skips cedilla | |
205 | = Ⓔ❺ | |
206 | <1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute | |
207 | = Ⓕ❶ | |
208 | <2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay | |
209 | = Ⓕ❾ | |
210 | ||
211 | * compare | |
212 | <2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below | |
213 | = ❽❼ | |
214 | ||
215 | * compare | |
216 | <1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) | |
217 | = Ⓓ❺❼❻ | |
218 | <1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla | |
219 | = Ⓔ❺²❷ | |
220 | <2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas | |
221 | = Ⓔ❺❺❷ | |
222 | <2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla | |
223 | = Ⓔ❺❻❷ | |
224 | <2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla | |
225 | = Ⓔ❾❺❷ | |
226 | ||
227 | * compare | |
228 | <1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla | |
229 | = ¼❺ | |
230 | <1 a\U0001D165\u0323 # a+dot below skips stem | |
231 | = Ⓔ¹ | |
232 | ||
233 | # partial contiguous match, backs up, matches discontiguous contraction | |
234 | <1 a\u0327\u0323b | |
235 | = Ⓔ❺b | |
236 | <1 a\u0327\u0323ba | |
237 | = Ⓔ❺bⓐ | |
238 | ||
239 | # a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks | |
240 | * compare | |
241 | <1 a\u0327\u0301\u0301\u0358 | |
242 | = Ⓒ❺ | |
243 | ||
244 | # FCD but not NFD | |
245 | * compare | |
246 | <1 a\u0f73\u0301 # a+acute skips tibetan ii | |
247 | = Ⓐ③ | |
248 | ||
249 | # FCD but the 0f71 inside the 0f73 must be skipped | |
250 | # to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 | |
251 | * compare | |
252 | <1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 | |
253 | = ③① | |
254 | ||
255 | ** test: discontiguous contractions with nested contractions | |
256 | * compare | |
257 | <1 a\u0323\u0308\u0301\u0358 | |
258 | = Ⓕ❹ | |
259 | <2 a\u0323\u0308\u0301\u0308\u0301\u0358 | |
260 | = Ⓕ❹❹ | |
261 | ||
262 | ** test: discontiguous contractions with interleaved contractions | |
263 | * compare | |
264 | # a+ring & cedilla & macron below+dot above right | |
265 | <1 a\u0327\u0331\u030a\u0358 | |
266 | = Ⓓ❺❽ | |
267 | ||
268 | # a+ring & 1x..3x macron below+dot above right | |
269 | <2 a\u0331\u030a\u0358 | |
270 | = Ⓓ❽ | |
271 | <2 a\u0331\u0331\u030a\u0358\u0358 | |
272 | = Ⓓ❽❽ | |
273 | # also skips acute | |
274 | <2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 | |
275 | = Ⓓ❽❽❽❶ | |
276 | ||
277 | # a+dot below & stem+augmentation dot, followed by contiguous d+z+acute | |
278 | <1 a\U0001D165\u0323\U0001D16Ddz\u0301 | |
279 | = Ⓔ³ⓓ | |
280 | ||
281 | ** test: some simple string comparisons | |
282 | @ root | |
283 | * compare | |
284 | # first string compares against "" | |
285 | = \u0000 | |
286 | < a | |
287 | <1 b | |
288 | <3 B | |
289 | = \u0000B\u0000 | |
290 | ||
291 | ** test: compare with strength=primary | |
292 | % strength=primary | |
293 | * compare | |
294 | <1 a | |
295 | <1 b | |
296 | = B | |
297 | ||
298 | ** test: compare with strength=secondary | |
299 | % strength=secondary | |
300 | * compare | |
301 | <1 a | |
302 | <1 b | |
303 | = B | |
304 | ||
305 | ** test: compare with strength=tertiary | |
306 | % strength=tertiary | |
307 | * compare | |
308 | <1 a | |
309 | <1 b | |
310 | <3 B | |
311 | ||
312 | ** test: compare with strength=quaternary | |
313 | % strength=quaternary | |
314 | * compare | |
315 | <1 a | |
316 | <1 b | |
317 | <3 B | |
318 | ||
319 | ** test: compare with strength=identical | |
320 | % strength=identical | |
321 | * compare | |
322 | <1 a | |
323 | <1 b | |
324 | <3 B | |
325 | ||
326 | ** test: côté with forwards secondary | |
327 | @ root | |
328 | * compare | |
329 | <1 cote | |
330 | <2 coté | |
331 | <2 côte | |
332 | <2 côté | |
333 | ||
334 | ** test: côté with forwards secondary vs. U+FFFE merge separator | |
335 | # Merged sort keys: On each level, any difference in the first segment | |
336 | # must trump any further difference. | |
337 | * compare | |
338 | <1 cote\uFFFEcôté | |
339 | <2 coté\uFFFEcôte | |
340 | <2 côte\uFFFEcoté | |
341 | <2 côté\uFFFEcote | |
342 | ||
343 | ** test: côté with backwards secondary | |
344 | % backwards=on | |
345 | * compare | |
346 | <1 cote | |
347 | <2 côte | |
348 | <2 coté | |
349 | <2 côté | |
350 | ||
351 | ** test: côté with backwards secondary vs. U+FFFE merge separator | |
352 | # Merged sort keys: On each level, any difference in the first segment | |
353 | # must trump any further difference. | |
354 | * compare | |
355 | <1 cote\uFFFEcôté | |
356 | <2 côte\uFFFEcoté | |
357 | <2 coté\uFFFEcôte | |
358 | <2 côté\uFFFEcote | |
359 | ||
360 | ** test: U+FFFE on identical level | |
361 | @ root | |
362 | % strength=identical | |
363 | * compare | |
364 | # All of these control codes are completely-ignorable, so that | |
365 | # their low code points are compared with the merge separator. | |
366 | # The merge separator must compare less than any other character. | |
367 | <1 \uFFFE\u0001\u0002\u0003 | |
368 | <i \u0001\uFFFE\u0002\u0003 | |
369 | <i \u0001\u0002\uFFFE\u0003 | |
370 | <i \u0001\u0002\u0003\uFFFE | |
371 | ||
372 | * compare | |
373 | # The merge separator must even compare less than U+0000. | |
374 | <1 \uFFFE\u0000\u0000 | |
375 | <i \u0000\uFFFE\u0000 | |
376 | <i \u0000\u0000\uFFFE | |
377 | ||
378 | ** test: Hani < surrogates < U+FFFD | |
379 | # Note: compareUTF8() treats unpaired surrogates like U+FFFD, | |
380 | # so with that the strings with surrogates will compare equal to each other | |
381 | # and equal to the string with U+FFFD. | |
382 | @ root | |
383 | % strength=identical | |
384 | * compare | |
385 | <1 abz | |
386 | <1 a\u4e00z | |
387 | <1 a\U00020000z | |
388 | <1 a\ud800z | |
389 | <1 a\udbffz | |
390 | <1 a\udc00z | |
391 | <1 a\udfffz | |
392 | <1 a\ufffdz | |
393 | ||
394 | ** test: script reordering | |
395 | @ root | |
396 | % reorder Hani Zzzz digit | |
397 | * compare | |
398 | <1 ? | |
399 | <1 + | |
400 | <1 丂 | |
401 | <1 a | |
402 | <1 α | |
403 | <1 5 | |
404 | ||
405 | % reorder default | |
406 | * compare | |
407 | <1 ? | |
408 | <1 + | |
409 | <1 5 | |
410 | <1 a | |
411 | <1 α | |
412 | <1 丂 | |
413 | ||
414 | ** test: empty rules | |
415 | @ rules | |
416 | * compare | |
417 | <1 a | |
418 | <2 ä | |
419 | <3 Ä | |
420 | <1 b | |
421 | ||
422 | ** test: very simple rules | |
423 | @ rules | |
424 | &a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z | |
425 | % strength=quaternary | |
426 | * compare | |
427 | <1 a | |
428 | = e | |
429 | <4 q | |
430 | <4 r | |
431 | <1 x | |
432 | <3 X | |
433 | <2 y | |
434 | <3 Y | |
435 | <2 z | |
436 | <3 Z | |
437 | ||
438 | ** test: tailoring twice before a root position: primary | |
439 | @ rules | |
440 | &[before 1]b<p | |
441 | &[before 1]b<q | |
442 | * compare | |
443 | <1 a | |
444 | <1 p | |
445 | <1 q | |
446 | <1 b | |
447 | ||
448 | ** test: tailoring twice before a root position: secondary | |
449 | @ rules | |
450 | &[before 2]ſ<<p | |
451 | &[before 2]ſ<<q | |
452 | * compare | |
453 | <1 s | |
454 | <2 p | |
455 | <2 q | |
456 | <2 ſ | |
457 | ||
458 | # secondary-before common weight | |
459 | @ rules | |
460 | &[before 2]b<<p | |
461 | &[before 2]b<<q | |
462 | * compare | |
463 | <1 a | |
464 | <1 p | |
465 | <2 q | |
466 | <2 b | |
467 | ||
468 | ** test: tailoring twice before a root position: tertiary | |
469 | @ rules | |
470 | &[before 3]B<<<p | |
471 | &[before 3]B<<<q | |
472 | * compare | |
473 | <1 b | |
474 | <3 p | |
475 | <3 q | |
476 | <3 B | |
477 | ||
478 | # tertiary-before common weight | |
479 | @ rules | |
480 | &[before 3]b<<<p | |
481 | &[before 3]b<<<q | |
482 | * compare | |
483 | <1 a | |
484 | <1 p | |
485 | <3 q | |
486 | <3 b | |
487 | ||
488 | @ rules | |
489 | &[before 2]b<<s | |
490 | &[before 3]s<<<p | |
491 | &[before 3]s<<<q | |
492 | * compare | |
493 | <1 a | |
494 | <1 p | |
495 | <3 q | |
496 | <3 s | |
497 | <2 b | |
498 | ||
499 | ** test: tailor after completely ignorable | |
500 | @ rules | |
501 | &\x00<<<x<<y | |
502 | * compare | |
503 | = \x00 | |
504 | = \x1F | |
505 | <3 x | |
506 | <2 y | |
507 | ||
508 | ** test: secondary tailoring gaps, ICU ticket 9362 | |
509 | @ rules | |
510 | &[before 2]s<<'_' | |
511 | &s<<r # secondary between s and ſ (long s) | |
512 | &ſ<<*a-q # more than 15 between ſ and secondary CE boundary | |
513 | &[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE | |
514 | &[last primary ignorable]<<y<<z | |
515 | ||
516 | * compare | |
517 | <2 u | |
518 | <2 v | |
519 | <2 \u0332 # lowest secondary CE | |
520 | <2 \u0308 | |
521 | <2 y | |
522 | <2 z | |
523 | <1 s_ | |
524 | <2 ss | |
525 | <2 sr | |
526 | <2 sſ | |
527 | <2 sa | |
528 | <2 sb | |
529 | <2 sp | |
530 | <2 sq | |
531 | <2 sus | |
532 | <2 svs | |
533 | <2 rs | |
534 | ||
535 | ** test: tertiary tailoring gaps, ICU ticket 9362 | |
536 | @ rules | |
537 | &[before 3]t<<<'_' | |
538 | &t<<<r # tertiary between t and fullwidth t | |
539 | &ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary | |
540 | &[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE | |
541 | &[last secondary ignorable]<<<y<<<z | |
542 | ||
543 | * compare | |
544 | <3 u | |
545 | <3 v | |
546 | # Note: The root collator currently does not map any characters to tertiary CEs. | |
547 | <3 y | |
548 | <3 z | |
549 | <1 t_ | |
550 | <3 tt | |
551 | <3 tr | |
552 | <3 tt | |
553 | <3 tᵀ | |
554 | <3 ta | |
555 | <3 tb | |
556 | <3 tp | |
557 | <3 tq | |
558 | <3 tut | |
559 | <3 tvt | |
560 | <3 rt | |
561 | ||
562 | ** test: secondary & tertiary around root character | |
563 | @ rules | |
564 | &[before 2]m<<r | |
565 | &m<<s | |
566 | &[before 3]m<<<u | |
567 | &m<<<v | |
568 | * compare | |
569 | <1 l | |
570 | <1 r | |
571 | <2 u | |
572 | <3 m | |
573 | <3 v | |
574 | <2 s | |
575 | <1 n | |
576 | ||
577 | ** test: secondary & tertiary around tailored item | |
578 | @ rules | |
579 | &m<x | |
580 | &[before 2]x<<r | |
581 | &x<<s | |
582 | &[before 3]x<<<u | |
583 | &x<<<v | |
584 | * compare | |
585 | <1 m | |
586 | <1 r | |
587 | <2 u | |
588 | <3 x | |
589 | <3 v | |
590 | <2 s | |
591 | <1 n | |
592 | ||
593 | ** test: more nesting of secondary & tertiary before | |
594 | @ rules | |
595 | &[before 3]m<<<u | |
596 | &[before 2]m<<r | |
597 | &[before 3]r<<<q | |
598 | &m<<<w | |
599 | &m<<t | |
600 | &[before 3]w<<<v | |
601 | &w<<<x | |
602 | &w<<s | |
603 | * compare | |
604 | <1 l | |
605 | <1 q | |
606 | <3 r | |
607 | <2 u | |
608 | <3 m | |
609 | <3 v | |
610 | <3 w | |
611 | <3 x | |
612 | <2 s | |
613 | <2 t | |
614 | <1 n | |
615 | ||
616 | ** test: case bits | |
617 | @ rules | |
618 | &w<x # tailored CE getting case bits | |
619 | =uv=uV=Uv=UV # 2 chars -> 1 CE | |
620 | &ae=ch=cH=Ch=CH # 2 chars -> 2 CEs | |
621 | &rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs | |
622 | % caseFirst=lower | |
623 | * compare | |
624 | <1 ae | |
625 | = ch | |
626 | <3 cH | |
627 | <3 Ch | |
628 | <3 CH | |
629 | <1 rst | |
630 | = yz | |
631 | <3 yZ | |
632 | <3 Yz | |
633 | <3 YZ | |
634 | <1 w | |
635 | <1 x | |
636 | = uv | |
637 | <3 uV | |
638 | = Uv # mixed case on single CE cannot distinguish variations | |
639 | <3 UV | |
640 | ||
641 | ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower | |
642 | @ rules | |
643 | &\u0001<<<t<<<T # tertiary CEs | |
644 | % caseFirst=lower | |
645 | * compare | |
646 | <1 aa | |
647 | <3 aat | |
648 | <3 aaT | |
649 | <3 aA | |
650 | <3 aAt | |
651 | <3 ata | |
652 | <3 aTa | |
653 | ||
654 | ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper | |
655 | % caseFirst=upper | |
656 | * compare | |
657 | <1 aA | |
658 | <3 aAt | |
659 | <3 aa | |
660 | <3 aat | |
661 | <3 aaT | |
662 | <3 ata | |
663 | <3 aTa | |
664 | ||
665 | ** test: reset on expansion, ICU tickets 9415 & 9593 | |
666 | @ rules | |
667 | &æ<x # tailor the last primary CE so that x sorts between ae and af | |
668 | &æb=bæ # copy all reset CEs to make bæ sort the same | |
669 | &각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 | |
670 | &⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference | |
671 | &l·=z # handle the pre-context for · when fetching reset CEs | |
672 | <<u # copy/tailor 2 CEs | |
673 | ||
674 | * compare | |
675 | <1 ae | |
676 | <2 æ | |
677 | <1 x | |
678 | <1 af | |
679 | ||
680 | * compare | |
681 | <1 aeb | |
682 | <2 æb | |
683 | = bæ | |
684 | ||
685 | * compare | |
686 | <1 각 | |
687 | <1 h | |
688 | <1 갂 | |
689 | <1 갃 | |
690 | ||
691 | * compare | |
692 | <1 · # by itself: primary CE | |
693 | <1 l | |
694 | <2 l· # l+middle dot has only a secondary difference from l | |
695 | = z | |
696 | <2 u | |
697 | ||
698 | * compare | |
699 | <1 (13) | |
700 | <3 ⒀ # DUCET sets special tertiary weights in all CEs | |
701 | <2 y | |
702 | <1 (13[ | |
703 | ||
704 | % alternate=shifted | |
705 | * compare | |
706 | <1 (13) | |
707 | = 13 | |
708 | <3 ⒀ | |
709 | = y # alternate=shifted removes the tailoring difference on the last CE | |
710 | <1 14 | |
711 | ||
712 | ** test: contraction inside extension, ICU ticket 9378 | |
713 | @ rules | |
714 | &а<<х/й # all letters are Cyrillic | |
715 | * compare | |
716 | <1 ай | |
717 | <2 х | |
718 | ||
719 | ** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 | |
720 | @ rules | |
721 | &t<x &ᵀ<y # same primary weights | |
722 | &q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent | |
723 | * compare | |
724 | <1 q | |
725 | <1 u | |
726 | <1 v | |
727 | <1 ꝗ | |
728 | <1 t | |
729 | <3 ᵀ | |
730 | <1 y | |
731 | <1 x | |
732 | ||
733 | # Principle: Each rule builds on the state of preceding rules and ignores following rules. | |
734 | ||
735 | ** test: later rule does not affect earlier reset position, ICU ticket 10105 | |
736 | @ rules | |
737 | &a < u < v < w &ov < x &b < v | |
738 | * compare | |
739 | <1 oa | |
740 | <1 ou | |
741 | <1 x # CE(o) followed by CE between u and w | |
742 | <1 ow | |
743 | <1 ob | |
744 | <1 ov | |
745 | ||
746 | ** test: later rule does not affect earlier extension (1), ICU ticket 10105 | |
747 | @ rules | |
748 | &a=x/b &v=b | |
749 | % strength=secondary | |
750 | * compare | |
751 | <1 B | |
752 | <1 c | |
753 | <1 v | |
754 | = b | |
755 | * compare | |
756 | <1 AB | |
757 | = x | |
758 | <1 ac | |
759 | <1 av | |
760 | = ab | |
761 | ||
762 | ** test: later rule does not affect earlier extension (2), ICU ticket 10105 | |
763 | @ rules | |
764 | &a <<< c / e &g <<< e / l | |
765 | % strength=secondary | |
766 | * compare | |
767 | <1 AE | |
768 | = c | |
769 | <2 æ | |
770 | <1 agl | |
771 | = ae | |
772 | ||
773 | ** test: later rule does not affect earlier extension (3), ICU ticket 10105 | |
774 | @ rules | |
775 | &a = b / c &d = c / e | |
776 | % strength=secondary | |
777 | * compare | |
778 | <1 AC # C is still only tertiary different from the original c | |
779 | = b | |
780 | <1 ade | |
781 | = ac | |
782 | ||
783 | ** test: extension contains tailored character, ICU ticket 10105 | |
784 | @ rules | |
785 | &a=e &b=u/e | |
786 | * compare | |
787 | <1 a | |
788 | = e | |
789 | <1 ba | |
790 | = be | |
791 | = u | |
792 | ||
793 | ** test: add simple mappings for characters with root context | |
794 | @ rules | |
795 | &z=· # middle dot has a prefix mapping in the CLDR root | |
796 | &n=и # и (U+0438) has contractions in the root | |
797 | * compare | |
798 | <1 l | |
799 | <2 l· # root mapping for l|· still works | |
800 | <1 z | |
801 | = · | |
802 | * compare | |
803 | <1 n | |
804 | = и | |
805 | <1 И | |
806 | <1 и\u0306 # root mapping for й=и\u0306 still works | |
807 | = й | |
808 | <3 Й | |
809 | ||
810 | ** test: add context mappings around characters with root context | |
811 | @ rules | |
812 | &z=·h # middle dot has a prefix mapping in the CLDR root | |
813 | &n=ә|и # и (U+0438) has contractions in the root | |
814 | * compare | |
815 | <1 l | |
816 | <2 l· # root mapping for l|· still works | |
817 | <1 z | |
818 | = ·h | |
819 | * compare | |
820 | <1 и | |
821 | <3 И | |
822 | <1 и\u0306 # root mapping for й=и\u0306 still works | |
823 | = й | |
824 | * compare | |
825 | <1 әn | |
826 | = әи | |
827 | <1 әo | |
828 | ||
829 | ** test: many secondary CEs at the top of their range | |
830 | @ rules | |
831 | &[last primary ignorable]<<*\u2801-\u28ff | |
832 | * compare | |
833 | <2 \u0308 | |
834 | <2 \u2801 | |
835 | <2 \u2802 | |
836 | <2 \u2803 | |
837 | <2 \u2804 | |
838 | <2 \u28fd | |
839 | <2 \u28fe | |
840 | <2 \u28ff | |
841 | <1 \x20 | |
842 | ||
843 | ** test: many tertiary CEs at the top of their range | |
844 | @ rules | |
845 | &[last secondary ignorable]<<<*a-z | |
846 | * compare | |
847 | <3 a | |
848 | <3 b | |
849 | <3 c | |
850 | <3 d | |
851 | # e..w | |
852 | <3 x | |
853 | <3 y | |
854 | <3 z | |
855 | <2 \u0308 | |
856 | ||
857 | ** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 | |
858 | @ rules | |
859 | &a=p|x &b=px &c=op | |
860 | * compare | |
861 | <1 b | |
862 | = px | |
863 | <3 B | |
864 | <1 c | |
865 | = op | |
866 | <3 C | |
867 | * compare | |
868 | <1 ca | |
869 | = opx # first contraction op, then prefix p|x | |
870 | <3 cA | |
871 | <3 Ca | |
872 | ||
873 | ** test: reset position with prefix (pre-context), ICU ticket 10102 | |
874 | @ rules | |
875 | &a=p|x &px=y | |
876 | * compare | |
877 | <1 pa | |
878 | = px | |
879 | = y | |
880 | <3 pA | |
881 | <1 q | |
882 | <1 x | |
883 | ||
884 | ** test: prefix+contraction together (1), ICU ticket 10071 | |
885 | @ rules | |
886 | &x=a|bc | |
887 | * compare | |
888 | <1 ab | |
889 | <1 Abc | |
890 | <1 abd | |
891 | <1 ac | |
892 | <1 aw | |
893 | <1 ax | |
894 | = abc | |
895 | <3 aX | |
896 | <3 Ax | |
897 | <1 b | |
898 | <1 bb | |
899 | <1 bc | |
900 | <3 bC | |
901 | <3 Bc | |
902 | <1 bd | |
903 | ||
904 | ** test: prefix+contraction together (2), ICU ticket 10071 | |
905 | @ rules | |
906 | &w=bc &x=a|b | |
907 | * compare | |
908 | <1 w | |
909 | = bc | |
910 | <3 W | |
911 | * compare | |
912 | <1 aw | |
913 | <1 ax | |
914 | = ab | |
915 | <3 aX | |
916 | <1 axb | |
917 | <1 axc | |
918 | = abc # prefix match a|b takes precedence over contraction match bc | |
919 | <3 abC | |
920 | <1 abd | |
921 | <1 ay | |
922 | ||
923 | ** test: prefix+contraction together (3), ICU ticket 10071 | |
924 | @ rules | |
925 | &x=a|b &w=bc # reverse order of rules as previous test, order should not matter here | |
926 | * compare # same "compare" sequences as previous test | |
927 | <1 w | |
928 | = bc | |
929 | <3 W | |
930 | * compare | |
931 | <1 aw | |
932 | <1 ax | |
933 | = ab | |
934 | <3 aX | |
935 | <1 axb | |
936 | <1 axc | |
937 | = abc # prefix match a|b takes precedence over contraction match bc | |
938 | <3 abC | |
939 | <1 abd | |
940 | <1 ay | |
941 | ||
942 | ** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 | |
943 | @ rules | |
944 | &d=ch &v=p|ci | |
945 | * compare | |
946 | <1 pc | |
947 | <3 pC | |
948 | <1 pcH | |
949 | <1 pcI | |
950 | <1 pd | |
951 | = pch # no-prefix contraction ch matches | |
952 | <3 pD | |
953 | <1 pv | |
954 | = pci # prefix+contraction p|ci matches | |
955 | <3 pV | |
956 | ||
957 | ** test: tailor in & around compact ranges of root primaries | |
958 | # The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs | |
959 | # which should be reliably encoded as one range in the root elements data. | |
960 | @ rules | |
961 | &[before 1]ᚁ<a | |
962 | &ᚁ<b | |
963 | &[before 1]ᚂ<c | |
964 | &ᚂ<d | |
965 | &[before 1]ᚚ<y | |
966 | &ᚚ<z | |
967 | &[before 2]ᚁ<<r | |
968 | &ᚁ<<s | |
969 | &[before 3]ᚚ<<<t | |
970 | &ᚚ<<<u | |
971 | * compare | |
972 | <1 ᣵ # U+18F5 last Canadian Aboriginal | |
973 | <1 a | |
974 | <1 r | |
975 | <2 ᚁ | |
976 | <2 s | |
977 | <1 b | |
978 | <1 c | |
979 | <1 ᚂ | |
980 | <1 d | |
981 | <1 ᚃ | |
982 | <1 ᚙ | |
983 | <1 y | |
984 | <1 t | |
985 | <3 ᚚ | |
986 | <3 u | |
987 | <1 z | |
988 | <1 ᚠ # U+16A0 first Runic | |
989 | ||
990 | ** test: suppressContractions | |
991 | @ rules | |
992 | &z<ch<әж [suppressContractions [·cә]] | |
993 | * compare | |
994 | <1 ch | |
995 | <3 cH # ch was suppressed | |
996 | <1 l | |
997 | <1 l· # primary difference, not secondary, because l|· was suppressed | |
998 | <1 ә | |
999 | <2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed | |
1000 | <1 әж | |
1001 | <3 әЖ | |
1002 | ||
1003 | ** test: Hangul & Jamo | |
1004 | @ rules | |
1005 | &L=\u1100 # first Jamo L | |
1006 | &V=\u1161 # first Jamo V | |
1007 | &T=\u11A8 # first Jamo T | |
1008 | &\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs | |
1009 | * compare | |
1010 | <1 Lv | |
1011 | <3 LV | |
1012 | = \u1100\u1161 | |
1013 | = \uAC00 | |
1014 | <1 LVt | |
1015 | <3 LVT | |
1016 | = \u1100\u1161\u11A8 | |
1017 | = \uAC00\u11A8 | |
1018 | = \uAC01 | |
1019 | <2 LVT\u0308 | |
1020 | <2 \u4E00 | |
1021 | <2 \u4E01 | |
1022 | <2 \u4E80 | |
1023 | <2 \u4EFF | |
1024 | <2 LV\u0308T | |
1025 | <1 \uAC02 | |
1026 | ||
1027 | ** test: adjust special reset positions according to previous rules, CLDR ticket 6070 | |
1028 | @ rules | |
1029 | &[last variable]<x | |
1030 | [maxVariable space] # has effect only after building, no effect on following rules | |
1031 | &[last variable]<y | |
1032 | &[before 1][first regular]<z | |
1033 | * compare | |
1034 | <1 ? # some punctuation | |
1035 | <1 x | |
1036 | <1 y | |
1037 | <1 z | |
1038 | <1 $ # some symbol | |
1039 | ||
1040 | @ rules | |
1041 | &[last primary ignorable]<<x<<<y | |
1042 | &[last primary ignorable]<<z | |
1043 | * compare | |
1044 | <2 \u0358 | |
1045 | <2 x | |
1046 | <3 y | |
1047 | <2 z | |
1048 | <1 \x20 | |
1049 | ||
1050 | @ rules | |
1051 | &[last secondary ignorable]<<<x | |
1052 | &[last secondary ignorable]<<<y | |
1053 | * compare | |
1054 | <3 x | |
1055 | <3 y | |
1056 | <2 \u0358 | |
1057 | ||
1058 | @ rules | |
1059 | &[before 2][first variable]<<z | |
1060 | &[before 2][first variable]<<y | |
1061 | &[before 3][first variable]<<<x | |
1062 | &[before 3][first variable]<<<w | |
1063 | &[before 1][first variable]<v | |
1064 | &[before 2][first variable]<<u | |
1065 | &[before 3][first variable]<<<t | |
1066 | &[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary | |
1067 | * compare | |
1068 | <2 \u0358 | |
1069 | <1 s | |
1070 | <2 \uFDD1\xA0 | |
1071 | <1 t | |
1072 | <3 u | |
1073 | <2 v | |
1074 | <1 w | |
1075 | <3 x | |
1076 | <3 y | |
1077 | <2 z | |
1078 | <2 \t | |
1079 | ||
1080 | @ rules | |
1081 | &[before 2][first regular]<<z | |
1082 | &[before 3][first regular]<<<y | |
1083 | &[before 1][first regular]<x | |
1084 | &[before 3][first regular]<<<w | |
1085 | &[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary | |
1086 | &[before 3][first regular]<<<u | |
1087 | &[before 1][first regular]<p # primary before the boundary: becomes variable | |
1088 | &[before 3][first regular]<<<t # not affected by p | |
1089 | &[last variable]<q # after p! | |
1090 | * compare | |
1091 | <1 ? | |
1092 | <1 p | |
1093 | <1 q | |
1094 | <1 t | |
1095 | <3 u | |
1096 | <3 v | |
1097 | <1 w | |
1098 | <3 x | |
1099 | <1 y | |
1100 | <3 z | |
1101 | <1 $ | |
1102 | ||
1103 | # check that p & q are indeed variable | |
1104 | % alternate=shifted | |
1105 | * compare | |
1106 | = ? | |
1107 | = p | |
1108 | = q | |
1109 | <1 t | |
1110 | <3 u | |
1111 | <3 v | |
1112 | <1 w | |
1113 | <3 x | |
1114 | <1 y | |
1115 | <3 z | |
1116 | <1 $ | |
1117 | ||
1118 | @ rules | |
1119 | &[before 2][first trailing]<<z | |
1120 | &[before 1][first trailing]<y | |
1121 | &[before 3][first trailing]<<<x | |
1122 | * compare | |
1123 | <1 \u4E00 # first Han, first implicit | |
1124 | <1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary | |
1125 | # Note: The root collator currently does not map any characters to the trailing first boundary primary. | |
1126 | <1 x | |
1127 | <3 y | |
1128 | <1 z | |
1129 | <2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. | |
1130 | ||
1131 | @ rules | |
1132 | &[before 2][first primary ignorable]<<z | |
1133 | &[before 2][first primary ignorable]<<y | |
1134 | &[before 3][first primary ignorable]<<<x | |
1135 | &[before 3][first primary ignorable]<<<w | |
1136 | * compare | |
1137 | = \x01 | |
1138 | <2 w | |
1139 | <3 x | |
1140 | <3 y | |
1141 | <2 z | |
1142 | <2 \u0301 | |
1143 | ||
1144 | @ rules | |
1145 | &[before 3][first secondary ignorable]<<<y | |
1146 | &[before 3][first secondary ignorable]<<<x | |
1147 | * compare | |
1148 | = \x01 | |
1149 | <3 x | |
1150 | <3 y | |
1151 | <2 \u0301 | |
1152 | ||
1153 | ** test: canonical closure | |
1154 | @ rules | |
1155 | &X=A &U=Â | |
1156 | * compare | |
1157 | <1 U | |
1158 | = Â | |
1159 | = A\u0302 | |
1160 | <2 Ú # U with acute | |
1161 | = U\u0301 | |
1162 | = Ấ # A with circumflex & acute | |
1163 | = Â\u0301 | |
1164 | = A\u0302\u0301 | |
1165 | <1 X | |
1166 | = A | |
1167 | <2 X\u030A # with ring above | |
1168 | = Å | |
1169 | = A\u030A | |
1170 | = \u212B # Angstrom sign | |
1171 | ||
1172 | @ rules | |
1173 | &x=\u5140\u55C0 | |
1174 | * compare | |
1175 | <1 x | |
1176 | = \u5140\u55C0 | |
1177 | = \u5140\uFA0D | |
1178 | = \uFA0C\u55C0 | |
1179 | = \uFA0C\uFA0D # CJK compatibility characters | |
1180 | <3 X | |
1181 | ||
1182 | # canonical closure on prefix rules, ICU ticket 9444 | |
1183 | @ rules | |
1184 | &x=ä|ŝ | |
1185 | * compare | |
1186 | <1 äs # not tailored | |
1187 | <1 äx | |
1188 | = äŝ | |
1189 | = a\u0308s\u0302 | |
1190 | = a\u0308ŝ | |
1191 | = äs\u0302 | |
1192 | <3 äX | |
1193 | ||
1194 | ** test: conjoining Jamo map to expansions | |
1195 | @ rules | |
1196 | &gg=\u1101 # Jamo Lead consonant GG | |
1197 | &nj=\u11AC # Jamo Trail consonant NJ | |
1198 | * compare | |
1199 | <1 gg\u1161nj | |
1200 | = \u1101\u1161\u11AC | |
1201 | = \uAE4C\u11AC | |
1202 | = \uAE51 | |
1203 | <3 gg\u1161nJ | |
1204 | <1 \u1100\u1100 | |
1205 | ||
1206 | ** test: canonical tail closure, ICU ticket 5913 | |
1207 | @ rules | |
1208 | &a<â | |
1209 | * compare | |
1210 | <1 a | |
1211 | <1 â # tailored | |
1212 | = a\u0302 | |
1213 | <2 a\u0323\u0302 # discontiguous contraction | |
1214 | = ạ\u0302 # equivalent | |
1215 | = ậ # equivalent | |
1216 | <1 b | |
1217 | ||
1218 | @ rules | |
1219 | &a<ạ | |
1220 | * compare | |
1221 | <1 a | |
1222 | <1 ạ # tailored | |
1223 | = a\u0323 | |
1224 | <2 a\u0323\u0302 # contiguous contraction plus extra diacritic | |
1225 | = ạ\u0302 # equivalent | |
1226 | = ậ # equivalent | |
1227 | <1 b | |
1228 | ||
1229 | # Tail closure should work even if there is a prefix and/or contraction. | |
1230 | @ rules | |
1231 | &a<\u5140|câ | |
1232 | # In order to find discontiguous contractions for \u5140|câ | |
1233 | # there must exist a mapping for \u5140|ca, regardless of what it maps to. | |
1234 | # (This follows from the UCA spec.) | |
1235 | &x=\u5140|ca | |
1236 | * compare | |
1237 | <1 \u5140a | |
1238 | = \uFA0Ca | |
1239 | <1 \u5140câ # tailored | |
1240 | = \uFA0Ccâ | |
1241 | = \u5140ca\u0302 | |
1242 | = \uFA0Cca\u0302 | |
1243 | <2 \u5140ca\u0323\u0302 # discontiguous contraction | |
1244 | = \uFA0Cca\u0323\u0302 | |
1245 | = \u5140cạ\u0302 | |
1246 | = \uFA0Ccạ\u0302 | |
1247 | = \u5140cậ | |
1248 | = \uFA0Ccậ | |
1249 | <1 \u5140b | |
1250 | = \uFA0Cb | |
1251 | <1 \u5140x | |
1252 | = \u5140ca | |
1253 | ||
1254 | # Double-check that without the extra mapping there will be no discontiguous match. | |
1255 | @ rules | |
1256 | &a<\u5140|câ | |
1257 | * compare | |
1258 | <1 \u5140a | |
1259 | = \uFA0Ca | |
1260 | <1 \u5140câ # tailored | |
1261 | = \uFA0Ccâ | |
1262 | = \u5140ca\u0302 | |
1263 | = \uFA0Cca\u0302 | |
1264 | <1 \u5140b | |
1265 | = \uFA0Cb | |
1266 | <1 \u5140ca\u0323\u0302 # no discontiguous contraction | |
1267 | = \uFA0Cca\u0323\u0302 | |
1268 | = \u5140cạ\u0302 | |
1269 | = \uFA0Ccạ\u0302 | |
1270 | = \u5140cậ | |
1271 | = \uFA0Ccậ | |
1272 | ||
1273 | @ rules | |
1274 | &a<cạ | |
1275 | * compare | |
1276 | <1 a | |
1277 | <1 cạ # tailored | |
1278 | = ca\u0323 | |
1279 | <2 ca\u0323\u0302 # contiguous contraction plus extra diacritic | |
1280 | = cạ\u0302 # equivalent | |
1281 | = cậ # equivalent | |
1282 | <1 b | |
1283 | ||
1284 | # ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | |
1285 | # = 03C9 0313 0300 0345 | |
1286 | # ccc = 0, 230, 230, 240 | |
1287 | @ rules | |
1288 | &δ=αῳ | |
1289 | # In order to find discontiguous contractions for αῳ | |
1290 | # there must exist a mapping for αω, regardless of what it maps to. | |
1291 | # (This follows from the UCA spec.) | |
1292 | &ε=αω | |
1293 | * compare | |
1294 | <1 δ | |
1295 | = αῳ | |
1296 | = αω\u0345 | |
1297 | <2 αω\u0313\u0300\u0345 # discontiguous contraction | |
1298 | = αὠ\u0300\u0345 | |
1299 | = αὢ\u0345 | |
1300 | = αᾢ | |
1301 | <2 αω\u0300\u0313\u0345 | |
1302 | = αὼ\u0313\u0345 | |
1303 | = αῲ\u0313 # not FCD | |
1304 | <1 ε | |
1305 | = αω | |
1306 | ||
1307 | # Double-check that without the extra mapping there will be no discontiguous match. | |
1308 | @ rules | |
1309 | &δ=αῳ | |
1310 | * compare | |
1311 | <1 αω\u0313\u0300\u0345 # no discontiguous contraction | |
1312 | = αὠ\u0300\u0345 | |
1313 | = αὢ\u0345 | |
1314 | = αᾢ | |
1315 | <2 αω\u0300\u0313\u0345 | |
1316 | = αὼ\u0313\u0345 | |
1317 | = αῲ\u0313 # not FCD | |
1318 | <1 δ | |
1319 | = αῳ | |
1320 | = αω\u0345 | |
1321 | ||
1322 | # Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. | |
1323 | # Tests code paths where the tailored string has a combining mark | |
1324 | # that does not occur in any composite's decomposition. | |
1325 | @ rules | |
1326 | &δ=αὼ\u0315 | |
1327 | * compare | |
1328 | <1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. | |
1329 | = αὠ\u0300\u0315 | |
1330 | = αὢ\u0315 | |
1331 | <1 δ | |
1332 | = αὼ\u0315 | |
1333 | = αω\u0300\u0315 | |
1334 | <2 αω\u0300\u0315\u0345 | |
1335 | = αὼ\u0315\u0345 | |
1336 | = αῲ\u0315 # not FCD | |
1337 | ||
1338 | ** test: danish a+a vs. a-umlaut, ICU ticket 9319 | |
1339 | @ rules | |
1340 | &z<aa | |
1341 | * compare | |
1342 | <1 z | |
1343 | <1 aa | |
1344 | <2 aa\u0308 | |
1345 | = aä | |
1346 | ||
1347 | ** test: Jamo L with and in prefix | |
1348 | # Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). | |
1349 | @ rules | |
1350 | # Jamo Lead consonant G after G or GG | |
1351 | &[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 | |
1352 | # Jamo Lead consonant GG sorts like G+G | |
1353 | &\u1100\u1100=\u1101 | |
1354 | # Note: Making G|GG and GG|GG sort the same as G|G+G | |
1355 | # would require the ability to reset on G|G+G, | |
1356 | # or we could make G-after-G equal to some secondary-CE character, | |
1357 | # and reset on a pair of those. | |
1358 | # (It does not matter much if there are at most two G in a row in real text.) | |
1359 | * compare | |
1360 | <1 \u1100 | |
1361 | <2 \u1100\u1100 # only one primary from a sequence of G lead consonants | |
1362 | = \u1101 | |
1363 | <2 \u1100\u1100\u1100 | |
1364 | = \u1101\u1100 | |
1365 | # but not = \u1100\u1101, see above | |
1366 | <1 \u1100\u1161 | |
1367 | = \uAC00 | |
1368 | <2 \u1100\u1100\u1161 | |
1369 | = \u1100\uAC00 # prefix match from the L of the LV syllable | |
1370 | = \u1101\u1161 | |
1371 | = \uAE4C | |
1372 | ||
1373 | ** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 | |
1374 | @ rules | |
1375 | # Low secondary CEs for Jamo V & T. | |
1376 | # Note: T should sort before V for proper syllable order. | |
1377 | &\u0332 # COMBINING LOW LINE (first primary ignorable) | |
1378 | <<\u1161<<\u1162 | |
1379 | ||
1380 | # Korean Jamo lead consonant search rules, part 2: | |
1381 | # Make modern compound L jamo primary equivalent to non-compound forms. | |
1382 | ||
1383 | # Secondary CEs for Jamo L-after-L, greater than Jamo V & T. | |
1384 | &\u0313 # COMBINING COMMA ABOVE (second primary ignorable) | |
1385 | =\u1100|\u1100 | |
1386 | =\u1103|\u1103 | |
1387 | =\u1107|\u1107 | |
1388 | =\u1109|\u1109 | |
1389 | =\u110C|\u110C | |
1390 | ||
1391 | # Compound L Jamo map to equivalent expansions of primary+secondary CE. | |
1392 | &\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK | |
1393 | &\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT | |
1394 | &\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP | |
1395 | &\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS | |
1396 | &\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC | |
1397 | ||
1398 | * compare | |
1399 | <1 \u1100\u1161 | |
1400 | = \uAC00 | |
1401 | <2 \u1100\u1162 | |
1402 | = \uAC1C | |
1403 | <2 \u1100\u1100\u1161 | |
1404 | = \u1100\uAC00 | |
1405 | = \u1101\u1161 | |
1406 | = \uAE4C | |
1407 | <3 \u3132\u1161 | |
1408 | ||
1409 | ** test: Hangul syllables in prefix & in the interior of a contraction | |
1410 | @ rules | |
1411 | &x=\u1100\u1161|a\u1102\u1162z | |
1412 | * compare | |
1413 | <1 \u1100\u1161x | |
1414 | = \u1100\u1161a\u1102\u1162z | |
1415 | = \u1100\u1161a\uB0B4z | |
1416 | = \uAC00a\u1102\u1162z | |
1417 | = \uAC00a\uB0B4z | |
1418 | ||
1419 | ** test: digits are unsafe-backwards when numeric=on | |
1420 | @ root | |
1421 | % numeric=on | |
1422 | * compare | |
1423 | # If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". | |
1424 | # We need to back up before the identical prefix "1" and compare the full numbers. | |
1425 | <1 11b | |
1426 | <1 101a | |
1427 | ||
1428 | ** test: simple locale data test | |
1429 | @ locale de | |
1430 | * compare | |
1431 | <1 a | |
1432 | <2 ä | |
1433 | <1 ae | |
1434 | <2 æ | |
1435 | ||
1436 | @ locale de-u-co-phonebk | |
1437 | * compare | |
1438 | <1 a | |
1439 | <1 ae | |
1440 | <2 ä | |
1441 | <2 æ | |
1442 | ||
1443 | # The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. | |
1444 | ||
1445 | ** test: DataDrivenCollationTest/TestMorePinyin | |
1446 | # Testing the primary strength. | |
1447 | @ locale zh | |
1448 | % strength=primary | |
1449 | * compare | |
1450 | < lā | |
1451 | = lĀ | |
1452 | = Lā | |
1453 | = LĀ | |
1454 | < lān | |
1455 | = lĀn | |
1456 | < lē | |
1457 | = lĒ | |
1458 | = Lē | |
1459 | = LĒ | |
1460 | < lēn | |
1461 | = lĒn | |
1462 | ||
1463 | ** test: DataDrivenCollationTest/TestLithuanian | |
1464 | # Lithuanian sort order. | |
1465 | @ locale lt | |
1466 | * compare | |
1467 | < cz | |
1468 | < č | |
1469 | < d | |
1470 | < iz | |
1471 | < j | |
1472 | < sz | |
1473 | < š | |
1474 | < t | |
1475 | < zz | |
1476 | < ž | |
1477 | ||
1478 | ** test: DataDrivenCollationTest/TestLatvian | |
1479 | # Latvian sort order. | |
1480 | @ locale lv | |
1481 | * compare | |
1482 | < cz | |
1483 | < č | |
1484 | < d | |
1485 | < gz | |
1486 | < ģ | |
1487 | < h | |
1488 | < iz | |
1489 | < j | |
1490 | < kz | |
1491 | < ķ | |
1492 | < l | |
1493 | < lz | |
1494 | < ļ | |
1495 | < m | |
1496 | < nz | |
1497 | < ņ | |
1498 | < o | |
1499 | < rz | |
1500 | < ŗ | |
1501 | < s | |
1502 | < sz | |
1503 | < š | |
1504 | < t | |
1505 | < zz | |
1506 | < ž | |
1507 | ||
1508 | ** test: DataDrivenCollationTest/TestEstonian | |
1509 | # Estonian sort order. | |
1510 | @ locale et | |
1511 | * compare | |
1512 | < sy | |
1513 | < š | |
1514 | < šy | |
1515 | < z | |
1516 | < zy | |
1517 | < ž | |
1518 | < v | |
57a6839d | 1519 | < va |
b331163b | 1520 | < w |
57a6839d A |
1521 | < õ |
1522 | < õy | |
1523 | < ä | |
1524 | < äy | |
1525 | < ö | |
1526 | < öy | |
1527 | < ü | |
1528 | < üy | |
1529 | < x | |
1530 | ||
1531 | ** test: DataDrivenCollationTest/TestAlbanian | |
1532 | # Albanian sort order. | |
1533 | @ locale sq | |
1534 | * compare | |
1535 | < cz | |
1536 | < ç | |
1537 | < d | |
1538 | < dz | |
1539 | < dh | |
1540 | < e | |
1541 | < ez | |
1542 | < ë | |
1543 | < f | |
1544 | < gz | |
1545 | < gj | |
1546 | < h | |
1547 | < lz | |
1548 | < ll | |
1549 | < m | |
1550 | < nz | |
1551 | < nj | |
1552 | < o | |
1553 | < rz | |
1554 | < rr | |
1555 | < s | |
1556 | < sz | |
1557 | < sh | |
1558 | < t | |
1559 | < tz | |
1560 | < th | |
1561 | < u | |
1562 | < xz | |
1563 | < xh | |
1564 | < y | |
1565 | < zz | |
1566 | < zh | |
1567 | ||
1568 | ** test: DataDrivenCollationTest/TestSimplifiedChineseOrder | |
1569 | # Sorted file has different order. | |
1570 | @ root | |
1571 | # normalization=on turned on & off automatically. | |
1572 | * compare | |
1573 | < \u5F20 | |
1574 | < \u5F20\u4E00\u8E3F | |
1575 | ||
1576 | ** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash | |
1577 | # This pretty much crashes. | |
1578 | @ root | |
1579 | * compare | |
1580 | < \u0f71\u0f72\u0f80\u0f71\u0f72 | |
1581 | < \u0f80 | |
1582 | ||
1583 | ** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems | |
1584 | # These are examples of strings that caused trouble in partial sort key testing. | |
1585 | @ locale th-TH | |
1586 | * compare | |
1587 | < \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C | |
1588 | < \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 | |
1589 | * compare | |
1590 | < \u0E01\u0E07\u0E01\u0E32\u0E23 | |
1591 | < \u0E01\u0E07\u0E42\u0E01\u0E49 | |
1592 | * compare | |
1593 | < \u0E01\u0E23\u0E19\u0E17\u0E32 | |
1594 | < \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 | |
1595 | * compare | |
1596 | < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 | |
1597 | < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 | |
1598 | * compare | |
1599 | < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D | |
1600 | < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 | |
1601 | ||
1602 | ** test: DataDrivenCollationTest/TestJavaStyleRule | |
1603 | # java.text allows rules to start as '<<<x<<<y...' | |
1604 | # we emulate this by assuming a &[first tertiary ignorable] in this case. | |
1605 | @ rules | |
1606 | &\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b | |
1607 | * compare | |
1608 | = a | |
1609 | = equal | |
1610 | < z | |
1611 | < x | |
1612 | = b # x had become the new first primary ignorable | |
1613 | < w | |
1614 | ||
1615 | ** test: DataDrivenCollationTest/TestShiftedIgnorable | |
1616 | # The UCA states that primary ignorables should be completely | |
1617 | # ignorable when following a shifted code point. | |
1618 | @ root | |
1619 | % alternate=shifted | |
1620 | % strength=quaternary | |
1621 | * compare | |
1622 | < a\u0020b | |
1623 | = a\u0020\u0300b | |
1624 | = a\u0020\u0301b | |
1625 | < a_b | |
1626 | = a_\u0300b | |
1627 | = a_\u0301b | |
1628 | < A\u0020b | |
1629 | = A\u0020\u0300b | |
1630 | = A\u0020\u0301b | |
1631 | < A_b | |
1632 | = A_\u0300b | |
1633 | = A_\u0301b | |
1634 | < a\u0301b | |
1635 | < A\u0301b | |
1636 | < a\u0300b | |
1637 | < A\u0300b | |
1638 | ||
1639 | ** test: DataDrivenCollationTest/TestNShiftedIgnorable | |
1640 | # The UCA states that primary ignorables should be completely | |
1641 | # ignorable when following a shifted code point. | |
1642 | @ root | |
1643 | % alternate=non-ignorable | |
1644 | % strength=tertiary | |
1645 | * compare | |
1646 | < a\u0020b | |
1647 | < A\u0020b | |
1648 | < a\u0020\u0301b | |
1649 | < A\u0020\u0301b | |
1650 | < a\u0020\u0300b | |
1651 | < A\u0020\u0300b | |
1652 | < a_b | |
1653 | < A_b | |
1654 | < a_\u0301b | |
1655 | < A_\u0301b | |
1656 | < a_\u0300b | |
1657 | < A_\u0300b | |
1658 | < a\u0301b | |
1659 | < A\u0301b | |
1660 | < a\u0300b | |
1661 | < A\u0300b | |
1662 | ||
1663 | ** test: DataDrivenCollationTest/TestSafeSurrogates | |
1664 | # It turned out that surrogates were not skipped properly | |
1665 | # when iterating backwards if they were in the middle of a | |
1666 | # contraction. This test assures that this is fixed. | |
1667 | @ rules | |
1668 | &a < x\ud800\udc00b | |
1669 | * compare | |
1670 | < a | |
1671 | < x\ud800\udc00b | |
1672 | ||
1673 | ** test: DataDrivenCollationTest/da_TestPrimary | |
1674 | # This test goes through primary strength cases | |
1675 | @ locale da | |
1676 | % strength=primary | |
1677 | * compare | |
1678 | < Lvi | |
1679 | < Lwi | |
1680 | * compare | |
1681 | < L\u00e4vi | |
1682 | < L\u00f6wi | |
1683 | * compare | |
1684 | < L\u00fcbeck | |
1685 | = Lybeck | |
1686 | ||
1687 | ** test: DataDrivenCollationTest/da_TestTertiary | |
1688 | # This test goes through tertiary strength cases | |
1689 | @ locale da | |
1690 | % strength=tertiary | |
1691 | * compare | |
1692 | < Luc | |
1693 | < luck | |
1694 | * compare | |
1695 | < luck | |
1696 | < L\u00fcbeck | |
1697 | * compare | |
1698 | < lybeck | |
1699 | < L\u00fcbeck | |
1700 | * compare | |
1701 | < L\u00e4vi | |
1702 | < L\u00f6we | |
1703 | * compare | |
1704 | < L\u00f6ww | |
1705 | < mast | |
1706 | ||
1707 | * compare | |
1708 | < A/S | |
1709 | < ANDRE | |
1710 | < ANDR\u00c9 | |
1711 | < ANDREAS | |
1712 | < AS | |
1713 | < CA | |
1714 | < \u00c7A | |
1715 | < CB | |
1716 | < \u00c7C | |
1717 | < D.S.B. | |
1718 | < DA | |
1719 | < \u00d0A | |
1720 | < DB | |
1721 | < \u00d0C | |
1722 | < DSB | |
1723 | < DSC | |
1724 | < EKSTRA_ARBEJDE | |
1725 | < EKSTRABUD0 | |
1726 | < H\u00d8ST | |
1727 | < HAAG | |
1728 | < H\u00c5NDBOG | |
1729 | < HAANDV\u00c6RKSBANKEN | |
1730 | < Karl | |
1731 | < karl | |
1732 | < NIELS\u0020J\u00d8RGEN | |
1733 | < NIELS-J\u00d8RGEN | |
1734 | < NIELSEN | |
1735 | < R\u00c9E,\u0020A | |
1736 | < REE,\u0020B | |
1737 | < R\u00c9E,\u0020L | |
1738 | < REE,\u0020V | |
1739 | < SCHYTT,\u0020B | |
1740 | < SCHYTT,\u0020H | |
1741 | < SCH\u00dcTT,\u0020H | |
1742 | < SCHYTT,\u0020L | |
1743 | < SCH\u00dcTT,\u0020M | |
1744 | < SS | |
1745 | < \u00df | |
1746 | < SSA | |
1747 | < STORE\u0020VILDMOSE | |
1748 | < STOREK\u00c6R0 | |
1749 | < STORM\u0020PETERSEN | |
1750 | < STORMLY | |
1751 | < THORVALD | |
1752 | < THORVARDUR | |
1753 | < \u00feORVAR\u00d0UR | |
1754 | < THYGESEN | |
1755 | < VESTERG\u00c5RD,\u0020A | |
1756 | < VESTERGAARD,\u0020A | |
1757 | < VESTERG\u00c5RD,\u0020B | |
1758 | < \u00c6BLE | |
1759 | < \u00c4BLE | |
1760 | < \u00d8BERG | |
1761 | < \u00d6BERG | |
1762 | ||
1763 | * compare | |
1764 | < andere | |
1765 | < chaque | |
1766 | < chemin | |
1767 | < cote | |
1768 | < cot\u00e9 | |
1769 | < c\u00f4te | |
1770 | < c\u00f4t\u00e9 | |
1771 | < \u010du\u010d\u0113t | |
1772 | < Czech | |
1773 | < hi\u0161a | |
1774 | < irdisch | |
1775 | < lie | |
1776 | < lire | |
1777 | < llama | |
1778 | < l\u00f5ug | |
1779 | < l\u00f2za | |
1780 | < lu\u010d | |
1781 | < luck | |
1782 | < L\u00fcbeck | |
1783 | < lye | |
1784 | < l\u00e4vi | |
1785 | < L\u00f6wen | |
1786 | < m\u00e0\u0161ta | |
1787 | < m\u00eer | |
1788 | < myndig | |
1789 | < M\u00e4nner | |
1790 | < m\u00f6chten | |
1791 | < pi\u00f1a | |
1792 | < pint | |
1793 | < pylon | |
1794 | < \u0161\u00e0ran | |
1795 | < savoir | |
1796 | < \u0160erb\u016bra | |
1797 | < Sietla | |
1798 | < \u015blub | |
1799 | < subtle | |
1800 | < symbol | |
1801 | < s\u00e4mtlich | |
1802 | < verkehrt | |
1803 | < vox | |
1804 | < v\u00e4ga | |
1805 | < waffle | |
1806 | < wood | |
1807 | < yen | |
1808 | < yuan | |
1809 | < yucca | |
1810 | < \u017eal | |
1811 | < \u017eena | |
1812 | < \u017den\u0113va | |
1813 | < zoo0 | |
1814 | < Zviedrija | |
1815 | < Z\u00fcrich | |
1816 | < zysk0 | |
1817 | < \u00e4ndere | |
1818 | ||
1819 | ** test: DataDrivenCollationTest/hi_TestNewRules | |
1820 | # This test goes through new rules and tests against old rules | |
1821 | @ locale hi | |
1822 | * compare | |
1823 | < कॐ | |
1824 | < कं | |
1825 | < कँ | |
1826 | < कः | |
1827 | ||
1828 | ** test: DataDrivenCollationTest/ro_TestNewRules | |
1829 | # This test goes through new rules and tests against old rules | |
1830 | @ locale ro | |
1831 | * compare | |
1832 | < xAx | |
1833 | < xă | |
1834 | < xĂ | |
1835 | < Xă | |
1836 | < XĂ | |
1837 | < xăx | |
1838 | < xĂx | |
1839 | < xâ | |
1840 | < x | |
1841 | < Xâ | |
1842 | < XÂ | |
1843 | < xâx | |
1844 | < xÂx | |
1845 | < xb | |
1846 | < xIx | |
1847 | < xî | |
1848 | < xÎ | |
1849 | < Xî | |
1850 | < XÎ | |
1851 | < xîx | |
1852 | < xÎx | |
1853 | < xj | |
1854 | < xSx | |
1855 | < xș | |
1856 | = xş | |
1857 | < xȘ | |
1858 | = xŞ | |
1859 | < Xș | |
1860 | = Xş | |
1861 | < XȘ | |
1862 | = XŞ | |
1863 | < xșx | |
1864 | = xşx | |
1865 | < xȘx | |
1866 | = xŞx | |
1867 | < xT | |
1868 | < xTx | |
1869 | < xț | |
1870 | = xţ | |
1871 | < xȚ | |
1872 | = xŢ | |
1873 | < Xț | |
1874 | = Xţ | |
1875 | < XȚ | |
1876 | = XŢ | |
1877 | < xțx | |
1878 | = xţx | |
1879 | < xȚx | |
1880 | = xŢx | |
1881 | < xU | |
1882 | ||
1883 | ** test: DataDrivenCollationTest/testOffsets | |
1884 | # This tests cases where forwards and backwards iteration get different offsets | |
1885 | @ locale en | |
1886 | % strength=tertiary | |
1887 | * compare | |
1888 | < a\uD800\uDC00\uDC00 | |
1889 | < b\uD800\uDC00\uDC00 | |
1890 | * compare | |
1891 | < \u0301A\u0301\u0301 | |
1892 | < \u0301B\u0301\u0301 | |
1893 | * compare | |
1894 | < abcd\r\u0301 | |
1895 | < abce\r\u0301 | |
1896 | # TODO: test offsets in new CollationTest | |
1897 | ||
1898 | # End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. | |
1899 | ||
1900 | ** test: was ICU 52 cmsccoll/TestRedundantRules | |
1901 | @ rules | |
1902 | & a < b < c < d& [before 1] c < m | |
1903 | * compare | |
1904 | <1 a | |
1905 | <1 b | |
1906 | <1 m | |
1907 | <1 c | |
1908 | <1 d | |
1909 | ||
1910 | @ rules | |
1911 | & a < b <<< c << d <<< e& [before 3] e <<< x | |
1912 | * compare | |
1913 | <1 a | |
1914 | <1 b | |
1915 | <3 c | |
1916 | <2 d | |
1917 | <3 x | |
1918 | <3 e | |
1919 | ||
1920 | @ rules | |
1921 | & a < b <<< c << d <<< e <<< f < g& [before 1] g < x | |
1922 | * compare | |
1923 | <1 a | |
1924 | <1 b | |
1925 | <3 c | |
1926 | <2 d | |
1927 | <3 e | |
1928 | <3 f | |
1929 | <1 x | |
1930 | <1 g | |
1931 | ||
1932 | @ rules | |
1933 | & a <<< b << c < d& a < m | |
1934 | * compare | |
1935 | <1 a | |
1936 | <3 b | |
1937 | <2 c | |
1938 | <1 m | |
1939 | <1 d | |
1940 | ||
1941 | @ rules | |
1942 | &a<b<<b\u0301 &z<b | |
1943 | * compare | |
1944 | <1 a | |
1945 | <1 b\u0301 | |
1946 | <1 z | |
1947 | <1 b | |
1948 | ||
1949 | @ rules | |
1950 | &z<m<<<q<<<m | |
1951 | * compare | |
1952 | <1 z | |
1953 | <1 q | |
1954 | <3 m | |
1955 | ||
1956 | @ rules | |
1957 | &z<<<m<q<<<m | |
1958 | * compare | |
1959 | <1 z | |
1960 | <1 q | |
1961 | <3 m | |
1962 | ||
1963 | @ rules | |
1964 | & a < b < c < d& r < c | |
1965 | * compare | |
1966 | <1 a | |
1967 | <1 b | |
1968 | <1 d | |
1969 | <1 r | |
1970 | <1 c | |
1971 | ||
1972 | @ rules | |
1973 | & a < b < c < d& c < m | |
1974 | * compare | |
1975 | <1 a | |
1976 | <1 b | |
1977 | <1 c | |
1978 | <1 m | |
1979 | <1 d | |
1980 | ||
1981 | @ rules | |
1982 | & a < b < c < d& a < m | |
1983 | * compare | |
1984 | <1 a | |
1985 | <1 m | |
1986 | <1 b | |
1987 | <1 c | |
1988 | <1 d | |
1989 | ||
1990 | ** test: was ICU 52 cmsccoll/TestExpansionSyntax | |
1991 | # The following two rules should sort the particular list of strings the same. | |
1992 | @ rules | |
1993 | &AE <<< a << b <<< c &d <<< f | |
1994 | * compare | |
1995 | <1 AE | |
1996 | <3 a | |
1997 | <2 b | |
1998 | <3 c | |
1999 | <1 d | |
2000 | <3 f | |
2001 | ||
2002 | @ rules | |
2003 | &A <<< a / E << b / E <<< c /E &d <<< f | |
2004 | * compare | |
2005 | <1 AE | |
2006 | <3 a | |
2007 | <2 b | |
2008 | <3 c | |
2009 | <1 d | |
2010 | <3 f | |
2011 | ||
2012 | # The following two rules should sort the particular list of strings the same. | |
2013 | @ rules | |
2014 | &AE <<< a <<< b << c << d < e < f <<< g | |
2015 | * compare | |
2016 | <1 AE | |
2017 | <3 a | |
2018 | <3 b | |
2019 | <2 c | |
2020 | <2 d | |
2021 | <1 e | |
2022 | <1 f | |
2023 | <3 g | |
2024 | ||
2025 | @ rules | |
2026 | &A <<< a / E <<< b / E << c / E << d / E < e < f <<< g | |
2027 | * compare | |
2028 | <1 AE | |
2029 | <3 a | |
2030 | <3 b | |
2031 | <2 c | |
2032 | <2 d | |
2033 | <1 e | |
2034 | <1 f | |
2035 | <3 g | |
2036 | ||
2037 | # The following two rules should sort the particular list of strings the same. | |
2038 | @ rules | |
2039 | &AE <<< B <<< C / D <<< F | |
2040 | * compare | |
2041 | <1 AE | |
2042 | <3 B | |
2043 | <3 F | |
2044 | <1 AED | |
2045 | <3 C | |
2046 | ||
2047 | @ rules | |
2048 | &A <<< B / E <<< C / ED <<< F / E | |
2049 | * compare | |
2050 | <1 AE | |
2051 | <3 B | |
2052 | <3 F | |
2053 | <1 AED | |
2054 | <3 C | |
2055 | ||
2056 | ** test: never reorder trailing primaries | |
2057 | @ root | |
2058 | % reorder Zzzz Grek | |
2059 | * compare | |
2060 | <1 L | |
2061 | <1 字 | |
2062 | <1 Ω | |
2063 | <1 \uFFFD | |
2064 | <1 \uFFFF | |
2065 | ||
2066 | ** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes | |
2067 | @ rules | |
2068 | &u=ab|cd | |
2069 | &v=b|ce | |
2070 | * compare | |
2071 | <1 abc | |
2072 | <1 abcc | |
2073 | <1 abcf | |
2074 | <1 abcd | |
2075 | = abu | |
2076 | <1 abce | |
2077 | = abv | |
2078 | ||
2079 | # With the following rules, there is only one prefix per composite ĉ or ç, | |
2080 | # but both prefixes apply to just c in NFD form. | |
2081 | # We would get different results for composed vs. NFD input | |
2082 | # if we fell back directly from longest-prefix mappings to no-prefix mappings. | |
2083 | @ rules | |
2084 | &x=op|ĉ | |
2085 | &y=p|ç | |
2086 | * compare | |
2087 | <1 opc | |
2088 | <2 opć | |
2089 | <1 opcz | |
2090 | <1 opd | |
2091 | <1 opĉ | |
2092 | = opc\u0302 | |
2093 | = opx | |
2094 | <1 opç | |
2095 | = opc\u0327 | |
2096 | = opy | |
2097 | ||
2098 | # The mapping is used which has the longest matching prefix for which | |
2099 | # there is also a suffix match, with the longest suffix match among several for that prefix. | |
2100 | @ rules | |
2101 | &❶=d | |
2102 | &❷=de | |
2103 | &❸=def | |
2104 | &①=c|d | |
2105 | &②=c|de | |
2106 | &③=c|def | |
2107 | &④=bc|d | |
2108 | &⑤=bc|de | |
2109 | &⑥=bc|def | |
2110 | &⑦=abc|d | |
2111 | &⑧=abc|de | |
2112 | &⑨=abc|def | |
2113 | * compare | |
2114 | <1 9aadzz | |
2115 | = 9aa❶zz | |
2116 | <1 9aadez | |
2117 | = 9aa❷z | |
2118 | <1 9aadef | |
2119 | = 9aa❸ | |
2120 | <1 9acdzz | |
2121 | = 9ac①zz | |
2122 | <1 9acdez | |
2123 | = 9ac②z | |
2124 | <1 9acdef | |
2125 | = 9ac③ | |
2126 | <1 9bcdzz | |
2127 | = 9bc④zz | |
2128 | <1 9bcdez | |
2129 | = 9bc⑤z | |
2130 | <1 9bcdef | |
2131 | = 9bc⑥ | |
2132 | <1 abcdzz | |
2133 | = abc⑦zz | |
2134 | <1 abcdez | |
2135 | = abc⑧z | |
2136 | <1 abcdef | |
2137 | = abc⑨ | |
2138 | ||
2139 | ** test: prefix + discontiguous contraction with missing prefix contraction | |
2140 | # Unfortunate terminology: The first "prefix" here is the pre-context, | |
2141 | # the second "prefix" refers to the contraction/relation string that is | |
2142 | # one shorter than the one being tested. | |
2143 | @ rules | |
2144 | &x=p|e | |
2145 | &y=p|ê | |
2146 | &z=op|ê | |
2147 | # No mapping for op|e: | |
2148 | # Discontiguous contraction matching should not match op|ê in opệ | |
2149 | # because it would have to skip the dot below and extend a match on op|e by the circumflex, | |
2150 | # but there is no match on op|e. | |
2151 | * compare | |
2152 | <1 oPe | |
2153 | <1 ope | |
2154 | = opx | |
2155 | <1 opệ | |
2156 | = opy\u0323 # y not z | |
2157 | <1 opê | |
2158 | = opz | |
2159 | ||
2160 | # We cannot test for fallback by whether the contraction default CE32 | |
2161 | # is for another contraction. With the following rules, there is no mapping for op|e, | |
2162 | # and the fallback to prefix p has no contractions. | |
2163 | @ rules | |
2164 | &x=p|e | |
2165 | &z=op|ê | |
2166 | * compare | |
2167 | <1 oPe | |
2168 | <1 ope | |
2169 | = opx | |
2170 | <2 opệ | |
2171 | = opx\u0323\u0302 # x not z | |
2172 | <1 opê | |
2173 | = opz | |
2174 | ||
2175 | # One more variation: Fallback to the simple code point, no shorter non-empty prefix. | |
2176 | @ rules | |
2177 | &x=e | |
2178 | &z=op|ê | |
2179 | * compare | |
2180 | <1 ope | |
2181 | = opx | |
2182 | <3 oPe | |
2183 | = oPx | |
2184 | <2 opệ | |
2185 | = opx\u0323\u0302 # x not z | |
2186 | <1 opê | |
2187 | = opz | |
2188 | ||
2189 | ** test: maxVariable via rules | |
2190 | @ rules | |
2191 | [maxVariable space][alternate shifted] | |
2192 | * compare | |
2193 | = \u0020 | |
2194 | = \u000A | |
2195 | <1 . | |
2196 | <1 ° # degree sign | |
2197 | <1 $ | |
2198 | <1 0 | |
2199 | ||
2200 | ** test: maxVariable via setting | |
2201 | @ root | |
2202 | % maxVariable=currency | |
2203 | % alternate=shifted | |
2204 | * compare | |
2205 | = \u0020 | |
2206 | = \u000A | |
2207 | = . | |
2208 | = ° # degree sign | |
2209 | = $ | |
2210 | <1 0 | |
2211 | ||
2212 | ** test: ICU4J CollationMiscTest/TestContractionClosure (ää) | |
2213 | # This tests canonical closure, but it also tests that CollationFastLatin | |
2214 | # bails out properly for contractions with combining marks. | |
2215 | # For that we need pairs of strings that remain in the Latin fastpath | |
2216 | # long enough, hence the extra "= b" lines. | |
2217 | @ rules | |
2218 | &b=\u00e4\u00e4 | |
2219 | * compare | |
2220 | <1 b | |
2221 | = \u00e4\u00e4 | |
2222 | = b | |
2223 | = a\u0308a\u0308 | |
2224 | = b | |
2225 | = \u00e4a\u0308 | |
2226 | = b | |
2227 | = a\u0308\u00e4 | |
2228 | ||
2229 | ** test: ICU4J CollationMiscTest/TestContractionClosure (Å) | |
2230 | @ rules | |
2231 | &b=\u00C5 | |
2232 | * compare | |
2233 | <1 b | |
2234 | = \u00C5 | |
2235 | = b | |
2236 | = A\u030A | |
2237 | = b | |
2238 | = \u212B | |
2239 | ||
2240 | ** test: reset-before on already-tailored characters, ICU ticket 10108 | |
2241 | @ rules | |
2242 | &a<w<<x &[before 2]x<<y | |
2243 | * compare | |
2244 | <1 a | |
2245 | <1 w | |
2246 | <2 y | |
2247 | <2 x | |
2248 | ||
2249 | @ rules | |
2250 | &a<<w<<<x &[before 2]x<<y | |
2251 | * compare | |
2252 | <1 a | |
2253 | <2 y | |
2254 | <2 w | |
2255 | <3 x | |
2256 | ||
2257 | @ rules | |
2258 | &a<w<x &[before 2]x<<y | |
2259 | * compare | |
2260 | <1 a | |
2261 | <1 w | |
2262 | <1 y | |
2263 | <2 x | |
2264 | ||
2265 | @ rules | |
2266 | &a<w<<<x &[before 2]x<<y | |
2267 | * compare | |
2268 | <1 a | |
2269 | <1 y | |
2270 | <2 w | |
2271 | <3 x | |
2272 | ||
2273 | ** test: numeric collation with other settings, ICU ticket 9092 | |
2274 | @ root | |
2275 | % strength=identical | |
2276 | % caseFirst=upper | |
2277 | % numeric=on | |
2278 | * compare | |
2279 | <1 100\u0020a | |
2280 | <1 101 | |
2281 | ||
2282 | ** test: collation type fallback from unsupported type, ICU ticket 10149 | |
2283 | @ locale fr-CA-u-co-phonebk | |
2284 | # Expect the same result as with fr-CA, using backwards-secondary order. | |
2285 | # That is, we should fall back from the unsupported collation type | |
2286 | # to the locale's default collation type. | |
2287 | * compare | |
2288 | <1 cote | |
2289 | <2 côte | |
2290 | <2 coté | |
2291 | <2 côté | |
2292 | ||
2293 | ** test: @ is equivalent to [backwards 2], ICU ticket 9956 | |
2294 | @ rules | |
2295 | &b<a @ &v<<w | |
2296 | * compare | |
2297 | <1 b | |
2298 | <1 a | |
2299 | <1 cote | |
2300 | <2 côte | |
2301 | <2 coté | |
2302 | <2 côté | |
2303 | <1 v | |
2304 | <2 w | |
2305 | <1 x | |
2306 | ||
2307 | ** test: shifted+reordering, ICU ticket 9507 | |
2308 | @ root | |
2309 | % reorder Grek punct space | |
2310 | % alternate=shifted | |
2311 | % strength=quaternary | |
2312 | # Which primaries are "variable" should be determined without script reordering, | |
2313 | # and then primaries should be reordered whether they are shifted to quaternary or not. | |
2314 | * compare | |
2315 | <4 ( # punctuation | |
2316 | <4 ) | |
2317 | <4 \u0020 # space | |
2318 | <1 ` # symbol | |
2319 | <1 ^ | |
2320 | <1 $ # currency symbol | |
2321 | <1 € | |
2322 | <1 0 # numbers | |
2323 | <1 ε # Greek | |
2324 | <1 e # Latin | |
2325 | <1 e(e | |
2326 | <4 e)e | |
2327 | <4 e\u0020e | |
2328 | <4 ee | |
2329 | <3 e(E | |
2330 | <4 e)E | |
2331 | <4 e\u0020E | |
2332 | <4 eE | |
2333 | ||
2334 | ** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 | |
2335 | @ rules | |
2336 | &\u0001<<<b<<<B | |
2337 | % caseFirst=upper | |
2338 | * compare | |
2339 | <1 aaa | |
2340 | <3 aaaB | |
2341 | ||
2342 | ** test: secondary+case ignores secondary ignorables, ICU ticket 9355 | |
2343 | @ rules | |
2344 | &\u0001<<<b<<<B | |
2345 | % strength=secondary | |
2346 | % caseLevel=on | |
2347 | * compare | |
2348 | <1 a | |
2349 | = ab | |
2350 | = aB | |
2351 | ||
2352 | ** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 | |
2353 | @ rules | |
2354 | &[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 | |
2355 | * compare | |
2356 | <1 ൗx | |
2357 | <2 ൌx | |
2358 | <1 ൗy | |
2359 | <2 ൌy | |
2360 | ||
2361 | ** test: quoted apostrophe in compact syntax, ICU ticket 8204 | |
2362 | @ rules | |
2363 | &q<<*a''c | |
2364 | * compare | |
2365 | <1 d | |
2366 | <1 p | |
2367 | <1 q | |
2368 | <2 a | |
2369 | <2 \u0027 | |
2370 | <2 c | |
2371 | <1 r | |
b331163b A |
2372 | |
2373 | # ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" | |
2374 | ** test: locale -u- with collation keywords, ICU ticket 8260 | |
2375 | @ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 | |
2376 | * compare | |
2377 | <4 \u0020 # space is shifted, strength=quaternary | |
2378 | <1 ! # punctuation is regular | |
2379 | <1 2 | |
2380 | <1 12 # numeric sorting | |
2381 | <1 B | |
2382 | <c b # uppercase first on case level | |
2383 | <1 x\u0301\u0308 | |
2384 | <2 x\u0308\u0301 # normalization off | |
2385 | ||
2386 | ** test: locale @ with collation keywords, ICU ticket 8260 | |
2387 | @ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted | |
2388 | * compare | |
2389 | <4 $ # currency symbols are shifted, strength=quaternary | |
2390 | <1 àla | |
2391 | <2 alà # backwards secondary level | |
2392 | ||
2393 | ** test: locale -u- with script reordering, ICU ticket 8260 | |
2394 | @ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai | |
2395 | * compare | |
2396 | <1 \u0020 | |
2397 | <1 あ | |
2398 | <1 ☂ | |
2399 | <1 Ω | |
2400 | <1 丂 | |
2401 | <1 ж | |
2402 | <1 L | |
2403 | <1 4 | |
2404 | <1 Ձ | |
2405 | <1 अ | |
2406 | <1 ሄ | |
2407 | <1 ฉ | |
2408 | ||
2409 | ** test: locale @collation=type should be case-insensitive | |
2410 | @ locale de@coLLation=PhoneBook | |
2411 | * compare | |
2412 | <1 ae | |
2413 | <2 ä | |
2414 | <3 Ä | |
2415 | ||
2416 | ** test: import root search rules plus German phonebook rules, ICU ticket 8962 | |
2417 | @ locale de-u-co-search | |
2418 | * compare | |
2419 | <1 = | |
2420 | <1 ≠ | |
2421 | <1 a | |
2422 | <1 ae | |
2423 | <2 ä | |
2424 | ||
2425 | # Once more, but with runtime builder. | |
2426 | @ rules | |
2427 | [import und-u-co-search][import de-u-co-phonebk] | |
2428 | * compare | |
2429 | <1 = | |
2430 | <1 ≠ | |
2431 | <1 a | |
2432 | <1 ae | |
2433 | <2 ä | |
2434 | ||
2435 | # Once again, with import from "root" not "und" (as in a proper language tag). | |
2436 | @ rules | |
2437 | [import root-u-co-search][import de-u-co-phonebk] | |
2438 | * compare | |
2439 | <1 = | |
2440 | <1 ≠ | |
2441 | <1 a | |
2442 | <1 ae | |
2443 | <2 ä | |
2444 | ||
2445 | ** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998 | |
2446 | # Greek should sort Greek first. | |
2447 | @ rules | |
2448 | [import el] | |
2449 | * compare | |
2450 | <1 4 | |
2451 | <1 Ω | |
2452 | <1 L | |
2453 | ||
2454 | # Import Greek, and then reset the reordering. | |
2455 | @ rules | |
2456 | [import el][reorder Zzzz] | |
2457 | * compare | |
2458 | <1 4 | |
2459 | <1 L | |
2460 | <1 Ω | |
2461 | ||
2462 | # "others" is a synonym for Zzzz. | |
2463 | @ rules | |
2464 | [import el][reorder others] | |
2465 | * compare | |
2466 | <1 4 | |
2467 | <1 L | |
2468 | <1 Ω | |
2469 | ||
2470 | ** test: regression test for CollationFastLatinBuilder, ICU ticket 11388 | |
2471 | @ rules | |
2472 | &x<<aa<<<Aa<<<AA | |
2473 | % strength=secondary | |
2474 | * compare | |
2475 | <1 AA | |
2476 | <2 Aẩ | |
2477 | <2 aą | |
2478 | * compare | |
2479 | <1 AA | |
2480 | <2 aą | |
2481 | ||
2482 | ** test: tailor tertiary-after a common tertiary where there is a lower one | |
2483 | # Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one. | |
2484 | # See ICU ticket 11448 & CLDR ticket 7222. | |
2485 | @ rules | |
2486 | &あ<<<x<<<y<<<z | |
2487 | * compare | |
2488 | <1 ぁ | |
2489 | <3 あ | |
2490 | <3 x | |
2491 | <3 y | |
2492 | <3 z | |
2493 | <3 ァ | |
2494 | <1 い | |
2495 | ||
2496 | ** test: tailor tertiary-after a below-common tertiary | |
2497 | @ rules | |
2498 | &ぁ<<<x<<<y<<<z | |
2499 | * compare | |
2500 | <1 ぁ | |
2501 | <3 x | |
2502 | <3 y | |
2503 | <3 z | |
2504 | <3 あ | |
2505 | <3 ァ | |
2506 | <1 い | |
2507 | ||
2508 | ** test: tailor tertiary-before a common tertiary where there is a lower one | |
2509 | @ rules | |
2510 | &[before 3]あ<<<x<<<y<<<z | |
2511 | * compare | |
2512 | <1 ぁ | |
2513 | <3 x | |
2514 | <3 y | |
2515 | <3 z | |
2516 | <3 あ | |
2517 | <3 ァ | |
2518 | <1 い | |
2519 | ||
2520 | ** test: tailor tertiary-before a below-common tertiary | |
2521 | @ rules | |
2522 | &[before 3]ぁ<<<x<<<y<<<z | |
2523 | * compare | |
2524 | <1 x | |
2525 | <3 y | |
2526 | <3 z | |
2527 | <3 ぁ | |
2528 | <3 あ | |
2529 | <3 ァ | |
2530 | <1 い | |
2531 | ||
2532 | ** test: reorder single scripts not groups, ICU ticket 11449 | |
2533 | @ root | |
2534 | % reorder Goth Latn | |
2535 | * compare | |
2536 | <1 4 | |
2537 | <1 𐌰 # Gothic | |
2538 | <1 L | |
2539 | <1 Ω | |
2540 | # Before ICU 55, the following reordered together with Gothic. | |
2541 | <1 𐌈 # Old Italic | |
2542 | <1 𐑐 # Shavian |