]>
Commit | Line | Data |
---|---|---|
b331163b | 1 | # Copyright (c) 2012-2015 International Business Machines |
57a6839d A |
2 | # Corporation and others. All Rights Reserved. |
3 | # | |
4 | # This file should be in UTF-8 with a signature byte sequence ("BOM"). | |
5 | # | |
6 | # collationtest.txt: Collation test data. | |
7 | # | |
8 | # created on: 2012apr13 | |
9 | # created by: Markus W. Scherer | |
10 | ||
11 | # A line with "** test: description" is used for verbose and error output. | |
12 | ||
13 | # A collator can be set with "@ root" or "@ locale language-tag", | |
14 | # for example "@ locale de-u-co-phonebk". | |
b331163b | 15 | # An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". |
57a6839d A |
16 | |
17 | # A collator can be built with "@ rules". | |
18 | # An "@ rules" line is followed by one or more lines with the tailoring rules. | |
19 | ||
20 | # A collator can be modified with "% attribute=value". | |
21 | ||
22 | # "* compare" tests the order (= or <) of the following strings. | |
23 | # The relation can be "=" or "<" (the level of the difference is not specified) | |
24 | # or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). | |
25 | ||
26 | # Test sections ("* compare") are terminated by | |
27 | # definitions of new collators, changing attributes, or new test sections. | |
28 | ||
29 | ** test: simple CEs & expansions | |
30 | # Many types of mappings are tested elsewhere, including via the UCA conformance tests. | |
31 | # Here we mostly cover a few unusual mappings. | |
32 | @ rules | |
33 | &\x01 # most control codes are ignorable | |
34 | <<<\u0300 # tertiary CE | |
35 | &9<\x00 # NUL not ignorable | |
36 | &\uA00A\uA00B=\uA002 # two long-primary CEs | |
37 | &\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits | |
38 | ||
39 | * compare | |
40 | = \x01 | |
41 | = \x02 | |
42 | <3 \u0300 | |
43 | <1 9 | |
44 | <1 \x00 | |
45 | = \x01\x00\x02 | |
46 | <1 a | |
47 | <3 a\u0300 | |
48 | <2 a\u0308 | |
49 | = ä | |
50 | <1 b | |
51 | <1 か # Hiragana Ka (U+304B) | |
52 | <2 か\u3099 # plus voiced sound mark | |
53 | = が # Hiragana Ga (U+304C) | |
54 | <1 \uA00A\uA00B | |
55 | = \uA002 | |
56 | <1 \uA00A\uA00B\u00050004 | |
57 | <1 \uA00A\uA00B\u00050005 | |
58 | = \uA003 | |
59 | <1 \uA00A\uA00B\u00050006 | |
60 | ||
61 | ** test: contractions | |
62 | # Create some interesting mappings, and map some normalization-inert characters | |
63 | # (which are not subject to canonical reordering) | |
64 | # to some of the same CEs to check the sequence of CEs. | |
65 | @ rules | |
66 | ||
67 | # Contractions starting with 'a' should not continue with any character < U+0300 | |
68 | # so that we can test a shortcut for that. | |
69 | &a=ⓐ | |
70 | &b<bz=ⓑ | |
71 | &d<dz\u0301=ⓓ # d+z+acute | |
72 | &z | |
73 | <a\u0301=Ⓐ # a+acute sorts after z | |
74 | <a\u0301\u0301=Ⓑ # a+acute+acute | |
75 | <a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right | |
76 | <a\u030a=Ⓓ # a+ring | |
77 | <a\u0323=Ⓔ # a+dot below | |
78 | <a\u0323\u0358=Ⓕ # a+dot below+dot above right | |
79 | <a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring | |
80 | <a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z | |
81 | ||
82 | &\U0001D158=⁰ # musical notehead black (has a symbol primary) | |
83 | <\U0001D158\U0001D165=¼ # musical quarter note | |
84 | ||
85 | # deliberately missing prefix contractions: | |
86 | # dz | |
87 | # a\u0327 | |
88 | # a\u0327\u0323 | |
89 | # a\u0327\u0323b | |
90 | ||
91 | &\x01 | |
92 | <<<\U0001D165=¹ # musical stem (ccc=216) | |
93 | <<<\U0001D16D=² # musical augmentation dot (ccc=226) | |
94 | <<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) | |
95 | &\u0301=❶ # acute (ccc=230) | |
96 | &\u030a=❷ # ring (ccc=230) | |
97 | &\u0308=❸ # diaeresis (ccc=230) | |
98 | <<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) | |
99 | &\u0327=❺ # cedilla (ccc=202) | |
100 | &\u0323=❻ # dot below (ccc=220) | |
101 | &\u0331=❼ # macron below (ccc=220) | |
102 | <<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) | |
103 | &\u0334=❾ # tilde overlay (ccc=1) | |
104 | &\u0358=❿ # dot above right (ccc=232) | |
105 | ||
106 | &\u0f71=① # tibetan vowel sign aa | |
107 | &\u0f72=② # tibetan vowel sign i | |
108 | # \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 | |
109 | &\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) | |
110 | ||
111 | ** test: simple contractions | |
112 | ||
113 | # Some strings are chosen to cause incremental contiguous contraction matching to | |
114 | # go into partial matches for prefixes of contractions | |
115 | # (where the prefixes are deliberately not also contractions). | |
116 | # When there is no complete match, then the matching code must back out of those | |
117 | # so that discontiguous contractions work as specified. | |
118 | ||
119 | * compare | |
120 | # contraction starter with no following text, or mismatch, or blocked | |
121 | <1 a | |
122 | = ⓐ | |
123 | <1 aa | |
124 | = ⓐⓐ | |
125 | <1 ab | |
126 | = ⓐb | |
127 | <1 az | |
128 | = ⓐz | |
129 | ||
130 | * compare | |
131 | <1 a | |
132 | <2 a\u0308\u030a # ring blocked by diaeresis | |
133 | = ⓐ❸❷ | |
134 | <2 a\u0327 | |
135 | = ⓐ❺ | |
136 | ||
137 | * compare | |
138 | <2 \u0308 | |
139 | = ❸ | |
140 | <2 \u0308\u030a\u0301 # acute blocked by ring | |
141 | = ❸❷❶ | |
142 | ||
143 | * compare | |
144 | <1 \U0001D158 | |
145 | = ⁰ | |
146 | <1 \U0001D158\U0001D165 | |
147 | = ¼ | |
148 | ||
149 | # no discontiguous contraction because of missing prefix contraction d+z, | |
150 | # and a starter ('z') after the 'd' | |
151 | * compare | |
152 | <1 dz\u0323\u0301 | |
153 | = dz❻❶ | |
154 | ||
155 | # contiguous contractions | |
156 | * compare | |
157 | <1 abz | |
158 | = ⓐⓑ | |
159 | <1 abzz | |
160 | = ⓐⓑz | |
161 | ||
162 | * compare | |
163 | <1 a | |
164 | <1 z | |
165 | <1 a\u0301 | |
166 | = Ⓐ | |
167 | <1 a\u0301\u0301 | |
168 | = Ⓑ | |
169 | <1 a\u0301\u0301\u0358 | |
170 | = Ⓒ | |
171 | <1 a\u030a | |
172 | = Ⓓ | |
173 | <1 a\u0323\u0358 | |
174 | = Ⓕ | |
175 | <1 a\u0327\u0323\u030a # match despite missing prefix | |
176 | = Ⓖ | |
177 | <1 a\u0327\u0323bz | |
178 | = Ⓗ | |
179 | ||
180 | * compare | |
181 | <2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second | |
182 | = ❸❹ | |
183 | ||
184 | * compare | |
185 | <1 \U0001D158\U0001D165 | |
186 | = ¼ | |
187 | ||
188 | * compare | |
189 | <3 \U0001D165\U0001D16D | |
190 | = ³ | |
191 | ||
192 | ** test: discontiguous contractions | |
193 | * compare | |
194 | <1 a\u0327\u030a # a+ring skips cedilla | |
195 | = Ⓓ❺ | |
196 | <2 a\u0327\u0327\u030a # a+ring skips 2 cedillas | |
197 | = Ⓓ❺❺ | |
198 | <2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas | |
199 | = Ⓓ❺❺❺ | |
200 | <2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas | |
201 | = Ⓓ❾❺❺ | |
202 | <1 a\u0327\u0323 # a+dot below skips cedilla | |
203 | = Ⓔ❺ | |
204 | <1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute | |
205 | = Ⓕ❶ | |
206 | <2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay | |
207 | = Ⓕ❾ | |
208 | ||
209 | * compare | |
210 | <2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below | |
211 | = ❽❼ | |
212 | ||
213 | * compare | |
214 | <1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) | |
215 | = Ⓓ❺❼❻ | |
216 | <1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla | |
217 | = Ⓔ❺²❷ | |
218 | <2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas | |
219 | = Ⓔ❺❺❷ | |
220 | <2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla | |
221 | = Ⓔ❺❻❷ | |
222 | <2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla | |
223 | = Ⓔ❾❺❷ | |
224 | ||
225 | * compare | |
226 | <1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla | |
227 | = ¼❺ | |
228 | <1 a\U0001D165\u0323 # a+dot below skips stem | |
229 | = Ⓔ¹ | |
230 | ||
231 | # partial contiguous match, backs up, matches discontiguous contraction | |
232 | <1 a\u0327\u0323b | |
233 | = Ⓔ❺b | |
234 | <1 a\u0327\u0323ba | |
235 | = Ⓔ❺bⓐ | |
236 | ||
237 | # a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks | |
238 | * compare | |
239 | <1 a\u0327\u0301\u0301\u0358 | |
240 | = Ⓒ❺ | |
241 | ||
242 | # FCD but not NFD | |
243 | * compare | |
244 | <1 a\u0f73\u0301 # a+acute skips tibetan ii | |
245 | = Ⓐ③ | |
246 | ||
247 | # FCD but the 0f71 inside the 0f73 must be skipped | |
248 | # to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 | |
249 | * compare | |
250 | <1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 | |
251 | = ③① | |
252 | ||
253 | ** test: discontiguous contractions with nested contractions | |
254 | * compare | |
255 | <1 a\u0323\u0308\u0301\u0358 | |
256 | = Ⓕ❹ | |
257 | <2 a\u0323\u0308\u0301\u0308\u0301\u0358 | |
258 | = Ⓕ❹❹ | |
259 | ||
260 | ** test: discontiguous contractions with interleaved contractions | |
261 | * compare | |
262 | # a+ring & cedilla & macron below+dot above right | |
263 | <1 a\u0327\u0331\u030a\u0358 | |
264 | = Ⓓ❺❽ | |
265 | ||
266 | # a+ring & 1x..3x macron below+dot above right | |
267 | <2 a\u0331\u030a\u0358 | |
268 | = Ⓓ❽ | |
269 | <2 a\u0331\u0331\u030a\u0358\u0358 | |
270 | = Ⓓ❽❽ | |
271 | # also skips acute | |
272 | <2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 | |
273 | = Ⓓ❽❽❽❶ | |
274 | ||
275 | # a+dot below & stem+augmentation dot, followed by contiguous d+z+acute | |
276 | <1 a\U0001D165\u0323\U0001D16Ddz\u0301 | |
277 | = Ⓔ³ⓓ | |
278 | ||
279 | ** test: some simple string comparisons | |
280 | @ root | |
281 | * compare | |
282 | # first string compares against "" | |
283 | = \u0000 | |
284 | < a | |
285 | <1 b | |
286 | <3 B | |
287 | = \u0000B\u0000 | |
288 | ||
289 | ** test: compare with strength=primary | |
290 | % strength=primary | |
291 | * compare | |
292 | <1 a | |
293 | <1 b | |
294 | = B | |
295 | ||
296 | ** test: compare with strength=secondary | |
297 | % strength=secondary | |
298 | * compare | |
299 | <1 a | |
300 | <1 b | |
301 | = B | |
302 | ||
303 | ** test: compare with strength=tertiary | |
304 | % strength=tertiary | |
305 | * compare | |
306 | <1 a | |
307 | <1 b | |
308 | <3 B | |
309 | ||
310 | ** test: compare with strength=quaternary | |
311 | % strength=quaternary | |
312 | * compare | |
313 | <1 a | |
314 | <1 b | |
315 | <3 B | |
316 | ||
317 | ** test: compare with strength=identical | |
318 | % strength=identical | |
319 | * compare | |
320 | <1 a | |
321 | <1 b | |
322 | <3 B | |
323 | ||
324 | ** test: côté with forwards secondary | |
325 | @ root | |
326 | * compare | |
327 | <1 cote | |
328 | <2 coté | |
329 | <2 côte | |
330 | <2 côté | |
331 | ||
332 | ** test: côté with forwards secondary vs. U+FFFE merge separator | |
333 | # Merged sort keys: On each level, any difference in the first segment | |
334 | # must trump any further difference. | |
335 | * compare | |
336 | <1 cote\uFFFEcôté | |
337 | <2 coté\uFFFEcôte | |
338 | <2 côte\uFFFEcoté | |
339 | <2 côté\uFFFEcote | |
340 | ||
341 | ** test: côté with backwards secondary | |
342 | % backwards=on | |
343 | * compare | |
344 | <1 cote | |
345 | <2 côte | |
346 | <2 coté | |
347 | <2 côté | |
348 | ||
349 | ** test: côté with backwards secondary vs. U+FFFE merge separator | |
350 | # Merged sort keys: On each level, any difference in the first segment | |
351 | # must trump any further difference. | |
352 | * compare | |
353 | <1 cote\uFFFEcôté | |
354 | <2 côte\uFFFEcoté | |
355 | <2 coté\uFFFEcôte | |
356 | <2 côté\uFFFEcote | |
357 | ||
358 | ** test: U+FFFE on identical level | |
359 | @ root | |
360 | % strength=identical | |
361 | * compare | |
362 | # All of these control codes are completely-ignorable, so that | |
363 | # their low code points are compared with the merge separator. | |
364 | # The merge separator must compare less than any other character. | |
365 | <1 \uFFFE\u0001\u0002\u0003 | |
366 | <i \u0001\uFFFE\u0002\u0003 | |
367 | <i \u0001\u0002\uFFFE\u0003 | |
368 | <i \u0001\u0002\u0003\uFFFE | |
369 | ||
370 | * compare | |
371 | # The merge separator must even compare less than U+0000. | |
372 | <1 \uFFFE\u0000\u0000 | |
373 | <i \u0000\uFFFE\u0000 | |
374 | <i \u0000\u0000\uFFFE | |
375 | ||
376 | ** test: Hani < surrogates < U+FFFD | |
377 | # Note: compareUTF8() treats unpaired surrogates like U+FFFD, | |
378 | # so with that the strings with surrogates will compare equal to each other | |
379 | # and equal to the string with U+FFFD. | |
380 | @ root | |
381 | % strength=identical | |
382 | * compare | |
383 | <1 abz | |
384 | <1 a\u4e00z | |
385 | <1 a\U00020000z | |
386 | <1 a\ud800z | |
387 | <1 a\udbffz | |
388 | <1 a\udc00z | |
389 | <1 a\udfffz | |
390 | <1 a\ufffdz | |
391 | ||
392 | ** test: script reordering | |
393 | @ root | |
394 | % reorder Hani Zzzz digit | |
395 | * compare | |
396 | <1 ? | |
397 | <1 + | |
398 | <1 丂 | |
399 | <1 a | |
400 | <1 α | |
401 | <1 5 | |
402 | ||
403 | % reorder default | |
404 | * compare | |
405 | <1 ? | |
406 | <1 + | |
407 | <1 5 | |
408 | <1 a | |
409 | <1 α | |
410 | <1 丂 | |
411 | ||
412 | ** test: empty rules | |
413 | @ rules | |
414 | * compare | |
415 | <1 a | |
416 | <2 ä | |
417 | <3 Ä | |
418 | <1 b | |
419 | ||
420 | ** test: very simple rules | |
421 | @ rules | |
422 | &a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z | |
423 | % strength=quaternary | |
424 | * compare | |
425 | <1 a | |
426 | = e | |
427 | <4 q | |
428 | <4 r | |
429 | <1 x | |
430 | <3 X | |
431 | <2 y | |
432 | <3 Y | |
433 | <2 z | |
434 | <3 Z | |
435 | ||
436 | ** test: tailoring twice before a root position: primary | |
437 | @ rules | |
438 | &[before 1]b<p | |
439 | &[before 1]b<q | |
440 | * compare | |
441 | <1 a | |
442 | <1 p | |
443 | <1 q | |
444 | <1 b | |
445 | ||
446 | ** test: tailoring twice before a root position: secondary | |
447 | @ rules | |
448 | &[before 2]ſ<<p | |
449 | &[before 2]ſ<<q | |
450 | * compare | |
451 | <1 s | |
452 | <2 p | |
453 | <2 q | |
454 | <2 ſ | |
455 | ||
456 | # secondary-before common weight | |
457 | @ rules | |
458 | &[before 2]b<<p | |
459 | &[before 2]b<<q | |
460 | * compare | |
461 | <1 a | |
462 | <1 p | |
463 | <2 q | |
464 | <2 b | |
465 | ||
466 | ** test: tailoring twice before a root position: tertiary | |
467 | @ rules | |
468 | &[before 3]B<<<p | |
469 | &[before 3]B<<<q | |
470 | * compare | |
471 | <1 b | |
472 | <3 p | |
473 | <3 q | |
474 | <3 B | |
475 | ||
476 | # tertiary-before common weight | |
477 | @ rules | |
478 | &[before 3]b<<<p | |
479 | &[before 3]b<<<q | |
480 | * compare | |
481 | <1 a | |
482 | <1 p | |
483 | <3 q | |
484 | <3 b | |
485 | ||
486 | @ rules | |
487 | &[before 2]b<<s | |
488 | &[before 3]s<<<p | |
489 | &[before 3]s<<<q | |
490 | * compare | |
491 | <1 a | |
492 | <1 p | |
493 | <3 q | |
494 | <3 s | |
495 | <2 b | |
496 | ||
497 | ** test: tailor after completely ignorable | |
498 | @ rules | |
499 | &\x00<<<x<<y | |
500 | * compare | |
501 | = \x00 | |
502 | = \x1F | |
503 | <3 x | |
504 | <2 y | |
505 | ||
506 | ** test: secondary tailoring gaps, ICU ticket 9362 | |
507 | @ rules | |
508 | &[before 2]s<<'_' | |
509 | &s<<r # secondary between s and ſ (long s) | |
510 | &ſ<<*a-q # more than 15 between ſ and secondary CE boundary | |
511 | &[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE | |
512 | &[last primary ignorable]<<y<<z | |
513 | ||
514 | * compare | |
515 | <2 u | |
516 | <2 v | |
517 | <2 \u0332 # lowest secondary CE | |
518 | <2 \u0308 | |
519 | <2 y | |
520 | <2 z | |
521 | <1 s_ | |
522 | <2 ss | |
523 | <2 sr | |
524 | <2 sſ | |
525 | <2 sa | |
526 | <2 sb | |
527 | <2 sp | |
528 | <2 sq | |
529 | <2 sus | |
530 | <2 svs | |
531 | <2 rs | |
532 | ||
533 | ** test: tertiary tailoring gaps, ICU ticket 9362 | |
534 | @ rules | |
535 | &[before 3]t<<<'_' | |
536 | &t<<<r # tertiary between t and fullwidth t | |
537 | &ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary | |
538 | &[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE | |
539 | &[last secondary ignorable]<<<y<<<z | |
540 | ||
541 | * compare | |
542 | <3 u | |
543 | <3 v | |
544 | # Note: The root collator currently does not map any characters to tertiary CEs. | |
545 | <3 y | |
546 | <3 z | |
547 | <1 t_ | |
548 | <3 tt | |
549 | <3 tr | |
550 | <3 tt | |
551 | <3 tᵀ | |
552 | <3 ta | |
553 | <3 tb | |
554 | <3 tp | |
555 | <3 tq | |
556 | <3 tut | |
557 | <3 tvt | |
558 | <3 rt | |
559 | ||
560 | ** test: secondary & tertiary around root character | |
561 | @ rules | |
562 | &[before 2]m<<r | |
563 | &m<<s | |
564 | &[before 3]m<<<u | |
565 | &m<<<v | |
566 | * compare | |
567 | <1 l | |
568 | <1 r | |
569 | <2 u | |
570 | <3 m | |
571 | <3 v | |
572 | <2 s | |
573 | <1 n | |
574 | ||
575 | ** test: secondary & tertiary around tailored item | |
576 | @ rules | |
577 | &m<x | |
578 | &[before 2]x<<r | |
579 | &x<<s | |
580 | &[before 3]x<<<u | |
581 | &x<<<v | |
582 | * compare | |
583 | <1 m | |
584 | <1 r | |
585 | <2 u | |
586 | <3 x | |
587 | <3 v | |
588 | <2 s | |
589 | <1 n | |
590 | ||
591 | ** test: more nesting of secondary & tertiary before | |
592 | @ rules | |
593 | &[before 3]m<<<u | |
594 | &[before 2]m<<r | |
595 | &[before 3]r<<<q | |
596 | &m<<<w | |
597 | &m<<t | |
598 | &[before 3]w<<<v | |
599 | &w<<<x | |
600 | &w<<s | |
601 | * compare | |
602 | <1 l | |
603 | <1 q | |
604 | <3 r | |
605 | <2 u | |
606 | <3 m | |
607 | <3 v | |
608 | <3 w | |
609 | <3 x | |
610 | <2 s | |
611 | <2 t | |
612 | <1 n | |
613 | ||
614 | ** test: case bits | |
615 | @ rules | |
616 | &w<x # tailored CE getting case bits | |
617 | =uv=uV=Uv=UV # 2 chars -> 1 CE | |
618 | &ae=ch=cH=Ch=CH # 2 chars -> 2 CEs | |
619 | &rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs | |
620 | % caseFirst=lower | |
621 | * compare | |
622 | <1 ae | |
623 | = ch | |
624 | <3 cH | |
625 | <3 Ch | |
626 | <3 CH | |
627 | <1 rst | |
628 | = yz | |
629 | <3 yZ | |
630 | <3 Yz | |
631 | <3 YZ | |
632 | <1 w | |
633 | <1 x | |
634 | = uv | |
635 | <3 uV | |
636 | = Uv # mixed case on single CE cannot distinguish variations | |
637 | <3 UV | |
638 | ||
639 | ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower | |
640 | @ rules | |
641 | &\u0001<<<t<<<T # tertiary CEs | |
642 | % caseFirst=lower | |
643 | * compare | |
644 | <1 aa | |
645 | <3 aat | |
646 | <3 aaT | |
647 | <3 aA | |
648 | <3 aAt | |
649 | <3 ata | |
650 | <3 aTa | |
651 | ||
652 | ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper | |
653 | % caseFirst=upper | |
654 | * compare | |
655 | <1 aA | |
656 | <3 aAt | |
657 | <3 aa | |
658 | <3 aat | |
659 | <3 aaT | |
660 | <3 ata | |
661 | <3 aTa | |
662 | ||
663 | ** test: reset on expansion, ICU tickets 9415 & 9593 | |
664 | @ rules | |
665 | &æ<x # tailor the last primary CE so that x sorts between ae and af | |
666 | &æb=bæ # copy all reset CEs to make bæ sort the same | |
667 | &각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 | |
668 | &⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference | |
669 | &l·=z # handle the pre-context for · when fetching reset CEs | |
670 | <<u # copy/tailor 2 CEs | |
671 | ||
672 | * compare | |
673 | <1 ae | |
674 | <2 æ | |
675 | <1 x | |
676 | <1 af | |
677 | ||
678 | * compare | |
679 | <1 aeb | |
680 | <2 æb | |
681 | = bæ | |
682 | ||
683 | * compare | |
684 | <1 각 | |
685 | <1 h | |
686 | <1 갂 | |
687 | <1 갃 | |
688 | ||
689 | * compare | |
690 | <1 · # by itself: primary CE | |
691 | <1 l | |
692 | <2 l· # l+middle dot has only a secondary difference from l | |
693 | = z | |
694 | <2 u | |
695 | ||
696 | * compare | |
697 | <1 (13) | |
698 | <3 ⒀ # DUCET sets special tertiary weights in all CEs | |
699 | <2 y | |
700 | <1 (13[ | |
701 | ||
702 | % alternate=shifted | |
703 | * compare | |
704 | <1 (13) | |
705 | = 13 | |
706 | <3 ⒀ | |
707 | = y # alternate=shifted removes the tailoring difference on the last CE | |
708 | <1 14 | |
709 | ||
710 | ** test: contraction inside extension, ICU ticket 9378 | |
711 | @ rules | |
712 | &а<<х/й # all letters are Cyrillic | |
713 | * compare | |
714 | <1 ай | |
715 | <2 х | |
716 | ||
717 | ** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 | |
718 | @ rules | |
719 | &t<x &ᵀ<y # same primary weights | |
720 | &q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent | |
721 | * compare | |
722 | <1 q | |
723 | <1 u | |
724 | <1 v | |
725 | <1 ꝗ | |
726 | <1 t | |
727 | <3 ᵀ | |
728 | <1 y | |
729 | <1 x | |
730 | ||
731 | # Principle: Each rule builds on the state of preceding rules and ignores following rules. | |
732 | ||
733 | ** test: later rule does not affect earlier reset position, ICU ticket 10105 | |
734 | @ rules | |
735 | &a < u < v < w &ov < x &b < v | |
736 | * compare | |
737 | <1 oa | |
738 | <1 ou | |
739 | <1 x # CE(o) followed by CE between u and w | |
740 | <1 ow | |
741 | <1 ob | |
742 | <1 ov | |
743 | ||
744 | ** test: later rule does not affect earlier extension (1), ICU ticket 10105 | |
745 | @ rules | |
746 | &a=x/b &v=b | |
747 | % strength=secondary | |
748 | * compare | |
749 | <1 B | |
750 | <1 c | |
751 | <1 v | |
752 | = b | |
753 | * compare | |
754 | <1 AB | |
755 | = x | |
756 | <1 ac | |
757 | <1 av | |
758 | = ab | |
759 | ||
760 | ** test: later rule does not affect earlier extension (2), ICU ticket 10105 | |
761 | @ rules | |
762 | &a <<< c / e &g <<< e / l | |
763 | % strength=secondary | |
764 | * compare | |
765 | <1 AE | |
766 | = c | |
767 | <2 æ | |
768 | <1 agl | |
769 | = ae | |
770 | ||
771 | ** test: later rule does not affect earlier extension (3), ICU ticket 10105 | |
772 | @ rules | |
773 | &a = b / c &d = c / e | |
774 | % strength=secondary | |
775 | * compare | |
776 | <1 AC # C is still only tertiary different from the original c | |
777 | = b | |
778 | <1 ade | |
779 | = ac | |
780 | ||
781 | ** test: extension contains tailored character, ICU ticket 10105 | |
782 | @ rules | |
783 | &a=e &b=u/e | |
784 | * compare | |
785 | <1 a | |
786 | = e | |
787 | <1 ba | |
788 | = be | |
789 | = u | |
790 | ||
791 | ** test: add simple mappings for characters with root context | |
792 | @ rules | |
793 | &z=· # middle dot has a prefix mapping in the CLDR root | |
794 | &n=и # и (U+0438) has contractions in the root | |
795 | * compare | |
796 | <1 l | |
797 | <2 l· # root mapping for l|· still works | |
798 | <1 z | |
799 | = · | |
800 | * compare | |
801 | <1 n | |
802 | = и | |
803 | <1 И | |
804 | <1 и\u0306 # root mapping for й=и\u0306 still works | |
805 | = й | |
806 | <3 Й | |
807 | ||
808 | ** test: add context mappings around characters with root context | |
809 | @ rules | |
810 | &z=·h # middle dot has a prefix mapping in the CLDR root | |
811 | &n=ә|и # и (U+0438) has contractions in the root | |
812 | * compare | |
813 | <1 l | |
814 | <2 l· # root mapping for l|· still works | |
815 | <1 z | |
816 | = ·h | |
817 | * compare | |
818 | <1 и | |
819 | <3 И | |
820 | <1 и\u0306 # root mapping for й=и\u0306 still works | |
821 | = й | |
822 | * compare | |
823 | <1 әn | |
824 | = әи | |
825 | <1 әo | |
826 | ||
827 | ** test: many secondary CEs at the top of their range | |
828 | @ rules | |
829 | &[last primary ignorable]<<*\u2801-\u28ff | |
830 | * compare | |
831 | <2 \u0308 | |
832 | <2 \u2801 | |
833 | <2 \u2802 | |
834 | <2 \u2803 | |
835 | <2 \u2804 | |
836 | <2 \u28fd | |
837 | <2 \u28fe | |
838 | <2 \u28ff | |
839 | <1 \x20 | |
840 | ||
841 | ** test: many tertiary CEs at the top of their range | |
842 | @ rules | |
843 | &[last secondary ignorable]<<<*a-z | |
844 | * compare | |
845 | <3 a | |
846 | <3 b | |
847 | <3 c | |
848 | <3 d | |
849 | # e..w | |
850 | <3 x | |
851 | <3 y | |
852 | <3 z | |
853 | <2 \u0308 | |
854 | ||
855 | ** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 | |
856 | @ rules | |
857 | &a=p|x &b=px &c=op | |
858 | * compare | |
859 | <1 b | |
860 | = px | |
861 | <3 B | |
862 | <1 c | |
863 | = op | |
864 | <3 C | |
865 | * compare | |
866 | <1 ca | |
867 | = opx # first contraction op, then prefix p|x | |
868 | <3 cA | |
869 | <3 Ca | |
870 | ||
871 | ** test: reset position with prefix (pre-context), ICU ticket 10102 | |
872 | @ rules | |
873 | &a=p|x &px=y | |
874 | * compare | |
875 | <1 pa | |
876 | = px | |
877 | = y | |
878 | <3 pA | |
879 | <1 q | |
880 | <1 x | |
881 | ||
882 | ** test: prefix+contraction together (1), ICU ticket 10071 | |
883 | @ rules | |
884 | &x=a|bc | |
885 | * compare | |
886 | <1 ab | |
887 | <1 Abc | |
888 | <1 abd | |
889 | <1 ac | |
890 | <1 aw | |
891 | <1 ax | |
892 | = abc | |
893 | <3 aX | |
894 | <3 Ax | |
895 | <1 b | |
896 | <1 bb | |
897 | <1 bc | |
898 | <3 bC | |
899 | <3 Bc | |
900 | <1 bd | |
901 | ||
902 | ** test: prefix+contraction together (2), ICU ticket 10071 | |
903 | @ rules | |
904 | &w=bc &x=a|b | |
905 | * compare | |
906 | <1 w | |
907 | = bc | |
908 | <3 W | |
909 | * compare | |
910 | <1 aw | |
911 | <1 ax | |
912 | = ab | |
913 | <3 aX | |
914 | <1 axb | |
915 | <1 axc | |
916 | = abc # prefix match a|b takes precedence over contraction match bc | |
917 | <3 abC | |
918 | <1 abd | |
919 | <1 ay | |
920 | ||
921 | ** test: prefix+contraction together (3), ICU ticket 10071 | |
922 | @ rules | |
923 | &x=a|b &w=bc # reverse order of rules as previous test, order should not matter here | |
924 | * compare # same "compare" sequences as previous test | |
925 | <1 w | |
926 | = bc | |
927 | <3 W | |
928 | * compare | |
929 | <1 aw | |
930 | <1 ax | |
931 | = ab | |
932 | <3 aX | |
933 | <1 axb | |
934 | <1 axc | |
935 | = abc # prefix match a|b takes precedence over contraction match bc | |
936 | <3 abC | |
937 | <1 abd | |
938 | <1 ay | |
939 | ||
940 | ** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 | |
941 | @ rules | |
942 | &d=ch &v=p|ci | |
943 | * compare | |
944 | <1 pc | |
945 | <3 pC | |
946 | <1 pcH | |
947 | <1 pcI | |
948 | <1 pd | |
949 | = pch # no-prefix contraction ch matches | |
950 | <3 pD | |
951 | <1 pv | |
952 | = pci # prefix+contraction p|ci matches | |
953 | <3 pV | |
954 | ||
955 | ** test: tailor in & around compact ranges of root primaries | |
956 | # The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs | |
957 | # which should be reliably encoded as one range in the root elements data. | |
958 | @ rules | |
959 | &[before 1]ᚁ<a | |
960 | &ᚁ<b | |
961 | &[before 1]ᚂ<c | |
962 | &ᚂ<d | |
963 | &[before 1]ᚚ<y | |
964 | &ᚚ<z | |
965 | &[before 2]ᚁ<<r | |
966 | &ᚁ<<s | |
967 | &[before 3]ᚚ<<<t | |
968 | &ᚚ<<<u | |
969 | * compare | |
970 | <1 ᣵ # U+18F5 last Canadian Aboriginal | |
971 | <1 a | |
972 | <1 r | |
973 | <2 ᚁ | |
974 | <2 s | |
975 | <1 b | |
976 | <1 c | |
977 | <1 ᚂ | |
978 | <1 d | |
979 | <1 ᚃ | |
980 | <1 ᚙ | |
981 | <1 y | |
982 | <1 t | |
983 | <3 ᚚ | |
984 | <3 u | |
985 | <1 z | |
986 | <1 ᚠ # U+16A0 first Runic | |
987 | ||
988 | ** test: suppressContractions | |
989 | @ rules | |
990 | &z<ch<әж [suppressContractions [·cә]] | |
991 | * compare | |
992 | <1 ch | |
993 | <3 cH # ch was suppressed | |
994 | <1 l | |
995 | <1 l· # primary difference, not secondary, because l|· was suppressed | |
996 | <1 ә | |
997 | <2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed | |
998 | <1 әж | |
999 | <3 әЖ | |
1000 | ||
1001 | ** test: Hangul & Jamo | |
1002 | @ rules | |
1003 | &L=\u1100 # first Jamo L | |
1004 | &V=\u1161 # first Jamo V | |
1005 | &T=\u11A8 # first Jamo T | |
1006 | &\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs | |
1007 | * compare | |
1008 | <1 Lv | |
1009 | <3 LV | |
1010 | = \u1100\u1161 | |
1011 | = \uAC00 | |
1012 | <1 LVt | |
1013 | <3 LVT | |
1014 | = \u1100\u1161\u11A8 | |
1015 | = \uAC00\u11A8 | |
1016 | = \uAC01 | |
1017 | <2 LVT\u0308 | |
1018 | <2 \u4E00 | |
1019 | <2 \u4E01 | |
1020 | <2 \u4E80 | |
1021 | <2 \u4EFF | |
1022 | <2 LV\u0308T | |
1023 | <1 \uAC02 | |
1024 | ||
1025 | ** test: adjust special reset positions according to previous rules, CLDR ticket 6070 | |
1026 | @ rules | |
1027 | &[last variable]<x | |
1028 | [maxVariable space] # has effect only after building, no effect on following rules | |
1029 | &[last variable]<y | |
1030 | &[before 1][first regular]<z | |
1031 | * compare | |
1032 | <1 ? # some punctuation | |
1033 | <1 x | |
1034 | <1 y | |
1035 | <1 z | |
1036 | <1 $ # some symbol | |
1037 | ||
1038 | @ rules | |
1039 | &[last primary ignorable]<<x<<<y | |
1040 | &[last primary ignorable]<<z | |
1041 | * compare | |
1042 | <2 \u0358 | |
1043 | <2 x | |
1044 | <3 y | |
1045 | <2 z | |
1046 | <1 \x20 | |
1047 | ||
1048 | @ rules | |
1049 | &[last secondary ignorable]<<<x | |
1050 | &[last secondary ignorable]<<<y | |
1051 | * compare | |
1052 | <3 x | |
1053 | <3 y | |
1054 | <2 \u0358 | |
1055 | ||
1056 | @ rules | |
1057 | &[before 2][first variable]<<z | |
1058 | &[before 2][first variable]<<y | |
1059 | &[before 3][first variable]<<<x | |
1060 | &[before 3][first variable]<<<w | |
1061 | &[before 1][first variable]<v | |
1062 | &[before 2][first variable]<<u | |
1063 | &[before 3][first variable]<<<t | |
1064 | &[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary | |
1065 | * compare | |
1066 | <2 \u0358 | |
1067 | <1 s | |
1068 | <2 \uFDD1\xA0 | |
1069 | <1 t | |
1070 | <3 u | |
1071 | <2 v | |
1072 | <1 w | |
1073 | <3 x | |
1074 | <3 y | |
1075 | <2 z | |
1076 | <2 \t | |
1077 | ||
1078 | @ rules | |
1079 | &[before 2][first regular]<<z | |
1080 | &[before 3][first regular]<<<y | |
1081 | &[before 1][first regular]<x | |
1082 | &[before 3][first regular]<<<w | |
1083 | &[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary | |
1084 | &[before 3][first regular]<<<u | |
1085 | &[before 1][first regular]<p # primary before the boundary: becomes variable | |
1086 | &[before 3][first regular]<<<t # not affected by p | |
1087 | &[last variable]<q # after p! | |
1088 | * compare | |
1089 | <1 ? | |
1090 | <1 p | |
1091 | <1 q | |
1092 | <1 t | |
1093 | <3 u | |
1094 | <3 v | |
1095 | <1 w | |
1096 | <3 x | |
1097 | <1 y | |
1098 | <3 z | |
1099 | <1 $ | |
1100 | ||
1101 | # check that p & q are indeed variable | |
1102 | % alternate=shifted | |
1103 | * compare | |
1104 | = ? | |
1105 | = p | |
1106 | = q | |
1107 | <1 t | |
1108 | <3 u | |
1109 | <3 v | |
1110 | <1 w | |
1111 | <3 x | |
1112 | <1 y | |
1113 | <3 z | |
1114 | <1 $ | |
1115 | ||
1116 | @ rules | |
1117 | &[before 2][first trailing]<<z | |
1118 | &[before 1][first trailing]<y | |
1119 | &[before 3][first trailing]<<<x | |
1120 | * compare | |
1121 | <1 \u4E00 # first Han, first implicit | |
1122 | <1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary | |
1123 | # Note: The root collator currently does not map any characters to the trailing first boundary primary. | |
1124 | <1 x | |
1125 | <3 y | |
1126 | <1 z | |
1127 | <2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. | |
1128 | ||
1129 | @ rules | |
1130 | &[before 2][first primary ignorable]<<z | |
1131 | &[before 2][first primary ignorable]<<y | |
1132 | &[before 3][first primary ignorable]<<<x | |
1133 | &[before 3][first primary ignorable]<<<w | |
1134 | * compare | |
1135 | = \x01 | |
1136 | <2 w | |
1137 | <3 x | |
1138 | <3 y | |
1139 | <2 z | |
1140 | <2 \u0301 | |
1141 | ||
1142 | @ rules | |
1143 | &[before 3][first secondary ignorable]<<<y | |
1144 | &[before 3][first secondary ignorable]<<<x | |
1145 | * compare | |
1146 | = \x01 | |
1147 | <3 x | |
1148 | <3 y | |
1149 | <2 \u0301 | |
1150 | ||
1151 | ** test: canonical closure | |
1152 | @ rules | |
1153 | &X=A &U=Â | |
1154 | * compare | |
1155 | <1 U | |
1156 | = Â | |
1157 | = A\u0302 | |
1158 | <2 Ú # U with acute | |
1159 | = U\u0301 | |
1160 | = Ấ # A with circumflex & acute | |
1161 | = Â\u0301 | |
1162 | = A\u0302\u0301 | |
1163 | <1 X | |
1164 | = A | |
1165 | <2 X\u030A # with ring above | |
1166 | = Å | |
1167 | = A\u030A | |
1168 | = \u212B # Angstrom sign | |
1169 | ||
1170 | @ rules | |
1171 | &x=\u5140\u55C0 | |
1172 | * compare | |
1173 | <1 x | |
1174 | = \u5140\u55C0 | |
1175 | = \u5140\uFA0D | |
1176 | = \uFA0C\u55C0 | |
1177 | = \uFA0C\uFA0D # CJK compatibility characters | |
1178 | <3 X | |
1179 | ||
1180 | # canonical closure on prefix rules, ICU ticket 9444 | |
1181 | @ rules | |
1182 | &x=ä|ŝ | |
1183 | * compare | |
1184 | <1 äs # not tailored | |
1185 | <1 äx | |
1186 | = äŝ | |
1187 | = a\u0308s\u0302 | |
1188 | = a\u0308ŝ | |
1189 | = äs\u0302 | |
1190 | <3 äX | |
1191 | ||
1192 | ** test: conjoining Jamo map to expansions | |
1193 | @ rules | |
1194 | &gg=\u1101 # Jamo Lead consonant GG | |
1195 | &nj=\u11AC # Jamo Trail consonant NJ | |
1196 | * compare | |
1197 | <1 gg\u1161nj | |
1198 | = \u1101\u1161\u11AC | |
1199 | = \uAE4C\u11AC | |
1200 | = \uAE51 | |
1201 | <3 gg\u1161nJ | |
1202 | <1 \u1100\u1100 | |
1203 | ||
1204 | ** test: canonical tail closure, ICU ticket 5913 | |
1205 | @ rules | |
1206 | &a<â | |
1207 | * compare | |
1208 | <1 a | |
1209 | <1 â # tailored | |
1210 | = a\u0302 | |
1211 | <2 a\u0323\u0302 # discontiguous contraction | |
1212 | = ạ\u0302 # equivalent | |
1213 | = ậ # equivalent | |
1214 | <1 b | |
1215 | ||
1216 | @ rules | |
1217 | &a<ạ | |
1218 | * compare | |
1219 | <1 a | |
1220 | <1 ạ # tailored | |
1221 | = a\u0323 | |
1222 | <2 a\u0323\u0302 # contiguous contraction plus extra diacritic | |
1223 | = ạ\u0302 # equivalent | |
1224 | = ậ # equivalent | |
1225 | <1 b | |
1226 | ||
1227 | # Tail closure should work even if there is a prefix and/or contraction. | |
1228 | @ rules | |
1229 | &a<\u5140|câ | |
1230 | # In order to find discontiguous contractions for \u5140|câ | |
1231 | # there must exist a mapping for \u5140|ca, regardless of what it maps to. | |
1232 | # (This follows from the UCA spec.) | |
1233 | &x=\u5140|ca | |
1234 | * compare | |
1235 | <1 \u5140a | |
1236 | = \uFA0Ca | |
1237 | <1 \u5140câ # tailored | |
1238 | = \uFA0Ccâ | |
1239 | = \u5140ca\u0302 | |
1240 | = \uFA0Cca\u0302 | |
1241 | <2 \u5140ca\u0323\u0302 # discontiguous contraction | |
1242 | = \uFA0Cca\u0323\u0302 | |
1243 | = \u5140cạ\u0302 | |
1244 | = \uFA0Ccạ\u0302 | |
1245 | = \u5140cậ | |
1246 | = \uFA0Ccậ | |
1247 | <1 \u5140b | |
1248 | = \uFA0Cb | |
1249 | <1 \u5140x | |
1250 | = \u5140ca | |
1251 | ||
1252 | # Double-check that without the extra mapping there will be no discontiguous match. | |
1253 | @ rules | |
1254 | &a<\u5140|câ | |
1255 | * compare | |
1256 | <1 \u5140a | |
1257 | = \uFA0Ca | |
1258 | <1 \u5140câ # tailored | |
1259 | = \uFA0Ccâ | |
1260 | = \u5140ca\u0302 | |
1261 | = \uFA0Cca\u0302 | |
1262 | <1 \u5140b | |
1263 | = \uFA0Cb | |
1264 | <1 \u5140ca\u0323\u0302 # no discontiguous contraction | |
1265 | = \uFA0Cca\u0323\u0302 | |
1266 | = \u5140cạ\u0302 | |
1267 | = \uFA0Ccạ\u0302 | |
1268 | = \u5140cậ | |
1269 | = \uFA0Ccậ | |
1270 | ||
1271 | @ rules | |
1272 | &a<cạ | |
1273 | * compare | |
1274 | <1 a | |
1275 | <1 cạ # tailored | |
1276 | = ca\u0323 | |
1277 | <2 ca\u0323\u0302 # contiguous contraction plus extra diacritic | |
1278 | = cạ\u0302 # equivalent | |
1279 | = cậ # equivalent | |
1280 | <1 b | |
1281 | ||
1282 | # ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | |
1283 | # = 03C9 0313 0300 0345 | |
1284 | # ccc = 0, 230, 230, 240 | |
1285 | @ rules | |
1286 | &δ=αῳ | |
1287 | # In order to find discontiguous contractions for αῳ | |
1288 | # there must exist a mapping for αω, regardless of what it maps to. | |
1289 | # (This follows from the UCA spec.) | |
1290 | &ε=αω | |
1291 | * compare | |
1292 | <1 δ | |
1293 | = αῳ | |
1294 | = αω\u0345 | |
1295 | <2 αω\u0313\u0300\u0345 # discontiguous contraction | |
1296 | = αὠ\u0300\u0345 | |
1297 | = αὢ\u0345 | |
1298 | = αᾢ | |
1299 | <2 αω\u0300\u0313\u0345 | |
1300 | = αὼ\u0313\u0345 | |
1301 | = αῲ\u0313 # not FCD | |
1302 | <1 ε | |
1303 | = αω | |
1304 | ||
1305 | # Double-check that without the extra mapping there will be no discontiguous match. | |
1306 | @ rules | |
1307 | &δ=αῳ | |
1308 | * compare | |
1309 | <1 αω\u0313\u0300\u0345 # no discontiguous contraction | |
1310 | = αὠ\u0300\u0345 | |
1311 | = αὢ\u0345 | |
1312 | = αᾢ | |
1313 | <2 αω\u0300\u0313\u0345 | |
1314 | = αὼ\u0313\u0345 | |
1315 | = αῲ\u0313 # not FCD | |
1316 | <1 δ | |
1317 | = αῳ | |
1318 | = αω\u0345 | |
1319 | ||
1320 | # Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. | |
1321 | # Tests code paths where the tailored string has a combining mark | |
1322 | # that does not occur in any composite's decomposition. | |
1323 | @ rules | |
1324 | &δ=αὼ\u0315 | |
1325 | * compare | |
1326 | <1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. | |
1327 | = αὠ\u0300\u0315 | |
1328 | = αὢ\u0315 | |
1329 | <1 δ | |
1330 | = αὼ\u0315 | |
1331 | = αω\u0300\u0315 | |
1332 | <2 αω\u0300\u0315\u0345 | |
1333 | = αὼ\u0315\u0345 | |
1334 | = αῲ\u0315 # not FCD | |
1335 | ||
1336 | ** test: danish a+a vs. a-umlaut, ICU ticket 9319 | |
1337 | @ rules | |
1338 | &z<aa | |
1339 | * compare | |
1340 | <1 z | |
1341 | <1 aa | |
1342 | <2 aa\u0308 | |
1343 | = aä | |
1344 | ||
1345 | ** test: Jamo L with and in prefix | |
1346 | # Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). | |
1347 | @ rules | |
1348 | # Jamo Lead consonant G after G or GG | |
1349 | &[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 | |
1350 | # Jamo Lead consonant GG sorts like G+G | |
1351 | &\u1100\u1100=\u1101 | |
1352 | # Note: Making G|GG and GG|GG sort the same as G|G+G | |
1353 | # would require the ability to reset on G|G+G, | |
1354 | # or we could make G-after-G equal to some secondary-CE character, | |
1355 | # and reset on a pair of those. | |
1356 | # (It does not matter much if there are at most two G in a row in real text.) | |
1357 | * compare | |
1358 | <1 \u1100 | |
1359 | <2 \u1100\u1100 # only one primary from a sequence of G lead consonants | |
1360 | = \u1101 | |
1361 | <2 \u1100\u1100\u1100 | |
1362 | = \u1101\u1100 | |
1363 | # but not = \u1100\u1101, see above | |
1364 | <1 \u1100\u1161 | |
1365 | = \uAC00 | |
1366 | <2 \u1100\u1100\u1161 | |
1367 | = \u1100\uAC00 # prefix match from the L of the LV syllable | |
1368 | = \u1101\u1161 | |
1369 | = \uAE4C | |
1370 | ||
1371 | ** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 | |
1372 | @ rules | |
1373 | # Low secondary CEs for Jamo V & T. | |
1374 | # Note: T should sort before V for proper syllable order. | |
1375 | &\u0332 # COMBINING LOW LINE (first primary ignorable) | |
1376 | <<\u1161<<\u1162 | |
1377 | ||
1378 | # Korean Jamo lead consonant search rules, part 2: | |
1379 | # Make modern compound L jamo primary equivalent to non-compound forms. | |
1380 | ||
1381 | # Secondary CEs for Jamo L-after-L, greater than Jamo V & T. | |
1382 | &\u0313 # COMBINING COMMA ABOVE (second primary ignorable) | |
1383 | =\u1100|\u1100 | |
1384 | =\u1103|\u1103 | |
1385 | =\u1107|\u1107 | |
1386 | =\u1109|\u1109 | |
1387 | =\u110C|\u110C | |
1388 | ||
1389 | # Compound L Jamo map to equivalent expansions of primary+secondary CE. | |
1390 | &\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK | |
1391 | &\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT | |
1392 | &\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP | |
1393 | &\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS | |
1394 | &\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC | |
1395 | ||
1396 | * compare | |
1397 | <1 \u1100\u1161 | |
1398 | = \uAC00 | |
1399 | <2 \u1100\u1162 | |
1400 | = \uAC1C | |
1401 | <2 \u1100\u1100\u1161 | |
1402 | = \u1100\uAC00 | |
1403 | = \u1101\u1161 | |
1404 | = \uAE4C | |
1405 | <3 \u3132\u1161 | |
1406 | ||
1407 | ** test: Hangul syllables in prefix & in the interior of a contraction | |
1408 | @ rules | |
1409 | &x=\u1100\u1161|a\u1102\u1162z | |
1410 | * compare | |
1411 | <1 \u1100\u1161x | |
1412 | = \u1100\u1161a\u1102\u1162z | |
1413 | = \u1100\u1161a\uB0B4z | |
1414 | = \uAC00a\u1102\u1162z | |
1415 | = \uAC00a\uB0B4z | |
1416 | ||
1417 | ** test: digits are unsafe-backwards when numeric=on | |
1418 | @ root | |
1419 | % numeric=on | |
1420 | * compare | |
1421 | # If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". | |
1422 | # We need to back up before the identical prefix "1" and compare the full numbers. | |
1423 | <1 11b | |
1424 | <1 101a | |
1425 | ||
1426 | ** test: simple locale data test | |
1427 | @ locale de | |
1428 | * compare | |
1429 | <1 a | |
1430 | <2 ä | |
1431 | <1 ae | |
1432 | <2 æ | |
1433 | ||
1434 | @ locale de-u-co-phonebk | |
1435 | * compare | |
1436 | <1 a | |
1437 | <1 ae | |
1438 | <2 ä | |
1439 | <2 æ | |
1440 | ||
1441 | # The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. | |
1442 | ||
1443 | ** test: DataDrivenCollationTest/TestMorePinyin | |
1444 | # Testing the primary strength. | |
1445 | @ locale zh | |
1446 | % strength=primary | |
1447 | * compare | |
1448 | < lā | |
1449 | = lĀ | |
1450 | = Lā | |
1451 | = LĀ | |
1452 | < lān | |
1453 | = lĀn | |
1454 | < lē | |
1455 | = lĒ | |
1456 | = Lē | |
1457 | = LĒ | |
1458 | < lēn | |
1459 | = lĒn | |
1460 | ||
1461 | ** test: DataDrivenCollationTest/TestLithuanian | |
1462 | # Lithuanian sort order. | |
1463 | @ locale lt | |
1464 | * compare | |
1465 | < cz | |
1466 | < č | |
1467 | < d | |
1468 | < iz | |
1469 | < j | |
1470 | < sz | |
1471 | < š | |
1472 | < t | |
1473 | < zz | |
1474 | < ž | |
1475 | ||
1476 | ** test: DataDrivenCollationTest/TestLatvian | |
1477 | # Latvian sort order. | |
1478 | @ locale lv | |
1479 | * compare | |
1480 | < cz | |
1481 | < č | |
1482 | < d | |
1483 | < gz | |
1484 | < ģ | |
1485 | < h | |
1486 | < iz | |
1487 | < j | |
1488 | < kz | |
1489 | < ķ | |
1490 | < l | |
1491 | < lz | |
1492 | < ļ | |
1493 | < m | |
1494 | < nz | |
1495 | < ņ | |
1496 | < o | |
1497 | < rz | |
1498 | < ŗ | |
1499 | < s | |
1500 | < sz | |
1501 | < š | |
1502 | < t | |
1503 | < zz | |
1504 | < ž | |
1505 | ||
1506 | ** test: DataDrivenCollationTest/TestEstonian | |
1507 | # Estonian sort order. | |
1508 | @ locale et | |
1509 | * compare | |
1510 | < sy | |
1511 | < š | |
1512 | < šy | |
1513 | < z | |
1514 | < zy | |
1515 | < ž | |
1516 | < v | |
57a6839d | 1517 | < va |
b331163b | 1518 | < w |
57a6839d A |
1519 | < õ |
1520 | < õy | |
1521 | < ä | |
1522 | < äy | |
1523 | < ö | |
1524 | < öy | |
1525 | < ü | |
1526 | < üy | |
1527 | < x | |
1528 | ||
1529 | ** test: DataDrivenCollationTest/TestAlbanian | |
1530 | # Albanian sort order. | |
1531 | @ locale sq | |
1532 | * compare | |
1533 | < cz | |
1534 | < ç | |
1535 | < d | |
1536 | < dz | |
1537 | < dh | |
1538 | < e | |
1539 | < ez | |
1540 | < ë | |
1541 | < f | |
1542 | < gz | |
1543 | < gj | |
1544 | < h | |
1545 | < lz | |
1546 | < ll | |
1547 | < m | |
1548 | < nz | |
1549 | < nj | |
1550 | < o | |
1551 | < rz | |
1552 | < rr | |
1553 | < s | |
1554 | < sz | |
1555 | < sh | |
1556 | < t | |
1557 | < tz | |
1558 | < th | |
1559 | < u | |
1560 | < xz | |
1561 | < xh | |
1562 | < y | |
1563 | < zz | |
1564 | < zh | |
1565 | ||
1566 | ** test: DataDrivenCollationTest/TestSimplifiedChineseOrder | |
1567 | # Sorted file has different order. | |
1568 | @ root | |
1569 | # normalization=on turned on & off automatically. | |
1570 | * compare | |
1571 | < \u5F20 | |
1572 | < \u5F20\u4E00\u8E3F | |
1573 | ||
1574 | ** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash | |
1575 | # This pretty much crashes. | |
1576 | @ root | |
1577 | * compare | |
1578 | < \u0f71\u0f72\u0f80\u0f71\u0f72 | |
1579 | < \u0f80 | |
1580 | ||
1581 | ** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems | |
1582 | # These are examples of strings that caused trouble in partial sort key testing. | |
1583 | @ locale th-TH | |
1584 | * compare | |
1585 | < \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C | |
1586 | < \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 | |
1587 | * compare | |
1588 | < \u0E01\u0E07\u0E01\u0E32\u0E23 | |
1589 | < \u0E01\u0E07\u0E42\u0E01\u0E49 | |
1590 | * compare | |
1591 | < \u0E01\u0E23\u0E19\u0E17\u0E32 | |
1592 | < \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 | |
1593 | * compare | |
1594 | < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 | |
1595 | < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 | |
1596 | * compare | |
1597 | < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D | |
1598 | < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 | |
1599 | ||
1600 | ** test: DataDrivenCollationTest/TestJavaStyleRule | |
1601 | # java.text allows rules to start as '<<<x<<<y...' | |
1602 | # we emulate this by assuming a &[first tertiary ignorable] in this case. | |
1603 | @ rules | |
1604 | &\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b | |
1605 | * compare | |
1606 | = a | |
1607 | = equal | |
1608 | < z | |
1609 | < x | |
1610 | = b # x had become the new first primary ignorable | |
1611 | < w | |
1612 | ||
1613 | ** test: DataDrivenCollationTest/TestShiftedIgnorable | |
1614 | # The UCA states that primary ignorables should be completely | |
1615 | # ignorable when following a shifted code point. | |
1616 | @ root | |
1617 | % alternate=shifted | |
1618 | % strength=quaternary | |
1619 | * compare | |
1620 | < a\u0020b | |
1621 | = a\u0020\u0300b | |
1622 | = a\u0020\u0301b | |
1623 | < a_b | |
1624 | = a_\u0300b | |
1625 | = a_\u0301b | |
1626 | < A\u0020b | |
1627 | = A\u0020\u0300b | |
1628 | = A\u0020\u0301b | |
1629 | < A_b | |
1630 | = A_\u0300b | |
1631 | = A_\u0301b | |
1632 | < a\u0301b | |
1633 | < A\u0301b | |
1634 | < a\u0300b | |
1635 | < A\u0300b | |
1636 | ||
1637 | ** test: DataDrivenCollationTest/TestNShiftedIgnorable | |
1638 | # The UCA states that primary ignorables should be completely | |
1639 | # ignorable when following a shifted code point. | |
1640 | @ root | |
1641 | % alternate=non-ignorable | |
1642 | % strength=tertiary | |
1643 | * compare | |
1644 | < a\u0020b | |
1645 | < A\u0020b | |
1646 | < a\u0020\u0301b | |
1647 | < A\u0020\u0301b | |
1648 | < a\u0020\u0300b | |
1649 | < A\u0020\u0300b | |
1650 | < a_b | |
1651 | < A_b | |
1652 | < a_\u0301b | |
1653 | < A_\u0301b | |
1654 | < a_\u0300b | |
1655 | < A_\u0300b | |
1656 | < a\u0301b | |
1657 | < A\u0301b | |
1658 | < a\u0300b | |
1659 | < A\u0300b | |
1660 | ||
1661 | ** test: DataDrivenCollationTest/TestSafeSurrogates | |
1662 | # It turned out that surrogates were not skipped properly | |
1663 | # when iterating backwards if they were in the middle of a | |
1664 | # contraction. This test assures that this is fixed. | |
1665 | @ rules | |
1666 | &a < x\ud800\udc00b | |
1667 | * compare | |
1668 | < a | |
1669 | < x\ud800\udc00b | |
1670 | ||
1671 | ** test: DataDrivenCollationTest/da_TestPrimary | |
1672 | # This test goes through primary strength cases | |
1673 | @ locale da | |
1674 | % strength=primary | |
1675 | * compare | |
1676 | < Lvi | |
1677 | < Lwi | |
1678 | * compare | |
1679 | < L\u00e4vi | |
1680 | < L\u00f6wi | |
1681 | * compare | |
1682 | < L\u00fcbeck | |
1683 | = Lybeck | |
1684 | ||
1685 | ** test: DataDrivenCollationTest/da_TestTertiary | |
1686 | # This test goes through tertiary strength cases | |
1687 | @ locale da | |
1688 | % strength=tertiary | |
1689 | * compare | |
1690 | < Luc | |
1691 | < luck | |
1692 | * compare | |
1693 | < luck | |
1694 | < L\u00fcbeck | |
1695 | * compare | |
1696 | < lybeck | |
1697 | < L\u00fcbeck | |
1698 | * compare | |
1699 | < L\u00e4vi | |
1700 | < L\u00f6we | |
1701 | * compare | |
1702 | < L\u00f6ww | |
1703 | < mast | |
1704 | ||
1705 | * compare | |
1706 | < A/S | |
1707 | < ANDRE | |
1708 | < ANDR\u00c9 | |
1709 | < ANDREAS | |
1710 | < AS | |
1711 | < CA | |
1712 | < \u00c7A | |
1713 | < CB | |
1714 | < \u00c7C | |
1715 | < D.S.B. | |
1716 | < DA | |
1717 | < \u00d0A | |
1718 | < DB | |
1719 | < \u00d0C | |
1720 | < DSB | |
1721 | < DSC | |
1722 | < EKSTRA_ARBEJDE | |
1723 | < EKSTRABUD0 | |
1724 | < H\u00d8ST | |
1725 | < HAAG | |
1726 | < H\u00c5NDBOG | |
1727 | < HAANDV\u00c6RKSBANKEN | |
1728 | < Karl | |
1729 | < karl | |
1730 | < NIELS\u0020J\u00d8RGEN | |
1731 | < NIELS-J\u00d8RGEN | |
1732 | < NIELSEN | |
1733 | < R\u00c9E,\u0020A | |
1734 | < REE,\u0020B | |
1735 | < R\u00c9E,\u0020L | |
1736 | < REE,\u0020V | |
1737 | < SCHYTT,\u0020B | |
1738 | < SCHYTT,\u0020H | |
1739 | < SCH\u00dcTT,\u0020H | |
1740 | < SCHYTT,\u0020L | |
1741 | < SCH\u00dcTT,\u0020M | |
1742 | < SS | |
1743 | < \u00df | |
1744 | < SSA | |
1745 | < STORE\u0020VILDMOSE | |
1746 | < STOREK\u00c6R0 | |
1747 | < STORM\u0020PETERSEN | |
1748 | < STORMLY | |
1749 | < THORVALD | |
1750 | < THORVARDUR | |
1751 | < \u00feORVAR\u00d0UR | |
1752 | < THYGESEN | |
1753 | < VESTERG\u00c5RD,\u0020A | |
1754 | < VESTERGAARD,\u0020A | |
1755 | < VESTERG\u00c5RD,\u0020B | |
1756 | < \u00c6BLE | |
1757 | < \u00c4BLE | |
1758 | < \u00d8BERG | |
1759 | < \u00d6BERG | |
1760 | ||
1761 | * compare | |
1762 | < andere | |
1763 | < chaque | |
1764 | < chemin | |
1765 | < cote | |
1766 | < cot\u00e9 | |
1767 | < c\u00f4te | |
1768 | < c\u00f4t\u00e9 | |
1769 | < \u010du\u010d\u0113t | |
1770 | < Czech | |
1771 | < hi\u0161a | |
1772 | < irdisch | |
1773 | < lie | |
1774 | < lire | |
1775 | < llama | |
1776 | < l\u00f5ug | |
1777 | < l\u00f2za | |
1778 | < lu\u010d | |
1779 | < luck | |
1780 | < L\u00fcbeck | |
1781 | < lye | |
1782 | < l\u00e4vi | |
1783 | < L\u00f6wen | |
1784 | < m\u00e0\u0161ta | |
1785 | < m\u00eer | |
1786 | < myndig | |
1787 | < M\u00e4nner | |
1788 | < m\u00f6chten | |
1789 | < pi\u00f1a | |
1790 | < pint | |
1791 | < pylon | |
1792 | < \u0161\u00e0ran | |
1793 | < savoir | |
1794 | < \u0160erb\u016bra | |
1795 | < Sietla | |
1796 | < \u015blub | |
1797 | < subtle | |
1798 | < symbol | |
1799 | < s\u00e4mtlich | |
1800 | < verkehrt | |
1801 | < vox | |
1802 | < v\u00e4ga | |
1803 | < waffle | |
1804 | < wood | |
1805 | < yen | |
1806 | < yuan | |
1807 | < yucca | |
1808 | < \u017eal | |
1809 | < \u017eena | |
1810 | < \u017den\u0113va | |
1811 | < zoo0 | |
1812 | < Zviedrija | |
1813 | < Z\u00fcrich | |
1814 | < zysk0 | |
1815 | < \u00e4ndere | |
1816 | ||
1817 | ** test: DataDrivenCollationTest/hi_TestNewRules | |
1818 | # This test goes through new rules and tests against old rules | |
1819 | @ locale hi | |
1820 | * compare | |
1821 | < कॐ | |
1822 | < कं | |
1823 | < कँ | |
1824 | < कः | |
1825 | ||
1826 | ** test: DataDrivenCollationTest/ro_TestNewRules | |
1827 | # This test goes through new rules and tests against old rules | |
1828 | @ locale ro | |
1829 | * compare | |
1830 | < xAx | |
1831 | < xă | |
1832 | < xĂ | |
1833 | < Xă | |
1834 | < XĂ | |
1835 | < xăx | |
1836 | < xĂx | |
1837 | < xâ | |
1838 | < x | |
1839 | < Xâ | |
1840 | < XÂ | |
1841 | < xâx | |
1842 | < xÂx | |
1843 | < xb | |
1844 | < xIx | |
1845 | < xî | |
1846 | < xÎ | |
1847 | < Xî | |
1848 | < XÎ | |
1849 | < xîx | |
1850 | < xÎx | |
1851 | < xj | |
1852 | < xSx | |
1853 | < xș | |
1854 | = xş | |
1855 | < xȘ | |
1856 | = xŞ | |
1857 | < Xș | |
1858 | = Xş | |
1859 | < XȘ | |
1860 | = XŞ | |
1861 | < xșx | |
1862 | = xşx | |
1863 | < xȘx | |
1864 | = xŞx | |
1865 | < xT | |
1866 | < xTx | |
1867 | < xț | |
1868 | = xţ | |
1869 | < xȚ | |
1870 | = xŢ | |
1871 | < Xț | |
1872 | = Xţ | |
1873 | < XȚ | |
1874 | = XŢ | |
1875 | < xțx | |
1876 | = xţx | |
1877 | < xȚx | |
1878 | = xŢx | |
1879 | < xU | |
1880 | ||
1881 | ** test: DataDrivenCollationTest/testOffsets | |
1882 | # This tests cases where forwards and backwards iteration get different offsets | |
1883 | @ locale en | |
1884 | % strength=tertiary | |
1885 | * compare | |
1886 | < a\uD800\uDC00\uDC00 | |
1887 | < b\uD800\uDC00\uDC00 | |
1888 | * compare | |
1889 | < \u0301A\u0301\u0301 | |
1890 | < \u0301B\u0301\u0301 | |
1891 | * compare | |
1892 | < abcd\r\u0301 | |
1893 | < abce\r\u0301 | |
1894 | # TODO: test offsets in new CollationTest | |
1895 | ||
1896 | # End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. | |
1897 | ||
1898 | ** test: was ICU 52 cmsccoll/TestRedundantRules | |
1899 | @ rules | |
1900 | & a < b < c < d& [before 1] c < m | |
1901 | * compare | |
1902 | <1 a | |
1903 | <1 b | |
1904 | <1 m | |
1905 | <1 c | |
1906 | <1 d | |
1907 | ||
1908 | @ rules | |
1909 | & a < b <<< c << d <<< e& [before 3] e <<< x | |
1910 | * compare | |
1911 | <1 a | |
1912 | <1 b | |
1913 | <3 c | |
1914 | <2 d | |
1915 | <3 x | |
1916 | <3 e | |
1917 | ||
1918 | @ rules | |
1919 | & a < b <<< c << d <<< e <<< f < g& [before 1] g < x | |
1920 | * compare | |
1921 | <1 a | |
1922 | <1 b | |
1923 | <3 c | |
1924 | <2 d | |
1925 | <3 e | |
1926 | <3 f | |
1927 | <1 x | |
1928 | <1 g | |
1929 | ||
1930 | @ rules | |
1931 | & a <<< b << c < d& a < m | |
1932 | * compare | |
1933 | <1 a | |
1934 | <3 b | |
1935 | <2 c | |
1936 | <1 m | |
1937 | <1 d | |
1938 | ||
1939 | @ rules | |
1940 | &a<b<<b\u0301 &z<b | |
1941 | * compare | |
1942 | <1 a | |
1943 | <1 b\u0301 | |
1944 | <1 z | |
1945 | <1 b | |
1946 | ||
1947 | @ rules | |
1948 | &z<m<<<q<<<m | |
1949 | * compare | |
1950 | <1 z | |
1951 | <1 q | |
1952 | <3 m | |
1953 | ||
1954 | @ rules | |
1955 | &z<<<m<q<<<m | |
1956 | * compare | |
1957 | <1 z | |
1958 | <1 q | |
1959 | <3 m | |
1960 | ||
1961 | @ rules | |
1962 | & a < b < c < d& r < c | |
1963 | * compare | |
1964 | <1 a | |
1965 | <1 b | |
1966 | <1 d | |
1967 | <1 r | |
1968 | <1 c | |
1969 | ||
1970 | @ rules | |
1971 | & a < b < c < d& c < m | |
1972 | * compare | |
1973 | <1 a | |
1974 | <1 b | |
1975 | <1 c | |
1976 | <1 m | |
1977 | <1 d | |
1978 | ||
1979 | @ rules | |
1980 | & a < b < c < d& a < m | |
1981 | * compare | |
1982 | <1 a | |
1983 | <1 m | |
1984 | <1 b | |
1985 | <1 c | |
1986 | <1 d | |
1987 | ||
1988 | ** test: was ICU 52 cmsccoll/TestExpansionSyntax | |
1989 | # The following two rules should sort the particular list of strings the same. | |
1990 | @ rules | |
1991 | &AE <<< a << b <<< c &d <<< f | |
1992 | * compare | |
1993 | <1 AE | |
1994 | <3 a | |
1995 | <2 b | |
1996 | <3 c | |
1997 | <1 d | |
1998 | <3 f | |
1999 | ||
2000 | @ rules | |
2001 | &A <<< a / E << b / E <<< c /E &d <<< f | |
2002 | * compare | |
2003 | <1 AE | |
2004 | <3 a | |
2005 | <2 b | |
2006 | <3 c | |
2007 | <1 d | |
2008 | <3 f | |
2009 | ||
2010 | # The following two rules should sort the particular list of strings the same. | |
2011 | @ rules | |
2012 | &AE <<< a <<< b << c << d < e < f <<< g | |
2013 | * compare | |
2014 | <1 AE | |
2015 | <3 a | |
2016 | <3 b | |
2017 | <2 c | |
2018 | <2 d | |
2019 | <1 e | |
2020 | <1 f | |
2021 | <3 g | |
2022 | ||
2023 | @ rules | |
2024 | &A <<< a / E <<< b / E << c / E << d / E < e < f <<< g | |
2025 | * compare | |
2026 | <1 AE | |
2027 | <3 a | |
2028 | <3 b | |
2029 | <2 c | |
2030 | <2 d | |
2031 | <1 e | |
2032 | <1 f | |
2033 | <3 g | |
2034 | ||
2035 | # The following two rules should sort the particular list of strings the same. | |
2036 | @ rules | |
2037 | &AE <<< B <<< C / D <<< F | |
2038 | * compare | |
2039 | <1 AE | |
2040 | <3 B | |
2041 | <3 F | |
2042 | <1 AED | |
2043 | <3 C | |
2044 | ||
2045 | @ rules | |
2046 | &A <<< B / E <<< C / ED <<< F / E | |
2047 | * compare | |
2048 | <1 AE | |
2049 | <3 B | |
2050 | <3 F | |
2051 | <1 AED | |
2052 | <3 C | |
2053 | ||
2054 | ** test: never reorder trailing primaries | |
2055 | @ root | |
2056 | % reorder Zzzz Grek | |
2057 | * compare | |
2058 | <1 L | |
2059 | <1 字 | |
2060 | <1 Ω | |
2061 | <1 \uFFFD | |
2062 | <1 \uFFFF | |
2063 | ||
2064 | ** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes | |
2065 | @ rules | |
2066 | &u=ab|cd | |
2067 | &v=b|ce | |
2068 | * compare | |
2069 | <1 abc | |
2070 | <1 abcc | |
2071 | <1 abcf | |
2072 | <1 abcd | |
2073 | = abu | |
2074 | <1 abce | |
2075 | = abv | |
2076 | ||
2077 | # With the following rules, there is only one prefix per composite ĉ or ç, | |
2078 | # but both prefixes apply to just c in NFD form. | |
2079 | # We would get different results for composed vs. NFD input | |
2080 | # if we fell back directly from longest-prefix mappings to no-prefix mappings. | |
2081 | @ rules | |
2082 | &x=op|ĉ | |
2083 | &y=p|ç | |
2084 | * compare | |
2085 | <1 opc | |
2086 | <2 opć | |
2087 | <1 opcz | |
2088 | <1 opd | |
2089 | <1 opĉ | |
2090 | = opc\u0302 | |
2091 | = opx | |
2092 | <1 opç | |
2093 | = opc\u0327 | |
2094 | = opy | |
2095 | ||
2096 | # The mapping is used which has the longest matching prefix for which | |
2097 | # there is also a suffix match, with the longest suffix match among several for that prefix. | |
2098 | @ rules | |
2099 | &❶=d | |
2100 | &❷=de | |
2101 | &❸=def | |
2102 | &①=c|d | |
2103 | &②=c|de | |
2104 | &③=c|def | |
2105 | &④=bc|d | |
2106 | &⑤=bc|de | |
2107 | &⑥=bc|def | |
2108 | &⑦=abc|d | |
2109 | &⑧=abc|de | |
2110 | &⑨=abc|def | |
2111 | * compare | |
2112 | <1 9aadzz | |
2113 | = 9aa❶zz | |
2114 | <1 9aadez | |
2115 | = 9aa❷z | |
2116 | <1 9aadef | |
2117 | = 9aa❸ | |
2118 | <1 9acdzz | |
2119 | = 9ac①zz | |
2120 | <1 9acdez | |
2121 | = 9ac②z | |
2122 | <1 9acdef | |
2123 | = 9ac③ | |
2124 | <1 9bcdzz | |
2125 | = 9bc④zz | |
2126 | <1 9bcdez | |
2127 | = 9bc⑤z | |
2128 | <1 9bcdef | |
2129 | = 9bc⑥ | |
2130 | <1 abcdzz | |
2131 | = abc⑦zz | |
2132 | <1 abcdez | |
2133 | = abc⑧z | |
2134 | <1 abcdef | |
2135 | = abc⑨ | |
2136 | ||
2137 | ** test: prefix + discontiguous contraction with missing prefix contraction | |
2138 | # Unfortunate terminology: The first "prefix" here is the pre-context, | |
2139 | # the second "prefix" refers to the contraction/relation string that is | |
2140 | # one shorter than the one being tested. | |
2141 | @ rules | |
2142 | &x=p|e | |
2143 | &y=p|ê | |
2144 | &z=op|ê | |
2145 | # No mapping for op|e: | |
2146 | # Discontiguous contraction matching should not match op|ê in opệ | |
2147 | # because it would have to skip the dot below and extend a match on op|e by the circumflex, | |
2148 | # but there is no match on op|e. | |
2149 | * compare | |
2150 | <1 oPe | |
2151 | <1 ope | |
2152 | = opx | |
2153 | <1 opệ | |
2154 | = opy\u0323 # y not z | |
2155 | <1 opê | |
2156 | = opz | |
2157 | ||
2158 | # We cannot test for fallback by whether the contraction default CE32 | |
2159 | # is for another contraction. With the following rules, there is no mapping for op|e, | |
2160 | # and the fallback to prefix p has no contractions. | |
2161 | @ rules | |
2162 | &x=p|e | |
2163 | &z=op|ê | |
2164 | * compare | |
2165 | <1 oPe | |
2166 | <1 ope | |
2167 | = opx | |
2168 | <2 opệ | |
2169 | = opx\u0323\u0302 # x not z | |
2170 | <1 opê | |
2171 | = opz | |
2172 | ||
2173 | # One more variation: Fallback to the simple code point, no shorter non-empty prefix. | |
2174 | @ rules | |
2175 | &x=e | |
2176 | &z=op|ê | |
2177 | * compare | |
2178 | <1 ope | |
2179 | = opx | |
2180 | <3 oPe | |
2181 | = oPx | |
2182 | <2 opệ | |
2183 | = opx\u0323\u0302 # x not z | |
2184 | <1 opê | |
2185 | = opz | |
2186 | ||
2187 | ** test: maxVariable via rules | |
2188 | @ rules | |
2189 | [maxVariable space][alternate shifted] | |
2190 | * compare | |
2191 | = \u0020 | |
2192 | = \u000A | |
2193 | <1 . | |
2194 | <1 ° # degree sign | |
2195 | <1 $ | |
2196 | <1 0 | |
2197 | ||
2198 | ** test: maxVariable via setting | |
2199 | @ root | |
2200 | % maxVariable=currency | |
2201 | % alternate=shifted | |
2202 | * compare | |
2203 | = \u0020 | |
2204 | = \u000A | |
2205 | = . | |
2206 | = ° # degree sign | |
2207 | = $ | |
2208 | <1 0 | |
2209 | ||
2210 | ** test: ICU4J CollationMiscTest/TestContractionClosure (ää) | |
2211 | # This tests canonical closure, but it also tests that CollationFastLatin | |
2212 | # bails out properly for contractions with combining marks. | |
2213 | # For that we need pairs of strings that remain in the Latin fastpath | |
2214 | # long enough, hence the extra "= b" lines. | |
2215 | @ rules | |
2216 | &b=\u00e4\u00e4 | |
2217 | * compare | |
2218 | <1 b | |
2219 | = \u00e4\u00e4 | |
2220 | = b | |
2221 | = a\u0308a\u0308 | |
2222 | = b | |
2223 | = \u00e4a\u0308 | |
2224 | = b | |
2225 | = a\u0308\u00e4 | |
2226 | ||
2227 | ** test: ICU4J CollationMiscTest/TestContractionClosure (Å) | |
2228 | @ rules | |
2229 | &b=\u00C5 | |
2230 | * compare | |
2231 | <1 b | |
2232 | = \u00C5 | |
2233 | = b | |
2234 | = A\u030A | |
2235 | = b | |
2236 | = \u212B | |
2237 | ||
2238 | ** test: reset-before on already-tailored characters, ICU ticket 10108 | |
2239 | @ rules | |
2240 | &a<w<<x &[before 2]x<<y | |
2241 | * compare | |
2242 | <1 a | |
2243 | <1 w | |
2244 | <2 y | |
2245 | <2 x | |
2246 | ||
2247 | @ rules | |
2248 | &a<<w<<<x &[before 2]x<<y | |
2249 | * compare | |
2250 | <1 a | |
2251 | <2 y | |
2252 | <2 w | |
2253 | <3 x | |
2254 | ||
2255 | @ rules | |
2256 | &a<w<x &[before 2]x<<y | |
2257 | * compare | |
2258 | <1 a | |
2259 | <1 w | |
2260 | <1 y | |
2261 | <2 x | |
2262 | ||
2263 | @ rules | |
2264 | &a<w<<<x &[before 2]x<<y | |
2265 | * compare | |
2266 | <1 a | |
2267 | <1 y | |
2268 | <2 w | |
2269 | <3 x | |
2270 | ||
2271 | ** test: numeric collation with other settings, ICU ticket 9092 | |
2272 | @ root | |
2273 | % strength=identical | |
2274 | % caseFirst=upper | |
2275 | % numeric=on | |
2276 | * compare | |
2277 | <1 100\u0020a | |
2278 | <1 101 | |
2279 | ||
2280 | ** test: collation type fallback from unsupported type, ICU ticket 10149 | |
2281 | @ locale fr-CA-u-co-phonebk | |
2282 | # Expect the same result as with fr-CA, using backwards-secondary order. | |
2283 | # That is, we should fall back from the unsupported collation type | |
2284 | # to the locale's default collation type. | |
2285 | * compare | |
2286 | <1 cote | |
2287 | <2 côte | |
2288 | <2 coté | |
2289 | <2 côté | |
2290 | ||
2291 | ** test: @ is equivalent to [backwards 2], ICU ticket 9956 | |
2292 | @ rules | |
2293 | &b<a @ &v<<w | |
2294 | * compare | |
2295 | <1 b | |
2296 | <1 a | |
2297 | <1 cote | |
2298 | <2 côte | |
2299 | <2 coté | |
2300 | <2 côté | |
2301 | <1 v | |
2302 | <2 w | |
2303 | <1 x | |
2304 | ||
2305 | ** test: shifted+reordering, ICU ticket 9507 | |
2306 | @ root | |
2307 | % reorder Grek punct space | |
2308 | % alternate=shifted | |
2309 | % strength=quaternary | |
2310 | # Which primaries are "variable" should be determined without script reordering, | |
2311 | # and then primaries should be reordered whether they are shifted to quaternary or not. | |
2312 | * compare | |
2313 | <4 ( # punctuation | |
2314 | <4 ) | |
2315 | <4 \u0020 # space | |
2316 | <1 ` # symbol | |
2317 | <1 ^ | |
2318 | <1 $ # currency symbol | |
2319 | <1 € | |
2320 | <1 0 # numbers | |
2321 | <1 ε # Greek | |
2322 | <1 e # Latin | |
2323 | <1 e(e | |
2324 | <4 e)e | |
2325 | <4 e\u0020e | |
2326 | <4 ee | |
2327 | <3 e(E | |
2328 | <4 e)E | |
2329 | <4 e\u0020E | |
2330 | <4 eE | |
2331 | ||
2332 | ** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 | |
2333 | @ rules | |
2334 | &\u0001<<<b<<<B | |
2335 | % caseFirst=upper | |
2336 | * compare | |
2337 | <1 aaa | |
2338 | <3 aaaB | |
2339 | ||
2340 | ** test: secondary+case ignores secondary ignorables, ICU ticket 9355 | |
2341 | @ rules | |
2342 | &\u0001<<<b<<<B | |
2343 | % strength=secondary | |
2344 | % caseLevel=on | |
2345 | * compare | |
2346 | <1 a | |
2347 | = ab | |
2348 | = aB | |
2349 | ||
2350 | ** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 | |
2351 | @ rules | |
2352 | &[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 | |
2353 | * compare | |
2354 | <1 ൗx | |
2355 | <2 ൌx | |
2356 | <1 ൗy | |
2357 | <2 ൌy | |
2358 | ||
2359 | ** test: quoted apostrophe in compact syntax, ICU ticket 8204 | |
2360 | @ rules | |
2361 | &q<<*a''c | |
2362 | * compare | |
2363 | <1 d | |
2364 | <1 p | |
2365 | <1 q | |
2366 | <2 a | |
2367 | <2 \u0027 | |
2368 | <2 c | |
2369 | <1 r | |
b331163b A |
2370 | |
2371 | # ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" | |
2372 | ** test: locale -u- with collation keywords, ICU ticket 8260 | |
2373 | @ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 | |
2374 | * compare | |
2375 | <4 \u0020 # space is shifted, strength=quaternary | |
2376 | <1 ! # punctuation is regular | |
2377 | <1 2 | |
2378 | <1 12 # numeric sorting | |
2379 | <1 B | |
2380 | <c b # uppercase first on case level | |
2381 | <1 x\u0301\u0308 | |
2382 | <2 x\u0308\u0301 # normalization off | |
2383 | ||
2384 | ** test: locale @ with collation keywords, ICU ticket 8260 | |
2385 | @ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted | |
2386 | * compare | |
2387 | <4 $ # currency symbols are shifted, strength=quaternary | |
2388 | <1 àla | |
2389 | <2 alà # backwards secondary level | |
2390 | ||
2391 | ** test: locale -u- with script reordering, ICU ticket 8260 | |
2392 | @ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai | |
2393 | * compare | |
2394 | <1 \u0020 | |
2395 | <1 あ | |
2396 | <1 ☂ | |
2397 | <1 Ω | |
2398 | <1 丂 | |
2399 | <1 ж | |
2400 | <1 L | |
2401 | <1 4 | |
2402 | <1 Ձ | |
2403 | <1 अ | |
2404 | <1 ሄ | |
2405 | <1 ฉ | |
2406 | ||
2407 | ** test: locale @collation=type should be case-insensitive | |
2408 | @ locale de@coLLation=PhoneBook | |
2409 | * compare | |
2410 | <1 ae | |
2411 | <2 ä | |
2412 | <3 Ä | |
2413 | ||
2414 | ** test: import root search rules plus German phonebook rules, ICU ticket 8962 | |
2415 | @ locale de-u-co-search | |
2416 | * compare | |
2417 | <1 = | |
2418 | <1 ≠ | |
2419 | <1 a | |
2420 | <1 ae | |
2421 | <2 ä | |
2422 | ||
2423 | # Once more, but with runtime builder. | |
2424 | @ rules | |
2425 | [import und-u-co-search][import de-u-co-phonebk] | |
2426 | * compare | |
2427 | <1 = | |
2428 | <1 ≠ | |
2429 | <1 a | |
2430 | <1 ae | |
2431 | <2 ä | |
2432 | ||
2433 | # Once again, with import from "root" not "und" (as in a proper language tag). | |
2434 | @ rules | |
2435 | [import root-u-co-search][import de-u-co-phonebk] | |
2436 | * compare | |
2437 | <1 = | |
2438 | <1 ≠ | |
2439 | <1 a | |
2440 | <1 ae | |
2441 | <2 ä | |
2442 | ||
2443 | ** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998 | |
2444 | # Greek should sort Greek first. | |
2445 | @ rules | |
2446 | [import el] | |
2447 | * compare | |
2448 | <1 4 | |
2449 | <1 Ω | |
2450 | <1 L | |
2451 | ||
2452 | # Import Greek, and then reset the reordering. | |
2453 | @ rules | |
2454 | [import el][reorder Zzzz] | |
2455 | * compare | |
2456 | <1 4 | |
2457 | <1 L | |
2458 | <1 Ω | |
2459 | ||
2460 | # "others" is a synonym for Zzzz. | |
2461 | @ rules | |
2462 | [import el][reorder others] | |
2463 | * compare | |
2464 | <1 4 | |
2465 | <1 L | |
2466 | <1 Ω | |
2467 | ||
2468 | ** test: regression test for CollationFastLatinBuilder, ICU ticket 11388 | |
2469 | @ rules | |
2470 | &x<<aa<<<Aa<<<AA | |
2471 | % strength=secondary | |
2472 | * compare | |
2473 | <1 AA | |
2474 | <2 Aẩ | |
2475 | <2 aą | |
2476 | * compare | |
2477 | <1 AA | |
2478 | <2 aą | |
2479 | ||
2480 | ** test: tailor tertiary-after a common tertiary where there is a lower one | |
2481 | # Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one. | |
2482 | # See ICU ticket 11448 & CLDR ticket 7222. | |
2483 | @ rules | |
2484 | &あ<<<x<<<y<<<z | |
2485 | * compare | |
2486 | <1 ぁ | |
2487 | <3 あ | |
2488 | <3 x | |
2489 | <3 y | |
2490 | <3 z | |
2491 | <3 ァ | |
2492 | <1 い | |
2493 | ||
2494 | ** test: tailor tertiary-after a below-common tertiary | |
2495 | @ rules | |
2496 | &ぁ<<<x<<<y<<<z | |
2497 | * compare | |
2498 | <1 ぁ | |
2499 | <3 x | |
2500 | <3 y | |
2501 | <3 z | |
2502 | <3 あ | |
2503 | <3 ァ | |
2504 | <1 い | |
2505 | ||
2506 | ** test: tailor tertiary-before a common tertiary where there is a lower one | |
2507 | @ rules | |
2508 | &[before 3]あ<<<x<<<y<<<z | |
2509 | * compare | |
2510 | <1 ぁ | |
2511 | <3 x | |
2512 | <3 y | |
2513 | <3 z | |
2514 | <3 あ | |
2515 | <3 ァ | |
2516 | <1 い | |
2517 | ||
2518 | ** test: tailor tertiary-before a below-common tertiary | |
2519 | @ rules | |
2520 | &[before 3]ぁ<<<x<<<y<<<z | |
2521 | * compare | |
2522 | <1 x | |
2523 | <3 y | |
2524 | <3 z | |
2525 | <3 ぁ | |
2526 | <3 あ | |
2527 | <3 ァ | |
2528 | <1 い | |
2529 | ||
2530 | ** test: reorder single scripts not groups, ICU ticket 11449 | |
2531 | @ root | |
2532 | % reorder Goth Latn | |
2533 | * compare | |
2534 | <1 4 | |
2535 | <1 𐌰 # Gothic | |
2536 | <1 L | |
2537 | <1 Ω | |
2538 | # Before ICU 55, the following reordered together with Gothic. | |
2539 | <1 𐌈 # Old Italic | |
2540 | <1 𐑐 # Shavian |