1 <?xml version=
"1.0" encoding=
"UTF-8"?>
3 <!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
5 <!-- Test data file for string search -->
6 <!DOCTYPE stringsearch-tests [
7 <!ELEMENT stringsearch-tests (test-case+)
>
8 <!ATTLIST stringsearch-tests debug IDREF #IMPLIED
>
9 <!ELEMENT test-case (pattern, pre?, m?, post?)
>
13 strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL)
"TERTIARY"
15 alternate_handling (NON_IGNORABLE | SHIFTED)
"NON_IGNORABLE"
18 <!ELEMENT pattern (#PCDATA)
>
19 <!ELEMENT pre (#PCDATA)
>
20 <!ELEMENT m (#PCDATA)
>
21 <!ELEMENT post (#PCDATA)
>
25 <!-- debug="test11" (for copying into the above element) -->
27 <!-- Very simple match -->
28 <test-case id=
"test01" >
29 <pattern>abc
</pattern>
30 <pre>xxx
</pre><m>abc
</m><post>yyy
</post>
33 <!-- Very simple no-match -->
34 <test-case id=
"test02" >
35 <pattern>abc
</pattern>
36 <pre>xxx
</pre><post>yyy
</post>
39 <!-- Match after several near-misses. -->
40 <test-case id=
"test03" >
41 <pattern>string
</pattern>
42 <pre>silly spring stling strxng strilg strinx stri
</pre><m>string
</m><post> fling
</post>
45 <test-case id=
"test04" strength=
"PRIMARY" >
46 <pattern>FUSS
</pattern>
47 <pre>abc
</pre><m>fuss
</m><post>sss
</post>
50 <test-case id=
"test05" strength=
"PRIMARY" >
51 <pattern>FUSS
</pattern>
52 <pre>abc
</pre><m>fuß
</m><post>sss
</post>
55 <test-case id=
"test05.5" strength=
"PRIMARY" >
56 <pattern>fuss
</pattern>
59 <post>ball table
</post>
62 <test-case id=
"test06" strength=
"PRIMARY" >
63 <pattern>fuß
</pattern>
64 <pre>abc
</pre><m>fuss
</m><post>xyz
</post>
67 <test-case id=
"test07" strength=
"SECONDARY" >
68 <pattern>fuß
</pattern>
72 <test-case id=
"test08" strength=
"PRIMARY" >
73 <pattern>fus
</pattern>
74 <pre>abcfuß
</pre><post>xyz
</post>
77 <!-- A good match following an initial match that failed because
78 of not ending on a character boundary -->
79 <test-case id=
"test09" strength=
"PRIMARY">
80 <pattern>fus
</pattern>
81 <pre>fuß
</pre><m>fus
</m><post>sss
</post>
85 <!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
87 <test-case id=
"test10" strength=
"TERTIARY">
88 <pattern>fox
</pattern>
89 <m>fox
</m><post>y fox
</post>
92 <test-case id=
"test11" strength=
"PRIMARY" locale=
"de_DE@collation=phonebook">
93 <pattern>toe
</pattern>
94 <pre>This is a
</pre><m>Tö
</m><post>ne
</post>
97 <test-case id=
"test11a" strength=
"SECONDARY" locale=
"de_DE@collation=phonebook">
98 <pattern>toe
</pattern>
99 <pre>This is a
</pre><post>Töne
</post>
102 <test-case id=
"test12" strength=
"TERTIARY">
104 <pre>tésting that é doés not match
</pre><m>e
</m><post></post>
107 <test-case id=
"test13" strength=
"PRIMARY" locale=
"fr">
109 <pre></pre><m>É
</m><post>É
</post>
112 <test-case id=
"test14" strength=
"PRIMARY" locale=
"fr">
114 <pre>C
</pre><m>O\u0302
</m><post>TÉ
</post>
118 <!-- Test cases from usrchdat.c STRENGTH -->
121 <test-case id=
"test15" strength=
"PRIMARY" locale=
"en">
122 <pattern>fox
</pattern>
123 <pre>The quick brown
</pre><m>fox
</m><post> jumps over the lazy foxes
</post>
126 <test-case id=
"test16" strength=
"PRIMARY" locale=
"fr">
127 <pattern>peche
</pattern>
128 <pre>blackbirds pat
</pre><m>p\u00E9ch\u00E9
</m><post> </post>
131 <test-case id=
"test17" strength=
"PRIMARY" locale=
"fr">
132 <pattern>peche
</pattern>
133 <pre>blackbirds pat
</pre><m>p\u00EAche
</m><post> </post>
136 <test-case id=
"test18" strength=
"PRIMARY" locale=
"fr">
137 <pattern>peche
</pattern>
138 <pre>blackbirds pat
</pre><m>p\u00E9che
</m><post>r
</post>
141 <test-case id=
"test19" strength=
"PRIMARY" locale=
"fr">
142 <pattern>peche
</pattern>
143 <pre>blackbirds pat
</pre><m>p\u00EAche
</m><post>r
</post>
146 <test-case id=
"test20" strength=
"PRIMARY" locale=
"es">
147 <pattern>channel
</pattern>
148 <pre>A
</pre><m>channel
</m><post>,
</post>
151 <test-case id=
"test21" strength=
"PRIMARY" locale=
"es">
152 <pattern>channel
</pattern>
153 <pre>A
</pre><m>CHANNEL
</m><post>,
</post>
156 <test-case id=
"test22" strength=
"PRIMARY" locale=
"es">
157 <pattern>channel
</pattern>
158 <pre>A
</pre><m>Channel
</m><post>s,
</post>
161 <test-case id=
"test23" strength=
"PRIMARY" locale=
"es">
162 <pattern>channel
</pattern>
163 <pre>A
</pre><m>channel
</m><post>...
</post>
166 <test-case id=
"test24" strength=
"TERTIARY" locale=
"en">
167 <pattern>A\u0300
</pattern>
168 <pre>A miss, and then
</pre><m>\u00c0
</m><post> should match but not A"
</post>
171 <!-- TODO: In the original test data, this test matched at IDENTICAL strength.
172 Doesn't seem right. The characters are different.
174 <test-case id=
"test24a" strength=
"IDENTICAL" locale=
"en">
175 <pattern>A\u0300
</pattern>
176 <pre>At IDENTICAL, shoud this match?
</pre><m>\u00c0
</m><post></post>
179 <test-case id=
"test24b" strength=
"IDENTICAL" alternate_handling=
"SHIFTED" locale=
"en">
180 <pattern>A\u0300
</pattern>
181 <pre>At IDENTICAL, shoud this match?
</pre>
186 <test-case id=
"test25" strength=
"SECONDARY" locale=
"en">
188 <pre>12</pre><m>ű
</m><post> Ű
</post>
191 <test-case id=
"test26" strength=
"SECONDARY" locale=
"en">
193 <pre>12</pre><m>a
</m><post>...
</post>
197 <!-- Test Cases from usrchdat.c, VARIABLE -->
198 <test-case id=
"test27" strength=
"TERTIARY" locale=
"en">
199 <pattern>blackbird
</pattern>
200 <pre>black-bird
</pre><m>blackbird
</m><post>...
</post>
203 <test-case id=
"test28" strength=
"TERTIARY" locale=
"en">
204 <pattern>go
</pattern>
208 <!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
209 the UStringSearch. How did the orignal test run? -->
211 <test-case id="test29" strength="PRIMARY" locale="en">
213 <pre></pre><m></m><post>abc</post>
217 <test-case id=
"test30" strength=
"SECONDARY" locale=
"en">
218 <pattern>abc
</pattern>
219 <pre> a bc ab c a bc ab c"
</pre>
222 <test-case id=
"test31" strength=
"SECONDARY" locale=
"en">
223 <pattern>abc
</pattern>
224 <pre> ---------------
</pre>
228 <!-- Normalization test cases from usrchdat.c -->
229 <test-case id=
"test32" strength=
"TERTIARY" norm=
"ON">
230 <pattern>a\u0325\u0300
</pattern>
231 <pre></pre><m>a\u0300\u0325
</m>
235 <test-case id=
"test32a" strength=
"TERTIARY" norm=
"OFF">
236 <pattern>a\u0325\u0300
</pattern>
237 <pre>a\u0300\u0325
</pre>
241 <!-- COMPOSITEBOUNDARIES from usrchdat.c
242 Boundaries are not identical to orignal test data because
243 of matching only full combining sequences
245 <test-case id=
"test40" strength=
"TERTIARY">
247 <pre>À
</pre> <!-- \u00C0 -->
250 <test-case id=
"test41" strength=
"TERTIARY">
252 <pre>À
</pre><m>A
</m><post>C
</post>
255 <test-case id=
"test42" strength=
"TERTIARY">
256 <pattern>A\u030A
</pattern>
262 <!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
263 <test-case id=
"test50" strength=
"TERTIARY">
264 <pattern>\uD800\uDC00
</pattern>
265 <pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00
</pre><m>\uD800\uDC00
</m>
266 <post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00
</post>
269 <test-case id=
"test51" strength=
"TERTIARY">
270 <pattern>\\uD834\\uDDB9
</pattern>
271 <pre>and
</pre><m>\\uD834\\uDDB9
</m><post>this sentence
</post>
274 <test-case id=
"test52" strength=
"TERTIARY">
275 <pattern> \\uD834\\uDDB9
</pattern>
276 <pre>and
</pre><m> \\uD834\\uDDB9
</m><post>this sentence
</post>
279 <test-case id=
"test53" strength=
"TERTIARY">
280 <pattern>-\\uD834\\uDDB9-
</pattern>
281 <pre>and
</pre><m>-\\uD834\\uDDB9-
</m><post>this sentence
</post>
284 <test-case id=
"test54" strength=
"TERTIARY">
285 <pattern>,\\uD834\\uDDB9,
</pattern>
286 <pre>and
</pre><m>,\\uD834\\uDDB9,
</m><post>this sentence
</post>
289 <test-case id=
"test55" strength=
"TERTIARY">
290 <pattern>?\\uD834\\uDDB9?
</pattern>
291 <pre>and
</pre><m>?\\uD834\\uDDB9?
</m><post>this sentence
</post>
295 <!-- Long combining sequences -->
296 <!-- Backwards search fails because patterns ends w/ ignorables
297 <test-case id="test60" strength="PRIMARY">
298 <pattern>A\u0301\u0301\u0301\u0301</pattern>
299 <m>A\u0301\u0301\u0301\u0301\u0301</m>
303 <test-case id=
"test61" strength=
"TERTIARY">
304 <pattern>A\u0301\u0301\u0301\u0301
</pattern>
305 <pre>A\u0301\u0301\u0301\u0301\u0301
</pre>
308 <test-case id=
"test62" strength=
"TERTIARY">
309 <pattern>A\u0301\u0301\u0301\u0301
</pattern>
310 <m>A\u0301\u0301\u0301\u0301
</m>
313 <!-- stand-alone combining marks don't match attached marks -->
314 <test-case id=
"test63" strength=
"TERTIARY">
315 <pattern>\u0301
</pattern>
316 <pre>A\u0301\u0301\u0301\u0301
</pre>
319 <test-case id=
"test64" strength=
"TERTIARY">
320 <pattern>\u0301
</pattern>
321 <post>\u0301\u0301\u0301\u0301
</post>
324 <!-- stand-alone combining mark does match an un-attached combining mark -->
325 <test-case id=
"test65" strength=
"TERTIARY">
326 <pattern>\u0301
</pattern>
327 <m>\u0301
</m><post>A\u0301\u0301
</post>
330 <test-case id=
"test66" strength=
"TERTIARY">
331 <pattern>\u0301
</pattern>
335 <!-- stand-alone combining marks at end of the target text -->
336 <test-case id=
"test67" strength=
"TERTIARY">
337 <pattern>\u0301
</pattern>
338 <pre>abcd\r
</pre><m>\u0301
</m>
341 <!-- attached combining marks at end of the target text, no match -->
342 <test-case id=
"test68" strength=
"TERTIARY">
343 <pattern>\u0301
</pattern>
344 <pre>abcd\u0301
</pre>
349 <!-- no match within expansions at the start -->
350 <test-case id=
"test70" strength=
"PRIMARY">
351 <pattern>Eligature
</pattern>
355 <test-case id=
"test71" strength=
"PRIMARY">
356 <pattern>AEligature
</pattern>
360 <test-case id=
"test72" strength=
"PRIMARY">
361 <pattern>AEligature
</pattern>
365 <!-- unattached combining Tilde will not match a Tilde that is
366 part of a composed Ñ (\u00D1) -->
367 <test-case id=
"test73" strength=
"SECONDARY">
368 <pattern>\u0303
</pattern> <!-- combining tilde -->
369 <pre>Ñ

</pre><m>\u0303
</m>
372 <test-case id=
"test74" strength=
"SECONDARY">
373 <pattern>\u0303
</pattern> <!-- combining tilde -->
374 <pre>Ñ

</pre><m>\u0303
</m><post>a
</post>
377 <test-case id=
"test75" strength=
"TERTIARY" locale=
"fr">
378 <pattern>\u00EA
</pattern>
379 <pre>p
</pre><m>\u00EA
</m><post>che
</post>
382 <test-case id=
"test76" strength=
"TERTIARY" locale=
"fr">
383 <pattern>\u00EA
</pattern>
384 <pre>p
</pre><m>e\u0302
</m><post>che
</post>
387 <test-case id=
"test77" strength=
"TERTIARY" locale=
"fr">
388 <pattern>e\u0302
</pattern>
389 <pre>p
</pre><m>\u00EA
</m><post>che
</post>
392 <!-- Test cases from ticket:5382 -->
393 <test-case id=
"test78" strength=
"SECONDARY" locale=
"hu_HU">
394 <pattern>\u0170
</pattern>
399 <test-case id=
"test79" strength=
"SECONDARY" locale=
"hu_HU">
400 <pattern>\u0170
</pattern>
406 <test-case id=
"test80" strength=
"SECONDARY" locale=
"hu_HU">
407 <pattern>\u0170
</pattern>
412 <!-- Test cases from ticket:5959 -->
413 <test-case id=
"test81" strength=
"SECONDARY">
414 <pattern>\u2166
</pattern>
418 <test-case id=
"test82" strength=
"SECONDARY">
419 <pattern>VII
</pattern>
423 <test-case id=
"test83" strength=
"IDENTICAL" alternate_handling=
"SHIFTED" locale=
"en">
424 <pattern>Universal Declaration of Human Rights
</pattern>
425 <pre>Proclaims this
</pre><m>Universal Declaration of Human Rights
</m><post> as a common standard of achievement for all peoples and all nations
</post>
428 <test-case id=
"test83b" strength=
"TERTIARY" alternate_handling=
"SHIFTED" locale=
"en">
429 <pattern>Universal Declaration of Human Rights
</pattern>
430 <pre>Proclaims this
</pre>
431 <m>Universal-Declaration-of-Human-Rights
</m>
432 <post> as a common standard of achievement for all peoples and all nations
</post>
435 <test-case id=
"test84" strength=
"TERTIARY" locale=
"en">
436 <pattern>\u05E9\u0591\u05E9
</pattern>
437 <m>\u05E9\u0592\u05E9
</m>
440 <test-case id=
"test84b" strength=
"IDENTICAL" locale=
"en">
441 <pattern>\u05E9\u0591\u05E9
</pattern>
442 <pre>\u05E9\u0592\u05E9
</pre>
444 </stringsearch-tests>