]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/testdata/ssearch.xml
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / test / testdata / ssearch.xml
1 <?xml version="1.0" encoding="UTF-8"?>
2
3 <!-- Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html -->
4 <!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
5
6 <!-- Test data file for string search -->
7 <!DOCTYPE stringsearch-tests [
8 <!ELEMENT stringsearch-tests (test-case+)>
9 <!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
10 <!ELEMENT test-case (pattern, pre?, m?, post?)>
11 <!ATTLIST test-case
12 id ID #REQUIRED
13 locale CDATA "en"
14 strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
15 norm (ON | OFF) "OFF"
16 alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
17 >
18
19 <!ELEMENT pattern (#PCDATA)>
20 <!ELEMENT pre (#PCDATA)>
21 <!ELEMENT m (#PCDATA)>
22 <!ELEMENT post (#PCDATA)>
23 ]>
24
25 <stringsearch-tests>
26 <!-- debug="test11" (for copying into the above element) -->
27
28 <!-- Very simple match -->
29 <test-case id="test01" >
30 <pattern>abc</pattern>
31 <pre>xxx</pre><m>abc</m><post>yyy</post>
32 </test-case>
33
34 <!-- Very simple no-match -->
35 <test-case id="test02" >
36 <pattern>abc</pattern>
37 <pre>xxx</pre><post>yyy</post>
38 </test-case>
39
40 <!-- Match after several near-misses. -->
41 <test-case id="test03" >
42 <pattern>string</pattern>
43 <pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
44 </test-case>
45
46 <test-case id="test04" strength="PRIMARY" >
47 <pattern>FUSS</pattern>
48 <pre>abc</pre><m>fuss</m><post>sss</post>
49 </test-case>
50
51 <test-case id="test05" strength="PRIMARY" >
52 <pattern>FUSS</pattern>
53 <pre>abc</pre><m>fuß</m><post>sss</post>
54 </test-case>
55
56 <test-case id="test05.5" strength="PRIMARY" >
57 <pattern>fuss</pattern>
58 <pre>a </pre>
59 <m>fuß</m>
60 <post>ball table</post>
61 </test-case>
62
63 <test-case id="test06" strength="PRIMARY" >
64 <pattern>fuß</pattern>
65 <pre>abc</pre><m>fuss</m><post>xyz</post>
66 </test-case>
67
68 <test-case id="test07" strength="SECONDARY" >
69 <pattern>fuß</pattern>
70 <pre>abcfussxyz</pre>
71 </test-case>
72
73 <test-case id="test08" strength="PRIMARY" >
74 <pattern>fus</pattern>
75 <pre>abcfuß</pre><post>xyz</post>
76 </test-case>
77
78 <!-- A good match following an initial match that failed because
79 of not ending on a character boundary -->
80 <test-case id="test09" strength="PRIMARY">
81 <pattern>fus</pattern>
82 <pre>fuß </pre><m>fus</m><post>sss</post>
83 </test-case>
84
85
86 <!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
87
88 <test-case id="test10" strength="TERTIARY">
89 <pattern>fox</pattern>
90 <m>fox</m><post>y fox</post>
91 </test-case>
92
93 <test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
94 <pattern>toe</pattern>
95 <pre>This is a </pre><m></m><post>ne</post>
96 </test-case>
97
98 <test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
99 <pattern>toe</pattern>
100 <pre>This is a </pre><post>Töne</post>
101 </test-case>
102
103 <test-case id="test12" strength="TERTIARY">
104 <pattern>e</pattern>
105 <pre>tésting that é doés not match </pre><m>e</m><post></post>
106 </test-case>
107
108 <test-case id="test13" strength="PRIMARY" locale="fr">
109 <pattern>e</pattern>
110 <pre></pre><m>É</m><post>É</post>
111 </test-case>
112
113 <test-case id="test14" strength="PRIMARY" locale="fr">
114 <pattern>O</pattern>
115 <pre>C</pre><m>O\u0302</m><post></post>
116 </test-case>
117
118
119 <!-- Test cases from usrchdat.c STRENGTH -->
120
121
122 <test-case id="test15" strength="PRIMARY" locale="en">
123 <pattern>fox</pattern>
124 <pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
125 </test-case>
126
127 <test-case id="test16" strength="PRIMARY" locale="fr">
128 <pattern>peche</pattern>
129 <pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
130 </test-case>
131
132 <test-case id="test17" strength="PRIMARY" locale="fr">
133 <pattern>peche</pattern>
134 <pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
135 </test-case>
136
137 <test-case id="test18" strength="PRIMARY" locale="fr">
138 <pattern>peche</pattern>
139 <pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
140 </test-case>
141
142 <test-case id="test19" strength="PRIMARY" locale="fr">
143 <pattern>peche</pattern>
144 <pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
145 </test-case>
146
147 <test-case id="test20" strength="PRIMARY" locale="es">
148 <pattern>channel</pattern>
149 <pre>A </pre><m>channel</m><post>, </post>
150 </test-case>
151
152 <test-case id="test21" strength="PRIMARY" locale="es">
153 <pattern>channel</pattern>
154 <pre>A </pre><m>CHANNEL</m><post>, </post>
155 </test-case>
156
157 <test-case id="test22" strength="PRIMARY" locale="es">
158 <pattern>channel</pattern>
159 <pre>A </pre><m>Channel</m><post>s, </post>
160 </test-case>
161
162 <test-case id="test23" strength="PRIMARY" locale="es">
163 <pattern>channel</pattern>
164 <pre>A </pre><m>channel</m><post>... </post>
165 </test-case>
166
167 <test-case id="test24" strength="TERTIARY" locale="en">
168 <pattern>A\u0300</pattern>
169 <pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
170 </test-case>
171
172 <!-- TODO: In the original test data, this test matched at IDENTICAL strength.
173 Doesn't seem right. The characters are different.
174 -->
175 <test-case id="test24a" strength="IDENTICAL" locale="en">
176 <pattern>A\u0300</pattern>
177 <pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
178 </test-case>
179
180 <test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
181 <pattern>A\u0300</pattern>
182 <pre>At IDENTICAL, shoud this match? </pre>
183 <m>\u00c0</m>
184 <post></post>
185 </test-case>
186
187 <test-case id="test25" strength="SECONDARY" locale="en">
188 <pattern>Ű</pattern>
189 <pre>12</pre><m>ű</m><post> Ű</post>
190 </test-case>
191
192 <test-case id="test26" strength="SECONDARY" locale="en">
193 <pattern>A</pattern>
194 <pre>12</pre><m>a</m><post>...</post>
195 </test-case>
196
197
198 <!-- Test Cases from usrchdat.c, VARIABLE -->
199 <test-case id="test27" strength="TERTIARY" locale="en">
200 <pattern>blackbird</pattern>
201 <pre>black-bird </pre><m>blackbird</m><post>...</post>
202 </test-case>
203
204 <test-case id="test28" strength="TERTIARY" locale="en">
205 <pattern>go</pattern>
206 <pre> on</pre>
207 </test-case>
208
209 <!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
210 the UStringSearch. How did the orignal test run? -->
211 <!--
212 <test-case id="test29" strength="PRIMARY" locale="en">
213 <pattern> </pattern>
214 <pre></pre><m></m><post>abc</post>
215 </test-case>
216 -->
217
218 <test-case id="test30" strength="SECONDARY" locale="en">
219 <pattern>abc</pattern>
220 <pre> a bc ab c a bc ab c"</pre>
221 </test-case>
222
223 <test-case id="test31" strength="SECONDARY" locale="en">
224 <pattern>abc</pattern>
225 <pre> ---------------</pre>
226 </test-case>
227
228
229 <!-- Normalization test cases from usrchdat.c -->
230 <test-case id="test32" strength="TERTIARY" norm="ON">
231 <pattern>a\u0325\u0300</pattern>
232 <pre></pre><m>a\u0300\u0325</m>
233 </test-case>
234
235
236 <test-case id="test32a" strength="TERTIARY" norm="OFF">
237 <pattern>a\u0325\u0300</pattern>
238 <pre>a\u0300\u0325</pre>
239 </test-case>
240
241
242 <!-- COMPOSITEBOUNDARIES from usrchdat.c
243 Boundaries are not identical to orignal test data because
244 of matching only full combining sequences
245 -->
246 <test-case id="test40" strength="TERTIARY">
247 <pattern>A</pattern>
248 <pre>À</pre> <!-- \u00C0 -->
249 </test-case>
250
251 <test-case id="test41" strength="TERTIARY">
252 <pattern>A</pattern>
253 <pre>À</pre><m>A</m><post>C</post>
254 </test-case>
255
256 <test-case id="test42" strength="TERTIARY">
257 <pattern>A\u030A</pattern>
258 <pre>À\u01FA</pre>
259 </test-case>
260
261
262
263 <!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
264 <test-case id="test50" strength="TERTIARY">
265 <pattern>\uD800\uDC00</pattern>
266 <pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
267 <post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
268 </test-case>
269
270 <test-case id="test51" strength="TERTIARY">
271 <pattern>\\uD834\\uDDB9</pattern>
272 <pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
273 </test-case>
274
275 <test-case id="test52" strength="TERTIARY">
276 <pattern> \\uD834\\uDDB9 </pattern>
277 <pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
278 </test-case>
279
280 <test-case id="test53" strength="TERTIARY">
281 <pattern>-\\uD834\\uDDB9-</pattern>
282 <pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
283 </test-case>
284
285 <test-case id="test54" strength="TERTIARY">
286 <pattern>,\\uD834\\uDDB9,</pattern>
287 <pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
288 </test-case>
289
290 <test-case id="test55" strength="TERTIARY">
291 <pattern>?\\uD834\\uDDB9?</pattern>
292 <pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
293 </test-case>
294
295
296 <!-- Long combining sequences -->
297 <!-- Backwards search fails because patterns ends w/ ignorables
298 <test-case id="test60" strength="PRIMARY">
299 <pattern>A\u0301\u0301\u0301\u0301</pattern>
300 <m>A\u0301\u0301\u0301\u0301\u0301</m>
301 </test-case>
302 -->
303
304 <test-case id="test61" strength="TERTIARY">
305 <pattern>A\u0301\u0301\u0301\u0301</pattern>
306 <pre>A\u0301\u0301\u0301\u0301\u0301</pre>
307 </test-case>
308
309 <test-case id="test62" strength="TERTIARY">
310 <pattern>A\u0301\u0301\u0301\u0301</pattern>
311 <m>A\u0301\u0301\u0301\u0301</m>
312 </test-case>
313
314 <!-- stand-alone combining marks don't match attached marks -->
315 <test-case id="test63" strength="TERTIARY">
316 <pattern>\u0301</pattern>
317 <pre>A\u0301\u0301\u0301\u0301</pre>
318 </test-case>
319
320 <test-case id="test64" strength="TERTIARY">
321 <pattern>\u0301</pattern>
322 <post>\u0301\u0301\u0301\u0301</post>
323 </test-case>
324
325 <!-- stand-alone combining mark does match an un-attached combining mark -->
326 <test-case id="test65" strength="TERTIARY">
327 <pattern>\u0301</pattern>
328 <m>\u0301</m><post>A\u0301\u0301</post>
329 </test-case>
330
331 <test-case id="test66" strength="TERTIARY">
332 <pattern>\u0301</pattern>
333 <m>\u0301</m>
334 </test-case>
335
336 <!-- stand-alone combining marks at end of the target text -->
337 <test-case id="test67" strength="TERTIARY">
338 <pattern>\u0301</pattern>
339 <pre>abcd\r</pre><m>\u0301</m>
340 </test-case>
341
342 <!-- attached combining marks at end of the target text, no match -->
343 <test-case id="test68" strength="TERTIARY">
344 <pattern>\u0301</pattern>
345 <pre>abcd\u0301</pre>
346 </test-case>
347
348
349
350 <!-- no match within expansions at the start -->
351 <test-case id="test70" strength="PRIMARY">
352 <pattern>Eligature</pattern>
353 <pre>Æligature</pre>
354 </test-case>
355
356 <test-case id="test71" strength="PRIMARY">
357 <pattern>AEligature</pattern>
358 <m>Æligature</m>
359 </test-case>
360
361 <test-case id="test72" strength="PRIMARY">
362 <pattern>AEligature</pattern>
363 <m>Æligature</m>
364 </test-case>
365
366 <!-- unattached combining Tilde will not match a Tilde that is
367 part of a composed Ñ (\u00D1) -->
368 <test-case id="test73" strength="SECONDARY">
369 <pattern>\u0303</pattern> <!-- combining tilde -->
370 <pre>Ñ&#x0d;</pre><m>\u0303</m>
371 </test-case>
372
373 <test-case id="test74" strength="SECONDARY">
374 <pattern>\u0303</pattern> <!-- combining tilde -->
375 <pre>Ñ &#x0d;</pre><m>\u0303</m><post>a</post>
376 </test-case>
377
378 <test-case id="test75" strength="TERTIARY" locale="fr">
379 <pattern>\u00EA</pattern>
380 <pre>p</pre><m>\u00EA</m><post>che</post>
381 </test-case>
382
383 <test-case id="test76" strength="TERTIARY" locale="fr">
384 <pattern>\u00EA</pattern>
385 <pre>p</pre><m>e\u0302</m><post>che</post>
386 </test-case>
387
388 <test-case id="test77" strength="TERTIARY" locale="fr">
389 <pattern>e\u0302</pattern>
390 <pre>p</pre><m>\u00EA</m><post>che</post>
391 </test-case>
392
393 <!-- Test cases from ticket:5382 -->
394 <test-case id="test78" strength="SECONDARY" locale="hu_HU">
395 <pattern>\u0170</pattern>
396 <m>\u0171</m>
397 <post>12</post>
398 </test-case>
399
400 <test-case id="test79" strength="SECONDARY" locale="hu_HU">
401 <pattern>\u0170</pattern>
402 <pre>1</pre>
403 <m>\u0171</m>
404 <post>2</post>
405 </test-case>
406
407 <test-case id="test80" strength="SECONDARY" locale="hu_HU">
408 <pattern>\u0170</pattern>
409 <pre>12</pre>
410 <m>\u0171</m>
411 </test-case>
412
413 <!-- Test cases from ticket:5959 -->
414 <test-case id="test81" strength="SECONDARY">
415 <pattern>\u2166</pattern>
416 <m>VII</m>
417 </test-case>
418
419 <test-case id="test82" strength="SECONDARY">
420 <pattern>VII</pattern>
421 <m>\u2166</m>
422 </test-case>
423
424 <test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
425 <pattern>Universal Declaration of Human Rights</pattern>
426 <pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
427 </test-case>
428
429 <test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
430 <pattern>Universal Declaration of Human Rights</pattern>
431 <pre>Proclaims this </pre>
432 <m>Universal-Declaration-of-Human-Rights</m>
433 <post> as a common standard of achievement for all peoples and all nations</post>
434 </test-case>
435
436 <test-case id="test84" strength="TERTIARY" locale="en">
437 <pattern>\u05E9\u0591\u05E9</pattern>
438 <m>\u05E9\u0592\u05E9</m>
439 </test-case>
440
441 <test-case id="test84b" strength="IDENTICAL" locale="en">
442 <pattern>\u05E9\u0591\u05E9</pattern>
443 <pre>\u05E9\u0592\u05E9</pre>
444 </test-case>
445 </stringsearch-tests>
446