]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/testdata/ssearch.xml
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / test / testdata / ssearch.xml
1 <?xml version="1.0" encoding="UTF-8"?>
2
3 <!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
4
5 <!-- Test data file for string search -->
6 <!DOCTYPE stringsearch-tests [
7 <!ELEMENT stringsearch-tests (test-case+)>
8 <!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
9 <!ELEMENT test-case (pattern, pre?, m?, post?)>
10 <!ATTLIST test-case
11 id ID #REQUIRED
12 locale CDATA "en"
13 strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
14 norm (ON | OFF) "OFF"
15 alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
16 >
17
18 <!ELEMENT pattern (#PCDATA)>
19 <!ELEMENT pre (#PCDATA)>
20 <!ELEMENT m (#PCDATA)>
21 <!ELEMENT post (#PCDATA)>
22 ]>
23
24 <stringsearch-tests>
25 <!-- debug="test11" (for copying into the above element) -->
26
27 <!-- Very simple match -->
28 <test-case id="test01" >
29 <pattern>abc</pattern>
30 <pre>xxx</pre><m>abc</m><post>yyy</post>
31 </test-case>
32
33 <!-- Very simple no-match -->
34 <test-case id="test02" >
35 <pattern>abc</pattern>
36 <pre>xxx</pre><post>yyy</post>
37 </test-case>
38
39 <!-- Match after several near-misses. -->
40 <test-case id="test03" >
41 <pattern>string</pattern>
42 <pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
43 </test-case>
44
45 <test-case id="test04" strength="PRIMARY" >
46 <pattern>FUSS</pattern>
47 <pre>abc</pre><m>fuss</m><post>sss</post>
48 </test-case>
49
50 <test-case id="test05" strength="PRIMARY" >
51 <pattern>FUSS</pattern>
52 <pre>abc</pre><m>fuß</m><post>sss</post>
53 </test-case>
54
55 <test-case id="test05.5" strength="PRIMARY" >
56 <pattern>fuss</pattern>
57 <pre>a </pre>
58 <m>fuß</m>
59 <post>ball table</post>
60 </test-case>
61
62 <test-case id="test06" strength="PRIMARY" >
63 <pattern>fuß</pattern>
64 <pre>abc</pre><m>fuss</m><post>xyz</post>
65 </test-case>
66
67 <test-case id="test07" strength="SECONDARY" >
68 <pattern>fuß</pattern>
69 <pre>abcfussxyz</pre>
70 </test-case>
71
72 <test-case id="test08" strength="PRIMARY" >
73 <pattern>fus</pattern>
74 <pre>abcfuß</pre><post>xyz</post>
75 </test-case>
76
77 <!-- A good match following an initial match that failed because
78 of not ending on a character boundary -->
79 <test-case id="test09" strength="PRIMARY">
80 <pattern>fus</pattern>
81 <pre>fuß </pre><m>fus</m><post>sss</post>
82 </test-case>
83
84
85 <!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
86
87 <test-case id="test10" strength="TERTIARY">
88 <pattern>fox</pattern>
89 <m>fox</m><post>y fox</post>
90 </test-case>
91
92 <test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
93 <pattern>toe</pattern>
94 <pre>This is a </pre><m></m><post>ne</post>
95 </test-case>
96
97 <test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
98 <pattern>toe</pattern>
99 <pre>This is a </pre><post>Töne</post>
100 </test-case>
101
102 <test-case id="test12" strength="TERTIARY">
103 <pattern>e</pattern>
104 <pre>tésting that é doés not match </pre><m>e</m><post></post>
105 </test-case>
106
107 <test-case id="test13" strength="PRIMARY" locale="fr">
108 <pattern>e</pattern>
109 <pre></pre><m>É</m><post>É</post>
110 </test-case>
111
112 <test-case id="test14" strength="PRIMARY" locale="fr">
113 <pattern>O</pattern>
114 <pre>C</pre><m>O\u0302</m><post></post>
115 </test-case>
116
117
118 <!-- Test cases from usrchdat.c STRENGTH -->
119
120
121 <test-case id="test15" strength="PRIMARY" locale="en">
122 <pattern>fox</pattern>
123 <pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
124 </test-case>
125
126 <test-case id="test16" strength="PRIMARY" locale="fr">
127 <pattern>peche</pattern>
128 <pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
129 </test-case>
130
131 <test-case id="test17" strength="PRIMARY" locale="fr">
132 <pattern>peche</pattern>
133 <pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
134 </test-case>
135
136 <test-case id="test18" strength="PRIMARY" locale="fr">
137 <pattern>peche</pattern>
138 <pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
139 </test-case>
140
141 <test-case id="test19" strength="PRIMARY" locale="fr">
142 <pattern>peche</pattern>
143 <pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
144 </test-case>
145
146 <test-case id="test20" strength="PRIMARY" locale="es">
147 <pattern>channel</pattern>
148 <pre>A </pre><m>channel</m><post>, </post>
149 </test-case>
150
151 <test-case id="test21" strength="PRIMARY" locale="es">
152 <pattern>channel</pattern>
153 <pre>A </pre><m>CHANNEL</m><post>, </post>
154 </test-case>
155
156 <test-case id="test22" strength="PRIMARY" locale="es">
157 <pattern>channel</pattern>
158 <pre>A </pre><m>Channel</m><post>s, </post>
159 </test-case>
160
161 <test-case id="test23" strength="PRIMARY" locale="es">
162 <pattern>channel</pattern>
163 <pre>A </pre><m>channel</m><post>... </post>
164 </test-case>
165
166 <test-case id="test24" strength="TERTIARY" locale="en">
167 <pattern>A\u0300</pattern>
168 <pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
169 </test-case>
170
171 <!-- TODO: In the original test data, this test matched at IDENTICAL strength.
172 Doesn't seem right. The characters are different.
173 -->
174 <test-case id="test24a" strength="IDENTICAL" locale="en">
175 <pattern>A\u0300</pattern>
176 <pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
177 </test-case>
178
179 <test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
180 <pattern>A\u0300</pattern>
181 <pre>At IDENTICAL, shoud this match? </pre>
182 <m>\u00c0</m>
183 <post></post>
184 </test-case>
185
186 <test-case id="test25" strength="SECONDARY" locale="en">
187 <pattern>Ű</pattern>
188 <pre>12</pre><m>ű</m><post> Ű</post>
189 </test-case>
190
191 <test-case id="test26" strength="SECONDARY" locale="en">
192 <pattern>A</pattern>
193 <pre>12</pre><m>a</m><post>...</post>
194 </test-case>
195
196
197 <!-- Test Cases from usrchdat.c, VARIABLE -->
198 <test-case id="test27" strength="TERTIARY" locale="en">
199 <pattern>blackbird</pattern>
200 <pre>black-bird </pre><m>blackbird</m><post>...</post>
201 </test-case>
202
203 <test-case id="test28" strength="TERTIARY" locale="en">
204 <pattern>go</pattern>
205 <pre> on</pre>
206 </test-case>
207
208 <!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
209 the UStringSearch. How did the orignal test run? -->
210 <!--
211 <test-case id="test29" strength="PRIMARY" locale="en">
212 <pattern> </pattern>
213 <pre></pre><m></m><post>abc</post>
214 </test-case>
215 -->
216
217 <test-case id="test30" strength="SECONDARY" locale="en">
218 <pattern>abc</pattern>
219 <pre> a bc ab c a bc ab c"</pre>
220 </test-case>
221
222 <test-case id="test31" strength="SECONDARY" locale="en">
223 <pattern>abc</pattern>
224 <pre> ---------------</pre>
225 </test-case>
226
227
228 <!-- Normalization test cases from usrchdat.c -->
229 <test-case id="test32" strength="TERTIARY" norm="ON">
230 <pattern>a\u0325\u0300</pattern>
231 <pre></pre><m>a\u0300\u0325</m>
232 </test-case>
233
234
235 <test-case id="test32a" strength="TERTIARY" norm="OFF">
236 <pattern>a\u0325\u0300</pattern>
237 <pre>a\u0300\u0325</pre>
238 </test-case>
239
240
241 <!-- COMPOSITEBOUNDARIES from usrchdat.c
242 Boundaries are not identical to orignal test data because
243 of matching only full combining sequences
244 -->
245 <test-case id="test40" strength="TERTIARY">
246 <pattern>A</pattern>
247 <pre>À</pre> <!-- \u00C0 -->
248 </test-case>
249
250 <test-case id="test41" strength="TERTIARY">
251 <pattern>A</pattern>
252 <pre>À</pre><m>A</m><post>C</post>
253 </test-case>
254
255 <test-case id="test42" strength="TERTIARY">
256 <pattern>A\u030A</pattern>
257 <pre>À\u01FA</pre>
258 </test-case>
259
260
261
262 <!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
263 <test-case id="test50" strength="TERTIARY">
264 <pattern>\uD800\uDC00</pattern>
265 <pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
266 <post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
267 </test-case>
268
269 <test-case id="test51" strength="TERTIARY">
270 <pattern>\\uD834\\uDDB9</pattern>
271 <pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
272 </test-case>
273
274 <test-case id="test52" strength="TERTIARY">
275 <pattern> \\uD834\\uDDB9 </pattern>
276 <pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
277 </test-case>
278
279 <test-case id="test53" strength="TERTIARY">
280 <pattern>-\\uD834\\uDDB9-</pattern>
281 <pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
282 </test-case>
283
284 <test-case id="test54" strength="TERTIARY">
285 <pattern>,\\uD834\\uDDB9,</pattern>
286 <pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
287 </test-case>
288
289 <test-case id="test55" strength="TERTIARY">
290 <pattern>?\\uD834\\uDDB9?</pattern>
291 <pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
292 </test-case>
293
294
295 <!-- Long combining sequences -->
296 <!-- Backwards search fails because patterns ends w/ ignorables
297 <test-case id="test60" strength="PRIMARY">
298 <pattern>A\u0301\u0301\u0301\u0301</pattern>
299 <m>A\u0301\u0301\u0301\u0301\u0301</m>
300 </test-case>
301 -->
302
303 <test-case id="test61" strength="TERTIARY">
304 <pattern>A\u0301\u0301\u0301\u0301</pattern>
305 <pre>A\u0301\u0301\u0301\u0301\u0301</pre>
306 </test-case>
307
308 <test-case id="test62" strength="TERTIARY">
309 <pattern>A\u0301\u0301\u0301\u0301</pattern>
310 <m>A\u0301\u0301\u0301\u0301</m>
311 </test-case>
312
313 <!-- stand-alone combining marks don't match attached marks -->
314 <test-case id="test63" strength="TERTIARY">
315 <pattern>\u0301</pattern>
316 <pre>A\u0301\u0301\u0301\u0301</pre>
317 </test-case>
318
319 <test-case id="test64" strength="TERTIARY">
320 <pattern>\u0301</pattern>
321 <post>\u0301\u0301\u0301\u0301</post>
322 </test-case>
323
324 <!-- stand-alone combining mark does match an un-attached combining mark -->
325 <test-case id="test65" strength="TERTIARY">
326 <pattern>\u0301</pattern>
327 <m>\u0301</m><post>A\u0301\u0301</post>
328 </test-case>
329
330 <test-case id="test66" strength="TERTIARY">
331 <pattern>\u0301</pattern>
332 <m>\u0301</m>
333 </test-case>
334
335 <!-- stand-alone combining marks at end of the target text -->
336 <test-case id="test67" strength="TERTIARY">
337 <pattern>\u0301</pattern>
338 <pre>abcd\r</pre><m>\u0301</m>
339 </test-case>
340
341 <!-- attached combining marks at end of the target text, no match -->
342 <test-case id="test68" strength="TERTIARY">
343 <pattern>\u0301</pattern>
344 <pre>abcd\u0301</pre>
345 </test-case>
346
347
348
349 <!-- no match within expansions at the start -->
350 <test-case id="test70" strength="PRIMARY">
351 <pattern>Eligature</pattern>
352 <pre>Æligature</pre>
353 </test-case>
354
355 <test-case id="test71" strength="PRIMARY">
356 <pattern>AEligature</pattern>
357 <m>Æligature</m>
358 </test-case>
359
360 <test-case id="test72" strength="PRIMARY">
361 <pattern>AEligature</pattern>
362 <m>Æligature</m>
363 </test-case>
364
365 <!-- unattached combining Tilde will not match a Tilde that is
366 part of a composed Ñ (\u00D1) -->
367 <test-case id="test73" strength="SECONDARY">
368 <pattern>\u0303</pattern> <!-- combining tilde -->
369 <pre>Ñ&#x0d;</pre><m>\u0303</m>
370 </test-case>
371
372 <test-case id="test74" strength="SECONDARY">
373 <pattern>\u0303</pattern> <!-- combining tilde -->
374 <pre>Ñ &#x0d;</pre><m>\u0303</m><post>a</post>
375 </test-case>
376
377 <test-case id="test75" strength="TERTIARY" locale="fr">
378 <pattern>\u00EA</pattern>
379 <pre>p</pre><m>\u00EA</m><post>che</post>
380 </test-case>
381
382 <test-case id="test76" strength="TERTIARY" locale="fr">
383 <pattern>\u00EA</pattern>
384 <pre>p</pre><m>e\u0302</m><post>che</post>
385 </test-case>
386
387 <test-case id="test77" strength="TERTIARY" locale="fr">
388 <pattern>e\u0302</pattern>
389 <pre>p</pre><m>\u00EA</m><post>che</post>
390 </test-case>
391
392 <!-- Test cases from ticket:5382 -->
393 <test-case id="test78" strength="SECONDARY" locale="hu_HU">
394 <pattern>\u0170</pattern>
395 <m>\u0171</m>
396 <post>12</post>
397 </test-case>
398
399 <test-case id="test79" strength="SECONDARY" locale="hu_HU">
400 <pattern>\u0170</pattern>
401 <pre>1</pre>
402 <m>\u0171</m>
403 <post>2</post>
404 </test-case>
405
406 <test-case id="test80" strength="SECONDARY" locale="hu_HU">
407 <pattern>\u0170</pattern>
408 <pre>12</pre>
409 <m>\u0171</m>
410 </test-case>
411
412 <!-- Test cases from ticket:5959 -->
413 <test-case id="test81" strength="SECONDARY">
414 <pattern>\u2166</pattern>
415 <m>VII</m>
416 </test-case>
417
418 <test-case id="test82" strength="SECONDARY">
419 <pattern>VII</pattern>
420 <m>\u2166</m>
421 </test-case>
422
423 <test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
424 <pattern>Universal Declaration of Human Rights</pattern>
425 <pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
426 </test-case>
427
428 <test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
429 <pattern>Universal Declaration of Human Rights</pattern>
430 <pre>Proclaims this </pre>
431 <m>Universal-Declaration-of-Human-Rights</m>
432 <post> as a common standard of achievement for all peoples and all nations</post>
433 </test-case>
434
435 <test-case id="test84" strength="TERTIARY" locale="en">
436 <pattern>\u05E9\u0591\u05E9</pattern>
437 <m>\u05E9\u0592\u05E9</m>
438 </test-case>
439
440 <test-case id="test84b" strength="IDENTICAL" locale="en">
441 <pattern>\u05E9\u0591\u05E9</pattern>
442 <pre>\u05E9\u0592\u05E9</pre>
443 </test-case>
444 </stringsearch-tests>
445