]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/usrchdat.c
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / test / cintltst / usrchdat.c
1 /********************************************************************
2 * Copyright (c) 2001-2008 International Business Machines
3 * Corporation and others. All Rights Reserved.
4 ********************************************************************
5 * File USRCHDAT.H
6 * Modification History:
7 * Name date Description
8 * synwee July 31 2001 creation
9 ********************************************************************/
10
11
12 /*
13 Note: This file is included by other C and C++ files. This file should not be directly compiled.
14 */
15 #ifndef USRCHDAT_C
16 #define USRCHDAT_C
17
18 #include "unicode/ucol.h"
19
20 #if !UCONFIG_NO_COLLATION
21
22 /* Set to 1 if matches must be on grapheme boundaries */
23 #define GRAPHEME_BOUNDARIES 1
24
25 U_CDECL_BEGIN
26 struct SearchData {
27 const char *text;
28 const char *pattern;
29 const char *collator;
30 UCollationStrength strength;
31 const char *breaker;
32 int8_t offset[32];
33 uint8_t size[32];
34 };
35 U_CDECL_END
36
37 typedef struct SearchData SearchData;
38
39 static const char *TESTCOLLATORRULE = "& o,O ; p,P";
40
41 static const char *EXTRACOLLATIONRULE = " & ae ; \\u00e4 & AE ; \\u00c4 & oe ; \\u00f6 & OE ; \\u00d6 & ue ; \\u00fc & UE ; \\u00dc";
42
43 static const SearchData BASIC[] = {
44 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
45 {"silly spring string", "string", NULL, UCOL_TERTIARY, NULL, {13, -1},
46 {6}},
47 {"silly spring string string", "string", NULL, UCOL_TERTIARY, NULL,
48 {13, 20, -1}, {6, 6}},
49 {"silly string spring string", "string", NULL, UCOL_TERTIARY, NULL,
50 {6, 20, -1}, {6, 6}},
51 {"string spring string", "string", NULL, UCOL_TERTIARY, NULL, {0, 14, -1},
52 {6, 6}},
53 {"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
54 {"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
55 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
56 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
57
58 #if GRAPHEME_BOUNDARIES
59 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
60 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
61 #else
62 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
63 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
64 #endif
65
66 {"\\u00c9", "e", NULL, UCOL_PRIMARY, NULL, {0, -1}, {1}},
67 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
68 };
69
70 static const SearchData BREAKITERATOREXACT[] = {
71 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1},
72 {3, 3}},
73 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}},
74 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY,
75 "characterbreaker", {10, 14, -1}, {3, 2}},
76 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, "wordbreaker",
77 {10, -1}, {3}},
78 {"Channel, another channel, more channels, and one last Channel",
79 "Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
80 /* jitterbug 1745 */
81 {"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
82 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
83 {"testing that string ab\\u00e9cd does not match e", "e", NULL,
84 UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
85 {"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}},
86 #if 0
87 /* Problem reported by Dave Bertoni, same as ticket 4279? */
88 {"\\u0043\\u004F\\u0302\\u0054\\u00C9", "\\u004F", NULL, UCOL_TERTIARY, "characterbreaker", {1, -1}, {2}},
89 #endif
90 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
91 };
92
93 static const SearchData STRENGTH[] = {
94 /*012345678901234567890123456789012345678901234567890123456789*/
95 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
96 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}},
97 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
98 UCOL_PRIMARY, "wordbreaker", {16, -1}, {3}},
99 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
100 "peche", "fr", UCOL_PRIMARY, NULL, {15, 21, 27, 34, -1}, {5, 5, 5, 5}},
101 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, NULL,
102 {10, 14, -1}, {3, 2}},
103 {"A channel, another CHANNEL, more Channels, and one last channel...",
104 "channel", "es", UCOL_PRIMARY, NULL, {2, 19, 33, 56, -1},
105 {7, 7, 7, 7}},
106 {"\\u00c0 should match but not A", "A\\u0300", "en", UCOL_IDENTICAL,
107 NULL, {0, -1}, {1, 0}},
108
109 #if 0
110 /* Ticket 5382 */
111 {"12\\u0171", "\\u0170", NULL, UCOL_SECONDARY, NULL, {2, -1}, {2}},
112 #endif
113
114 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
115 };
116
117 static const SearchData VARIABLE[] = {
118 /*012345678901234567890123456789012345678901234567890123456789*/
119 {"blackbirds black blackbirds blackbird black-bird",
120 "blackbird", NULL, UCOL_TERTIARY, NULL, {0, 17, 28, 38, -1},
121 {9, 9, 9, 10}},
122 /* to see that it doesn't go into an infinite loop if the start of text
123 is a ignorable character */
124 {" on", "go", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
125 {"abcdefghijklmnopqrstuvwxyz", " ", NULL, UCOL_PRIMARY, NULL,
126 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
127 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
129 /* testing tightest match */
130 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_QUATERNARY,
131 NULL, {1, -1}, {3}},
132 /*012345678901234567890123456789012345678901234567890123456789 */
133 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_SECONDARY,
134 NULL, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}},
135 /* totally ignorable text */
136 {" ---------------", "abc", NULL, UCOL_SECONDARY,
137 NULL, {-1}, {0}},
138 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
139 };
140
141 static const SearchData NORMEXACT[] = {
142 {"a\\u0300\\u0325", "a\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
143
144 #if GRAPHEME_BOUNDARIES
145 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
146 #else
147 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
148 #endif
149
150 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
151 };
152
153 static const SearchData NONNORMEXACT[] = {
154 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
155 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
156 };
157
158 static const SearchData OVERLAP[] = {
159 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 2, 4, -1},
160 {4, 4, 4}},
161 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
162 };
163
164 static const SearchData NONOVERLAP[] = {
165 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 4, -1}, {4, 4}},
166 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
167 };
168
169 static const SearchData COLLATOR[] = {
170 /* english */
171 {"fox fpx", "fox", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
172 /* tailored */
173 {"fox fpx", "fox", NULL, UCOL_PRIMARY, NULL, {0, 4, -1}, {3, 3}},
174 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
175 };
176
177 static const SearchData PATTERN[] = {
178 {"The quick brown fox jumps over the lazy foxes", "the", NULL,
179 UCOL_PRIMARY, NULL, {0, 31, -1}, {3, 3}},
180 {"The quick brown fox jumps over the lazy foxes", "fox", NULL,
181 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}},
182 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
183 };
184
185 static const SearchData TEXT[] = {
186 {"the foxy brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {4, 15, -1},
187 {3, 3}},
188 {"the quick brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {16, -1},
189 {3}},
190 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
191 };
192
193 static const SearchData COMPOSITEBOUNDARIES[] = {
194 #if GRAPHEME_BOUNDARIES
195 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
196 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
197 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
198 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
199 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
200 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
201 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
202 #else
203 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
204 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
205 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
206 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
207 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
208 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
209 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1},
210 {1, 1}},
211 #endif
212
213 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
214 /* A + 030A + 0301 */
215 {"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
216 {"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
217 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
218 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
219 {"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
220 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
221 {"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
222 {"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
223
224 #if GRAPHEME_BOUNDARIES
225 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
226 #else
227 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
228 #endif
229
230 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
231 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
232 {"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
233 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
234 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
235 {"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
236 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
237 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
238
239 /* Ticket 5024 */
240 {"a\\u00e1", "a\\u00e1", NULL, UCOL_SECONDARY, NULL, {0, -1}, {2}},
241
242 /* Ticket 5420 */
243 {"fu\\u00dfball", "fu\\u00df", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
244 {"fu\\u00dfball", "fuss", NULL, UCOL_PRIMARY, NULL, {0, -1}, {3}},
245 {"fu\\u00dfball", "uss", NULL, UCOL_PRIMARY, NULL, {1, -1}, {2}},
246
247 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
248 };
249
250 static const SearchData MATCH[] = {
251 {"a busy bee is a very busy beeee", "bee", NULL, UCOL_TERTIARY, NULL,
252 {7, 26, -1}, {3, 3}},
253 /* 012345678901234567890123456789012345678901234567890 */
254 {"a busy bee is a very busy beeee with no bee life", "bee", NULL,
255 UCOL_TERTIARY, NULL, {7, 26, 40, -1}, {3, 3, 3}},
256 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
257 };
258
259 static const SearchData SUPPLEMENTARY[] = {
260 /* 012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
261 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00",
262 "\\uD800\\uDC00", NULL, UCOL_TERTIARY, NULL, {4, 13, 22, 26, 29, -1},
263 {2, 2, 2, 2, 2}},
264 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL,
265 UCOL_TERTIARY, NULL, {3, -1}, {2}},
266 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL,
267 UCOL_TERTIARY, NULL, {3, -1}, {4}},
268 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL,
269 UCOL_TERTIARY, NULL, {3, -1}, {4}},
270 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL,
271 UCOL_TERTIARY, NULL, {3, -1}, {4}},
272 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL,
273 UCOL_TERTIARY, NULL, {3, -1}, {4}},
274 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
275 };
276
277 static const char *CONTRACTIONRULE =
278 "&z = ab/c < AB < X\\u0300 < ABC < X\\u0300\\u0315";
279
280 static const SearchData CONTRACTION[] = {
281 /* common discontiguous */
282 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
283
284 #if GRAPHEME_BOUNDARIES
285 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
286 #else
287 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
288 #endif
289
290 /* contraction prefix */
291 {"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
292
293 #if GRAPHEME_BOUNDARIES
294 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
295 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
296 #else
297 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
298 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
299 #endif
300
301 /* discontiguous problem here for backwards iteration.
302 accents not found because discontiguous stores all information */
303 {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {-1},
304 {0}},
305 /* ends not with a contraction character */
306 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1},
307 {0}},
308 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL,
309 {0, -1}, {3}},
310 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1},
311 {0}},
312 /* blocked discontiguous */
313 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL,
314 {-1}, {0}},
315
316 #if GRAPHEME_BOUNDARIES
317 /*
318 * "ab" generates a contraction that's an expansion. The "z" matches the
319 * first CE of the expansion but the match fails because it ends in the
320 * middle of an expansion...
321 */
322 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
323 #else
324 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
325 #endif
326
327 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
328 };
329
330 static const char *IGNORABLERULE = "&a = \\u0300";
331
332 static const SearchData IGNORABLE[] = {
333 #if GRAPHEME_BOUNDARIES
334 /*
335 * This isn't much of a test when matches have to be on
336 * grapheme boundiaries. The match at 0 only works because
337 * it's at the start of the text.
338 */
339 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
340 {0, -1}, {2}},
341 #else
342 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
343 {0, 3, -1}, {2, 2}},
344 #endif
345
346 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
347 };
348
349 static const SearchData BASICCANONICAL[] = {
350 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
351 {"silly spring string", "string", NULL, UCOL_TERTIARY, NULL, {13, -1},
352 {6}},
353 {"silly spring string string", "string", NULL, UCOL_TERTIARY, NULL,
354 {13, 20, -1}, {6, 6}},
355 {"silly string spring string", "string", NULL, UCOL_TERTIARY, NULL,
356 {6, 20, -1}, {6, 6}},
357 {"string spring string", "string", NULL, UCOL_TERTIARY, NULL, {0, 14, -1},
358 {6, 6}},
359 {"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
360 {"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
361
362 #if GRAPHEME_BOUNDARIES
363 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
364 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
365 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
366 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
367 {"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
368 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY,
369 NULL, {-1}, {0}},
370 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY,
371 NULL, {-1}, {0}},
372 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
373 "\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
374 #else
375 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
376 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
377 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
378 {2}},
379 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
380 {"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {1, -1}, {3}},
381 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY,
382 NULL, {0, -1}, {5}},
383 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY,
384 NULL, {0, -1}, {5}},
385 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
386 "\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {1, 12, -1}, {5, 3}},
387 #endif
388
389 {"\\u00c4\\u0323", "A\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
390 {"\\u0308\\u0323", "\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
391 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
392 };
393
394
395 static const SearchData NORMCANONICAL[] = {
396 #if GRAPHEME_BOUNDARIES
397 /*
398 * These tests don't really mean anything. With matches restricted to grapheme
399 * boundaries, isCanonicalMatch doesn't mean anything unless normalization is
400 * also turned on...
401 */
402 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
403 {"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
404 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
405 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
406 {"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
407 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
408 #else
409 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
410 {"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
411 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1},
412 {2}},
413 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
414 {2}},
415 {"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
416 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
417 #endif
418
419 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
420 };
421
422 static const SearchData BREAKITERATORCANONICAL[] = {
423 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1},
424 {3, 3}},
425 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}},
426 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY,
427 "characterbreaker", {10, 14, -1}, {3, 2}},
428 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, "wordbreaker",
429 {10, -1}, {3}},
430 {"Channel, another channel, more channels, and one last Channel",
431 "Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
432 /* jitterbug 1745 */
433 {"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
434 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
435 {"testing that string ab\\u00e9cd does not match e", "e", NULL,
436 UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
437 {"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}},
438 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
439 };
440
441 static const SearchData STRENGTHCANONICAL[] = {
442 /*012345678901234567890123456789012345678901234567890123456789 */
443 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
444 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}},
445 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
446 UCOL_PRIMARY, "wordbreaker", {16, -1}, {3}},
447 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
448 "peche", "fr", UCOL_PRIMARY, NULL, {15, 21, 27, 34, -1}, {5, 5, 5, 5}},
449 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, NULL,
450 {10, 14, -1}, {3, 2}},
451 {"A channel, another CHANNEL, more Channels, and one last channel...",
452 "channel", "es", UCOL_PRIMARY, NULL, {2, 19, 33, 56, -1},
453 {7, 7, 7, 7}},
454 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
455 };
456
457 static const SearchData VARIABLECANONICAL[] = {
458 /*012345678901234567890123456789012345678901234567890123456789 */
459 {"blackbirds black blackbirds blackbird black-bird",
460 "blackbird", NULL, UCOL_TERTIARY, NULL, {0, 17, 28, 38, -1},
461 {9, 9, 9, 10}},
462 /* to see that it doesn't go into an infinite loop if the start of text
463 is a ignorable character */
464 {" on", "go", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
465 {"abcdefghijklmnopqrstuvwxyz", " ", NULL, UCOL_PRIMARY, NULL,
466 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
467 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
469 /* testing tightest match */
470 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_QUATERNARY,
471 NULL, {1, -1}, {3}},
472 /*012345678901234567890123456789012345678901234567890123456789 */
473 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_SECONDARY,
474 NULL, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}},
475 /* totally ignorable text */
476 {" ---------------", "abc", NULL, UCOL_SECONDARY,
477 NULL, {-1}, {0}},
478 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
479 };
480
481 static const SearchData OVERLAPCANONICAL[] = {
482 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 2, 4, -1},
483 {4, 4, 4}},
484 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
485 };
486
487 static const SearchData NONOVERLAPCANONICAL[] = {
488 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 4, -1}, {4, 4}},
489 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
490 };
491
492 static const SearchData COLLATORCANONICAL[] = {
493 /* english */
494 {"fox fpx", "fox", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
495 /* tailored */
496 {"fox fpx", "fox", NULL, UCOL_PRIMARY, NULL, {0, 4, -1}, {3, 3}},
497 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
498 };
499
500 static const SearchData PATTERNCANONICAL[] = {
501 {"The quick brown fox jumps over the lazy foxes", "the", NULL,
502 UCOL_PRIMARY, NULL, {0, 31, -1}, {3, 3}},
503 {"The quick brown fox jumps over the lazy foxes", "fox", NULL,
504 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}},
505 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
506 };
507
508 static const SearchData TEXTCANONICAL[] = {
509 {"the foxy brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {4, 15, -1},
510 {3, 3}},
511 {"the quick brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {16, -1},
512 {3}},
513 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
514 };
515
516 static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
517 #if GRAPHEME_BOUNDARIES
518 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
519 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
520 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
521 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
522 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
523 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
524
525 /* first one matches only because it's at the start of the text */
526 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
527
528 /* \\u0300 blocked by \\u0300 */
529 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
530 #else
531 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
532 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
533 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
534 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
535 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
536 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
537 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1},
538 {1, 1}},
539 /* \\u0300 blocked by \\u0300 */
540 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
541 #endif
542
543 /* A + 030A + 0301 */
544 {"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
545 {"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
546
547 #if GRAPHEME_BOUNDARIES
548 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
549 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
550 #else
551 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
552 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
553 #endif
554
555 {"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
556
557 #if GRAPHEME_BOUNDARIES
558 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
559 #else
560 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
561 #endif
562
563 /* blocked accent */
564 {"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
565 {"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
566
567 #if GRAPHEME_BOUNDARIES
568 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
569 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
570 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
571 #else
572 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
573 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
574 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
575 #endif
576
577 {"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
578
579 #if GRAPHEME_BOUNDARIES
580 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
581 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
582 #else
583 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
584 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
585 #endif
586
587 {"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
588
589 #if GRAPHEME_BOUNDARIES
590 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
591 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
592 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
593 NULL, UCOL_TERTIARY, NULL, {10, -1}, {2}},
594 #else
595 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
596 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
597 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
598 NULL, UCOL_TERTIARY, NULL, {0, 6, 10, 13, -1}, {1, 3, 2, 1}},
599 #endif
600
601 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
602 };
603
604 static const SearchData MATCHCANONICAL[] = {
605 {"a busy bee is a very busy beeee", "bee", NULL, UCOL_TERTIARY, NULL,
606 {7, 26, -1}, {3, 3}},
607 /*012345678901234567890123456789012345678901234567890 */
608 {"a busy bee is a very busy beeee with no bee life", "bee", NULL,
609 UCOL_TERTIARY, NULL, {7, 26, 40, -1}, {3, 3, 3}},
610 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
611 };
612
613 static const SearchData SUPPLEMENTARYCANONICAL[] = {
614 /*012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
615 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00",
616 "\\uD800\\uDC00", NULL, UCOL_TERTIARY, NULL, {4, 13, 22, 26, 29, -1},
617 {2, 2, 2, 2, 2}},
618 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL,
619 UCOL_TERTIARY, NULL, {3, -1}, {2}},
620 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL,
621 UCOL_TERTIARY, NULL, {3, -1}, {4}},
622 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL,
623 UCOL_TERTIARY, NULL, {3, -1}, {4}},
624 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL,
625 UCOL_TERTIARY, NULL, {3, -1}, {4}},
626 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL,
627 UCOL_TERTIARY, NULL, {3, -1}, {4}},
628 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
629 };
630
631 static const SearchData CONTRACTIONCANONICAL[] = {
632 /* common discontiguous */
633 #if GRAPHEME_BOUNDARIES
634 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
635 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
636 #else
637 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
638 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
639 #endif
640
641 /* contraction prefix */
642 {"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
643
644 #if GRAPHEME_BOUNDARIES
645 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
646 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
647 #else
648 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
649 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
650 #endif
651
652 /* discontiguous problem here for backwards iteration.
653 forwards gives 0, 4 but backwards give 1, 3 */
654 /* {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {0, -1},
655 {4}}, */
656
657 /* ends not with a contraction character */
658 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
659 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
660
661 #if GRAPHEME_BOUNDARIES
662 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
663
664 /* blocked discontiguous */
665 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
666
667 /*
668 * "ab" generates a contraction that's an expansion. The "z" matches the
669 * first CE of the expansion but the match fails because it ends in the
670 * middle of an expansion...
671 */
672 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {2}},
673 #else
674 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {4}},
675
676 /* blocked discontiguous */
677 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {1, -1}, {4}},
678
679 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
680 #endif
681
682 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
683 };
684
685 static const SearchData DIACRITICMATCH[] = {
686 {"\\u03BA\\u03B1\\u03B9\\u0300\\u0020\\u03BA\\u03B1\\u1F76", "\\u03BA\\u03B1\\u03B9", NULL, UCOL_PRIMARY, NULL, {0, 5,-1}, {4, 3}},
687 {"\\u0061\\u0061\\u00E1", "\\u0061\\u00E1", NULL, UCOL_SECONDARY, NULL, {1, -1}, {2}},
688 {"\\u0020\\u00C2\\u0303\\u0020\\u0041\\u0061\\u1EAA\\u0041\\u0302\\u0303\\u00C2\\u0303\\u1EAB\\u0061\\u0302\\u0303\\u00E2\\u0303\\uD806\\uDC01\\u0300\\u0020",
689 "\\u00C2\\u0303", "LDE_AN_CX_EX_FX_HX_NX_S1", UCOL_PRIMARY, NULL, {1, 4, 5, 6, 7, 10, 12, 13, 16,-1}, {2, 1, 1, 1, 3, 2, 1, 3, 2}},
690 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
691 };
692
693 #endif /* #if !UCONFIG_NO_COLLATION */
694
695 #endif