1 /********************************************************************
2 * Copyright (c) 2001-2008 International Business Machines
3 * Corporation and others. All Rights Reserved.
4 ********************************************************************
6 * Modification History:
7 * Name date Description
8 * synwee July 31 2001 creation
9 ********************************************************************/
13 Note: This file is included by other C and C++ files. This file should not be directly compiled.
18 #include "unicode/ucol.h"
20 #if !UCONFIG_NO_COLLATION
22 /* Set to 1 if matches must be on grapheme boundaries */
23 #define GRAPHEME_BOUNDARIES 1
30 UCollationStrength strength
;
37 typedef struct SearchData SearchData
;
39 static const char *TESTCOLLATORRULE
= "& o,O ; p,P";
41 static const char *EXTRACOLLATIONRULE
= " & ae ; \\u00e4 & AE ; \\u00c4 & oe ; \\u00f6 & OE ; \\u00d6 & ue ; \\u00fc & UE ; \\u00dc";
43 static const SearchData BASIC
[] = {
44 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
45 {"silly spring string", "string", NULL
, UCOL_TERTIARY
, NULL
, {13, -1},
47 {"silly spring string string", "string", NULL
, UCOL_TERTIARY
, NULL
,
48 {13, 20, -1}, {6, 6}},
49 {"silly string spring string", "string", NULL
, UCOL_TERTIARY
, NULL
,
51 {"string spring string", "string", NULL
, UCOL_TERTIARY
, NULL
, {0, 14, -1},
53 {"Scott Ganyo", "c", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
54 {"Scott Ganyo", " ", NULL
, UCOL_TERTIARY
, NULL
, {5, -1}, {1}},
55 {"\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
56 {"a\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
58 #if GRAPHEME_BOUNDARIES
59 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
60 {"a\\u0300b", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
62 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
63 {"a\\u0300b", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
66 {"\\u00c9", "e", NULL
, UCOL_PRIMARY
, NULL
, {0, -1}, {1}},
67 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
70 static const SearchData BREAKITERATOREXACT
[] = {
71 {"foxy fox", "fox", NULL
, UCOL_TERTIARY
, "characterbreaker", {0, 5, -1},
73 {"foxy fox", "fox", NULL
, UCOL_TERTIARY
, "wordbreaker", {5, -1}, {3}},
74 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
,
75 "characterbreaker", {10, 14, -1}, {3, 2}},
76 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
, "wordbreaker",
78 {"Channel, another channel, more channels, and one last Channel",
79 "Channel", "es", UCOL_TERTIARY
, "wordbreaker", {0, 54, -1}, {7, 7}},
81 {"testing that \\u00e9 does not match e", "e", NULL
, UCOL_TERTIARY
,
82 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
83 {"testing that string ab\\u00e9cd does not match e", "e", NULL
,
84 UCOL_TERTIARY
, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
85 {"\\u00c9", "e", "fr", UCOL_PRIMARY
, "characterbreaker", {0, -1}, {1}},
87 /* Problem reported by Dave Bertoni, same as ticket 4279? */
88 {"\\u0043\\u004F\\u0302\\u0054\\u00C9", "\\u004F", NULL
, UCOL_TERTIARY
, "characterbreaker", {1, -1}, {2}},
90 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
93 static const SearchData STRENGTH
[] = {
94 /*012345678901234567890123456789012345678901234567890123456789*/
95 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
96 UCOL_PRIMARY
, NULL
, {16, 40, -1}, {3, 3}},
97 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
98 UCOL_PRIMARY
, "wordbreaker", {16, -1}, {3}},
99 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
100 "peche", "fr", UCOL_PRIMARY
, NULL
, {15, 21, 27, 34, -1}, {5, 5, 5, 5}},
101 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
, NULL
,
102 {10, 14, -1}, {3, 2}},
103 {"A channel, another CHANNEL, more Channels, and one last channel...",
104 "channel", "es", UCOL_PRIMARY
, NULL
, {2, 19, 33, 56, -1},
106 {"\\u00c0 should match but not A", "A\\u0300", "en", UCOL_IDENTICAL
,
107 NULL
, {0, -1}, {1, 0}},
111 {"12\\u0171", "\\u0170", NULL
, UCOL_SECONDARY
, NULL
, {2, -1}, {2}},
114 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
117 static const SearchData VARIABLE
[] = {
118 /*012345678901234567890123456789012345678901234567890123456789*/
119 {"blackbirds black blackbirds blackbird black-bird",
120 "blackbird", NULL
, UCOL_TERTIARY
, NULL
, {0, 17, 28, 38, -1},
122 /* to see that it doesn't go into an infinite loop if the start of text
123 is a ignorable character */
124 {" on", "go", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
125 {"abcdefghijklmnopqrstuvwxyz", " ", NULL
, UCOL_PRIMARY
, NULL
,
126 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
127 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
129 /* testing tightest match */
130 {" abc a bc ab c a bc ab c", "abc", NULL
, UCOL_QUATERNARY
,
132 /*012345678901234567890123456789012345678901234567890123456789 */
133 {" abc a bc ab c a bc ab c", "abc", NULL
, UCOL_SECONDARY
,
134 NULL
, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}},
135 /* totally ignorable text */
136 {" ---------------", "abc", NULL
, UCOL_SECONDARY
,
138 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
141 static const SearchData NORMEXACT
[] = {
142 {"a\\u0300\\u0325", "a\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {3}},
144 #if GRAPHEME_BOUNDARIES
145 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
147 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
150 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
153 static const SearchData NONNORMEXACT
[] = {
154 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
155 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
158 static const SearchData OVERLAP
[] = {
159 {"abababab", "abab", NULL
, UCOL_TERTIARY
, NULL
, {0, 2, 4, -1},
161 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
164 static const SearchData NONOVERLAP
[] = {
165 {"abababab", "abab", NULL
, UCOL_TERTIARY
, NULL
, {0, 4, -1}, {4, 4}},
166 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
169 static const SearchData COLLATOR
[] = {
171 {"fox fpx", "fox", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {3}},
173 {"fox fpx", "fox", NULL
, UCOL_PRIMARY
, NULL
, {0, 4, -1}, {3, 3}},
174 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
177 static const SearchData PATTERN
[] = {
178 {"The quick brown fox jumps over the lazy foxes", "the", NULL
,
179 UCOL_PRIMARY
, NULL
, {0, 31, -1}, {3, 3}},
180 {"The quick brown fox jumps over the lazy foxes", "fox", NULL
,
181 UCOL_PRIMARY
, NULL
, {16, 40, -1}, {3, 3}},
182 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
185 static const SearchData TEXT
[] = {
186 {"the foxy brown fox", "fox", NULL
, UCOL_TERTIARY
, NULL
, {4, 15, -1},
188 {"the quick brown fox", "fox", NULL
, UCOL_TERTIARY
, NULL
, {16, -1},
190 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
193 static const SearchData COMPOSITEBOUNDARIES
[] = {
194 #if GRAPHEME_BOUNDARIES
195 {"\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
196 {"A\\u00C0C", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
197 {"\\u00C0A", "A", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
198 {"B\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
199 {"\\u00C0B", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
200 {"\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
201 {"\\u0300\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
203 {"\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
204 {"A\\u00C0C", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1}, {1, 1}},
205 {"\\u00C0A", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1}, {1, 1}},
206 {"B\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
207 {"\\u00C0B", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
208 {"\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
209 {"\\u0300\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1},
213 {"\\u00C0\\u0300", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
214 /* A + 030A + 0301 */
215 {"\\u01FA", "\\u01FA", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
216 {"\\u01FA", "A\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
217 {"\\u01FA", "\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
218 {"\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
219 {"\\u01FA", "\\u030AA", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
220 {"\\u01FA", "\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
221 {"\\u01FA", "A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
222 {"\\u01FA", "\\u0301A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
224 #if GRAPHEME_BOUNDARIES
225 {"\\u01FA", "\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
227 {"\\u01FA", "\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
230 {"A\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
231 {"\\u01FAA", "\\u0301A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
232 {"\\u0F73", "\\u0F73", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
233 {"\\u0F73", "\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
234 {"\\u0F73", "\\u0F72", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
235 {"\\u0F73", "\\u0F71\\u0F72", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
236 {"A\\u0F73", "A\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
237 {"\\u0F73A", "\\u0F72A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
240 {"a\\u00e1", "a\\u00e1", NULL
, UCOL_SECONDARY
, NULL
, {0, -1}, {2}},
243 {"fu\\u00dfball", "fu\\u00df", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {3}},
244 {"fu\\u00dfball", "fuss", NULL
, UCOL_PRIMARY
, NULL
, {0, -1}, {3}},
245 {"fu\\u00dfball", "uss", NULL
, UCOL_PRIMARY
, NULL
, {1, -1}, {2}},
247 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
250 static const SearchData MATCH
[] = {
251 {"a busy bee is a very busy beeee", "bee", NULL
, UCOL_TERTIARY
, NULL
,
252 {7, 26, -1}, {3, 3}},
253 /* 012345678901234567890123456789012345678901234567890 */
254 {"a busy bee is a very busy beeee with no bee life", "bee", NULL
,
255 UCOL_TERTIARY
, NULL
, {7, 26, 40, -1}, {3, 3, 3}},
256 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
259 static const SearchData SUPPLEMENTARY
[] = {
260 /* 012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
261 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00",
262 "\\uD800\\uDC00", NULL
, UCOL_TERTIARY
, NULL
, {4, 13, 22, 26, 29, -1},
264 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL
,
265 UCOL_TERTIARY
, NULL
, {3, -1}, {2}},
266 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL
,
267 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
268 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL
,
269 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
270 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL
,
271 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
272 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL
,
273 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
274 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
277 static const char *CONTRACTIONRULE
=
278 "&z = ab/c < AB < X\\u0300 < ABC < X\\u0300\\u0315";
280 static const SearchData CONTRACTION
[] = {
281 /* common discontiguous */
282 {"A\\u0300\\u0315", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
284 #if GRAPHEME_BOUNDARIES
285 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
287 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
290 /* contraction prefix */
291 {"AB\\u0315C", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
293 #if GRAPHEME_BOUNDARIES
294 {"AB\\u0315C", "AB", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
295 {"AB\\u0315C", "\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
297 {"AB\\u0315C", "AB", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
298 {"AB\\u0315C", "\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {2, -1}, {1}},
301 /* discontiguous problem here for backwards iteration.
302 accents not found because discontiguous stores all information */
303 {"X\\u0300\\u0319\\u0315", "\\u0319", NULL
, UCOL_TERTIARY
, NULL
, {-1},
305 /* ends not with a contraction character */
306 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1},
308 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
,
310 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1},
312 /* blocked discontiguous */
313 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL
, UCOL_TERTIARY
, NULL
,
316 #if GRAPHEME_BOUNDARIES
318 * "ab" generates a contraction that's an expansion. The "z" matches the
319 * first CE of the expansion but the match fails because it ends in the
320 * middle of an expansion...
322 {"ab", "z", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
324 {"ab", "z", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
327 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
330 static const char *IGNORABLERULE
= "&a = \\u0300";
332 static const SearchData IGNORABLE
[] = {
333 #if GRAPHEME_BOUNDARIES
335 * This isn't much of a test when matches have to be on
336 * grapheme boundiaries. The match at 0 only works because
337 * it's at the start of the text.
339 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL
, UCOL_PRIMARY
, NULL
,
342 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL
, UCOL_PRIMARY
, NULL
,
346 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
349 static const SearchData BASICCANONICAL
[] = {
350 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
351 {"silly spring string", "string", NULL
, UCOL_TERTIARY
, NULL
, {13, -1},
353 {"silly spring string string", "string", NULL
, UCOL_TERTIARY
, NULL
,
354 {13, 20, -1}, {6, 6}},
355 {"silly string spring string", "string", NULL
, UCOL_TERTIARY
, NULL
,
356 {6, 20, -1}, {6, 6}},
357 {"string spring string", "string", NULL
, UCOL_TERTIARY
, NULL
, {0, 14, -1},
359 {"Scott Ganyo", "c", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
360 {"Scott Ganyo", " ", NULL
, UCOL_TERTIARY
, NULL
, {5, -1}, {1}},
362 #if GRAPHEME_BOUNDARIES
363 {"\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
364 {"a\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
365 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
366 {"a\\u0300b", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
367 {"a\\u0300\\u0325b", "\\u0300b", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
368 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL
, UCOL_TERTIARY
,
370 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL
, UCOL_TERTIARY
,
372 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
373 "\\u0300b\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
375 {"\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
376 {"a\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
377 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {1, -1},
379 {"a\\u0300b", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
380 {"a\\u0300\\u0325b", "\\u0300b", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {3}},
381 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL
, UCOL_TERTIARY
,
383 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL
, UCOL_TERTIARY
,
385 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
386 "\\u0300b\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {1, 12, -1}, {5, 3}},
389 {"\\u00c4\\u0323", "A\\u0323\\u0308", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
390 {"\\u0308\\u0323", "\\u0323\\u0308", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
391 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
395 static const SearchData NORMCANONICAL
[] = {
396 #if GRAPHEME_BOUNDARIES
398 * These tests don't really mean anything. With matches restricted to grapheme
399 * boundaries, isCanonicalMatch doesn't mean anything unless normalization is
402 {"\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
403 {"\\u0300\\u0325", "\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
404 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
405 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
406 {"a\\u0300\\u0325", "\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
407 {"a\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
409 {"\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
410 {"\\u0300\\u0325", "\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
411 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1},
413 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {1, -1},
415 {"a\\u0300\\u0325", "\\u0325", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
416 {"a\\u0300\\u0325", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
419 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
422 static const SearchData BREAKITERATORCANONICAL
[] = {
423 {"foxy fox", "fox", NULL
, UCOL_TERTIARY
, "characterbreaker", {0, 5, -1},
425 {"foxy fox", "fox", NULL
, UCOL_TERTIARY
, "wordbreaker", {5, -1}, {3}},
426 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
,
427 "characterbreaker", {10, 14, -1}, {3, 2}},
428 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
, "wordbreaker",
430 {"Channel, another channel, more channels, and one last Channel",
431 "Channel", "es", UCOL_TERTIARY
, "wordbreaker", {0, 54, -1}, {7, 7}},
433 {"testing that \\u00e9 does not match e", "e", NULL
, UCOL_TERTIARY
,
434 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
435 {"testing that string ab\\u00e9cd does not match e", "e", NULL
,
436 UCOL_TERTIARY
, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
437 {"\\u00c9", "e", "fr", UCOL_PRIMARY
, "characterbreaker", {0, -1}, {1}},
438 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
441 static const SearchData STRENGTHCANONICAL
[] = {
442 /*012345678901234567890123456789012345678901234567890123456789 */
443 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
444 UCOL_PRIMARY
, NULL
, {16, 40, -1}, {3, 3}},
445 {"The quick brown fox jumps over the lazy foxes", "fox", "en",
446 UCOL_PRIMARY
, "wordbreaker", {16, -1}, {3}},
447 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe",
448 "peche", "fr", UCOL_PRIMARY
, NULL
, {15, 21, 27, 34, -1}, {5, 5, 5, 5}},
449 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY
, NULL
,
450 {10, 14, -1}, {3, 2}},
451 {"A channel, another CHANNEL, more Channels, and one last channel...",
452 "channel", "es", UCOL_PRIMARY
, NULL
, {2, 19, 33, 56, -1},
454 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
457 static const SearchData VARIABLECANONICAL
[] = {
458 /*012345678901234567890123456789012345678901234567890123456789 */
459 {"blackbirds black blackbirds blackbird black-bird",
460 "blackbird", NULL
, UCOL_TERTIARY
, NULL
, {0, 17, 28, 38, -1},
462 /* to see that it doesn't go into an infinite loop if the start of text
463 is a ignorable character */
464 {" on", "go", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
465 {"abcdefghijklmnopqrstuvwxyz", " ", NULL
, UCOL_PRIMARY
, NULL
,
466 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
467 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
469 /* testing tightest match */
470 {" abc a bc ab c a bc ab c", "abc", NULL
, UCOL_QUATERNARY
,
472 /*012345678901234567890123456789012345678901234567890123456789 */
473 {" abc a bc ab c a bc ab c", "abc", NULL
, UCOL_SECONDARY
,
474 NULL
, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}},
475 /* totally ignorable text */
476 {" ---------------", "abc", NULL
, UCOL_SECONDARY
,
478 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
481 static const SearchData OVERLAPCANONICAL
[] = {
482 {"abababab", "abab", NULL
, UCOL_TERTIARY
, NULL
, {0, 2, 4, -1},
484 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
487 static const SearchData NONOVERLAPCANONICAL
[] = {
488 {"abababab", "abab", NULL
, UCOL_TERTIARY
, NULL
, {0, 4, -1}, {4, 4}},
489 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
492 static const SearchData COLLATORCANONICAL
[] = {
494 {"fox fpx", "fox", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {3}},
496 {"fox fpx", "fox", NULL
, UCOL_PRIMARY
, NULL
, {0, 4, -1}, {3, 3}},
497 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
500 static const SearchData PATTERNCANONICAL
[] = {
501 {"The quick brown fox jumps over the lazy foxes", "the", NULL
,
502 UCOL_PRIMARY
, NULL
, {0, 31, -1}, {3, 3}},
503 {"The quick brown fox jumps over the lazy foxes", "fox", NULL
,
504 UCOL_PRIMARY
, NULL
, {16, 40, -1}, {3, 3}},
505 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
508 static const SearchData TEXTCANONICAL
[] = {
509 {"the foxy brown fox", "fox", NULL
, UCOL_TERTIARY
, NULL
, {4, 15, -1},
511 {"the quick brown fox", "fox", NULL
, UCOL_TERTIARY
, NULL
, {16, -1},
513 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
516 static const SearchData COMPOSITEBOUNDARIESCANONICAL
[] = {
517 #if GRAPHEME_BOUNDARIES
518 {"\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
519 {"A\\u00C0C", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
520 {"\\u00C0A", "A", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
521 {"B\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
522 {"\\u00C0B", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
523 {"\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
525 /* first one matches only because it's at the start of the text */
526 {"\\u0300\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
528 /* \\u0300 blocked by \\u0300 */
529 {"\\u00C0\\u0300", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
531 {"\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
532 {"A\\u00C0C", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1}, {1, 1}},
533 {"\\u00C0A", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1}, {1, 1}},
534 {"B\\u00C0", "A", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
535 {"\\u00C0B", "A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
536 {"\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
537 {"\\u0300\\u00C0", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, 1, -1},
539 /* \\u0300 blocked by \\u0300 */
540 {"\\u00C0\\u0300", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
543 /* A + 030A + 0301 */
544 {"\\u01FA", "\\u01FA", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
545 {"\\u01FA", "A\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
547 #if GRAPHEME_BOUNDARIES
548 {"\\u01FA", "\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
549 {"\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
551 {"\\u01FA", "\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
552 {"\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
555 {"\\u01FA", "\\u030AA", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
557 #if GRAPHEME_BOUNDARIES
558 {"\\u01FA", "\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
560 {"\\u01FA", "\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
564 {"\\u01FA", "A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
565 {"\\u01FA", "\\u0301A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
567 #if GRAPHEME_BOUNDARIES
568 {"\\u01FA", "\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
569 {"A\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
570 {"\\u01FAA", "\\u0301A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
572 {"\\u01FA", "\\u030A\\u0301", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
573 {"A\\u01FA", "A\\u030A", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {1}},
574 {"\\u01FAA", "\\u0301A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
577 {"\\u0F73", "\\u0F73", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
579 #if GRAPHEME_BOUNDARIES
580 {"\\u0F73", "\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
581 {"\\u0F73", "\\u0F72", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
583 {"\\u0F73", "\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
584 {"\\u0F73", "\\u0F72", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
587 {"\\u0F73", "\\u0F71\\u0F72", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {1}},
589 #if GRAPHEME_BOUNDARIES
590 {"A\\u0F73", "A\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
591 {"\\u0F73A", "\\u0F72A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
592 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
593 NULL
, UCOL_TERTIARY
, NULL
, {10, -1}, {2}},
595 {"A\\u0F73", "A\\u0F71", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
596 {"\\u0F73A", "\\u0F72A", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
597 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
598 NULL
, UCOL_TERTIARY
, NULL
, {0, 6, 10, 13, -1}, {1, 3, 2, 1}},
601 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
604 static const SearchData MATCHCANONICAL
[] = {
605 {"a busy bee is a very busy beeee", "bee", NULL
, UCOL_TERTIARY
, NULL
,
606 {7, 26, -1}, {3, 3}},
607 /*012345678901234567890123456789012345678901234567890 */
608 {"a busy bee is a very busy beeee with no bee life", "bee", NULL
,
609 UCOL_TERTIARY
, NULL
, {7, 26, 40, -1}, {3, 3, 3}},
610 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
613 static const SearchData SUPPLEMENTARYCANONICAL
[] = {
614 /*012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */
615 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00",
616 "\\uD800\\uDC00", NULL
, UCOL_TERTIARY
, NULL
, {4, 13, 22, 26, 29, -1},
618 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL
,
619 UCOL_TERTIARY
, NULL
, {3, -1}, {2}},
620 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL
,
621 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
622 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL
,
623 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
624 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL
,
625 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
626 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL
,
627 UCOL_TERTIARY
, NULL
, {3, -1}, {4}},
628 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
631 static const SearchData CONTRACTIONCANONICAL
[] = {
632 /* common discontiguous */
633 #if GRAPHEME_BOUNDARIES
634 {"A\\u0300\\u0315", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
635 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
637 {"A\\u0300\\u0315", "\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
638 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {2}},
641 /* contraction prefix */
642 {"AB\\u0315C", "A", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
644 #if GRAPHEME_BOUNDARIES
645 {"AB\\u0315C", "AB", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
646 {"AB\\u0315C", "\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
648 {"AB\\u0315C", "AB", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
649 {"AB\\u0315C", "\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {2, -1}, {1}},
652 /* discontiguous problem here for backwards iteration.
653 forwards gives 0, 4 but backwards give 1, 3 */
654 /* {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {0, -1},
657 /* ends not with a contraction character */
658 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
659 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {3}},
661 #if GRAPHEME_BOUNDARIES
662 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
664 /* blocked discontiguous */
665 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}},
668 * "ab" generates a contraction that's an expansion. The "z" matches the
669 * first CE of the expansion but the match fails because it ends in the
670 * middle of an expansion...
672 {"ab", "z", NULL
, UCOL_TERTIARY
, NULL
, {-1}, {2}},
674 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {4}},
676 /* blocked discontiguous */
677 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL
, UCOL_TERTIARY
, NULL
, {1, -1}, {4}},
679 {"ab", "z", NULL
, UCOL_TERTIARY
, NULL
, {0, -1}, {2}},
682 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
685 static const SearchData DIACRITICMATCH
[] = {
686 {"\\u03BA\\u03B1\\u03B9\\u0300\\u0020\\u03BA\\u03B1\\u1F76", "\\u03BA\\u03B1\\u03B9", NULL
, UCOL_PRIMARY
, NULL
, {0, 5,-1}, {4, 3}},
687 {"\\u0061\\u0061\\u00E1", "\\u0061\\u00E1", NULL
, UCOL_SECONDARY
, NULL
, {1, -1}, {2}},
688 {"\\u0020\\u00C2\\u0303\\u0020\\u0041\\u0061\\u1EAA\\u0041\\u0302\\u0303\\u00C2\\u0303\\u1EAB\\u0061\\u0302\\u0303\\u00E2\\u0303\\uD806\\uDC01\\u0300\\u0020",
689 "\\u00C2\\u0303", "LDE_AN_CX_EX_FX_HX_NX_S1", UCOL_PRIMARY
, NULL
, {1, 4, 5, 6, 7, 10, 12, 13, 16,-1}, {2, 1, 1, 1, 3, 2, 1, 3, 2}},
690 {NULL
, NULL
, NULL
, UCOL_TERTIARY
, NULL
, {-1}, {0}}
693 #endif /* #if !UCONFIG_NO_COLLATION */