]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
b331163b | 3 | * Copyright (C) 1996-2015, International Business Machines |
57a6839d A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * collationcompare.cpp | |
7 | * | |
8 | * created on: 2012feb14 with new and old collation code | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | ||
14 | #if !UCONFIG_NO_COLLATION | |
15 | ||
16 | #include "unicode/ucol.h" | |
17 | #include "cmemory.h" | |
18 | #include "collation.h" | |
19 | #include "collationcompare.h" | |
20 | #include "collationiterator.h" | |
21 | #include "collationsettings.h" | |
22 | #include "uassert.h" | |
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | UCollationResult | |
27 | CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, | |
28 | const CollationSettings &settings, | |
29 | UErrorCode &errorCode) { | |
30 | if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } | |
31 | ||
32 | int32_t options = settings.options; | |
33 | uint32_t variableTop; | |
34 | if((options & CollationSettings::ALTERNATE_MASK) == 0) { | |
35 | variableTop = 0; | |
36 | } else { | |
37 | // +1 so that we can use "<" and primary ignorables test out early. | |
38 | variableTop = settings.variableTop + 1; | |
39 | } | |
40 | UBool anyVariable = FALSE; | |
41 | ||
42 | // Fetch CEs, compare primaries, store secondary & tertiary weights. | |
43 | U_ALIGN_CODE(16); | |
44 | for(;;) { | |
45 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
46 | uint32_t leftPrimary; | |
47 | do { | |
48 | int64_t ce = left.nextCE(errorCode); | |
49 | leftPrimary = (uint32_t)(ce >> 32); | |
50 | if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { | |
51 | // Variable CE, shift it to quaternary level. | |
52 | // Ignore all following primary ignorables, and shift further variable CEs. | |
53 | anyVariable = TRUE; | |
54 | do { | |
55 | // Store only the primary of the variable CE. | |
56 | left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); | |
57 | for(;;) { | |
58 | ce = left.nextCE(errorCode); | |
59 | leftPrimary = (uint32_t)(ce >> 32); | |
60 | if(leftPrimary == 0) { | |
61 | left.setCurrentCE(0); | |
62 | } else { | |
63 | break; | |
64 | } | |
65 | } | |
66 | } while(leftPrimary < variableTop && | |
67 | leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); | |
68 | } | |
69 | } while(leftPrimary == 0); | |
70 | ||
71 | uint32_t rightPrimary; | |
72 | do { | |
73 | int64_t ce = right.nextCE(errorCode); | |
74 | rightPrimary = (uint32_t)(ce >> 32); | |
75 | if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { | |
76 | // Variable CE, shift it to quaternary level. | |
77 | // Ignore all following primary ignorables, and shift further variable CEs. | |
78 | anyVariable = TRUE; | |
79 | do { | |
80 | // Store only the primary of the variable CE. | |
81 | right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); | |
82 | for(;;) { | |
83 | ce = right.nextCE(errorCode); | |
84 | rightPrimary = (uint32_t)(ce >> 32); | |
85 | if(rightPrimary == 0) { | |
86 | right.setCurrentCE(0); | |
87 | } else { | |
88 | break; | |
89 | } | |
90 | } | |
91 | } while(rightPrimary < variableTop && | |
92 | rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); | |
93 | } | |
94 | } while(rightPrimary == 0); | |
95 | ||
96 | if(leftPrimary != rightPrimary) { | |
97 | // Return the primary difference, with script reordering. | |
b331163b A |
98 | if(settings.hasReordering()) { |
99 | leftPrimary = settings.reorder(leftPrimary); | |
100 | rightPrimary = settings.reorder(rightPrimary); | |
57a6839d A |
101 | } |
102 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
103 | } | |
104 | if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } | |
105 | } | |
106 | if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } | |
107 | ||
108 | // Compare the buffered secondary & tertiary weights. | |
109 | // We might skip the secondary level but continue with the case level | |
110 | // which is turned on separately. | |
111 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
112 | if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { | |
113 | int32_t leftIndex = 0; | |
114 | int32_t rightIndex = 0; | |
115 | for(;;) { | |
116 | uint32_t leftSecondary; | |
117 | do { | |
118 | leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; | |
119 | } while(leftSecondary == 0); | |
120 | ||
121 | uint32_t rightSecondary; | |
122 | do { | |
123 | rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; | |
124 | } while(rightSecondary == 0); | |
125 | ||
126 | if(leftSecondary != rightSecondary) { | |
127 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
128 | } | |
129 | if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } | |
130 | } | |
131 | } else { | |
132 | // The backwards secondary level compares secondary weights backwards | |
133 | // within segments separated by the merge separator (U+FFFE, weight 02). | |
134 | int32_t leftStart = 0; | |
135 | int32_t rightStart = 0; | |
136 | for(;;) { | |
137 | // Find the merge separator or the NO_CE terminator. | |
b331163b | 138 | uint32_t p; |
57a6839d | 139 | int32_t leftLimit = leftStart; |
b331163b A |
140 | while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > |
141 | Collation::MERGE_SEPARATOR_PRIMARY || | |
142 | p == 0) { | |
57a6839d A |
143 | ++leftLimit; |
144 | } | |
145 | int32_t rightLimit = rightStart; | |
b331163b A |
146 | while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > |
147 | Collation::MERGE_SEPARATOR_PRIMARY || | |
148 | p == 0) { | |
57a6839d A |
149 | ++rightLimit; |
150 | } | |
151 | ||
152 | // Compare the segments. | |
153 | int32_t leftIndex = leftLimit; | |
154 | int32_t rightIndex = rightLimit; | |
155 | for(;;) { | |
156 | int32_t leftSecondary = 0; | |
157 | while(leftSecondary == 0 && leftIndex > leftStart) { | |
158 | leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; | |
159 | } | |
160 | ||
161 | int32_t rightSecondary = 0; | |
162 | while(rightSecondary == 0 && rightIndex > rightStart) { | |
163 | rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; | |
164 | } | |
165 | ||
166 | if(leftSecondary != rightSecondary) { | |
167 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
168 | } | |
169 | if(leftSecondary == 0) { break; } | |
170 | } | |
171 | ||
172 | // Did we reach the end of either string? | |
173 | // Both strings have the same number of merge separators, | |
174 | // or else there would have been a primary-level difference. | |
175 | U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); | |
b331163b | 176 | if(p == Collation::NO_CE_PRIMARY) { break; } |
57a6839d A |
177 | // Skip both merge separators and continue. |
178 | leftStart = leftLimit + 1; | |
179 | rightStart = rightLimit + 1; | |
180 | } | |
181 | } | |
182 | } | |
183 | ||
184 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
185 | int32_t strength = CollationSettings::getStrength(options); | |
186 | int32_t leftIndex = 0; | |
187 | int32_t rightIndex = 0; | |
188 | for(;;) { | |
189 | uint32_t leftCase, leftLower32, rightCase; | |
190 | if(strength == UCOL_PRIMARY) { | |
191 | // Primary+caseLevel: Ignore case level weights of primary ignorables. | |
192 | // Otherwise we would get a-umlaut > a | |
193 | // which is not desirable for accent-insensitive sorting. | |
194 | // Check for (lower 32 bits) == 0 as well because variable CEs are stored | |
195 | // with only primary weights. | |
196 | int64_t ce; | |
197 | do { | |
198 | ce = left.getCE(leftIndex++); | |
199 | leftCase = (uint32_t)ce; | |
200 | } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); | |
201 | leftLower32 = leftCase; | |
202 | leftCase &= 0xc000; | |
203 | ||
204 | do { | |
205 | ce = right.getCE(rightIndex++); | |
206 | rightCase = (uint32_t)ce; | |
207 | } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); | |
208 | rightCase &= 0xc000; | |
209 | } else { | |
210 | // Secondary+caseLevel: By analogy with the above, | |
211 | // ignore case level weights of secondary ignorables. | |
212 | // | |
213 | // Note: A tertiary CE has uppercase case bits (0.0.ut) | |
214 | // to keep tertiary+caseFirst well-formed. | |
215 | // | |
216 | // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. | |
217 | // Otherwise a tertiary CE's uppercase would be no greater than | |
218 | // a primary/secondary CE's uppercase. | |
219 | // (See UCA well-formedness condition 2.) | |
220 | // We could construct a special case weight higher than uppercase, | |
221 | // but it's simpler to always ignore case weights of secondary ignorables, | |
222 | // turning 0.0.ut into 0.0.0.t. | |
223 | // (See LDML Collation, Case Parameters.) | |
224 | do { | |
225 | leftCase = (uint32_t)left.getCE(leftIndex++); | |
226 | } while(leftCase <= 0xffff); | |
227 | leftLower32 = leftCase; | |
228 | leftCase &= 0xc000; | |
229 | ||
230 | do { | |
231 | rightCase = (uint32_t)right.getCE(rightIndex++); | |
232 | } while(rightCase <= 0xffff); | |
233 | rightCase &= 0xc000; | |
234 | } | |
235 | ||
236 | // No need to handle NO_CE and MERGE_SEPARATOR specially: | |
237 | // There is one case weight for each previous-level weight, | |
238 | // so level length differences were handled there. | |
239 | if(leftCase != rightCase) { | |
240 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
241 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
242 | } else { | |
243 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
244 | } | |
245 | } | |
246 | if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } | |
247 | } | |
248 | } | |
249 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
250 | ||
251 | uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); | |
252 | ||
253 | int32_t leftIndex = 0; | |
254 | int32_t rightIndex = 0; | |
255 | uint32_t anyQuaternaries = 0; | |
256 | for(;;) { | |
257 | uint32_t leftLower32, leftTertiary; | |
258 | do { | |
259 | leftLower32 = (uint32_t)left.getCE(leftIndex++); | |
260 | anyQuaternaries |= leftLower32; | |
261 | U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || | |
262 | (leftLower32 & 0xc0c0) == 0); | |
263 | leftTertiary = leftLower32 & tertiaryMask; | |
264 | } while(leftTertiary == 0); | |
265 | ||
266 | uint32_t rightLower32, rightTertiary; | |
267 | do { | |
268 | rightLower32 = (uint32_t)right.getCE(rightIndex++); | |
269 | anyQuaternaries |= rightLower32; | |
270 | U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || | |
271 | (rightLower32 & 0xc0c0) == 0); | |
272 | rightTertiary = rightLower32 & tertiaryMask; | |
273 | } while(rightTertiary == 0); | |
274 | ||
275 | if(leftTertiary != rightTertiary) { | |
276 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
b331163b | 277 | // Pass through NO_CE and keep real tertiary weights larger than that. |
57a6839d A |
278 | // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), |
279 | // to keep tertiary CEs well-formed. | |
280 | // Their case+tertiary weights must be greater than those of | |
281 | // primary and secondary CEs. | |
b331163b | 282 | if(leftTertiary > Collation::NO_CE_WEIGHT16) { |
57a6839d A |
283 | if(leftLower32 > 0xffff) { |
284 | leftTertiary ^= 0xc000; | |
285 | } else { | |
286 | leftTertiary += 0x4000; | |
287 | } | |
288 | } | |
b331163b | 289 | if(rightTertiary > Collation::NO_CE_WEIGHT16) { |
57a6839d A |
290 | if(rightLower32 > 0xffff) { |
291 | rightTertiary ^= 0xc000; | |
292 | } else { | |
293 | rightTertiary += 0x4000; | |
294 | } | |
295 | } | |
296 | } | |
297 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
298 | } | |
299 | if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } | |
300 | } | |
301 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
302 | ||
303 | if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { | |
304 | // If there are no "variable" CEs and no non-zero quaternary weights, | |
305 | // then there are no quaternary differences. | |
306 | return UCOL_EQUAL; | |
307 | } | |
308 | ||
309 | leftIndex = 0; | |
310 | rightIndex = 0; | |
311 | for(;;) { | |
312 | uint32_t leftQuaternary; | |
313 | do { | |
314 | int64_t ce = left.getCE(leftIndex++); | |
315 | leftQuaternary = (uint32_t)ce & 0xffff; | |
b331163b A |
316 | if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { |
317 | // Variable primary or completely ignorable or NO_CE. | |
57a6839d | 318 | leftQuaternary = (uint32_t)(ce >> 32); |
57a6839d A |
319 | } else { |
320 | // Regular CE, not tertiary ignorable. | |
321 | // Preserve the quaternary weight in bits 7..6. | |
322 | leftQuaternary |= 0xffffff3f; | |
323 | } | |
324 | } while(leftQuaternary == 0); | |
325 | ||
326 | uint32_t rightQuaternary; | |
327 | do { | |
328 | int64_t ce = right.getCE(rightIndex++); | |
329 | rightQuaternary = (uint32_t)ce & 0xffff; | |
b331163b A |
330 | if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { |
331 | // Variable primary or completely ignorable or NO_CE. | |
57a6839d | 332 | rightQuaternary = (uint32_t)(ce >> 32); |
57a6839d A |
333 | } else { |
334 | // Regular CE, not tertiary ignorable. | |
335 | // Preserve the quaternary weight in bits 7..6. | |
336 | rightQuaternary |= 0xffffff3f; | |
337 | } | |
338 | } while(rightQuaternary == 0); | |
339 | ||
340 | if(leftQuaternary != rightQuaternary) { | |
341 | // Return the difference, with script reordering. | |
b331163b A |
342 | if(settings.hasReordering()) { |
343 | leftQuaternary = settings.reorder(leftQuaternary); | |
344 | rightQuaternary = settings.reorder(rightQuaternary); | |
57a6839d A |
345 | } |
346 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
347 | } | |
b331163b | 348 | if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } |
57a6839d A |
349 | } |
350 | return UCOL_EQUAL; | |
351 | } | |
352 | ||
353 | U_NAMESPACE_END | |
354 | ||
355 | #endif // !UCONFIG_NO_COLLATION |