]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/collationcompare.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationcompare.cpp
... / ...
CommitLineData
1/*
2*******************************************************************************
3* Copyright (C) 1996-2015, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* collationcompare.cpp
7*
8* created on: 2012feb14 with new and old collation code
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "cmemory.h"
18#include "collation.h"
19#include "collationcompare.h"
20#include "collationiterator.h"
21#include "collationsettings.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26UCollationResult
27CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28 const CollationSettings &settings,
29 UErrorCode &errorCode) {
30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31
32 int32_t options = settings.options;
33 uint32_t variableTop;
34 if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35 variableTop = 0;
36 } else {
37 // +1 so that we can use "<" and primary ignorables test out early.
38 variableTop = settings.variableTop + 1;
39 }
40 UBool anyVariable = FALSE;
41
42 // Fetch CEs, compare primaries, store secondary & tertiary weights.
43 for(;;) {
44 // We fetch CEs until we get a non-ignorable primary or reach the end.
45 uint32_t leftPrimary;
46 do {
47 int64_t ce = left.nextCE(errorCode);
48 leftPrimary = (uint32_t)(ce >> 32);
49 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
50 // Variable CE, shift it to quaternary level.
51 // Ignore all following primary ignorables, and shift further variable CEs.
52 anyVariable = TRUE;
53 do {
54 // Store only the primary of the variable CE.
55 left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
56 for(;;) {
57 ce = left.nextCE(errorCode);
58 leftPrimary = (uint32_t)(ce >> 32);
59 if(leftPrimary == 0) {
60 left.setCurrentCE(0);
61 } else {
62 break;
63 }
64 }
65 } while(leftPrimary < variableTop &&
66 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
67 }
68 } while(leftPrimary == 0);
69
70 uint32_t rightPrimary;
71 do {
72 int64_t ce = right.nextCE(errorCode);
73 rightPrimary = (uint32_t)(ce >> 32);
74 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
75 // Variable CE, shift it to quaternary level.
76 // Ignore all following primary ignorables, and shift further variable CEs.
77 anyVariable = TRUE;
78 do {
79 // Store only the primary of the variable CE.
80 right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
81 for(;;) {
82 ce = right.nextCE(errorCode);
83 rightPrimary = (uint32_t)(ce >> 32);
84 if(rightPrimary == 0) {
85 right.setCurrentCE(0);
86 } else {
87 break;
88 }
89 }
90 } while(rightPrimary < variableTop &&
91 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
92 }
93 } while(rightPrimary == 0);
94
95 if(leftPrimary != rightPrimary) {
96 // Return the primary difference, with script reordering.
97 if(settings.hasReordering()) {
98 leftPrimary = settings.reorder(leftPrimary);
99 rightPrimary = settings.reorder(rightPrimary);
100 }
101 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
102 }
103 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
104 }
105 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
106
107 // Compare the buffered secondary & tertiary weights.
108 // We might skip the secondary level but continue with the case level
109 // which is turned on separately.
110 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
111 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
112 int32_t leftIndex = 0;
113 int32_t rightIndex = 0;
114 for(;;) {
115 uint32_t leftSecondary;
116 do {
117 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
118 } while(leftSecondary == 0);
119
120 uint32_t rightSecondary;
121 do {
122 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
123 } while(rightSecondary == 0);
124
125 if(leftSecondary != rightSecondary) {
126 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
127 }
128 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
129 }
130 } else {
131 // The backwards secondary level compares secondary weights backwards
132 // within segments separated by the merge separator (U+FFFE, weight 02).
133 int32_t leftStart = 0;
134 int32_t rightStart = 0;
135 for(;;) {
136 // Find the merge separator or the NO_CE terminator.
137 uint32_t p;
138 int32_t leftLimit = leftStart;
139 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
140 Collation::MERGE_SEPARATOR_PRIMARY ||
141 p == 0) {
142 ++leftLimit;
143 }
144 int32_t rightLimit = rightStart;
145 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
146 Collation::MERGE_SEPARATOR_PRIMARY ||
147 p == 0) {
148 ++rightLimit;
149 }
150
151 // Compare the segments.
152 int32_t leftIndex = leftLimit;
153 int32_t rightIndex = rightLimit;
154 for(;;) {
155 int32_t leftSecondary = 0;
156 while(leftSecondary == 0 && leftIndex > leftStart) {
157 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
158 }
159
160 int32_t rightSecondary = 0;
161 while(rightSecondary == 0 && rightIndex > rightStart) {
162 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
163 }
164
165 if(leftSecondary != rightSecondary) {
166 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
167 }
168 if(leftSecondary == 0) { break; }
169 }
170
171 // Did we reach the end of either string?
172 // Both strings have the same number of merge separators,
173 // or else there would have been a primary-level difference.
174 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
175 if(p == Collation::NO_CE_PRIMARY) { break; }
176 // Skip both merge separators and continue.
177 leftStart = leftLimit + 1;
178 rightStart = rightLimit + 1;
179 }
180 }
181 }
182
183 if((options & CollationSettings::CASE_LEVEL) != 0) {
184 int32_t strength = CollationSettings::getStrength(options);
185 int32_t leftIndex = 0;
186 int32_t rightIndex = 0;
187 for(;;) {
188 uint32_t leftCase, leftLower32, rightCase;
189 if(strength == UCOL_PRIMARY) {
190 // Primary+caseLevel: Ignore case level weights of primary ignorables.
191 // Otherwise we would get a-umlaut > a
192 // which is not desirable for accent-insensitive sorting.
193 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
194 // with only primary weights.
195 int64_t ce;
196 do {
197 ce = left.getCE(leftIndex++);
198 leftCase = (uint32_t)ce;
199 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
200 leftLower32 = leftCase;
201 leftCase &= 0xc000;
202
203 do {
204 ce = right.getCE(rightIndex++);
205 rightCase = (uint32_t)ce;
206 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
207 rightCase &= 0xc000;
208 } else {
209 // Secondary+caseLevel: By analogy with the above,
210 // ignore case level weights of secondary ignorables.
211 //
212 // Note: A tertiary CE has uppercase case bits (0.0.ut)
213 // to keep tertiary+caseFirst well-formed.
214 //
215 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
216 // Otherwise a tertiary CE's uppercase would be no greater than
217 // a primary/secondary CE's uppercase.
218 // (See UCA well-formedness condition 2.)
219 // We could construct a special case weight higher than uppercase,
220 // but it's simpler to always ignore case weights of secondary ignorables,
221 // turning 0.0.ut into 0.0.0.t.
222 // (See LDML Collation, Case Parameters.)
223 do {
224 leftCase = (uint32_t)left.getCE(leftIndex++);
225 } while(leftCase <= 0xffff);
226 leftLower32 = leftCase;
227 leftCase &= 0xc000;
228
229 do {
230 rightCase = (uint32_t)right.getCE(rightIndex++);
231 } while(rightCase <= 0xffff);
232 rightCase &= 0xc000;
233 }
234
235 // No need to handle NO_CE and MERGE_SEPARATOR specially:
236 // There is one case weight for each previous-level weight,
237 // so level length differences were handled there.
238 if(leftCase != rightCase) {
239 if((options & CollationSettings::UPPER_FIRST) == 0) {
240 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
241 } else {
242 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
243 }
244 }
245 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
246 }
247 }
248 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
249
250 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
251
252 int32_t leftIndex = 0;
253 int32_t rightIndex = 0;
254 uint32_t anyQuaternaries = 0;
255 for(;;) {
256 uint32_t leftLower32, leftTertiary;
257 do {
258 leftLower32 = (uint32_t)left.getCE(leftIndex++);
259 anyQuaternaries |= leftLower32;
260 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
261 (leftLower32 & 0xc0c0) == 0);
262 leftTertiary = leftLower32 & tertiaryMask;
263 } while(leftTertiary == 0);
264
265 uint32_t rightLower32, rightTertiary;
266 do {
267 rightLower32 = (uint32_t)right.getCE(rightIndex++);
268 anyQuaternaries |= rightLower32;
269 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
270 (rightLower32 & 0xc0c0) == 0);
271 rightTertiary = rightLower32 & tertiaryMask;
272 } while(rightTertiary == 0);
273
274 if(leftTertiary != rightTertiary) {
275 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
276 // Pass through NO_CE and keep real tertiary weights larger than that.
277 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
278 // to keep tertiary CEs well-formed.
279 // Their case+tertiary weights must be greater than those of
280 // primary and secondary CEs.
281 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
282 if(leftLower32 > 0xffff) {
283 leftTertiary ^= 0xc000;
284 } else {
285 leftTertiary += 0x4000;
286 }
287 }
288 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
289 if(rightLower32 > 0xffff) {
290 rightTertiary ^= 0xc000;
291 } else {
292 rightTertiary += 0x4000;
293 }
294 }
295 }
296 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
297 }
298 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
299 }
300 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
301
302 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
303 // If there are no "variable" CEs and no non-zero quaternary weights,
304 // then there are no quaternary differences.
305 return UCOL_EQUAL;
306 }
307
308 leftIndex = 0;
309 rightIndex = 0;
310 for(;;) {
311 uint32_t leftQuaternary;
312 do {
313 int64_t ce = left.getCE(leftIndex++);
314 leftQuaternary = (uint32_t)ce & 0xffff;
315 if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
316 // Variable primary or completely ignorable or NO_CE.
317 leftQuaternary = (uint32_t)(ce >> 32);
318 } else {
319 // Regular CE, not tertiary ignorable.
320 // Preserve the quaternary weight in bits 7..6.
321 leftQuaternary |= 0xffffff3f;
322 }
323 } while(leftQuaternary == 0);
324
325 uint32_t rightQuaternary;
326 do {
327 int64_t ce = right.getCE(rightIndex++);
328 rightQuaternary = (uint32_t)ce & 0xffff;
329 if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
330 // Variable primary or completely ignorable or NO_CE.
331 rightQuaternary = (uint32_t)(ce >> 32);
332 } else {
333 // Regular CE, not tertiary ignorable.
334 // Preserve the quaternary weight in bits 7..6.
335 rightQuaternary |= 0xffffff3f;
336 }
337 } while(rightQuaternary == 0);
338
339 if(leftQuaternary != rightQuaternary) {
340 // Return the difference, with script reordering.
341 if(settings.hasReordering()) {
342 leftQuaternary = settings.reorder(leftQuaternary);
343 rightQuaternary = settings.reorder(rightQuaternary);
344 }
345 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
346 }
347 if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
348 }
349 return UCOL_EQUAL;
350}
351
352U_NAMESPACE_END
353
354#endif // !UCONFIG_NO_COLLATION