]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationcompare.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / i18n / collationcompare.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationcompare.cpp
7 *
8 * created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "cmemory.h"
18 #include "collation.h"
19 #include "collationcompare.h"
20 #include "collationiterator.h"
21 #include "collationsettings.h"
22 #include "uassert.h"
23
24 U_NAMESPACE_BEGIN
25
26 UCollationResult
27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28 const CollationSettings &settings,
29 UErrorCode &errorCode) {
30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31
32 int32_t options = settings.options;
33 uint32_t variableTop;
34 if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35 variableTop = 0;
36 } else {
37 // +1 so that we can use "<" and primary ignorables test out early.
38 variableTop = settings.variableTop + 1;
39 }
40 UBool anyVariable = FALSE;
41
42 // Fetch CEs, compare primaries, store secondary & tertiary weights.
43 U_ALIGN_CODE(16);
44 for(;;) {
45 // We fetch CEs until we get a non-ignorable primary or reach the end.
46 uint32_t leftPrimary;
47 do {
48 int64_t ce = left.nextCE(errorCode);
49 leftPrimary = (uint32_t)(ce >> 32);
50 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
51 // Variable CE, shift it to quaternary level.
52 // Ignore all following primary ignorables, and shift further variable CEs.
53 anyVariable = TRUE;
54 do {
55 // Store only the primary of the variable CE.
56 left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
57 for(;;) {
58 ce = left.nextCE(errorCode);
59 leftPrimary = (uint32_t)(ce >> 32);
60 if(leftPrimary == 0) {
61 left.setCurrentCE(0);
62 } else {
63 break;
64 }
65 }
66 } while(leftPrimary < variableTop &&
67 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
68 }
69 } while(leftPrimary == 0);
70
71 uint32_t rightPrimary;
72 do {
73 int64_t ce = right.nextCE(errorCode);
74 rightPrimary = (uint32_t)(ce >> 32);
75 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
76 // Variable CE, shift it to quaternary level.
77 // Ignore all following primary ignorables, and shift further variable CEs.
78 anyVariable = TRUE;
79 do {
80 // Store only the primary of the variable CE.
81 right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
82 for(;;) {
83 ce = right.nextCE(errorCode);
84 rightPrimary = (uint32_t)(ce >> 32);
85 if(rightPrimary == 0) {
86 right.setCurrentCE(0);
87 } else {
88 break;
89 }
90 }
91 } while(rightPrimary < variableTop &&
92 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
93 }
94 } while(rightPrimary == 0);
95
96 if(leftPrimary != rightPrimary) {
97 // Return the primary difference, with script reordering.
98 const uint8_t *reorderTable = settings.reorderTable;
99 if (reorderTable != NULL) {
100 leftPrimary = Collation::reorder(reorderTable, leftPrimary);
101 rightPrimary = Collation::reorder(reorderTable, rightPrimary);
102 }
103 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104 }
105 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106 }
107 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108
109 // Compare the buffered secondary & tertiary weights.
110 // We might skip the secondary level but continue with the case level
111 // which is turned on separately.
112 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114 int32_t leftIndex = 0;
115 int32_t rightIndex = 0;
116 for(;;) {
117 uint32_t leftSecondary;
118 do {
119 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120 } while(leftSecondary == 0);
121
122 uint32_t rightSecondary;
123 do {
124 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125 } while(rightSecondary == 0);
126
127 if(leftSecondary != rightSecondary) {
128 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129 }
130 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131 }
132 } else {
133 // The backwards secondary level compares secondary weights backwards
134 // within segments separated by the merge separator (U+FFFE, weight 02).
135 int32_t leftStart = 0;
136 int32_t rightStart = 0;
137 for(;;) {
138 // Find the merge separator or the NO_CE terminator.
139 int32_t leftLimit = leftStart;
140 uint32_t leftLower32;
141 while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
142 Collation::MERGE_SEPARATOR_LOWER32 ||
143 leftLower32 == 0) {
144 ++leftLimit;
145 }
146 int32_t rightLimit = rightStart;
147 uint32_t rightLower32;
148 while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
149 Collation::MERGE_SEPARATOR_LOWER32 ||
150 rightLower32 == 0) {
151 ++rightLimit;
152 }
153
154 // Compare the segments.
155 int32_t leftIndex = leftLimit;
156 int32_t rightIndex = rightLimit;
157 for(;;) {
158 int32_t leftSecondary = 0;
159 while(leftSecondary == 0 && leftIndex > leftStart) {
160 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
161 }
162
163 int32_t rightSecondary = 0;
164 while(rightSecondary == 0 && rightIndex > rightStart) {
165 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
166 }
167
168 if(leftSecondary != rightSecondary) {
169 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
170 }
171 if(leftSecondary == 0) { break; }
172 }
173
174 // Did we reach the end of either string?
175 // Both strings have the same number of merge separators,
176 // or else there would have been a primary-level difference.
177 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
178 if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
179 // Skip both merge separators and continue.
180 leftStart = leftLimit + 1;
181 rightStart = rightLimit + 1;
182 }
183 }
184 }
185
186 if((options & CollationSettings::CASE_LEVEL) != 0) {
187 int32_t strength = CollationSettings::getStrength(options);
188 int32_t leftIndex = 0;
189 int32_t rightIndex = 0;
190 for(;;) {
191 uint32_t leftCase, leftLower32, rightCase;
192 if(strength == UCOL_PRIMARY) {
193 // Primary+caseLevel: Ignore case level weights of primary ignorables.
194 // Otherwise we would get a-umlaut > a
195 // which is not desirable for accent-insensitive sorting.
196 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
197 // with only primary weights.
198 int64_t ce;
199 do {
200 ce = left.getCE(leftIndex++);
201 leftCase = (uint32_t)ce;
202 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
203 leftLower32 = leftCase;
204 leftCase &= 0xc000;
205
206 do {
207 ce = right.getCE(rightIndex++);
208 rightCase = (uint32_t)ce;
209 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
210 rightCase &= 0xc000;
211 } else {
212 // Secondary+caseLevel: By analogy with the above,
213 // ignore case level weights of secondary ignorables.
214 //
215 // Note: A tertiary CE has uppercase case bits (0.0.ut)
216 // to keep tertiary+caseFirst well-formed.
217 //
218 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
219 // Otherwise a tertiary CE's uppercase would be no greater than
220 // a primary/secondary CE's uppercase.
221 // (See UCA well-formedness condition 2.)
222 // We could construct a special case weight higher than uppercase,
223 // but it's simpler to always ignore case weights of secondary ignorables,
224 // turning 0.0.ut into 0.0.0.t.
225 // (See LDML Collation, Case Parameters.)
226 do {
227 leftCase = (uint32_t)left.getCE(leftIndex++);
228 } while(leftCase <= 0xffff);
229 leftLower32 = leftCase;
230 leftCase &= 0xc000;
231
232 do {
233 rightCase = (uint32_t)right.getCE(rightIndex++);
234 } while(rightCase <= 0xffff);
235 rightCase &= 0xc000;
236 }
237
238 // No need to handle NO_CE and MERGE_SEPARATOR specially:
239 // There is one case weight for each previous-level weight,
240 // so level length differences were handled there.
241 if(leftCase != rightCase) {
242 if((options & CollationSettings::UPPER_FIRST) == 0) {
243 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
244 } else {
245 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
246 }
247 }
248 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
249 }
250 }
251 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
252
253 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
254
255 int32_t leftIndex = 0;
256 int32_t rightIndex = 0;
257 uint32_t anyQuaternaries = 0;
258 for(;;) {
259 uint32_t leftLower32, leftTertiary;
260 do {
261 leftLower32 = (uint32_t)left.getCE(leftIndex++);
262 anyQuaternaries |= leftLower32;
263 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
264 (leftLower32 & 0xc0c0) == 0);
265 leftTertiary = leftLower32 & tertiaryMask;
266 } while(leftTertiary == 0);
267
268 uint32_t rightLower32, rightTertiary;
269 do {
270 rightLower32 = (uint32_t)right.getCE(rightIndex++);
271 anyQuaternaries |= rightLower32;
272 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
273 (rightLower32 & 0xc0c0) == 0);
274 rightTertiary = rightLower32 & tertiaryMask;
275 } while(rightTertiary == 0);
276
277 if(leftTertiary != rightTertiary) {
278 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
279 // Pass through NO_CE and MERGE_SEPARATOR
280 // and keep real tertiary weights larger than the MERGE_SEPARATOR.
281 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
282 // to keep tertiary CEs well-formed.
283 // Their case+tertiary weights must be greater than those of
284 // primary and secondary CEs.
285 if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
286 if(leftLower32 > 0xffff) {
287 leftTertiary ^= 0xc000;
288 } else {
289 leftTertiary += 0x4000;
290 }
291 }
292 if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
293 if(rightLower32 > 0xffff) {
294 rightTertiary ^= 0xc000;
295 } else {
296 rightTertiary += 0x4000;
297 }
298 }
299 }
300 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
301 }
302 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
303 }
304 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
305
306 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
307 // If there are no "variable" CEs and no non-zero quaternary weights,
308 // then there are no quaternary differences.
309 return UCOL_EQUAL;
310 }
311
312 leftIndex = 0;
313 rightIndex = 0;
314 for(;;) {
315 uint32_t leftQuaternary;
316 do {
317 int64_t ce = left.getCE(leftIndex++);
318 leftQuaternary = (uint32_t)ce & 0xffff;
319 if(leftQuaternary == 0) {
320 // Variable primary or completely ignorable.
321 leftQuaternary = (uint32_t)(ce >> 32);
322 } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
323 // Leave NO_CE or MERGE_SEPARATOR as is.
324 } else {
325 // Regular CE, not tertiary ignorable.
326 // Preserve the quaternary weight in bits 7..6.
327 leftQuaternary |= 0xffffff3f;
328 }
329 } while(leftQuaternary == 0);
330
331 uint32_t rightQuaternary;
332 do {
333 int64_t ce = right.getCE(rightIndex++);
334 rightQuaternary = (uint32_t)ce & 0xffff;
335 if(rightQuaternary == 0) {
336 // Variable primary or completely ignorable.
337 rightQuaternary = (uint32_t)(ce >> 32);
338 } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
339 // Leave NO_CE or MERGE_SEPARATOR as is.
340 } else {
341 // Regular CE, not tertiary ignorable.
342 // Preserve the quaternary weight in bits 7..6.
343 rightQuaternary |= 0xffffff3f;
344 }
345 } while(rightQuaternary == 0);
346
347 if(leftQuaternary != rightQuaternary) {
348 // Return the difference, with script reordering.
349 const uint8_t *reorderTable = settings.reorderTable;
350 if (reorderTable != NULL) {
351 leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
352 rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
353 }
354 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
355 }
356 if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
357 }
358 return UCOL_EQUAL;
359 }
360
361 U_NAMESPACE_END
362
363 #endif // !UCONFIG_NO_COLLATION