]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationcompare.cpp
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
18 #include "collation.h"
19 #include "collationcompare.h"
20 #include "collationiterator.h"
21 #include "collationsettings.h"
27 CollationCompare::compareUpToQuaternary(CollationIterator
&left
, CollationIterator
&right
,
28 const CollationSettings
&settings
,
29 UErrorCode
&errorCode
) {
30 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
32 int32_t options
= settings
.options
;
34 if((options
& CollationSettings::ALTERNATE_MASK
) == 0) {
37 // +1 so that we can use "<" and primary ignorables test out early.
38 variableTop
= settings
.variableTop
+ 1;
40 UBool anyVariable
= FALSE
;
42 // Fetch CEs, compare primaries, store secondary & tertiary weights.
45 // We fetch CEs until we get a non-ignorable primary or reach the end.
48 int64_t ce
= left
.nextCE(errorCode
);
49 leftPrimary
= (uint32_t)(ce
>> 32);
50 if(leftPrimary
< variableTop
&& leftPrimary
> Collation::MERGE_SEPARATOR_PRIMARY
) {
51 // Variable CE, shift it to quaternary level.
52 // Ignore all following primary ignorables, and shift further variable CEs.
55 // Store only the primary of the variable CE.
56 left
.setCurrentCE(ce
& INT64_C(0xffffffff00000000));
58 ce
= left
.nextCE(errorCode
);
59 leftPrimary
= (uint32_t)(ce
>> 32);
60 if(leftPrimary
== 0) {
66 } while(leftPrimary
< variableTop
&&
67 leftPrimary
> Collation::MERGE_SEPARATOR_PRIMARY
);
69 } while(leftPrimary
== 0);
71 uint32_t rightPrimary
;
73 int64_t ce
= right
.nextCE(errorCode
);
74 rightPrimary
= (uint32_t)(ce
>> 32);
75 if(rightPrimary
< variableTop
&& rightPrimary
> Collation::MERGE_SEPARATOR_PRIMARY
) {
76 // Variable CE, shift it to quaternary level.
77 // Ignore all following primary ignorables, and shift further variable CEs.
80 // Store only the primary of the variable CE.
81 right
.setCurrentCE(ce
& INT64_C(0xffffffff00000000));
83 ce
= right
.nextCE(errorCode
);
84 rightPrimary
= (uint32_t)(ce
>> 32);
85 if(rightPrimary
== 0) {
86 right
.setCurrentCE(0);
91 } while(rightPrimary
< variableTop
&&
92 rightPrimary
> Collation::MERGE_SEPARATOR_PRIMARY
);
94 } while(rightPrimary
== 0);
96 if(leftPrimary
!= rightPrimary
) {
97 // Return the primary difference, with script reordering.
98 const uint8_t *reorderTable
= settings
.reorderTable
;
99 if (reorderTable
!= NULL
) {
100 leftPrimary
= Collation::reorder(reorderTable
, leftPrimary
);
101 rightPrimary
= Collation::reorder(reorderTable
, rightPrimary
);
103 return (leftPrimary
< rightPrimary
) ? UCOL_LESS
: UCOL_GREATER
;
105 if(leftPrimary
== Collation::NO_CE_PRIMARY
) { break; }
107 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
109 // Compare the buffered secondary & tertiary weights.
110 // We might skip the secondary level but continue with the case level
111 // which is turned on separately.
112 if(CollationSettings::getStrength(options
) >= UCOL_SECONDARY
) {
113 if((options
& CollationSettings::BACKWARD_SECONDARY
) == 0) {
114 int32_t leftIndex
= 0;
115 int32_t rightIndex
= 0;
117 uint32_t leftSecondary
;
119 leftSecondary
= ((uint32_t)left
.getCE(leftIndex
++)) >> 16;
120 } while(leftSecondary
== 0);
122 uint32_t rightSecondary
;
124 rightSecondary
= ((uint32_t)right
.getCE(rightIndex
++)) >> 16;
125 } while(rightSecondary
== 0);
127 if(leftSecondary
!= rightSecondary
) {
128 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
130 if(leftSecondary
== Collation::NO_CE_WEIGHT16
) { break; }
133 // The backwards secondary level compares secondary weights backwards
134 // within segments separated by the merge separator (U+FFFE, weight 02).
135 int32_t leftStart
= 0;
136 int32_t rightStart
= 0;
138 // Find the merge separator or the NO_CE terminator.
139 int32_t leftLimit
= leftStart
;
140 uint32_t leftLower32
;
141 while((leftLower32
= (uint32_t)left
.getCE(leftLimit
)) >
142 Collation::MERGE_SEPARATOR_LOWER32
||
146 int32_t rightLimit
= rightStart
;
147 uint32_t rightLower32
;
148 while((rightLower32
= (uint32_t)right
.getCE(rightLimit
)) >
149 Collation::MERGE_SEPARATOR_LOWER32
||
154 // Compare the segments.
155 int32_t leftIndex
= leftLimit
;
156 int32_t rightIndex
= rightLimit
;
158 int32_t leftSecondary
= 0;
159 while(leftSecondary
== 0 && leftIndex
> leftStart
) {
160 leftSecondary
= ((uint32_t)left
.getCE(--leftIndex
)) >> 16;
163 int32_t rightSecondary
= 0;
164 while(rightSecondary
== 0 && rightIndex
> rightStart
) {
165 rightSecondary
= ((uint32_t)right
.getCE(--rightIndex
)) >> 16;
168 if(leftSecondary
!= rightSecondary
) {
169 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
171 if(leftSecondary
== 0) { break; }
174 // Did we reach the end of either string?
175 // Both strings have the same number of merge separators,
176 // or else there would have been a primary-level difference.
177 U_ASSERT(left
.getCE(leftLimit
) == right
.getCE(rightLimit
));
178 if(left
.getCE(leftLimit
) == Collation::NO_CE
) { break; }
179 // Skip both merge separators and continue.
180 leftStart
= leftLimit
+ 1;
181 rightStart
= rightLimit
+ 1;
186 if((options
& CollationSettings::CASE_LEVEL
) != 0) {
187 int32_t strength
= CollationSettings::getStrength(options
);
188 int32_t leftIndex
= 0;
189 int32_t rightIndex
= 0;
191 uint32_t leftCase
, leftLower32
, rightCase
;
192 if(strength
== UCOL_PRIMARY
) {
193 // Primary+caseLevel: Ignore case level weights of primary ignorables.
194 // Otherwise we would get a-umlaut > a
195 // which is not desirable for accent-insensitive sorting.
196 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
197 // with only primary weights.
200 ce
= left
.getCE(leftIndex
++);
201 leftCase
= (uint32_t)ce
;
202 } while((uint32_t)(ce
>> 32) == 0 || leftCase
== 0);
203 leftLower32
= leftCase
;
207 ce
= right
.getCE(rightIndex
++);
208 rightCase
= (uint32_t)ce
;
209 } while((uint32_t)(ce
>> 32) == 0 || rightCase
== 0);
212 // Secondary+caseLevel: By analogy with the above,
213 // ignore case level weights of secondary ignorables.
215 // Note: A tertiary CE has uppercase case bits (0.0.ut)
216 // to keep tertiary+caseFirst well-formed.
218 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
219 // Otherwise a tertiary CE's uppercase would be no greater than
220 // a primary/secondary CE's uppercase.
221 // (See UCA well-formedness condition 2.)
222 // We could construct a special case weight higher than uppercase,
223 // but it's simpler to always ignore case weights of secondary ignorables,
224 // turning 0.0.ut into 0.0.0.t.
225 // (See LDML Collation, Case Parameters.)
227 leftCase
= (uint32_t)left
.getCE(leftIndex
++);
228 } while(leftCase
<= 0xffff);
229 leftLower32
= leftCase
;
233 rightCase
= (uint32_t)right
.getCE(rightIndex
++);
234 } while(rightCase
<= 0xffff);
238 // No need to handle NO_CE and MERGE_SEPARATOR specially:
239 // There is one case weight for each previous-level weight,
240 // so level length differences were handled there.
241 if(leftCase
!= rightCase
) {
242 if((options
& CollationSettings::UPPER_FIRST
) == 0) {
243 return (leftCase
< rightCase
) ? UCOL_LESS
: UCOL_GREATER
;
245 return (leftCase
< rightCase
) ? UCOL_GREATER
: UCOL_LESS
;
248 if((leftLower32
>> 16) == Collation::NO_CE_WEIGHT16
) { break; }
251 if(CollationSettings::getStrength(options
) <= UCOL_SECONDARY
) { return UCOL_EQUAL
; }
253 uint32_t tertiaryMask
= CollationSettings::getTertiaryMask(options
);
255 int32_t leftIndex
= 0;
256 int32_t rightIndex
= 0;
257 uint32_t anyQuaternaries
= 0;
259 uint32_t leftLower32
, leftTertiary
;
261 leftLower32
= (uint32_t)left
.getCE(leftIndex
++);
262 anyQuaternaries
|= leftLower32
;
263 U_ASSERT((leftLower32
& Collation::ONLY_TERTIARY_MASK
) != 0 ||
264 (leftLower32
& 0xc0c0) == 0);
265 leftTertiary
= leftLower32
& tertiaryMask
;
266 } while(leftTertiary
== 0);
268 uint32_t rightLower32
, rightTertiary
;
270 rightLower32
= (uint32_t)right
.getCE(rightIndex
++);
271 anyQuaternaries
|= rightLower32
;
272 U_ASSERT((rightLower32
& Collation::ONLY_TERTIARY_MASK
) != 0 ||
273 (rightLower32
& 0xc0c0) == 0);
274 rightTertiary
= rightLower32
& tertiaryMask
;
275 } while(rightTertiary
== 0);
277 if(leftTertiary
!= rightTertiary
) {
278 if(CollationSettings::sortsTertiaryUpperCaseFirst(options
)) {
279 // Pass through NO_CE and MERGE_SEPARATOR
280 // and keep real tertiary weights larger than the MERGE_SEPARATOR.
281 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
282 // to keep tertiary CEs well-formed.
283 // Their case+tertiary weights must be greater than those of
284 // primary and secondary CEs.
285 if(leftTertiary
> Collation::MERGE_SEPARATOR_WEIGHT16
) {
286 if(leftLower32
> 0xffff) {
287 leftTertiary
^= 0xc000;
289 leftTertiary
+= 0x4000;
292 if(rightTertiary
> Collation::MERGE_SEPARATOR_WEIGHT16
) {
293 if(rightLower32
> 0xffff) {
294 rightTertiary
^= 0xc000;
296 rightTertiary
+= 0x4000;
300 return (leftTertiary
< rightTertiary
) ? UCOL_LESS
: UCOL_GREATER
;
302 if(leftTertiary
== Collation::NO_CE_WEIGHT16
) { break; }
304 if(CollationSettings::getStrength(options
) <= UCOL_TERTIARY
) { return UCOL_EQUAL
; }
306 if(!anyVariable
&& (anyQuaternaries
& 0xc0) == 0) {
307 // If there are no "variable" CEs and no non-zero quaternary weights,
308 // then there are no quaternary differences.
315 uint32_t leftQuaternary
;
317 int64_t ce
= left
.getCE(leftIndex
++);
318 leftQuaternary
= (uint32_t)ce
& 0xffff;
319 if(leftQuaternary
== 0) {
320 // Variable primary or completely ignorable.
321 leftQuaternary
= (uint32_t)(ce
>> 32);
322 } else if(leftQuaternary
<= Collation::MERGE_SEPARATOR_WEIGHT16
) {
323 // Leave NO_CE or MERGE_SEPARATOR as is.
325 // Regular CE, not tertiary ignorable.
326 // Preserve the quaternary weight in bits 7..6.
327 leftQuaternary
|= 0xffffff3f;
329 } while(leftQuaternary
== 0);
331 uint32_t rightQuaternary
;
333 int64_t ce
= right
.getCE(rightIndex
++);
334 rightQuaternary
= (uint32_t)ce
& 0xffff;
335 if(rightQuaternary
== 0) {
336 // Variable primary or completely ignorable.
337 rightQuaternary
= (uint32_t)(ce
>> 32);
338 } else if(rightQuaternary
<= Collation::MERGE_SEPARATOR_WEIGHT16
) {
339 // Leave NO_CE or MERGE_SEPARATOR as is.
341 // Regular CE, not tertiary ignorable.
342 // Preserve the quaternary weight in bits 7..6.
343 rightQuaternary
|= 0xffffff3f;
345 } while(rightQuaternary
== 0);
347 if(leftQuaternary
!= rightQuaternary
) {
348 // Return the difference, with script reordering.
349 const uint8_t *reorderTable
= settings
.reorderTable
;
350 if (reorderTable
!= NULL
) {
351 leftQuaternary
= Collation::reorder(reorderTable
, leftQuaternary
);
352 rightQuaternary
= Collation::reorder(reorderTable
, rightQuaternary
);
354 return (leftQuaternary
< rightQuaternary
) ? UCOL_LESS
: UCOL_GREATER
;
356 if(leftQuaternary
== Collation::NO_CE_WEIGHT16
) { break; }
363 #endif // !UCONFIG_NO_COLLATION