2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
25 CollationFastLatin::getOptions(const CollationData
*data
, const CollationSettings
&settings
,
26 uint16_t *primaries
, int32_t capacity
) {
27 const uint16_t *table
= data
->fastLatinTable
;
28 if(table
== NULL
) { return -1; }
29 U_ASSERT(capacity
== LATIN_LIMIT
);
30 if(capacity
!= LATIN_LIMIT
) { return -1; }
33 if((settings
.options
& CollationSettings::ALTERNATE_MASK
) == 0) {
34 // No mini primaries are variable, set a variableTop just below the
35 // lowest long mini primary.
36 miniVarTop
= MIN_LONG
- 1;
38 int32_t headerLength
= *table
& 0xff;
39 int32_t i
= 1 + settings
.getMaxVariable();
40 if(i
>= headerLength
) {
41 return -1; // variableTop >= digits, should not occur
43 miniVarTop
= table
[i
];
46 UBool digitsAreReordered
= FALSE
;
47 if(settings
.hasReordering()) {
48 uint32_t prevStart
= 0;
49 uint32_t beforeDigitStart
= 0;
50 uint32_t digitStart
= 0;
51 uint32_t afterDigitStart
= 0;
52 for(int32_t group
= UCOL_REORDER_CODE_FIRST
;
53 group
< UCOL_REORDER_CODE_FIRST
+ CollationData::MAX_NUM_SPECIAL_REORDER_CODES
;
55 uint32_t start
= data
->getFirstPrimaryForGroup(group
);
56 start
= settings
.reorder(start
);
57 if(group
== UCOL_REORDER_CODE_DIGIT
) {
58 beforeDigitStart
= prevStart
;
60 } else if(start
!= 0) {
61 if(start
< prevStart
) {
62 // The permutation affects the groups up to Latin.
65 // In the future, there might be a special group between digits & Latin.
66 if(digitStart
!= 0 && afterDigitStart
== 0 && prevStart
== beforeDigitStart
) {
67 afterDigitStart
= start
;
72 uint32_t latinStart
= data
->getFirstPrimaryForGroup(USCRIPT_LATIN
);
73 latinStart
= settings
.reorder(latinStart
);
74 if(latinStart
< prevStart
) {
77 if(afterDigitStart
== 0) {
78 afterDigitStart
= latinStart
;
80 if(!(beforeDigitStart
< digitStart
&& digitStart
< afterDigitStart
)) {
81 digitsAreReordered
= TRUE
;
85 table
+= (table
[0] & 0xff); // skip the header
86 for(UChar32 c
= 0; c
< LATIN_LIMIT
; ++c
) {
87 uint32_t p
= table
[c
];
89 p
&= SHORT_PRIMARY_MASK
;
90 } else if(p
> miniVarTop
) {
91 p
&= LONG_PRIMARY_MASK
;
95 primaries
[c
] = (uint16_t)p
;
97 if(digitsAreReordered
|| (settings
.options
& CollationSettings::NUMERIC
) != 0) {
98 // Bail out for digits.
99 for(UChar32 c
= 0x30; c
<= 0x39; ++c
) { primaries
[c
] = 0; }
102 // Shift the miniVarTop above other options.
103 return ((int32_t)miniVarTop
<< 16) | settings
.options
;
107 CollationFastLatin::compareUTF16(const uint16_t *table
, const uint16_t *primaries
, int32_t options
,
108 const UChar
*left
, int32_t leftLength
,
109 const UChar
*right
, int32_t rightLength
) {
110 // This is a modified copy of CollationCompare::compareUpToQuaternary(),
111 // optimized for common Latin text.
112 // Keep them in sync!
113 // Keep compareUTF16() and compareUTF8() in sync very closely!
115 U_ASSERT((table
[0] >> 8) == VERSION
);
116 table
+= (table
[0] & 0xff); // skip the header
117 uint32_t variableTop
= (uint32_t)options
>> 16; // see getOptions()
118 options
&= 0xffff; // needed for CollationSettings::getStrength() to work
120 // Check for supported characters, fetch mini CEs, and compare primaries.
121 int32_t leftIndex
= 0, rightIndex
= 0;
123 * Single mini CE or a pair.
124 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
125 * If there is only one, then it is in the lower bits, and the upper bits are 0.
127 uint32_t leftPair
= 0, rightPair
= 0;
129 // We fetch CEs until we get a non-ignorable primary or reach the end.
130 while(leftPair
== 0) {
131 if(leftIndex
== leftLength
) {
135 UChar32 c
= left
[leftIndex
++];
137 leftPair
= primaries
[c
];
138 if(leftPair
!= 0) { break; }
139 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
140 return BAIL_OUT_RESULT
;
143 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
144 leftPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
146 leftPair
= lookup(table
, c
);
148 if(leftPair
>= MIN_SHORT
) {
149 leftPair
&= SHORT_PRIMARY_MASK
;
151 } else if(leftPair
> variableTop
) {
152 leftPair
&= LONG_PRIMARY_MASK
;
155 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
156 if(leftPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
157 leftPair
= getPrimaries(variableTop
, leftPair
);
161 while(rightPair
== 0) {
162 if(rightIndex
== rightLength
) {
166 UChar32 c
= right
[rightIndex
++];
168 rightPair
= primaries
[c
];
169 if(rightPair
!= 0) { break; }
170 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
171 return BAIL_OUT_RESULT
;
173 rightPair
= table
[c
];
174 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
175 rightPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
177 rightPair
= lookup(table
, c
);
179 if(rightPair
>= MIN_SHORT
) {
180 rightPair
&= SHORT_PRIMARY_MASK
;
182 } else if(rightPair
> variableTop
) {
183 rightPair
&= LONG_PRIMARY_MASK
;
186 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
187 if(rightPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
188 rightPair
= getPrimaries(variableTop
, rightPair
);
192 if(leftPair
== rightPair
) {
193 if(leftPair
== EOS
) { break; }
194 leftPair
= rightPair
= 0;
197 uint32_t leftPrimary
= leftPair
& 0xffff;
198 uint32_t rightPrimary
= rightPair
& 0xffff;
199 if(leftPrimary
!= rightPrimary
) {
200 // Return the primary difference.
201 return (leftPrimary
< rightPrimary
) ? UCOL_LESS
: UCOL_GREATER
;
203 if(leftPair
== EOS
) { break; }
207 // In the following, we need to re-fetch each character because we did not buffer the CEs,
208 // but we know that the string is well-formed and
209 // only contains supported characters and mappings.
211 // We might skip the secondary level but continue with the case level
212 // which is turned on separately.
213 if(CollationSettings::getStrength(options
) >= UCOL_SECONDARY
) {
214 leftIndex
= rightIndex
= 0;
215 leftPair
= rightPair
= 0;
217 while(leftPair
== 0) {
218 if(leftIndex
== leftLength
) {
222 UChar32 c
= left
[leftIndex
++];
225 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
226 leftPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
228 leftPair
= lookup(table
, c
);
230 if(leftPair
>= MIN_SHORT
) {
231 leftPair
= getSecondariesFromOneShortCE(leftPair
);
233 } else if(leftPair
> variableTop
) {
234 leftPair
= COMMON_SEC_PLUS_OFFSET
;
237 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
238 leftPair
= getSecondaries(variableTop
, leftPair
);
242 while(rightPair
== 0) {
243 if(rightIndex
== rightLength
) {
247 UChar32 c
= right
[rightIndex
++];
249 rightPair
= table
[c
];
250 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
251 rightPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
253 rightPair
= lookup(table
, c
);
255 if(rightPair
>= MIN_SHORT
) {
256 rightPair
= getSecondariesFromOneShortCE(rightPair
);
258 } else if(rightPair
> variableTop
) {
259 rightPair
= COMMON_SEC_PLUS_OFFSET
;
262 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
263 rightPair
= getSecondaries(variableTop
, rightPair
);
267 if(leftPair
== rightPair
) {
268 if(leftPair
== EOS
) { break; }
269 leftPair
= rightPair
= 0;
272 uint32_t leftSecondary
= leftPair
& 0xffff;
273 uint32_t rightSecondary
= rightPair
& 0xffff;
274 if(leftSecondary
!= rightSecondary
) {
275 if((options
& CollationSettings::BACKWARD_SECONDARY
) != 0) {
276 // Full support for backwards secondary requires backwards contraction matching
277 // and moving backwards between merge separators.
278 return BAIL_OUT_RESULT
;
280 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
282 if(leftPair
== EOS
) { break; }
288 if((options
& CollationSettings::CASE_LEVEL
) != 0) {
289 UBool strengthIsPrimary
= CollationSettings::getStrength(options
) == UCOL_PRIMARY
;
290 leftIndex
= rightIndex
= 0;
291 leftPair
= rightPair
= 0;
293 while(leftPair
== 0) {
294 if(leftIndex
== leftLength
) {
298 UChar32 c
= left
[leftIndex
++];
299 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
300 if(leftPair
< MIN_LONG
) {
301 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
303 leftPair
= getCases(variableTop
, strengthIsPrimary
, leftPair
);
306 while(rightPair
== 0) {
307 if(rightIndex
== rightLength
) {
311 UChar32 c
= right
[rightIndex
++];
312 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
313 if(rightPair
< MIN_LONG
) {
314 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
316 rightPair
= getCases(variableTop
, strengthIsPrimary
, rightPair
);
319 if(leftPair
== rightPair
) {
320 if(leftPair
== EOS
) { break; }
321 leftPair
= rightPair
= 0;
324 uint32_t leftCase
= leftPair
& 0xffff;
325 uint32_t rightCase
= rightPair
& 0xffff;
326 if(leftCase
!= rightCase
) {
327 if((options
& CollationSettings::UPPER_FIRST
) == 0) {
328 return (leftCase
< rightCase
) ? UCOL_LESS
: UCOL_GREATER
;
330 return (leftCase
< rightCase
) ? UCOL_GREATER
: UCOL_LESS
;
333 if(leftPair
== EOS
) { break; }
338 if(CollationSettings::getStrength(options
) <= UCOL_SECONDARY
) { return UCOL_EQUAL
; }
340 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
341 UBool withCaseBits
= CollationSettings::isTertiaryWithCaseBits(options
);
343 leftIndex
= rightIndex
= 0;
344 leftPair
= rightPair
= 0;
346 while(leftPair
== 0) {
347 if(leftIndex
== leftLength
) {
351 UChar32 c
= left
[leftIndex
++];
352 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
353 if(leftPair
< MIN_LONG
) {
354 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
356 leftPair
= getTertiaries(variableTop
, withCaseBits
, leftPair
);
359 while(rightPair
== 0) {
360 if(rightIndex
== rightLength
) {
364 UChar32 c
= right
[rightIndex
++];
365 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
366 if(rightPair
< MIN_LONG
) {
367 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
369 rightPair
= getTertiaries(variableTop
, withCaseBits
, rightPair
);
372 if(leftPair
== rightPair
) {
373 if(leftPair
== EOS
) { break; }
374 leftPair
= rightPair
= 0;
377 uint32_t leftTertiary
= leftPair
& 0xffff;
378 uint32_t rightTertiary
= rightPair
& 0xffff;
379 if(leftTertiary
!= rightTertiary
) {
380 if(CollationSettings::sortsTertiaryUpperCaseFirst(options
)) {
381 // Pass through EOS and MERGE_WEIGHT
382 // and keep real tertiary weights larger than the MERGE_WEIGHT.
383 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
384 if(leftTertiary
> MERGE_WEIGHT
) {
385 leftTertiary
^= CASE_MASK
;
387 if(rightTertiary
> MERGE_WEIGHT
) {
388 rightTertiary
^= CASE_MASK
;
391 return (leftTertiary
< rightTertiary
) ? UCOL_LESS
: UCOL_GREATER
;
393 if(leftPair
== EOS
) { break; }
397 if(CollationSettings::getStrength(options
) <= UCOL_TERTIARY
) { return UCOL_EQUAL
; }
399 leftIndex
= rightIndex
= 0;
400 leftPair
= rightPair
= 0;
402 while(leftPair
== 0) {
403 if(leftIndex
== leftLength
) {
407 UChar32 c
= left
[leftIndex
++];
408 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
409 if(leftPair
< MIN_LONG
) {
410 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
412 leftPair
= getQuaternaries(variableTop
, leftPair
);
415 while(rightPair
== 0) {
416 if(rightIndex
== rightLength
) {
420 UChar32 c
= right
[rightIndex
++];
421 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
422 if(rightPair
< MIN_LONG
) {
423 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
425 rightPair
= getQuaternaries(variableTop
, rightPair
);
428 if(leftPair
== rightPair
) {
429 if(leftPair
== EOS
) { break; }
430 leftPair
= rightPair
= 0;
433 uint32_t leftQuaternary
= leftPair
& 0xffff;
434 uint32_t rightQuaternary
= rightPair
& 0xffff;
435 if(leftQuaternary
!= rightQuaternary
) {
436 return (leftQuaternary
< rightQuaternary
) ? UCOL_LESS
: UCOL_GREATER
;
438 if(leftPair
== EOS
) { break; }
446 CollationFastLatin::compareUTF8(const uint16_t *table
, const uint16_t *primaries
, int32_t options
,
447 const uint8_t *left
, int32_t leftLength
,
448 const uint8_t *right
, int32_t rightLength
) {
449 // Keep compareUTF16() and compareUTF8() in sync very closely!
451 U_ASSERT((table
[0] >> 8) == VERSION
);
452 table
+= (table
[0] & 0xff); // skip the header
453 uint32_t variableTop
= (uint32_t)options
>> 16; // see RuleBasedCollator::getFastLatinOptions()
454 options
&= 0xffff; // needed for CollationSettings::getStrength() to work
456 // Check for supported characters, fetch mini CEs, and compare primaries.
457 int32_t leftIndex
= 0, rightIndex
= 0;
459 * Single mini CE or a pair.
460 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
461 * If there is only one, then it is in the lower bits, and the upper bits are 0.
463 uint32_t leftPair
= 0, rightPair
= 0;
464 // Note: There is no need to assemble the code point.
465 // We only need to look up the table entry for the character,
466 // and nextPair() looks for whether c==0.
468 // We fetch CEs until we get a non-ignorable primary or reach the end.
469 while(leftPair
== 0) {
470 if(leftIndex
== leftLength
) {
474 UChar32 c
= left
[leftIndex
++];
477 leftPair
= primaries
[c
];
478 if(leftPair
!= 0) { break; }
479 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
480 return BAIL_OUT_RESULT
;
483 } else if(c
<= LATIN_MAX_UTF8_LEAD
&& 0xc2 <= c
&& leftIndex
!= leftLength
&&
484 0x80 <= (t
= left
[leftIndex
]) && t
<= 0xbf) {
486 c
= ((c
- 0xc2) << 6) + t
;
487 leftPair
= primaries
[c
];
488 if(leftPair
!= 0) { break; }
491 leftPair
= lookupUTF8(table
, c
, left
, leftIndex
, leftLength
);
493 if(leftPair
>= MIN_SHORT
) {
494 leftPair
&= SHORT_PRIMARY_MASK
;
496 } else if(leftPair
> variableTop
) {
497 leftPair
&= LONG_PRIMARY_MASK
;
500 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
501 if(leftPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
502 leftPair
= getPrimaries(variableTop
, leftPair
);
506 while(rightPair
== 0) {
507 if(rightIndex
== rightLength
) {
511 UChar32 c
= right
[rightIndex
++];
514 rightPair
= primaries
[c
];
515 if(rightPair
!= 0) { break; }
516 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
517 return BAIL_OUT_RESULT
;
519 rightPair
= table
[c
];
520 } else if(c
<= LATIN_MAX_UTF8_LEAD
&& 0xc2 <= c
&& rightIndex
!= rightLength
&&
521 0x80 <= (t
= right
[rightIndex
]) && t
<= 0xbf) {
523 c
= ((c
- 0xc2) << 6) + t
;
524 rightPair
= primaries
[c
];
525 if(rightPair
!= 0) { break; }
526 rightPair
= table
[c
];
528 rightPair
= lookupUTF8(table
, c
, right
, rightIndex
, rightLength
);
530 if(rightPair
>= MIN_SHORT
) {
531 rightPair
&= SHORT_PRIMARY_MASK
;
533 } else if(rightPair
> variableTop
) {
534 rightPair
&= LONG_PRIMARY_MASK
;
537 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
538 if(rightPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
539 rightPair
= getPrimaries(variableTop
, rightPair
);
543 if(leftPair
== rightPair
) {
544 if(leftPair
== EOS
) { break; }
545 leftPair
= rightPair
= 0;
548 uint32_t leftPrimary
= leftPair
& 0xffff;
549 uint32_t rightPrimary
= rightPair
& 0xffff;
550 if(leftPrimary
!= rightPrimary
) {
551 // Return the primary difference.
552 return (leftPrimary
< rightPrimary
) ? UCOL_LESS
: UCOL_GREATER
;
554 if(leftPair
== EOS
) { break; }
558 // In the following, we need to re-fetch each character because we did not buffer the CEs,
559 // but we know that the string is well-formed and
560 // only contains supported characters and mappings.
562 // We might skip the secondary level but continue with the case level
563 // which is turned on separately.
564 if(CollationSettings::getStrength(options
) >= UCOL_SECONDARY
) {
565 leftIndex
= rightIndex
= 0;
566 leftPair
= rightPair
= 0;
568 while(leftPair
== 0) {
569 if(leftIndex
== leftLength
) {
573 UChar32 c
= left
[leftIndex
++];
576 } else if(c
<= LATIN_MAX_UTF8_LEAD
) {
577 leftPair
= table
[((c
- 0xc2) << 6) + left
[leftIndex
++]];
579 leftPair
= lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
581 if(leftPair
>= MIN_SHORT
) {
582 leftPair
= getSecondariesFromOneShortCE(leftPair
);
584 } else if(leftPair
> variableTop
) {
585 leftPair
= COMMON_SEC_PLUS_OFFSET
;
588 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
589 leftPair
= getSecondaries(variableTop
, leftPair
);
593 while(rightPair
== 0) {
594 if(rightIndex
== rightLength
) {
598 UChar32 c
= right
[rightIndex
++];
600 rightPair
= table
[c
];
601 } else if(c
<= LATIN_MAX_UTF8_LEAD
) {
602 rightPair
= table
[((c
- 0xc2) << 6) + right
[rightIndex
++]];
604 rightPair
= lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
606 if(rightPair
>= MIN_SHORT
) {
607 rightPair
= getSecondariesFromOneShortCE(rightPair
);
609 } else if(rightPair
> variableTop
) {
610 rightPair
= COMMON_SEC_PLUS_OFFSET
;
613 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
614 rightPair
= getSecondaries(variableTop
, rightPair
);
618 if(leftPair
== rightPair
) {
619 if(leftPair
== EOS
) { break; }
620 leftPair
= rightPair
= 0;
623 uint32_t leftSecondary
= leftPair
& 0xffff;
624 uint32_t rightSecondary
= rightPair
& 0xffff;
625 if(leftSecondary
!= rightSecondary
) {
626 if((options
& CollationSettings::BACKWARD_SECONDARY
) != 0) {
627 // Full support for backwards secondary requires backwards contraction matching
628 // and moving backwards between merge separators.
629 return BAIL_OUT_RESULT
;
631 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
633 if(leftPair
== EOS
) { break; }
639 if((options
& CollationSettings::CASE_LEVEL
) != 0) {
640 UBool strengthIsPrimary
= CollationSettings::getStrength(options
) == UCOL_PRIMARY
;
641 leftIndex
= rightIndex
= 0;
642 leftPair
= rightPair
= 0;
644 while(leftPair
== 0) {
645 if(leftIndex
== leftLength
) {
649 UChar32 c
= left
[leftIndex
++];
650 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
651 if(leftPair
< MIN_LONG
) {
652 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
654 leftPair
= getCases(variableTop
, strengthIsPrimary
, leftPair
);
657 while(rightPair
== 0) {
658 if(rightIndex
== rightLength
) {
662 UChar32 c
= right
[rightIndex
++];
663 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
664 if(rightPair
< MIN_LONG
) {
665 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
667 rightPair
= getCases(variableTop
, strengthIsPrimary
, rightPair
);
670 if(leftPair
== rightPair
) {
671 if(leftPair
== EOS
) { break; }
672 leftPair
= rightPair
= 0;
675 uint32_t leftCase
= leftPair
& 0xffff;
676 uint32_t rightCase
= rightPair
& 0xffff;
677 if(leftCase
!= rightCase
) {
678 if((options
& CollationSettings::UPPER_FIRST
) == 0) {
679 return (leftCase
< rightCase
) ? UCOL_LESS
: UCOL_GREATER
;
681 return (leftCase
< rightCase
) ? UCOL_GREATER
: UCOL_LESS
;
684 if(leftPair
== EOS
) { break; }
689 if(CollationSettings::getStrength(options
) <= UCOL_SECONDARY
) { return UCOL_EQUAL
; }
691 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
692 UBool withCaseBits
= CollationSettings::isTertiaryWithCaseBits(options
);
694 leftIndex
= rightIndex
= 0;
695 leftPair
= rightPair
= 0;
697 while(leftPair
== 0) {
698 if(leftIndex
== leftLength
) {
702 UChar32 c
= left
[leftIndex
++];
703 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
704 if(leftPair
< MIN_LONG
) {
705 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
707 leftPair
= getTertiaries(variableTop
, withCaseBits
, leftPair
);
710 while(rightPair
== 0) {
711 if(rightIndex
== rightLength
) {
715 UChar32 c
= right
[rightIndex
++];
716 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
717 if(rightPair
< MIN_LONG
) {
718 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
720 rightPair
= getTertiaries(variableTop
, withCaseBits
, rightPair
);
723 if(leftPair
== rightPair
) {
724 if(leftPair
== EOS
) { break; }
725 leftPair
= rightPair
= 0;
728 uint32_t leftTertiary
= leftPair
& 0xffff;
729 uint32_t rightTertiary
= rightPair
& 0xffff;
730 if(leftTertiary
!= rightTertiary
) {
731 if(CollationSettings::sortsTertiaryUpperCaseFirst(options
)) {
732 // Pass through EOS and MERGE_WEIGHT
733 // and keep real tertiary weights larger than the MERGE_WEIGHT.
734 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
735 if(leftTertiary
> MERGE_WEIGHT
) {
736 leftTertiary
^= CASE_MASK
;
738 if(rightTertiary
> MERGE_WEIGHT
) {
739 rightTertiary
^= CASE_MASK
;
742 return (leftTertiary
< rightTertiary
) ? UCOL_LESS
: UCOL_GREATER
;
744 if(leftPair
== EOS
) { break; }
748 if(CollationSettings::getStrength(options
) <= UCOL_TERTIARY
) { return UCOL_EQUAL
; }
750 leftIndex
= rightIndex
= 0;
751 leftPair
= rightPair
= 0;
753 while(leftPair
== 0) {
754 if(leftIndex
== leftLength
) {
758 UChar32 c
= left
[leftIndex
++];
759 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
760 if(leftPair
< MIN_LONG
) {
761 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
763 leftPair
= getQuaternaries(variableTop
, leftPair
);
766 while(rightPair
== 0) {
767 if(rightIndex
== rightLength
) {
771 UChar32 c
= right
[rightIndex
++];
772 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
773 if(rightPair
< MIN_LONG
) {
774 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
776 rightPair
= getQuaternaries(variableTop
, rightPair
);
779 if(leftPair
== rightPair
) {
780 if(leftPair
== EOS
) { break; }
781 leftPair
= rightPair
= 0;
784 uint32_t leftQuaternary
= leftPair
& 0xffff;
785 uint32_t rightQuaternary
= rightPair
& 0xffff;
786 if(leftQuaternary
!= rightQuaternary
) {
787 return (leftQuaternary
< rightQuaternary
) ? UCOL_LESS
: UCOL_GREATER
;
789 if(leftPair
== EOS
) { break; }
797 CollationFastLatin::lookup(const uint16_t *table
, UChar32 c
) {
798 U_ASSERT(c
> LATIN_MAX
);
799 if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
800 return table
[c
- PUNCT_START
+ LATIN_LIMIT
];
801 } else if(c
== 0xfffe) {
803 } else if(c
== 0xffff) {
804 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
;
811 CollationFastLatin::lookupUTF8(const uint16_t *table
, UChar32 c
,
812 const uint8_t *s8
, int32_t &sIndex
, int32_t sLength
) {
813 // The caller handled ASCII and valid/supported Latin.
815 int32_t i2
= sIndex
+ 1;
816 if(i2
< sLength
|| sLength
< 0) {
817 uint8_t t1
= s8
[sIndex
];
820 if(c
== 0xe2 && t1
== 0x80 && 0x80 <= t2
&& t2
<= 0xbf) {
821 return table
[(LATIN_LIMIT
- 0x80) + t2
]; // 2000..203F -> 0180..01BF
822 } else if(c
== 0xef && t1
== 0xbf) {
824 return MERGE_WEIGHT
; // U+FFFE
825 } else if(t2
== 0xbf) {
826 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
; // U+FFFF
834 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table
, UChar32 c
,
835 const uint8_t *s8
, int32_t &sIndex
) {
836 // The caller handled ASCII.
837 // The string is well-formed and contains only supported characters.
839 if(c
<= LATIN_MAX_UTF8_LEAD
) {
840 return table
[((c
- 0xc2) << 6) + s8
[sIndex
++]]; // 0080..017F
842 uint8_t t2
= s8
[sIndex
+ 1];
845 return table
[(LATIN_LIMIT
- 0x80) + t2
]; // 2000..203F -> 0180..01BF
846 } else if(t2
== 0xbe) {
847 return MERGE_WEIGHT
; // U+FFFE
849 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
; // U+FFFF
854 CollationFastLatin::nextPair(const uint16_t *table
, UChar32 c
, uint32_t ce
,
855 const UChar
*s16
, const uint8_t *s8
, int32_t &sIndex
, int32_t &sLength
) {
856 if(ce
>= MIN_LONG
|| ce
< CONTRACTION
) {
857 return ce
; // simple or special mini CE
858 } else if(ce
>= EXPANSION
) {
859 int32_t index
= NUM_FAST_CHARS
+ (ce
& INDEX_MASK
);
860 return ((uint32_t)table
[index
+ 1] << 16) | table
[index
];
861 } else /* ce >= CONTRACTION */ {
862 if(c
== 0 && sLength
< 0) {
863 sLength
= sIndex
- 1;
866 // Contraction list: Default mapping followed by
867 // 0 or more single-character contraction suffix mappings.
868 int32_t index
= NUM_FAST_CHARS
+ (ce
& INDEX_MASK
);
869 if(sIndex
!= sLength
) {
870 // Read the next character.
872 int32_t nextIndex
= sIndex
;
874 c2
= s16
[nextIndex
++];
876 if(PUNCT_START
<= c2
&& c2
< PUNCT_LIMIT
) {
877 c2
= c2
- PUNCT_START
+ LATIN_LIMIT
; // 2000..203F -> 0180..01BF
878 } else if(c2
== 0xfffe || c2
== 0xffff) {
879 c2
= -1; // U+FFFE & U+FFFF cannot occur in contractions.
885 c2
= s8
[nextIndex
++];
888 if(c2
<= 0xc5 && 0xc2 <= c2
&& nextIndex
!= sLength
&&
889 0x80 <= (t
= s8
[nextIndex
]) && t
<= 0xbf) {
890 c2
= ((c2
- 0xc2) << 6) + t
; // 0080..017F
893 int32_t i2
= nextIndex
+ 1;
894 if(i2
< sLength
|| sLength
< 0) {
895 if(c2
== 0xe2 && s8
[nextIndex
] == 0x80 &&
896 0x80 <= (t
= s8
[i2
]) && t
<= 0xbf) {
897 c2
= (LATIN_LIMIT
- 0x80) + t
; // 2000..203F -> 0180..01BF
898 } else if(c2
== 0xef && s8
[nextIndex
] == 0xbf &&
899 ((t
= s8
[i2
]) == 0xbe || t
== 0xbf)) {
900 c2
= -1; // U+FFFE & U+FFFF cannot occur in contractions.
911 if(c2
== 0 && sLength
< 0) {
915 // Look for the next character in the contraction suffix list,
916 // which is in ascending order of single suffix characters.
918 int32_t head
= table
[i
]; // first skip the default mapping
921 i
+= head
>> CONTR_LENGTH_SHIFT
;
923 x
= head
& CONTR_CHAR_MASK
;
930 // Return the CE or CEs for the default or contraction mapping.
931 int32_t length
= table
[index
] >> CONTR_LENGTH_SHIFT
;
935 ce
= table
[index
+ 1];
939 return ((uint32_t)table
[index
+ 2] << 16) | ce
;
945 CollationFastLatin::getSecondaries(uint32_t variableTop
, uint32_t pair
) {
948 if(pair
>= MIN_SHORT
) {
949 pair
= getSecondariesFromOneShortCE(pair
);
950 } else if(pair
> variableTop
) {
951 pair
= COMMON_SEC_PLUS_OFFSET
;
952 } else if(pair
>= MIN_LONG
) {
953 pair
= 0; // variable
955 // else special mini CE
957 uint32_t ce
= pair
& 0xffff;
958 if(ce
>= MIN_SHORT
) {
959 pair
= (pair
& TWO_SECONDARIES_MASK
) + TWO_SEC_OFFSETS
;
960 } else if(ce
> variableTop
) {
961 pair
= TWO_COMMON_SEC_PLUS_OFFSET
;
963 U_ASSERT(ce
>= MIN_LONG
);
964 pair
= 0; // variable
971 CollationFastLatin::getCases(uint32_t variableTop
, UBool strengthIsPrimary
, uint32_t pair
) {
972 // Primary+caseLevel: Ignore case level weights of primary ignorables.
973 // Otherwise: Ignore case level weights of secondary ignorables.
974 // For details see the comments in the CollationCompare class.
975 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
978 if(pair
>= MIN_SHORT
) {
979 // A high secondary weight means we really have two CEs,
980 // a primary CE and a secondary CE.
982 pair
&= CASE_MASK
; // explicit weight of primary CE
983 if(!strengthIsPrimary
&& (ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
984 pair
|= LOWER_CASE
<< 16; // implied weight of secondary CE
986 } else if(pair
> variableTop
) {
988 } else if(pair
>= MIN_LONG
) {
989 pair
= 0; // variable
991 // else special mini CE
993 // two mini CEs, same primary groups, neither expands like above
994 uint32_t ce
= pair
& 0xffff;
995 if(ce
>= MIN_SHORT
) {
996 if(strengthIsPrimary
&& (pair
& (SHORT_PRIMARY_MASK
<< 16)) == 0) {
999 pair
&= TWO_CASES_MASK
;
1001 } else if(ce
> variableTop
) {
1002 pair
= TWO_LOWER_CASES
;
1004 U_ASSERT(ce
>= MIN_LONG
);
1005 pair
= 0; // variable
1012 CollationFastLatin::getTertiaries(uint32_t variableTop
, UBool withCaseBits
, uint32_t pair
) {
1013 if(pair
<= 0xffff) {
1015 if(pair
>= MIN_SHORT
) {
1016 // A high secondary weight means we really have two CEs,
1017 // a primary CE and a secondary CE.
1020 pair
= (pair
& CASE_AND_TERTIARY_MASK
) + TER_OFFSET
;
1021 if((ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1022 pair
|= (LOWER_CASE
| COMMON_TER_PLUS_OFFSET
) << 16;
1025 pair
= (pair
& TERTIARY_MASK
) + TER_OFFSET
;
1026 if((ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1027 pair
|= COMMON_TER_PLUS_OFFSET
<< 16;
1030 } else if(pair
> variableTop
) {
1031 pair
= (pair
& TERTIARY_MASK
) + TER_OFFSET
;
1035 } else if(pair
>= MIN_LONG
) {
1036 pair
= 0; // variable
1038 // else special mini CE
1040 // two mini CEs, same primary groups, neither expands like above
1041 uint32_t ce
= pair
& 0xffff;
1042 if(ce
>= MIN_SHORT
) {
1044 pair
&= TWO_CASES_MASK
| TWO_TERTIARIES_MASK
;
1046 pair
&= TWO_TERTIARIES_MASK
;
1048 pair
+= TWO_TER_OFFSETS
;
1049 } else if(ce
> variableTop
) {
1050 pair
= (pair
& TWO_TERTIARIES_MASK
) + TWO_TER_OFFSETS
;
1052 pair
|= TWO_LOWER_CASES
;
1055 U_ASSERT(ce
>= MIN_LONG
);
1056 pair
= 0; // variable
1063 CollationFastLatin::getQuaternaries(uint32_t variableTop
, uint32_t pair
) {
1064 // Return the primary weight of a variable CE,
1065 // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1066 if(pair
<= 0xffff) {
1068 if(pair
>= MIN_SHORT
) {
1069 // A high secondary weight means we really have two CEs,
1070 // a primary CE and a secondary CE.
1071 if((pair
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1072 pair
= TWO_SHORT_PRIMARIES_MASK
;
1074 pair
= SHORT_PRIMARY_MASK
;
1076 } else if(pair
> variableTop
) {
1077 pair
= SHORT_PRIMARY_MASK
;
1078 } else if(pair
>= MIN_LONG
) {
1079 pair
&= LONG_PRIMARY_MASK
; // variable
1081 // else special mini CE
1083 // two mini CEs, same primary groups, neither expands like above
1084 uint32_t ce
= pair
& 0xffff;
1085 if(ce
> variableTop
) {
1086 pair
= TWO_SHORT_PRIMARIES_MASK
;
1088 U_ASSERT(ce
>= MIN_LONG
);
1089 pair
&= TWO_LONG_PRIMARIES_MASK
; // variable
1097 #endif // !UCONFIG_NO_COLLATION