2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "putilimp.h" // U_ALIGN_CODE
26 CollationFastLatin::getOptions(const CollationData
*data
, const CollationSettings
&settings
,
27 uint16_t *primaries
, int32_t capacity
) {
28 const uint16_t *table
= data
->fastLatinTable
;
29 if(table
== NULL
) { return -1; }
30 U_ASSERT(capacity
== LATIN_LIMIT
);
31 if(capacity
!= LATIN_LIMIT
) { return -1; }
34 if((settings
.options
& CollationSettings::ALTERNATE_MASK
) == 0) {
35 // No mini primaries are variable, set a variableTop just below the
36 // lowest long mini primary.
37 miniVarTop
= MIN_LONG
- 1;
39 uint32_t v1
= settings
.variableTop
>> 24;
40 int32_t headerLength
= *table
& 0xff;
41 int32_t i
= headerLength
- 1;
42 if(i
<= 0 || v1
> (table
[i
] & 0x7f)) {
43 return -1; // variableTop >= digits, should not occur
45 while(i
> 1 && v1
<= (table
[i
- 1] & 0x7f)) { --i
; }
46 // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
47 // Shift right to make it comparable with long mini primaries in bits 15..3.
48 miniVarTop
= (table
[i
] & 0xff80) >> 4;
51 const uint8_t *reorderTable
= settings
.reorderTable
;
52 if(reorderTable
!= NULL
) {
53 const uint16_t *scripts
= data
->scripts
;
54 int32_t length
= data
->scriptsLength
;
55 uint32_t prevLastByte
= 0;
56 for(int32_t i
= 0; i
< length
;) {
57 // reordered last byte of the group
58 uint32_t lastByte
= reorderTable
[scripts
[i
] & 0xff];
59 if(lastByte
< prevLastByte
) {
60 // The permutation affects the groups up to Latin.
63 if(scripts
[i
+ 2] == USCRIPT_LATIN
) { break; }
64 i
= i
+ 2 + scripts
[i
+ 1];
65 prevLastByte
= lastByte
;
69 table
+= (table
[0] & 0xff); // skip the header
70 for(UChar32 c
= 0; c
< LATIN_LIMIT
; ++c
) {
71 uint32_t p
= table
[c
];
73 p
&= SHORT_PRIMARY_MASK
;
74 } else if(p
> miniVarTop
) {
75 p
&= LONG_PRIMARY_MASK
;
79 primaries
[c
] = (uint16_t)p
;
81 if((settings
.options
& CollationSettings::NUMERIC
) != 0) {
82 // Bail out for digits.
83 for(UChar32 c
= 0x30; c
<= 0x39; ++c
) { primaries
[c
] = 0; }
86 // Shift the miniVarTop above other options.
87 return ((int32_t)miniVarTop
<< 16) | settings
.options
;
91 CollationFastLatin::compareUTF16(const uint16_t *table
, const uint16_t *primaries
, int32_t options
,
92 const UChar
*left
, int32_t leftLength
,
93 const UChar
*right
, int32_t rightLength
) {
94 // This is a modified copy of CollationCompare::compareUpToQuaternary(),
95 // optimized for common Latin text.
97 // Keep compareUTF16() and compareUTF8() in sync very closely!
99 U_ASSERT((table
[0] >> 8) == VERSION
);
100 table
+= (table
[0] & 0xff); // skip the header
101 uint32_t variableTop
= (uint32_t)options
>> 16; // see getOptions()
102 options
&= 0xffff; // needed for CollationSettings::getStrength() to work
104 // Check for supported characters, fetch mini CEs, and compare primaries.
106 int32_t leftIndex
= 0, rightIndex
= 0;
108 * Single mini CE or a pair.
109 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
110 * If there is only one, then it is in the lower bits, and the upper bits are 0.
112 uint32_t leftPair
= 0, rightPair
= 0;
114 // We fetch CEs until we get a non-ignorable primary or reach the end.
115 while(leftPair
== 0) {
116 if(leftIndex
== leftLength
) {
120 UChar32 c
= left
[leftIndex
++];
122 leftPair
= primaries
[c
];
123 if(leftPair
!= 0) { break; }
124 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
125 return BAIL_OUT_RESULT
;
128 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
129 leftPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
131 leftPair
= lookup(table
, c
);
133 if(leftPair
>= MIN_SHORT
) {
134 leftPair
&= SHORT_PRIMARY_MASK
;
136 } else if(leftPair
> variableTop
) {
137 leftPair
&= LONG_PRIMARY_MASK
;
140 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
141 if(leftPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
142 leftPair
= getPrimaries(variableTop
, leftPair
);
146 while(rightPair
== 0) {
147 if(rightIndex
== rightLength
) {
151 UChar32 c
= right
[rightIndex
++];
153 rightPair
= primaries
[c
];
154 if(rightPair
!= 0) { break; }
155 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
156 return BAIL_OUT_RESULT
;
158 rightPair
= table
[c
];
159 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
160 rightPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
162 rightPair
= lookup(table
, c
);
164 if(rightPair
>= MIN_SHORT
) {
165 rightPair
&= SHORT_PRIMARY_MASK
;
167 } else if(rightPair
> variableTop
) {
168 rightPair
&= LONG_PRIMARY_MASK
;
171 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
172 if(rightPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
173 rightPair
= getPrimaries(variableTop
, rightPair
);
177 if(leftPair
== rightPair
) {
178 if(leftPair
== EOS
) { break; }
179 leftPair
= rightPair
= 0;
182 uint32_t leftPrimary
= leftPair
& 0xffff;
183 uint32_t rightPrimary
= rightPair
& 0xffff;
184 if(leftPrimary
!= rightPrimary
) {
185 // Return the primary difference.
186 return (leftPrimary
< rightPrimary
) ? UCOL_LESS
: UCOL_GREATER
;
188 if(leftPair
== EOS
) { break; }
192 // In the following, we need to re-fetch each character because we did not buffer the CEs,
193 // but we know that the string is well-formed and
194 // only contains supported characters and mappings.
196 // We might skip the secondary level but continue with the case level
197 // which is turned on separately.
198 if(CollationSettings::getStrength(options
) >= UCOL_SECONDARY
) {
199 leftIndex
= rightIndex
= 0;
200 leftPair
= rightPair
= 0;
202 while(leftPair
== 0) {
203 if(leftIndex
== leftLength
) {
207 UChar32 c
= left
[leftIndex
++];
210 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
211 leftPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
213 leftPair
= lookup(table
, c
);
215 if(leftPair
>= MIN_SHORT
) {
216 leftPair
= getSecondariesFromOneShortCE(leftPair
);
218 } else if(leftPair
> variableTop
) {
219 leftPair
= COMMON_SEC_PLUS_OFFSET
;
222 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
223 leftPair
= getSecondaries(variableTop
, leftPair
);
227 while(rightPair
== 0) {
228 if(rightIndex
== rightLength
) {
232 UChar32 c
= right
[rightIndex
++];
234 rightPair
= table
[c
];
235 } else if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
236 rightPair
= table
[c
- PUNCT_START
+ LATIN_LIMIT
];
238 rightPair
= lookup(table
, c
);
240 if(rightPair
>= MIN_SHORT
) {
241 rightPair
= getSecondariesFromOneShortCE(rightPair
);
243 } else if(rightPair
> variableTop
) {
244 rightPair
= COMMON_SEC_PLUS_OFFSET
;
247 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
248 rightPair
= getSecondaries(variableTop
, rightPair
);
252 if(leftPair
== rightPair
) {
253 if(leftPair
== EOS
) { break; }
254 leftPair
= rightPair
= 0;
257 uint32_t leftSecondary
= leftPair
& 0xffff;
258 uint32_t rightSecondary
= rightPair
& 0xffff;
259 if(leftSecondary
!= rightSecondary
) {
260 if((options
& CollationSettings::BACKWARD_SECONDARY
) != 0) {
261 // Full support for backwards secondary requires backwards contraction matching
262 // and moving backwards between merge separators.
263 return BAIL_OUT_RESULT
;
265 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
267 if(leftPair
== EOS
) { break; }
273 if((options
& CollationSettings::CASE_LEVEL
) != 0) {
274 UBool strengthIsPrimary
= CollationSettings::getStrength(options
) == UCOL_PRIMARY
;
275 leftIndex
= rightIndex
= 0;
276 leftPair
= rightPair
= 0;
278 while(leftPair
== 0) {
279 if(leftIndex
== leftLength
) {
283 UChar32 c
= left
[leftIndex
++];
284 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
285 if(leftPair
< MIN_LONG
) {
286 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
288 leftPair
= getCases(variableTop
, strengthIsPrimary
, leftPair
);
291 while(rightPair
== 0) {
292 if(rightIndex
== rightLength
) {
296 UChar32 c
= right
[rightIndex
++];
297 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
298 if(rightPair
< MIN_LONG
) {
299 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
301 rightPair
= getCases(variableTop
, strengthIsPrimary
, rightPair
);
304 if(leftPair
== rightPair
) {
305 if(leftPair
== EOS
) { break; }
306 leftPair
= rightPair
= 0;
309 uint32_t leftCase
= leftPair
& 0xffff;
310 uint32_t rightCase
= rightPair
& 0xffff;
311 if(leftCase
!= rightCase
) {
312 if((options
& CollationSettings::UPPER_FIRST
) == 0) {
313 return (leftCase
< rightCase
) ? UCOL_LESS
: UCOL_GREATER
;
315 return (leftCase
< rightCase
) ? UCOL_GREATER
: UCOL_LESS
;
318 if(leftPair
== EOS
) { break; }
323 if(CollationSettings::getStrength(options
) <= UCOL_SECONDARY
) { return UCOL_EQUAL
; }
325 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
326 UBool withCaseBits
= CollationSettings::isTertiaryWithCaseBits(options
);
328 leftIndex
= rightIndex
= 0;
329 leftPair
= rightPair
= 0;
331 while(leftPair
== 0) {
332 if(leftIndex
== leftLength
) {
336 UChar32 c
= left
[leftIndex
++];
337 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
338 if(leftPair
< MIN_LONG
) {
339 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
341 leftPair
= getTertiaries(variableTop
, withCaseBits
, leftPair
);
344 while(rightPair
== 0) {
345 if(rightIndex
== rightLength
) {
349 UChar32 c
= right
[rightIndex
++];
350 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
351 if(rightPair
< MIN_LONG
) {
352 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
354 rightPair
= getTertiaries(variableTop
, withCaseBits
, rightPair
);
357 if(leftPair
== rightPair
) {
358 if(leftPair
== EOS
) { break; }
359 leftPair
= rightPair
= 0;
362 uint32_t leftTertiary
= leftPair
& 0xffff;
363 uint32_t rightTertiary
= rightPair
& 0xffff;
364 if(leftTertiary
!= rightTertiary
) {
365 if(CollationSettings::sortsTertiaryUpperCaseFirst(options
)) {
366 // Pass through EOS and MERGE_WEIGHT
367 // and keep real tertiary weights larger than the MERGE_WEIGHT.
368 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
369 if(leftTertiary
> MERGE_WEIGHT
) {
370 leftTertiary
^= CASE_MASK
;
372 if(rightTertiary
> MERGE_WEIGHT
) {
373 rightTertiary
^= CASE_MASK
;
376 return (leftTertiary
< rightTertiary
) ? UCOL_LESS
: UCOL_GREATER
;
378 if(leftPair
== EOS
) { break; }
382 if(CollationSettings::getStrength(options
) <= UCOL_TERTIARY
) { return UCOL_EQUAL
; }
384 leftIndex
= rightIndex
= 0;
385 leftPair
= rightPair
= 0;
387 while(leftPair
== 0) {
388 if(leftIndex
== leftLength
) {
392 UChar32 c
= left
[leftIndex
++];
393 leftPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
394 if(leftPair
< MIN_LONG
) {
395 leftPair
= nextPair(table
, c
, leftPair
, left
, NULL
, leftIndex
, leftLength
);
397 leftPair
= getQuaternaries(variableTop
, leftPair
);
400 while(rightPair
== 0) {
401 if(rightIndex
== rightLength
) {
405 UChar32 c
= right
[rightIndex
++];
406 rightPair
= (c
<= LATIN_MAX
) ? table
[c
] : lookup(table
, c
);
407 if(rightPair
< MIN_LONG
) {
408 rightPair
= nextPair(table
, c
, rightPair
, right
, NULL
, rightIndex
, rightLength
);
410 rightPair
= getQuaternaries(variableTop
, rightPair
);
413 if(leftPair
== rightPair
) {
414 if(leftPair
== EOS
) { break; }
415 leftPair
= rightPair
= 0;
418 uint32_t leftQuaternary
= leftPair
& 0xffff;
419 uint32_t rightQuaternary
= rightPair
& 0xffff;
420 if(leftQuaternary
!= rightQuaternary
) {
421 return (leftQuaternary
< rightQuaternary
) ? UCOL_LESS
: UCOL_GREATER
;
423 if(leftPair
== EOS
) { break; }
431 CollationFastLatin::compareUTF8(const uint16_t *table
, const uint16_t *primaries
, int32_t options
,
432 const uint8_t *left
, int32_t leftLength
,
433 const uint8_t *right
, int32_t rightLength
) {
434 // Keep compareUTF16() and compareUTF8() in sync very closely!
436 U_ASSERT((table
[0] >> 8) == VERSION
);
437 table
+= (table
[0] & 0xff); // skip the header
438 uint32_t variableTop
= (uint32_t)options
>> 16; // see RuleBasedCollator::getFastLatinOptions()
439 options
&= 0xffff; // needed for CollationSettings::getStrength() to work
441 // Check for supported characters, fetch mini CEs, and compare primaries.
443 int32_t leftIndex
= 0, rightIndex
= 0;
445 * Single mini CE or a pair.
446 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
447 * If there is only one, then it is in the lower bits, and the upper bits are 0.
449 uint32_t leftPair
= 0, rightPair
= 0;
450 // Note: There is no need to assemble the code point.
451 // We only need to look up the table entry for the character,
452 // and nextPair() looks for whether c==0.
454 // We fetch CEs until we get a non-ignorable primary or reach the end.
455 while(leftPair
== 0) {
456 if(leftIndex
== leftLength
) {
460 UChar32 c
= left
[leftIndex
++];
463 leftPair
= primaries
[c
];
464 if(leftPair
!= 0) { break; }
465 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
466 return BAIL_OUT_RESULT
;
469 } else if(c
<= LATIN_MAX_UTF8_LEAD
&& 0xc2 <= c
&& leftIndex
!= leftLength
&&
470 0x80 <= (t
= left
[leftIndex
]) && t
<= 0xbf) {
472 c
= ((c
- 0xc2) << 6) + t
;
473 leftPair
= primaries
[c
];
474 if(leftPair
!= 0) { break; }
477 leftPair
= lookupUTF8(table
, c
, left
, leftIndex
, leftLength
);
479 if(leftPair
>= MIN_SHORT
) {
480 leftPair
&= SHORT_PRIMARY_MASK
;
482 } else if(leftPair
> variableTop
) {
483 leftPair
&= LONG_PRIMARY_MASK
;
486 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
487 if(leftPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
488 leftPair
= getPrimaries(variableTop
, leftPair
);
492 while(rightPair
== 0) {
493 if(rightIndex
== rightLength
) {
497 UChar32 c
= right
[rightIndex
++];
500 rightPair
= primaries
[c
];
501 if(rightPair
!= 0) { break; }
502 if(c
<= 0x39 && c
>= 0x30 && (options
& CollationSettings::NUMERIC
) != 0) {
503 return BAIL_OUT_RESULT
;
505 rightPair
= table
[c
];
506 } else if(c
<= LATIN_MAX_UTF8_LEAD
&& 0xc2 <= c
&& rightIndex
!= rightLength
&&
507 0x80 <= (t
= right
[rightIndex
]) && t
<= 0xbf) {
509 c
= ((c
- 0xc2) << 6) + t
;
510 rightPair
= primaries
[c
];
511 if(rightPair
!= 0) { break; }
512 rightPair
= table
[c
];
514 rightPair
= lookupUTF8(table
, c
, right
, rightIndex
, rightLength
);
516 if(rightPair
>= MIN_SHORT
) {
517 rightPair
&= SHORT_PRIMARY_MASK
;
519 } else if(rightPair
> variableTop
) {
520 rightPair
&= LONG_PRIMARY_MASK
;
523 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
524 if(rightPair
== BAIL_OUT
) { return BAIL_OUT_RESULT
; }
525 rightPair
= getPrimaries(variableTop
, rightPair
);
529 if(leftPair
== rightPair
) {
530 if(leftPair
== EOS
) { break; }
531 leftPair
= rightPair
= 0;
534 uint32_t leftPrimary
= leftPair
& 0xffff;
535 uint32_t rightPrimary
= rightPair
& 0xffff;
536 if(leftPrimary
!= rightPrimary
) {
537 // Return the primary difference.
538 return (leftPrimary
< rightPrimary
) ? UCOL_LESS
: UCOL_GREATER
;
540 if(leftPair
== EOS
) { break; }
544 // In the following, we need to re-fetch each character because we did not buffer the CEs,
545 // but we know that the string is well-formed and
546 // only contains supported characters and mappings.
548 // We might skip the secondary level but continue with the case level
549 // which is turned on separately.
550 if(CollationSettings::getStrength(options
) >= UCOL_SECONDARY
) {
551 leftIndex
= rightIndex
= 0;
552 leftPair
= rightPair
= 0;
554 while(leftPair
== 0) {
555 if(leftIndex
== leftLength
) {
559 UChar32 c
= left
[leftIndex
++];
562 } else if(c
<= LATIN_MAX_UTF8_LEAD
) {
563 leftPair
= table
[((c
- 0xc2) << 6) + left
[leftIndex
++]];
565 leftPair
= lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
567 if(leftPair
>= MIN_SHORT
) {
568 leftPair
= getSecondariesFromOneShortCE(leftPair
);
570 } else if(leftPair
> variableTop
) {
571 leftPair
= COMMON_SEC_PLUS_OFFSET
;
574 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
575 leftPair
= getSecondaries(variableTop
, leftPair
);
579 while(rightPair
== 0) {
580 if(rightIndex
== rightLength
) {
584 UChar32 c
= right
[rightIndex
++];
586 rightPair
= table
[c
];
587 } else if(c
<= LATIN_MAX_UTF8_LEAD
) {
588 rightPair
= table
[((c
- 0xc2) << 6) + right
[rightIndex
++]];
590 rightPair
= lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
592 if(rightPair
>= MIN_SHORT
) {
593 rightPair
= getSecondariesFromOneShortCE(rightPair
);
595 } else if(rightPair
> variableTop
) {
596 rightPair
= COMMON_SEC_PLUS_OFFSET
;
599 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
600 rightPair
= getSecondaries(variableTop
, rightPair
);
604 if(leftPair
== rightPair
) {
605 if(leftPair
== EOS
) { break; }
606 leftPair
= rightPair
= 0;
609 uint32_t leftSecondary
= leftPair
& 0xffff;
610 uint32_t rightSecondary
= rightPair
& 0xffff;
611 if(leftSecondary
!= rightSecondary
) {
612 if((options
& CollationSettings::BACKWARD_SECONDARY
) != 0) {
613 // Full support for backwards secondary requires backwards contraction matching
614 // and moving backwards between merge separators.
615 return BAIL_OUT_RESULT
;
617 return (leftSecondary
< rightSecondary
) ? UCOL_LESS
: UCOL_GREATER
;
619 if(leftPair
== EOS
) { break; }
625 if((options
& CollationSettings::CASE_LEVEL
) != 0) {
626 UBool strengthIsPrimary
= CollationSettings::getStrength(options
) == UCOL_PRIMARY
;
627 leftIndex
= rightIndex
= 0;
628 leftPair
= rightPair
= 0;
630 while(leftPair
== 0) {
631 if(leftIndex
== leftLength
) {
635 UChar32 c
= left
[leftIndex
++];
636 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
637 if(leftPair
< MIN_LONG
) {
638 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
640 leftPair
= getCases(variableTop
, strengthIsPrimary
, leftPair
);
643 while(rightPair
== 0) {
644 if(rightIndex
== rightLength
) {
648 UChar32 c
= right
[rightIndex
++];
649 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
650 if(rightPair
< MIN_LONG
) {
651 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
653 rightPair
= getCases(variableTop
, strengthIsPrimary
, rightPair
);
656 if(leftPair
== rightPair
) {
657 if(leftPair
== EOS
) { break; }
658 leftPair
= rightPair
= 0;
661 uint32_t leftCase
= leftPair
& 0xffff;
662 uint32_t rightCase
= rightPair
& 0xffff;
663 if(leftCase
!= rightCase
) {
664 if((options
& CollationSettings::UPPER_FIRST
) == 0) {
665 return (leftCase
< rightCase
) ? UCOL_LESS
: UCOL_GREATER
;
667 return (leftCase
< rightCase
) ? UCOL_GREATER
: UCOL_LESS
;
670 if(leftPair
== EOS
) { break; }
675 if(CollationSettings::getStrength(options
) <= UCOL_SECONDARY
) { return UCOL_EQUAL
; }
677 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
678 UBool withCaseBits
= CollationSettings::isTertiaryWithCaseBits(options
);
680 leftIndex
= rightIndex
= 0;
681 leftPair
= rightPair
= 0;
683 while(leftPair
== 0) {
684 if(leftIndex
== leftLength
) {
688 UChar32 c
= left
[leftIndex
++];
689 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
690 if(leftPair
< MIN_LONG
) {
691 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
693 leftPair
= getTertiaries(variableTop
, withCaseBits
, leftPair
);
696 while(rightPair
== 0) {
697 if(rightIndex
== rightLength
) {
701 UChar32 c
= right
[rightIndex
++];
702 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
703 if(rightPair
< MIN_LONG
) {
704 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
706 rightPair
= getTertiaries(variableTop
, withCaseBits
, rightPair
);
709 if(leftPair
== rightPair
) {
710 if(leftPair
== EOS
) { break; }
711 leftPair
= rightPair
= 0;
714 uint32_t leftTertiary
= leftPair
& 0xffff;
715 uint32_t rightTertiary
= rightPair
& 0xffff;
716 if(leftTertiary
!= rightTertiary
) {
717 if(CollationSettings::sortsTertiaryUpperCaseFirst(options
)) {
718 // Pass through EOS and MERGE_WEIGHT
719 // and keep real tertiary weights larger than the MERGE_WEIGHT.
720 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
721 if(leftTertiary
> MERGE_WEIGHT
) {
722 leftTertiary
^= CASE_MASK
;
724 if(rightTertiary
> MERGE_WEIGHT
) {
725 rightTertiary
^= CASE_MASK
;
728 return (leftTertiary
< rightTertiary
) ? UCOL_LESS
: UCOL_GREATER
;
730 if(leftPair
== EOS
) { break; }
734 if(CollationSettings::getStrength(options
) <= UCOL_TERTIARY
) { return UCOL_EQUAL
; }
736 leftIndex
= rightIndex
= 0;
737 leftPair
= rightPair
= 0;
739 while(leftPair
== 0) {
740 if(leftIndex
== leftLength
) {
744 UChar32 c
= left
[leftIndex
++];
745 leftPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, left
, leftIndex
);
746 if(leftPair
< MIN_LONG
) {
747 leftPair
= nextPair(table
, c
, leftPair
, NULL
, left
, leftIndex
, leftLength
);
749 leftPair
= getQuaternaries(variableTop
, leftPair
);
752 while(rightPair
== 0) {
753 if(rightIndex
== rightLength
) {
757 UChar32 c
= right
[rightIndex
++];
758 rightPair
= (c
<= 0x7f) ? table
[c
] : lookupUTF8Unsafe(table
, c
, right
, rightIndex
);
759 if(rightPair
< MIN_LONG
) {
760 rightPair
= nextPair(table
, c
, rightPair
, NULL
, right
, rightIndex
, rightLength
);
762 rightPair
= getQuaternaries(variableTop
, rightPair
);
765 if(leftPair
== rightPair
) {
766 if(leftPair
== EOS
) { break; }
767 leftPair
= rightPair
= 0;
770 uint32_t leftQuaternary
= leftPair
& 0xffff;
771 uint32_t rightQuaternary
= rightPair
& 0xffff;
772 if(leftQuaternary
!= rightQuaternary
) {
773 return (leftQuaternary
< rightQuaternary
) ? UCOL_LESS
: UCOL_GREATER
;
775 if(leftPair
== EOS
) { break; }
783 CollationFastLatin::lookup(const uint16_t *table
, UChar32 c
) {
784 U_ASSERT(c
> LATIN_MAX
);
785 if(PUNCT_START
<= c
&& c
< PUNCT_LIMIT
) {
786 return table
[c
- PUNCT_START
+ LATIN_LIMIT
];
787 } else if(c
== 0xfffe) {
789 } else if(c
== 0xffff) {
790 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
;
797 CollationFastLatin::lookupUTF8(const uint16_t *table
, UChar32 c
,
798 const uint8_t *s8
, int32_t &sIndex
, int32_t sLength
) {
799 // The caller handled ASCII and valid/supported Latin.
801 int32_t i2
= sIndex
+ 1;
802 if(i2
< sLength
|| sLength
< 0) {
803 uint8_t t1
= s8
[sIndex
];
806 if(c
== 0xe2 && t1
== 0x80 && 0x80 <= t2
&& t2
<= 0xbf) {
807 return table
[(LATIN_LIMIT
- 0x80) + t2
]; // 2000..203F -> 0180..01BF
808 } else if(c
== 0xef && t1
== 0xbf) {
810 return MERGE_WEIGHT
; // U+FFFE
811 } else if(t2
== 0xbf) {
812 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
; // U+FFFF
820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table
, UChar32 c
,
821 const uint8_t *s8
, int32_t &sIndex
) {
822 // The caller handled ASCII.
823 // The string is well-formed and contains only supported characters.
825 if(c
<= LATIN_MAX_UTF8_LEAD
) {
826 return table
[((c
- 0xc2) << 6) + s8
[sIndex
++]]; // 0080..017F
828 uint8_t t2
= s8
[sIndex
+ 1];
831 return table
[(LATIN_LIMIT
- 0x80) + t2
]; // 2000..203F -> 0180..01BF
832 } else if(t2
== 0xbe) {
833 return MERGE_WEIGHT
; // U+FFFE
835 return MAX_SHORT
| COMMON_SEC
| LOWER_CASE
| COMMON_TER
; // U+FFFF
840 CollationFastLatin::nextPair(const uint16_t *table
, UChar32 c
, uint32_t ce
,
841 const UChar
*s16
, const uint8_t *s8
, int32_t &sIndex
, int32_t &sLength
) {
842 if(ce
>= MIN_LONG
|| ce
< CONTRACTION
) {
843 return ce
; // simple or special mini CE
844 } else if(ce
>= EXPANSION
) {
845 int32_t index
= NUM_FAST_CHARS
+ (ce
& INDEX_MASK
);
846 return ((uint32_t)table
[index
+ 1] << 16) | table
[index
];
847 } else /* ce >= CONTRACTION */ {
848 if(c
== 0 && sLength
< 0) {
849 sLength
= sIndex
- 1;
852 // Contraction list: Default mapping followed by
853 // 0 or more single-character contraction suffix mappings.
854 int32_t index
= NUM_FAST_CHARS
+ (ce
& INDEX_MASK
);
855 if(sIndex
!= sLength
) {
856 // Read the next character.
858 int32_t nextIndex
= sIndex
;
860 c2
= s16
[nextIndex
++];
862 if(PUNCT_START
<= c2
&& c2
< PUNCT_LIMIT
) {
863 c2
= c2
- PUNCT_START
+ LATIN_LIMIT
; // 2000..203F -> 0180..01BF
864 } else if(c2
== 0xfffe || c2
== 0xffff) {
865 c2
= -1; // U+FFFE & U+FFFF cannot occur in contractions.
871 c2
= s8
[nextIndex
++];
874 if(c2
<= 0xc5 && 0xc2 <= c2
&& nextIndex
!= sLength
&&
875 0x80 <= (t
= s8
[nextIndex
]) && t
<= 0xbf) {
876 c2
= ((c2
- 0xc2) << 6) + t
; // 0080..017F
879 int32_t i2
= nextIndex
+ 1;
880 if(i2
< sLength
|| sLength
< 0) {
881 if(c2
== 0xe2 && s8
[nextIndex
] == 0x80 &&
882 0x80 <= (t
= s8
[i2
]) && t
<= 0xbf) {
883 c2
= (LATIN_LIMIT
- 0x80) + t
; // 2000..203F -> 0180..01BF
884 } else if(c2
== 0xef && s8
[nextIndex
] == 0xbf &&
885 ((t
= s8
[i2
]) == 0xbe || t
== 0xbf)) {
886 c2
= -1; // U+FFFE & U+FFFF cannot occur in contractions.
897 if(c2
== 0 && sLength
< 0) {
901 // Look for the next character in the contraction suffix list,
902 // which is in ascending order of single suffix characters.
904 int32_t head
= table
[i
]; // first skip the default mapping
907 i
+= head
>> CONTR_LENGTH_SHIFT
;
909 x
= head
& CONTR_CHAR_MASK
;
916 // Return the CE or CEs for the default or contraction mapping.
917 int32_t length
= table
[index
] >> CONTR_LENGTH_SHIFT
;
921 ce
= table
[index
+ 1];
925 return ((uint32_t)table
[index
+ 2] << 16) | ce
;
931 CollationFastLatin::getSecondaries(uint32_t variableTop
, uint32_t pair
) {
934 if(pair
>= MIN_SHORT
) {
935 pair
= getSecondariesFromOneShortCE(pair
);
936 } else if(pair
> variableTop
) {
937 pair
= COMMON_SEC_PLUS_OFFSET
;
938 } else if(pair
>= MIN_LONG
) {
939 pair
= 0; // variable
941 // else special mini CE
943 uint32_t ce
= pair
& 0xffff;
944 if(ce
>= MIN_SHORT
) {
945 pair
= (pair
& TWO_SECONDARIES_MASK
) + TWO_SEC_OFFSETS
;
946 } else if(ce
> variableTop
) {
947 pair
= TWO_COMMON_SEC_PLUS_OFFSET
;
949 U_ASSERT(ce
>= MIN_LONG
);
950 pair
= 0; // variable
957 CollationFastLatin::getCases(uint32_t variableTop
, UBool strengthIsPrimary
, uint32_t pair
) {
958 // Primary+caseLevel: Ignore case level weights of primary ignorables.
959 // Otherwise: Ignore case level weights of secondary ignorables.
960 // For details see the comments in the CollationCompare class.
961 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
964 if(pair
>= MIN_SHORT
) {
965 // A high secondary weight means we really have two CEs,
966 // a primary CE and a secondary CE.
968 pair
&= CASE_MASK
; // explicit weight of primary CE
969 if(!strengthIsPrimary
&& (ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
970 pair
|= LOWER_CASE
<< 16; // implied weight of secondary CE
972 } else if(pair
> variableTop
) {
974 } else if(pair
>= MIN_LONG
) {
975 pair
= 0; // variable
977 // else special mini CE
979 // two mini CEs, same primary groups, neither expands like above
980 uint32_t ce
= pair
& 0xffff;
981 if(ce
>= MIN_SHORT
) {
982 if(strengthIsPrimary
&& (pair
& (SHORT_PRIMARY_MASK
<< 16)) == 0) {
985 pair
&= TWO_CASES_MASK
;
987 } else if(ce
> variableTop
) {
988 pair
= TWO_LOWER_CASES
;
990 U_ASSERT(ce
>= MIN_LONG
);
991 pair
= 0; // variable
998 CollationFastLatin::getTertiaries(uint32_t variableTop
, UBool withCaseBits
, uint32_t pair
) {
1001 if(pair
>= MIN_SHORT
) {
1002 // A high secondary weight means we really have two CEs,
1003 // a primary CE and a secondary CE.
1006 pair
= (pair
& CASE_AND_TERTIARY_MASK
) + TER_OFFSET
;
1007 if((ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1008 pair
|= (LOWER_CASE
| COMMON_TER_PLUS_OFFSET
) << 16;
1011 pair
= (pair
& TERTIARY_MASK
) + TER_OFFSET
;
1012 if((ce
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1013 pair
|= COMMON_TER_PLUS_OFFSET
<< 16;
1016 } else if(pair
> variableTop
) {
1017 pair
= (pair
& TERTIARY_MASK
) + TER_OFFSET
;
1021 } else if(pair
>= MIN_LONG
) {
1022 pair
= 0; // variable
1024 // else special mini CE
1026 // two mini CEs, same primary groups, neither expands like above
1027 uint32_t ce
= pair
& 0xffff;
1028 if(ce
>= MIN_SHORT
) {
1030 pair
&= TWO_CASES_MASK
| TWO_TERTIARIES_MASK
;
1032 pair
&= TWO_TERTIARIES_MASK
;
1034 pair
+= TWO_TER_OFFSETS
;
1035 } else if(ce
> variableTop
) {
1036 pair
= (pair
& TWO_TERTIARIES_MASK
) + TWO_TER_OFFSETS
;
1038 pair
|= TWO_LOWER_CASES
;
1041 U_ASSERT(ce
>= MIN_LONG
);
1042 pair
= 0; // variable
1049 CollationFastLatin::getQuaternaries(uint32_t variableTop
, uint32_t pair
) {
1050 // Return the primary weight of a variable CE,
1051 // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1052 if(pair
<= 0xffff) {
1054 if(pair
>= MIN_SHORT
) {
1055 // A high secondary weight means we really have two CEs,
1056 // a primary CE and a secondary CE.
1057 if((pair
& SECONDARY_MASK
) >= MIN_SEC_HIGH
) {
1058 pair
= TWO_SHORT_PRIMARIES_MASK
;
1060 pair
= SHORT_PRIMARY_MASK
;
1062 } else if(pair
> variableTop
) {
1063 pair
= SHORT_PRIMARY_MASK
;
1064 } else if(pair
>= MIN_LONG
) {
1065 pair
&= LONG_PRIMARY_MASK
; // variable
1067 // else special mini CE
1069 // two mini CEs, same primary groups, neither expands like above
1070 uint32_t ce
= pair
& 0xffff;
1071 if(ce
> variableTop
) {
1072 pair
= TWO_SHORT_PRIMARIES_MASK
;
1074 U_ASSERT(ce
>= MIN_LONG
);
1075 pair
&= TWO_LONG_PRIMARIES_MASK
; // variable
1083 #endif // !UCONFIG_NO_COLLATION