]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationfastlatin.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationfastlatin.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
7 *
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "uassert.h"
21
22 U_NAMESPACE_BEGIN
23
24 int32_t
25 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
26 uint16_t *primaries, int32_t capacity) {
27 const uint16_t *table = data->fastLatinTable;
28 if(table == NULL) { return -1; }
29 U_ASSERT(capacity == LATIN_LIMIT);
30 if(capacity != LATIN_LIMIT) { return -1; }
31
32 uint32_t miniVarTop;
33 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
34 // No mini primaries are variable, set a variableTop just below the
35 // lowest long mini primary.
36 miniVarTop = MIN_LONG - 1;
37 } else {
38 int32_t headerLength = *table & 0xff;
39 int32_t i = 1 + settings.getMaxVariable();
40 if(i >= headerLength) {
41 return -1; // variableTop >= digits, should not occur
42 }
43 miniVarTop = table[i];
44 }
45
46 UBool digitsAreReordered = FALSE;
47 if(settings.hasReordering()) {
48 uint32_t prevStart = 0;
49 uint32_t beforeDigitStart = 0;
50 uint32_t digitStart = 0;
51 uint32_t afterDigitStart = 0;
52 for(int32_t group = UCOL_REORDER_CODE_FIRST;
53 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
54 ++group) {
55 uint32_t start = data->getFirstPrimaryForGroup(group);
56 start = settings.reorder(start);
57 if(group == UCOL_REORDER_CODE_DIGIT) {
58 beforeDigitStart = prevStart;
59 digitStart = start;
60 } else if(start != 0) {
61 if(start < prevStart) {
62 // The permutation affects the groups up to Latin.
63 return -1;
64 }
65 // In the future, there might be a special group between digits & Latin.
66 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
67 afterDigitStart = start;
68 }
69 prevStart = start;
70 }
71 }
72 uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
73 latinStart = settings.reorder(latinStart);
74 if(latinStart < prevStart) {
75 return -1;
76 }
77 if(afterDigitStart == 0) {
78 afterDigitStart = latinStart;
79 }
80 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
81 digitsAreReordered = TRUE;
82 }
83 }
84
85 table += (table[0] & 0xff); // skip the header
86 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
87 uint32_t p = table[c];
88 if(p >= MIN_SHORT) {
89 p &= SHORT_PRIMARY_MASK;
90 } else if(p > miniVarTop) {
91 p &= LONG_PRIMARY_MASK;
92 } else {
93 p = 0;
94 }
95 primaries[c] = (uint16_t)p;
96 }
97 if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
98 // Bail out for digits.
99 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
100 }
101
102 // Shift the miniVarTop above other options.
103 return ((int32_t)miniVarTop << 16) | settings.options;
104 }
105
106 int32_t
107 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
108 const UChar *left, int32_t leftLength,
109 const UChar *right, int32_t rightLength) {
110 // This is a modified copy of CollationCompare::compareUpToQuaternary(),
111 // optimized for common Latin text.
112 // Keep them in sync!
113 // Keep compareUTF16() and compareUTF8() in sync very closely!
114
115 U_ASSERT((table[0] >> 8) == VERSION);
116 table += (table[0] & 0xff); // skip the header
117 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions()
118 options &= 0xffff; // needed for CollationSettings::getStrength() to work
119
120 // Check for supported characters, fetch mini CEs, and compare primaries.
121 int32_t leftIndex = 0, rightIndex = 0;
122 /**
123 * Single mini CE or a pair.
124 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
125 * If there is only one, then it is in the lower bits, and the upper bits are 0.
126 */
127 uint32_t leftPair = 0, rightPair = 0;
128 for(;;) {
129 // We fetch CEs until we get a non-ignorable primary or reach the end.
130 while(leftPair == 0) {
131 if(leftIndex == leftLength) {
132 leftPair = EOS;
133 break;
134 }
135 UChar32 c = left[leftIndex++];
136 if(c <= LATIN_MAX) {
137 leftPair = primaries[c];
138 if(leftPair != 0) { break; }
139 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
140 return BAIL_OUT_RESULT;
141 }
142 leftPair = table[c];
143 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
144 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
145 } else {
146 leftPair = lookup(table, c);
147 }
148 if(leftPair >= MIN_SHORT) {
149 leftPair &= SHORT_PRIMARY_MASK;
150 break;
151 } else if(leftPair > variableTop) {
152 leftPair &= LONG_PRIMARY_MASK;
153 break;
154 } else {
155 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
156 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
157 leftPair = getPrimaries(variableTop, leftPair);
158 }
159 }
160
161 while(rightPair == 0) {
162 if(rightIndex == rightLength) {
163 rightPair = EOS;
164 break;
165 }
166 UChar32 c = right[rightIndex++];
167 if(c <= LATIN_MAX) {
168 rightPair = primaries[c];
169 if(rightPair != 0) { break; }
170 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
171 return BAIL_OUT_RESULT;
172 }
173 rightPair = table[c];
174 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
175 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
176 } else {
177 rightPair = lookup(table, c);
178 }
179 if(rightPair >= MIN_SHORT) {
180 rightPair &= SHORT_PRIMARY_MASK;
181 break;
182 } else if(rightPair > variableTop) {
183 rightPair &= LONG_PRIMARY_MASK;
184 break;
185 } else {
186 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
187 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
188 rightPair = getPrimaries(variableTop, rightPair);
189 }
190 }
191
192 if(leftPair == rightPair) {
193 if(leftPair == EOS) { break; }
194 leftPair = rightPair = 0;
195 continue;
196 }
197 uint32_t leftPrimary = leftPair & 0xffff;
198 uint32_t rightPrimary = rightPair & 0xffff;
199 if(leftPrimary != rightPrimary) {
200 // Return the primary difference.
201 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
202 }
203 if(leftPair == EOS) { break; }
204 leftPair >>= 16;
205 rightPair >>= 16;
206 }
207 // In the following, we need to re-fetch each character because we did not buffer the CEs,
208 // but we know that the string is well-formed and
209 // only contains supported characters and mappings.
210
211 // We might skip the secondary level but continue with the case level
212 // which is turned on separately.
213 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
214 leftIndex = rightIndex = 0;
215 leftPair = rightPair = 0;
216 for(;;) {
217 while(leftPair == 0) {
218 if(leftIndex == leftLength) {
219 leftPair = EOS;
220 break;
221 }
222 UChar32 c = left[leftIndex++];
223 if(c <= LATIN_MAX) {
224 leftPair = table[c];
225 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
226 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
227 } else {
228 leftPair = lookup(table, c);
229 }
230 if(leftPair >= MIN_SHORT) {
231 leftPair = getSecondariesFromOneShortCE(leftPair);
232 break;
233 } else if(leftPair > variableTop) {
234 leftPair = COMMON_SEC_PLUS_OFFSET;
235 break;
236 } else {
237 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
238 leftPair = getSecondaries(variableTop, leftPair);
239 }
240 }
241
242 while(rightPair == 0) {
243 if(rightIndex == rightLength) {
244 rightPair = EOS;
245 break;
246 }
247 UChar32 c = right[rightIndex++];
248 if(c <= LATIN_MAX) {
249 rightPair = table[c];
250 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
251 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
252 } else {
253 rightPair = lookup(table, c);
254 }
255 if(rightPair >= MIN_SHORT) {
256 rightPair = getSecondariesFromOneShortCE(rightPair);
257 break;
258 } else if(rightPair > variableTop) {
259 rightPair = COMMON_SEC_PLUS_OFFSET;
260 break;
261 } else {
262 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
263 rightPair = getSecondaries(variableTop, rightPair);
264 }
265 }
266
267 if(leftPair == rightPair) {
268 if(leftPair == EOS) { break; }
269 leftPair = rightPair = 0;
270 continue;
271 }
272 uint32_t leftSecondary = leftPair & 0xffff;
273 uint32_t rightSecondary = rightPair & 0xffff;
274 if(leftSecondary != rightSecondary) {
275 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
276 // Full support for backwards secondary requires backwards contraction matching
277 // and moving backwards between merge separators.
278 return BAIL_OUT_RESULT;
279 }
280 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
281 }
282 if(leftPair == EOS) { break; }
283 leftPair >>= 16;
284 rightPair >>= 16;
285 }
286 }
287
288 if((options & CollationSettings::CASE_LEVEL) != 0) {
289 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
290 leftIndex = rightIndex = 0;
291 leftPair = rightPair = 0;
292 for(;;) {
293 while(leftPair == 0) {
294 if(leftIndex == leftLength) {
295 leftPair = EOS;
296 break;
297 }
298 UChar32 c = left[leftIndex++];
299 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
300 if(leftPair < MIN_LONG) {
301 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
302 }
303 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
304 }
305
306 while(rightPair == 0) {
307 if(rightIndex == rightLength) {
308 rightPair = EOS;
309 break;
310 }
311 UChar32 c = right[rightIndex++];
312 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
313 if(rightPair < MIN_LONG) {
314 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
315 }
316 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
317 }
318
319 if(leftPair == rightPair) {
320 if(leftPair == EOS) { break; }
321 leftPair = rightPair = 0;
322 continue;
323 }
324 uint32_t leftCase = leftPair & 0xffff;
325 uint32_t rightCase = rightPair & 0xffff;
326 if(leftCase != rightCase) {
327 if((options & CollationSettings::UPPER_FIRST) == 0) {
328 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
329 } else {
330 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
331 }
332 }
333 if(leftPair == EOS) { break; }
334 leftPair >>= 16;
335 rightPair >>= 16;
336 }
337 }
338 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
339
340 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
341 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
342
343 leftIndex = rightIndex = 0;
344 leftPair = rightPair = 0;
345 for(;;) {
346 while(leftPair == 0) {
347 if(leftIndex == leftLength) {
348 leftPair = EOS;
349 break;
350 }
351 UChar32 c = left[leftIndex++];
352 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
353 if(leftPair < MIN_LONG) {
354 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
355 }
356 leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
357 }
358
359 while(rightPair == 0) {
360 if(rightIndex == rightLength) {
361 rightPair = EOS;
362 break;
363 }
364 UChar32 c = right[rightIndex++];
365 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
366 if(rightPair < MIN_LONG) {
367 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
368 }
369 rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
370 }
371
372 if(leftPair == rightPair) {
373 if(leftPair == EOS) { break; }
374 leftPair = rightPair = 0;
375 continue;
376 }
377 uint32_t leftTertiary = leftPair & 0xffff;
378 uint32_t rightTertiary = rightPair & 0xffff;
379 if(leftTertiary != rightTertiary) {
380 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
381 // Pass through EOS and MERGE_WEIGHT
382 // and keep real tertiary weights larger than the MERGE_WEIGHT.
383 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
384 if(leftTertiary > MERGE_WEIGHT) {
385 leftTertiary ^= CASE_MASK;
386 }
387 if(rightTertiary > MERGE_WEIGHT) {
388 rightTertiary ^= CASE_MASK;
389 }
390 }
391 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
392 }
393 if(leftPair == EOS) { break; }
394 leftPair >>= 16;
395 rightPair >>= 16;
396 }
397 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
398
399 leftIndex = rightIndex = 0;
400 leftPair = rightPair = 0;
401 for(;;) {
402 while(leftPair == 0) {
403 if(leftIndex == leftLength) {
404 leftPair = EOS;
405 break;
406 }
407 UChar32 c = left[leftIndex++];
408 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
409 if(leftPair < MIN_LONG) {
410 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
411 }
412 leftPair = getQuaternaries(variableTop, leftPair);
413 }
414
415 while(rightPair == 0) {
416 if(rightIndex == rightLength) {
417 rightPair = EOS;
418 break;
419 }
420 UChar32 c = right[rightIndex++];
421 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
422 if(rightPair < MIN_LONG) {
423 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
424 }
425 rightPair = getQuaternaries(variableTop, rightPair);
426 }
427
428 if(leftPair == rightPair) {
429 if(leftPair == EOS) { break; }
430 leftPair = rightPair = 0;
431 continue;
432 }
433 uint32_t leftQuaternary = leftPair & 0xffff;
434 uint32_t rightQuaternary = rightPair & 0xffff;
435 if(leftQuaternary != rightQuaternary) {
436 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
437 }
438 if(leftPair == EOS) { break; }
439 leftPair >>= 16;
440 rightPair >>= 16;
441 }
442 return UCOL_EQUAL;
443 }
444
445 int32_t
446 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
447 const uint8_t *left, int32_t leftLength,
448 const uint8_t *right, int32_t rightLength) {
449 // Keep compareUTF16() and compareUTF8() in sync very closely!
450
451 U_ASSERT((table[0] >> 8) == VERSION);
452 table += (table[0] & 0xff); // skip the header
453 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions()
454 options &= 0xffff; // needed for CollationSettings::getStrength() to work
455
456 // Check for supported characters, fetch mini CEs, and compare primaries.
457 int32_t leftIndex = 0, rightIndex = 0;
458 /**
459 * Single mini CE or a pair.
460 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
461 * If there is only one, then it is in the lower bits, and the upper bits are 0.
462 */
463 uint32_t leftPair = 0, rightPair = 0;
464 // Note: There is no need to assemble the code point.
465 // We only need to look up the table entry for the character,
466 // and nextPair() looks for whether c==0.
467 for(;;) {
468 // We fetch CEs until we get a non-ignorable primary or reach the end.
469 while(leftPair == 0) {
470 if(leftIndex == leftLength) {
471 leftPair = EOS;
472 break;
473 }
474 UChar32 c = left[leftIndex++];
475 uint8_t t;
476 if(c <= 0x7f) {
477 leftPair = primaries[c];
478 if(leftPair != 0) { break; }
479 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
480 return BAIL_OUT_RESULT;
481 }
482 leftPair = table[c];
483 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
484 0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
485 ++leftIndex;
486 c = ((c - 0xc2) << 6) + t;
487 leftPair = primaries[c];
488 if(leftPair != 0) { break; }
489 leftPair = table[c];
490 } else {
491 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
492 }
493 if(leftPair >= MIN_SHORT) {
494 leftPair &= SHORT_PRIMARY_MASK;
495 break;
496 } else if(leftPair > variableTop) {
497 leftPair &= LONG_PRIMARY_MASK;
498 break;
499 } else {
500 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
501 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
502 leftPair = getPrimaries(variableTop, leftPair);
503 }
504 }
505
506 while(rightPair == 0) {
507 if(rightIndex == rightLength) {
508 rightPair = EOS;
509 break;
510 }
511 UChar32 c = right[rightIndex++];
512 uint8_t t;
513 if(c <= 0x7f) {
514 rightPair = primaries[c];
515 if(rightPair != 0) { break; }
516 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
517 return BAIL_OUT_RESULT;
518 }
519 rightPair = table[c];
520 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
521 0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
522 ++rightIndex;
523 c = ((c - 0xc2) << 6) + t;
524 rightPair = primaries[c];
525 if(rightPair != 0) { break; }
526 rightPair = table[c];
527 } else {
528 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
529 }
530 if(rightPair >= MIN_SHORT) {
531 rightPair &= SHORT_PRIMARY_MASK;
532 break;
533 } else if(rightPair > variableTop) {
534 rightPair &= LONG_PRIMARY_MASK;
535 break;
536 } else {
537 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
538 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
539 rightPair = getPrimaries(variableTop, rightPair);
540 }
541 }
542
543 if(leftPair == rightPair) {
544 if(leftPair == EOS) { break; }
545 leftPair = rightPair = 0;
546 continue;
547 }
548 uint32_t leftPrimary = leftPair & 0xffff;
549 uint32_t rightPrimary = rightPair & 0xffff;
550 if(leftPrimary != rightPrimary) {
551 // Return the primary difference.
552 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
553 }
554 if(leftPair == EOS) { break; }
555 leftPair >>= 16;
556 rightPair >>= 16;
557 }
558 // In the following, we need to re-fetch each character because we did not buffer the CEs,
559 // but we know that the string is well-formed and
560 // only contains supported characters and mappings.
561
562 // We might skip the secondary level but continue with the case level
563 // which is turned on separately.
564 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
565 leftIndex = rightIndex = 0;
566 leftPair = rightPair = 0;
567 for(;;) {
568 while(leftPair == 0) {
569 if(leftIndex == leftLength) {
570 leftPair = EOS;
571 break;
572 }
573 UChar32 c = left[leftIndex++];
574 if(c <= 0x7f) {
575 leftPair = table[c];
576 } else if(c <= LATIN_MAX_UTF8_LEAD) {
577 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
578 } else {
579 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
580 }
581 if(leftPair >= MIN_SHORT) {
582 leftPair = getSecondariesFromOneShortCE(leftPair);
583 break;
584 } else if(leftPair > variableTop) {
585 leftPair = COMMON_SEC_PLUS_OFFSET;
586 break;
587 } else {
588 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
589 leftPair = getSecondaries(variableTop, leftPair);
590 }
591 }
592
593 while(rightPair == 0) {
594 if(rightIndex == rightLength) {
595 rightPair = EOS;
596 break;
597 }
598 UChar32 c = right[rightIndex++];
599 if(c <= 0x7f) {
600 rightPair = table[c];
601 } else if(c <= LATIN_MAX_UTF8_LEAD) {
602 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
603 } else {
604 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
605 }
606 if(rightPair >= MIN_SHORT) {
607 rightPair = getSecondariesFromOneShortCE(rightPair);
608 break;
609 } else if(rightPair > variableTop) {
610 rightPair = COMMON_SEC_PLUS_OFFSET;
611 break;
612 } else {
613 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
614 rightPair = getSecondaries(variableTop, rightPair);
615 }
616 }
617
618 if(leftPair == rightPair) {
619 if(leftPair == EOS) { break; }
620 leftPair = rightPair = 0;
621 continue;
622 }
623 uint32_t leftSecondary = leftPair & 0xffff;
624 uint32_t rightSecondary = rightPair & 0xffff;
625 if(leftSecondary != rightSecondary) {
626 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
627 // Full support for backwards secondary requires backwards contraction matching
628 // and moving backwards between merge separators.
629 return BAIL_OUT_RESULT;
630 }
631 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
632 }
633 if(leftPair == EOS) { break; }
634 leftPair >>= 16;
635 rightPair >>= 16;
636 }
637 }
638
639 if((options & CollationSettings::CASE_LEVEL) != 0) {
640 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
641 leftIndex = rightIndex = 0;
642 leftPair = rightPair = 0;
643 for(;;) {
644 while(leftPair == 0) {
645 if(leftIndex == leftLength) {
646 leftPair = EOS;
647 break;
648 }
649 UChar32 c = left[leftIndex++];
650 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
651 if(leftPair < MIN_LONG) {
652 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
653 }
654 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
655 }
656
657 while(rightPair == 0) {
658 if(rightIndex == rightLength) {
659 rightPair = EOS;
660 break;
661 }
662 UChar32 c = right[rightIndex++];
663 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
664 if(rightPair < MIN_LONG) {
665 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
666 }
667 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
668 }
669
670 if(leftPair == rightPair) {
671 if(leftPair == EOS) { break; }
672 leftPair = rightPair = 0;
673 continue;
674 }
675 uint32_t leftCase = leftPair & 0xffff;
676 uint32_t rightCase = rightPair & 0xffff;
677 if(leftCase != rightCase) {
678 if((options & CollationSettings::UPPER_FIRST) == 0) {
679 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
680 } else {
681 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
682 }
683 }
684 if(leftPair == EOS) { break; }
685 leftPair >>= 16;
686 rightPair >>= 16;
687 }
688 }
689 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
690
691 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
692 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
693
694 leftIndex = rightIndex = 0;
695 leftPair = rightPair = 0;
696 for(;;) {
697 while(leftPair == 0) {
698 if(leftIndex == leftLength) {
699 leftPair = EOS;
700 break;
701 }
702 UChar32 c = left[leftIndex++];
703 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
704 if(leftPair < MIN_LONG) {
705 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
706 }
707 leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
708 }
709
710 while(rightPair == 0) {
711 if(rightIndex == rightLength) {
712 rightPair = EOS;
713 break;
714 }
715 UChar32 c = right[rightIndex++];
716 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
717 if(rightPair < MIN_LONG) {
718 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
719 }
720 rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
721 }
722
723 if(leftPair == rightPair) {
724 if(leftPair == EOS) { break; }
725 leftPair = rightPair = 0;
726 continue;
727 }
728 uint32_t leftTertiary = leftPair & 0xffff;
729 uint32_t rightTertiary = rightPair & 0xffff;
730 if(leftTertiary != rightTertiary) {
731 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
732 // Pass through EOS and MERGE_WEIGHT
733 // and keep real tertiary weights larger than the MERGE_WEIGHT.
734 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
735 if(leftTertiary > MERGE_WEIGHT) {
736 leftTertiary ^= CASE_MASK;
737 }
738 if(rightTertiary > MERGE_WEIGHT) {
739 rightTertiary ^= CASE_MASK;
740 }
741 }
742 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
743 }
744 if(leftPair == EOS) { break; }
745 leftPair >>= 16;
746 rightPair >>= 16;
747 }
748 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
749
750 leftIndex = rightIndex = 0;
751 leftPair = rightPair = 0;
752 for(;;) {
753 while(leftPair == 0) {
754 if(leftIndex == leftLength) {
755 leftPair = EOS;
756 break;
757 }
758 UChar32 c = left[leftIndex++];
759 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
760 if(leftPair < MIN_LONG) {
761 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
762 }
763 leftPair = getQuaternaries(variableTop, leftPair);
764 }
765
766 while(rightPair == 0) {
767 if(rightIndex == rightLength) {
768 rightPair = EOS;
769 break;
770 }
771 UChar32 c = right[rightIndex++];
772 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
773 if(rightPair < MIN_LONG) {
774 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
775 }
776 rightPair = getQuaternaries(variableTop, rightPair);
777 }
778
779 if(leftPair == rightPair) {
780 if(leftPair == EOS) { break; }
781 leftPair = rightPair = 0;
782 continue;
783 }
784 uint32_t leftQuaternary = leftPair & 0xffff;
785 uint32_t rightQuaternary = rightPair & 0xffff;
786 if(leftQuaternary != rightQuaternary) {
787 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
788 }
789 if(leftPair == EOS) { break; }
790 leftPair >>= 16;
791 rightPair >>= 16;
792 }
793 return UCOL_EQUAL;
794 }
795
796 uint32_t
797 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
798 U_ASSERT(c > LATIN_MAX);
799 if(PUNCT_START <= c && c < PUNCT_LIMIT) {
800 return table[c - PUNCT_START + LATIN_LIMIT];
801 } else if(c == 0xfffe) {
802 return MERGE_WEIGHT;
803 } else if(c == 0xffff) {
804 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
805 } else {
806 return BAIL_OUT;
807 }
808 }
809
810 uint32_t
811 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
812 const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
813 // The caller handled ASCII and valid/supported Latin.
814 U_ASSERT(c > 0x7f);
815 int32_t i2 = sIndex + 1;
816 if(i2 < sLength || sLength < 0) {
817 uint8_t t1 = s8[sIndex];
818 uint8_t t2 = s8[i2];
819 sIndex += 2;
820 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
821 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
822 } else if(c == 0xef && t1 == 0xbf) {
823 if(t2 == 0xbe) {
824 return MERGE_WEIGHT; // U+FFFE
825 } else if(t2 == 0xbf) {
826 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
827 }
828 }
829 }
830 return BAIL_OUT;
831 }
832
833 uint32_t
834 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
835 const uint8_t *s8, int32_t &sIndex) {
836 // The caller handled ASCII.
837 // The string is well-formed and contains only supported characters.
838 U_ASSERT(c > 0x7f);
839 if(c <= LATIN_MAX_UTF8_LEAD) {
840 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F
841 }
842 uint8_t t2 = s8[sIndex + 1];
843 sIndex += 2;
844 if(c == 0xe2) {
845 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
846 } else if(t2 == 0xbe) {
847 return MERGE_WEIGHT; // U+FFFE
848 } else {
849 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
850 }
851 }
852
853 uint32_t
854 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
855 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
856 if(ce >= MIN_LONG || ce < CONTRACTION) {
857 return ce; // simple or special mini CE
858 } else if(ce >= EXPANSION) {
859 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
860 return ((uint32_t)table[index + 1] << 16) | table[index];
861 } else /* ce >= CONTRACTION */ {
862 if(c == 0 && sLength < 0) {
863 sLength = sIndex - 1;
864 return EOS;
865 }
866 // Contraction list: Default mapping followed by
867 // 0 or more single-character contraction suffix mappings.
868 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
869 if(sIndex != sLength) {
870 // Read the next character.
871 int32_t c2;
872 int32_t nextIndex = sIndex;
873 if(s16 != NULL) {
874 c2 = s16[nextIndex++];
875 if(c2 > LATIN_MAX) {
876 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
877 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF
878 } else if(c2 == 0xfffe || c2 == 0xffff) {
879 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
880 } else {
881 return BAIL_OUT;
882 }
883 }
884 } else {
885 c2 = s8[nextIndex++];
886 if(c2 > 0x7f) {
887 uint8_t t;
888 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
889 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
890 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F
891 ++nextIndex;
892 } else {
893 int32_t i2 = nextIndex + 1;
894 if(i2 < sLength || sLength < 0) {
895 if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
896 0x80 <= (t = s8[i2]) && t <= 0xbf) {
897 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF
898 } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
899 ((t = s8[i2]) == 0xbe || t == 0xbf)) {
900 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
901 } else {
902 return BAIL_OUT;
903 }
904 } else {
905 return BAIL_OUT;
906 }
907 nextIndex += 2;
908 }
909 }
910 }
911 if(c2 == 0 && sLength < 0) {
912 sLength = sIndex;
913 c2 = -1;
914 }
915 // Look for the next character in the contraction suffix list,
916 // which is in ascending order of single suffix characters.
917 int32_t i = index;
918 int32_t head = table[i]; // first skip the default mapping
919 int32_t x;
920 do {
921 i += head >> CONTR_LENGTH_SHIFT;
922 head = table[i];
923 x = head & CONTR_CHAR_MASK;
924 } while(x < c2);
925 if(x == c2) {
926 index = i;
927 sIndex = nextIndex;
928 }
929 }
930 // Return the CE or CEs for the default or contraction mapping.
931 int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
932 if(length == 1) {
933 return BAIL_OUT;
934 }
935 ce = table[index + 1];
936 if(length == 2) {
937 return ce;
938 } else {
939 return ((uint32_t)table[index + 2] << 16) | ce;
940 }
941 }
942 }
943
944 uint32_t
945 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
946 if(pair <= 0xffff) {
947 // one mini CE
948 if(pair >= MIN_SHORT) {
949 pair = getSecondariesFromOneShortCE(pair);
950 } else if(pair > variableTop) {
951 pair = COMMON_SEC_PLUS_OFFSET;
952 } else if(pair >= MIN_LONG) {
953 pair = 0; // variable
954 }
955 // else special mini CE
956 } else {
957 uint32_t ce = pair & 0xffff;
958 if(ce >= MIN_SHORT) {
959 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
960 } else if(ce > variableTop) {
961 pair = TWO_COMMON_SEC_PLUS_OFFSET;
962 } else {
963 U_ASSERT(ce >= MIN_LONG);
964 pair = 0; // variable
965 }
966 }
967 return pair;
968 }
969
970 uint32_t
971 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
972 // Primary+caseLevel: Ignore case level weights of primary ignorables.
973 // Otherwise: Ignore case level weights of secondary ignorables.
974 // For details see the comments in the CollationCompare class.
975 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
976 if(pair <= 0xffff) {
977 // one mini CE
978 if(pair >= MIN_SHORT) {
979 // A high secondary weight means we really have two CEs,
980 // a primary CE and a secondary CE.
981 uint32_t ce = pair;
982 pair &= CASE_MASK; // explicit weight of primary CE
983 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
984 pair |= LOWER_CASE << 16; // implied weight of secondary CE
985 }
986 } else if(pair > variableTop) {
987 pair = LOWER_CASE;
988 } else if(pair >= MIN_LONG) {
989 pair = 0; // variable
990 }
991 // else special mini CE
992 } else {
993 // two mini CEs, same primary groups, neither expands like above
994 uint32_t ce = pair & 0xffff;
995 if(ce >= MIN_SHORT) {
996 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
997 pair &= CASE_MASK;
998 } else {
999 pair &= TWO_CASES_MASK;
1000 }
1001 } else if(ce > variableTop) {
1002 pair = TWO_LOWER_CASES;
1003 } else {
1004 U_ASSERT(ce >= MIN_LONG);
1005 pair = 0; // variable
1006 }
1007 }
1008 return pair;
1009 }
1010
1011 uint32_t
1012 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
1013 if(pair <= 0xffff) {
1014 // one mini CE
1015 if(pair >= MIN_SHORT) {
1016 // A high secondary weight means we really have two CEs,
1017 // a primary CE and a secondary CE.
1018 uint32_t ce = pair;
1019 if(withCaseBits) {
1020 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1021 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1022 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1023 }
1024 } else {
1025 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1026 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1027 pair |= COMMON_TER_PLUS_OFFSET << 16;
1028 }
1029 }
1030 } else if(pair > variableTop) {
1031 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1032 if(withCaseBits) {
1033 pair |= LOWER_CASE;
1034 }
1035 } else if(pair >= MIN_LONG) {
1036 pair = 0; // variable
1037 }
1038 // else special mini CE
1039 } else {
1040 // two mini CEs, same primary groups, neither expands like above
1041 uint32_t ce = pair & 0xffff;
1042 if(ce >= MIN_SHORT) {
1043 if(withCaseBits) {
1044 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1045 } else {
1046 pair &= TWO_TERTIARIES_MASK;
1047 }
1048 pair += TWO_TER_OFFSETS;
1049 } else if(ce > variableTop) {
1050 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1051 if(withCaseBits) {
1052 pair |= TWO_LOWER_CASES;
1053 }
1054 } else {
1055 U_ASSERT(ce >= MIN_LONG);
1056 pair = 0; // variable
1057 }
1058 }
1059 return pair;
1060 }
1061
1062 uint32_t
1063 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1064 // Return the primary weight of a variable CE,
1065 // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1066 if(pair <= 0xffff) {
1067 // one mini CE
1068 if(pair >= MIN_SHORT) {
1069 // A high secondary weight means we really have two CEs,
1070 // a primary CE and a secondary CE.
1071 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1072 pair = TWO_SHORT_PRIMARIES_MASK;
1073 } else {
1074 pair = SHORT_PRIMARY_MASK;
1075 }
1076 } else if(pair > variableTop) {
1077 pair = SHORT_PRIMARY_MASK;
1078 } else if(pair >= MIN_LONG) {
1079 pair &= LONG_PRIMARY_MASK; // variable
1080 }
1081 // else special mini CE
1082 } else {
1083 // two mini CEs, same primary groups, neither expands like above
1084 uint32_t ce = pair & 0xffff;
1085 if(ce > variableTop) {
1086 pair = TWO_SHORT_PRIMARIES_MASK;
1087 } else {
1088 U_ASSERT(ce >= MIN_LONG);
1089 pair &= TWO_LONG_PRIMARIES_MASK; // variable
1090 }
1091 }
1092 return pair;
1093 }
1094
1095 U_NAMESPACE_END
1096
1097 #endif // !UCONFIG_NO_COLLATION