]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * Copyright (C) 2013-2015, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ******************************************************************************* | |
8 | * collationfastlatin.cpp | |
9 | * | |
10 | * created on: 2013aug18 | |
11 | * created by: Markus W. Scherer | |
12 | */ | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | ||
16 | #if !UCONFIG_NO_COLLATION | |
17 | ||
18 | #include "unicode/ucol.h" | |
19 | #include "collationdata.h" | |
20 | #include "collationfastlatin.h" | |
21 | #include "collationsettings.h" | |
22 | #include "uassert.h" | |
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | int32_t | |
27 | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, | |
28 | uint16_t *primaries, int32_t capacity) { | |
29 | const uint16_t *table = data->fastLatinTable; | |
30 | if(table == NULL) { return -1; } | |
31 | U_ASSERT(capacity == LATIN_LIMIT); | |
32 | if(capacity != LATIN_LIMIT) { return -1; } | |
33 | ||
34 | uint32_t miniVarTop; | |
35 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { | |
36 | // No mini primaries are variable, set a variableTop just below the | |
37 | // lowest long mini primary. | |
38 | miniVarTop = MIN_LONG - 1; | |
39 | } else { | |
40 | int32_t headerLength = *table & 0xff; | |
41 | int32_t i = 1 + settings.getMaxVariable(); | |
42 | if(i >= headerLength) { | |
43 | return -1; // variableTop >= digits, should not occur | |
44 | } | |
45 | miniVarTop = table[i]; | |
46 | } | |
47 | ||
48 | UBool digitsAreReordered = FALSE; | |
49 | if(settings.hasReordering()) { | |
50 | uint32_t prevStart = 0; | |
51 | uint32_t beforeDigitStart = 0; | |
52 | uint32_t digitStart = 0; | |
53 | uint32_t afterDigitStart = 0; | |
54 | for(int32_t group = UCOL_REORDER_CODE_FIRST; | |
55 | group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; | |
56 | ++group) { | |
57 | uint32_t start = data->getFirstPrimaryForGroup(group); | |
58 | start = settings.reorder(start); | |
59 | if(group == UCOL_REORDER_CODE_DIGIT) { | |
60 | beforeDigitStart = prevStart; | |
61 | digitStart = start; | |
62 | } else if(start != 0) { | |
63 | if(start < prevStart) { | |
64 | // The permutation affects the groups up to Latin. | |
65 | return -1; | |
66 | } | |
67 | // In the future, there might be a special group between digits & Latin. | |
68 | if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { | |
69 | afterDigitStart = start; | |
70 | } | |
71 | prevStart = start; | |
72 | } | |
73 | } | |
74 | uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); | |
75 | latinStart = settings.reorder(latinStart); | |
76 | if(latinStart < prevStart) { | |
77 | return -1; | |
78 | } | |
79 | if(afterDigitStart == 0) { | |
80 | afterDigitStart = latinStart; | |
81 | } | |
82 | if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { | |
83 | digitsAreReordered = TRUE; | |
84 | } | |
85 | } | |
86 | ||
87 | table += (table[0] & 0xff); // skip the header | |
88 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { | |
89 | uint32_t p = table[c]; | |
90 | if(p >= MIN_SHORT) { | |
91 | p &= SHORT_PRIMARY_MASK; | |
92 | } else if(p > miniVarTop) { | |
93 | p &= LONG_PRIMARY_MASK; | |
94 | } else { | |
95 | p = 0; | |
96 | } | |
97 | primaries[c] = (uint16_t)p; | |
98 | } | |
99 | if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { | |
100 | // Bail out for digits. | |
101 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } | |
102 | } | |
103 | ||
104 | // Shift the miniVarTop above other options. | |
105 | return ((int32_t)miniVarTop << 16) | settings.options; | |
106 | } | |
107 | ||
108 | int32_t | |
109 | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
110 | const UChar *left, int32_t leftLength, | |
111 | const UChar *right, int32_t rightLength) { | |
112 | // This is a modified copy of CollationCompare::compareUpToQuaternary(), | |
113 | // optimized for common Latin text. | |
114 | // Keep them in sync! | |
115 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
116 | ||
117 | U_ASSERT((table[0] >> 8) == VERSION); | |
118 | table += (table[0] & 0xff); // skip the header | |
119 | uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() | |
120 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
121 | ||
122 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
123 | int32_t leftIndex = 0, rightIndex = 0; | |
124 | /** | |
125 | * Single mini CE or a pair. | |
126 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
127 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
128 | */ | |
129 | uint32_t leftPair = 0, rightPair = 0; | |
130 | for(;;) { | |
131 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
132 | while(leftPair == 0) { | |
133 | if(leftIndex == leftLength) { | |
134 | leftPair = EOS; | |
135 | break; | |
136 | } | |
137 | UChar32 c = left[leftIndex++]; | |
138 | if(c <= LATIN_MAX) { | |
139 | leftPair = primaries[c]; | |
140 | if(leftPair != 0) { break; } | |
141 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
142 | return BAIL_OUT_RESULT; | |
143 | } | |
144 | leftPair = table[c]; | |
145 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
146 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
147 | } else { | |
148 | leftPair = lookup(table, c); | |
149 | } | |
150 | if(leftPair >= MIN_SHORT) { | |
151 | leftPair &= SHORT_PRIMARY_MASK; | |
152 | break; | |
153 | } else if(leftPair > variableTop) { | |
154 | leftPair &= LONG_PRIMARY_MASK; | |
155 | break; | |
156 | } else { | |
157 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
158 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
159 | leftPair = getPrimaries(variableTop, leftPair); | |
160 | } | |
161 | } | |
162 | ||
163 | while(rightPair == 0) { | |
164 | if(rightIndex == rightLength) { | |
165 | rightPair = EOS; | |
166 | break; | |
167 | } | |
168 | UChar32 c = right[rightIndex++]; | |
169 | if(c <= LATIN_MAX) { | |
170 | rightPair = primaries[c]; | |
171 | if(rightPair != 0) { break; } | |
172 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
173 | return BAIL_OUT_RESULT; | |
174 | } | |
175 | rightPair = table[c]; | |
176 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
177 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
178 | } else { | |
179 | rightPair = lookup(table, c); | |
180 | } | |
181 | if(rightPair >= MIN_SHORT) { | |
182 | rightPair &= SHORT_PRIMARY_MASK; | |
183 | break; | |
184 | } else if(rightPair > variableTop) { | |
185 | rightPair &= LONG_PRIMARY_MASK; | |
186 | break; | |
187 | } else { | |
188 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
189 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
190 | rightPair = getPrimaries(variableTop, rightPair); | |
191 | } | |
192 | } | |
193 | ||
194 | if(leftPair == rightPair) { | |
195 | if(leftPair == EOS) { break; } | |
196 | leftPair = rightPair = 0; | |
197 | continue; | |
198 | } | |
199 | uint32_t leftPrimary = leftPair & 0xffff; | |
200 | uint32_t rightPrimary = rightPair & 0xffff; | |
201 | if(leftPrimary != rightPrimary) { | |
202 | // Return the primary difference. | |
203 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
204 | } | |
205 | if(leftPair == EOS) { break; } | |
206 | leftPair >>= 16; | |
207 | rightPair >>= 16; | |
208 | } | |
209 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
210 | // but we know that the string is well-formed and | |
211 | // only contains supported characters and mappings. | |
212 | ||
213 | // We might skip the secondary level but continue with the case level | |
214 | // which is turned on separately. | |
215 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
216 | leftIndex = rightIndex = 0; | |
217 | leftPair = rightPair = 0; | |
218 | for(;;) { | |
219 | while(leftPair == 0) { | |
220 | if(leftIndex == leftLength) { | |
221 | leftPair = EOS; | |
222 | break; | |
223 | } | |
224 | UChar32 c = left[leftIndex++]; | |
225 | if(c <= LATIN_MAX) { | |
226 | leftPair = table[c]; | |
227 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
228 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
229 | } else { | |
230 | leftPair = lookup(table, c); | |
231 | } | |
232 | if(leftPair >= MIN_SHORT) { | |
233 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
234 | break; | |
235 | } else if(leftPair > variableTop) { | |
236 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
237 | break; | |
238 | } else { | |
239 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
240 | leftPair = getSecondaries(variableTop, leftPair); | |
241 | } | |
242 | } | |
243 | ||
244 | while(rightPair == 0) { | |
245 | if(rightIndex == rightLength) { | |
246 | rightPair = EOS; | |
247 | break; | |
248 | } | |
249 | UChar32 c = right[rightIndex++]; | |
250 | if(c <= LATIN_MAX) { | |
251 | rightPair = table[c]; | |
252 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
253 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
254 | } else { | |
255 | rightPair = lookup(table, c); | |
256 | } | |
257 | if(rightPair >= MIN_SHORT) { | |
258 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
259 | break; | |
260 | } else if(rightPair > variableTop) { | |
261 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
262 | break; | |
263 | } else { | |
264 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
265 | rightPair = getSecondaries(variableTop, rightPair); | |
266 | } | |
267 | } | |
268 | ||
269 | if(leftPair == rightPair) { | |
270 | if(leftPair == EOS) { break; } | |
271 | leftPair = rightPair = 0; | |
272 | continue; | |
273 | } | |
274 | uint32_t leftSecondary = leftPair & 0xffff; | |
275 | uint32_t rightSecondary = rightPair & 0xffff; | |
276 | if(leftSecondary != rightSecondary) { | |
277 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
278 | // Full support for backwards secondary requires backwards contraction matching | |
279 | // and moving backwards between merge separators. | |
280 | return BAIL_OUT_RESULT; | |
281 | } | |
282 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
283 | } | |
284 | if(leftPair == EOS) { break; } | |
285 | leftPair >>= 16; | |
286 | rightPair >>= 16; | |
287 | } | |
288 | } | |
289 | ||
290 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
291 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
292 | leftIndex = rightIndex = 0; | |
293 | leftPair = rightPair = 0; | |
294 | for(;;) { | |
295 | while(leftPair == 0) { | |
296 | if(leftIndex == leftLength) { | |
297 | leftPair = EOS; | |
298 | break; | |
299 | } | |
300 | UChar32 c = left[leftIndex++]; | |
301 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
302 | if(leftPair < MIN_LONG) { | |
303 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
304 | } | |
305 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
306 | } | |
307 | ||
308 | while(rightPair == 0) { | |
309 | if(rightIndex == rightLength) { | |
310 | rightPair = EOS; | |
311 | break; | |
312 | } | |
313 | UChar32 c = right[rightIndex++]; | |
314 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
315 | if(rightPair < MIN_LONG) { | |
316 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
317 | } | |
318 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
319 | } | |
320 | ||
321 | if(leftPair == rightPair) { | |
322 | if(leftPair == EOS) { break; } | |
323 | leftPair = rightPair = 0; | |
324 | continue; | |
325 | } | |
326 | uint32_t leftCase = leftPair & 0xffff; | |
327 | uint32_t rightCase = rightPair & 0xffff; | |
328 | if(leftCase != rightCase) { | |
329 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
330 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
331 | } else { | |
332 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
333 | } | |
334 | } | |
335 | if(leftPair == EOS) { break; } | |
336 | leftPair >>= 16; | |
337 | rightPair >>= 16; | |
338 | } | |
339 | } | |
340 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
341 | ||
342 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
343 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
344 | ||
345 | leftIndex = rightIndex = 0; | |
346 | leftPair = rightPair = 0; | |
347 | for(;;) { | |
348 | while(leftPair == 0) { | |
349 | if(leftIndex == leftLength) { | |
350 | leftPair = EOS; | |
351 | break; | |
352 | } | |
353 | UChar32 c = left[leftIndex++]; | |
354 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
355 | if(leftPair < MIN_LONG) { | |
356 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
357 | } | |
358 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
359 | } | |
360 | ||
361 | while(rightPair == 0) { | |
362 | if(rightIndex == rightLength) { | |
363 | rightPair = EOS; | |
364 | break; | |
365 | } | |
366 | UChar32 c = right[rightIndex++]; | |
367 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
368 | if(rightPair < MIN_LONG) { | |
369 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
370 | } | |
371 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
372 | } | |
373 | ||
374 | if(leftPair == rightPair) { | |
375 | if(leftPair == EOS) { break; } | |
376 | leftPair = rightPair = 0; | |
377 | continue; | |
378 | } | |
379 | uint32_t leftTertiary = leftPair & 0xffff; | |
380 | uint32_t rightTertiary = rightPair & 0xffff; | |
381 | if(leftTertiary != rightTertiary) { | |
382 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
383 | // Pass through EOS and MERGE_WEIGHT | |
384 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
385 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
386 | if(leftTertiary > MERGE_WEIGHT) { | |
387 | leftTertiary ^= CASE_MASK; | |
388 | } | |
389 | if(rightTertiary > MERGE_WEIGHT) { | |
390 | rightTertiary ^= CASE_MASK; | |
391 | } | |
392 | } | |
393 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
394 | } | |
395 | if(leftPair == EOS) { break; } | |
396 | leftPair >>= 16; | |
397 | rightPair >>= 16; | |
398 | } | |
399 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
400 | ||
401 | leftIndex = rightIndex = 0; | |
402 | leftPair = rightPair = 0; | |
403 | for(;;) { | |
404 | while(leftPair == 0) { | |
405 | if(leftIndex == leftLength) { | |
406 | leftPair = EOS; | |
407 | break; | |
408 | } | |
409 | UChar32 c = left[leftIndex++]; | |
410 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
411 | if(leftPair < MIN_LONG) { | |
412 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
413 | } | |
414 | leftPair = getQuaternaries(variableTop, leftPair); | |
415 | } | |
416 | ||
417 | while(rightPair == 0) { | |
418 | if(rightIndex == rightLength) { | |
419 | rightPair = EOS; | |
420 | break; | |
421 | } | |
422 | UChar32 c = right[rightIndex++]; | |
423 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
424 | if(rightPair < MIN_LONG) { | |
425 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
426 | } | |
427 | rightPair = getQuaternaries(variableTop, rightPair); | |
428 | } | |
429 | ||
430 | if(leftPair == rightPair) { | |
431 | if(leftPair == EOS) { break; } | |
432 | leftPair = rightPair = 0; | |
433 | continue; | |
434 | } | |
435 | uint32_t leftQuaternary = leftPair & 0xffff; | |
436 | uint32_t rightQuaternary = rightPair & 0xffff; | |
437 | if(leftQuaternary != rightQuaternary) { | |
438 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
439 | } | |
440 | if(leftPair == EOS) { break; } | |
441 | leftPair >>= 16; | |
442 | rightPair >>= 16; | |
443 | } | |
444 | return UCOL_EQUAL; | |
445 | } | |
446 | ||
447 | int32_t | |
448 | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
449 | const uint8_t *left, int32_t leftLength, | |
450 | const uint8_t *right, int32_t rightLength) { | |
451 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
452 | ||
453 | U_ASSERT((table[0] >> 8) == VERSION); | |
454 | table += (table[0] & 0xff); // skip the header | |
455 | uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() | |
456 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
457 | ||
458 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
459 | int32_t leftIndex = 0, rightIndex = 0; | |
460 | /** | |
461 | * Single mini CE or a pair. | |
462 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
463 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
464 | */ | |
465 | uint32_t leftPair = 0, rightPair = 0; | |
466 | // Note: There is no need to assemble the code point. | |
467 | // We only need to look up the table entry for the character, | |
468 | // and nextPair() looks for whether c==0. | |
469 | for(;;) { | |
470 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
471 | while(leftPair == 0) { | |
472 | if(leftIndex == leftLength) { | |
473 | leftPair = EOS; | |
474 | break; | |
475 | } | |
476 | UChar32 c = left[leftIndex++]; | |
477 | uint8_t t; | |
478 | if(c <= 0x7f) { | |
479 | leftPair = primaries[c]; | |
480 | if(leftPair != 0) { break; } | |
481 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
482 | return BAIL_OUT_RESULT; | |
483 | } | |
484 | leftPair = table[c]; | |
485 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && | |
486 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { | |
487 | ++leftIndex; | |
488 | c = ((c - 0xc2) << 6) + t; | |
489 | leftPair = primaries[c]; | |
490 | if(leftPair != 0) { break; } | |
491 | leftPair = table[c]; | |
492 | } else { | |
493 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); | |
494 | } | |
495 | if(leftPair >= MIN_SHORT) { | |
496 | leftPair &= SHORT_PRIMARY_MASK; | |
497 | break; | |
498 | } else if(leftPair > variableTop) { | |
499 | leftPair &= LONG_PRIMARY_MASK; | |
500 | break; | |
501 | } else { | |
502 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
503 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
504 | leftPair = getPrimaries(variableTop, leftPair); | |
505 | } | |
506 | } | |
507 | ||
508 | while(rightPair == 0) { | |
509 | if(rightIndex == rightLength) { | |
510 | rightPair = EOS; | |
511 | break; | |
512 | } | |
513 | UChar32 c = right[rightIndex++]; | |
514 | uint8_t t; | |
515 | if(c <= 0x7f) { | |
516 | rightPair = primaries[c]; | |
517 | if(rightPair != 0) { break; } | |
518 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
519 | return BAIL_OUT_RESULT; | |
520 | } | |
521 | rightPair = table[c]; | |
522 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && | |
523 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { | |
524 | ++rightIndex; | |
525 | c = ((c - 0xc2) << 6) + t; | |
526 | rightPair = primaries[c]; | |
527 | if(rightPair != 0) { break; } | |
528 | rightPair = table[c]; | |
529 | } else { | |
530 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); | |
531 | } | |
532 | if(rightPair >= MIN_SHORT) { | |
533 | rightPair &= SHORT_PRIMARY_MASK; | |
534 | break; | |
535 | } else if(rightPair > variableTop) { | |
536 | rightPair &= LONG_PRIMARY_MASK; | |
537 | break; | |
538 | } else { | |
539 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
540 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
541 | rightPair = getPrimaries(variableTop, rightPair); | |
542 | } | |
543 | } | |
544 | ||
545 | if(leftPair == rightPair) { | |
546 | if(leftPair == EOS) { break; } | |
547 | leftPair = rightPair = 0; | |
548 | continue; | |
549 | } | |
550 | uint32_t leftPrimary = leftPair & 0xffff; | |
551 | uint32_t rightPrimary = rightPair & 0xffff; | |
552 | if(leftPrimary != rightPrimary) { | |
553 | // Return the primary difference. | |
554 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
555 | } | |
556 | if(leftPair == EOS) { break; } | |
557 | leftPair >>= 16; | |
558 | rightPair >>= 16; | |
559 | } | |
560 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
561 | // but we know that the string is well-formed and | |
562 | // only contains supported characters and mappings. | |
563 | ||
564 | // We might skip the secondary level but continue with the case level | |
565 | // which is turned on separately. | |
566 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
567 | leftIndex = rightIndex = 0; | |
568 | leftPair = rightPair = 0; | |
569 | for(;;) { | |
570 | while(leftPair == 0) { | |
571 | if(leftIndex == leftLength) { | |
572 | leftPair = EOS; | |
573 | break; | |
574 | } | |
575 | UChar32 c = left[leftIndex++]; | |
576 | if(c <= 0x7f) { | |
577 | leftPair = table[c]; | |
578 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
579 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; | |
580 | } else { | |
581 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); | |
582 | } | |
583 | if(leftPair >= MIN_SHORT) { | |
584 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
585 | break; | |
586 | } else if(leftPair > variableTop) { | |
587 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
588 | break; | |
589 | } else { | |
590 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
591 | leftPair = getSecondaries(variableTop, leftPair); | |
592 | } | |
593 | } | |
594 | ||
595 | while(rightPair == 0) { | |
596 | if(rightIndex == rightLength) { | |
597 | rightPair = EOS; | |
598 | break; | |
599 | } | |
600 | UChar32 c = right[rightIndex++]; | |
601 | if(c <= 0x7f) { | |
602 | rightPair = table[c]; | |
603 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
604 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; | |
605 | } else { | |
606 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); | |
607 | } | |
608 | if(rightPair >= MIN_SHORT) { | |
609 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
610 | break; | |
611 | } else if(rightPair > variableTop) { | |
612 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
613 | break; | |
614 | } else { | |
615 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
616 | rightPair = getSecondaries(variableTop, rightPair); | |
617 | } | |
618 | } | |
619 | ||
620 | if(leftPair == rightPair) { | |
621 | if(leftPair == EOS) { break; } | |
622 | leftPair = rightPair = 0; | |
623 | continue; | |
624 | } | |
625 | uint32_t leftSecondary = leftPair & 0xffff; | |
626 | uint32_t rightSecondary = rightPair & 0xffff; | |
627 | if(leftSecondary != rightSecondary) { | |
628 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
629 | // Full support for backwards secondary requires backwards contraction matching | |
630 | // and moving backwards between merge separators. | |
631 | return BAIL_OUT_RESULT; | |
632 | } | |
633 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
634 | } | |
635 | if(leftPair == EOS) { break; } | |
636 | leftPair >>= 16; | |
637 | rightPair >>= 16; | |
638 | } | |
639 | } | |
640 | ||
641 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
642 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
643 | leftIndex = rightIndex = 0; | |
644 | leftPair = rightPair = 0; | |
645 | for(;;) { | |
646 | while(leftPair == 0) { | |
647 | if(leftIndex == leftLength) { | |
648 | leftPair = EOS; | |
649 | break; | |
650 | } | |
651 | UChar32 c = left[leftIndex++]; | |
652 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
653 | if(leftPair < MIN_LONG) { | |
654 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
655 | } | |
656 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
657 | } | |
658 | ||
659 | while(rightPair == 0) { | |
660 | if(rightIndex == rightLength) { | |
661 | rightPair = EOS; | |
662 | break; | |
663 | } | |
664 | UChar32 c = right[rightIndex++]; | |
665 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
666 | if(rightPair < MIN_LONG) { | |
667 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
668 | } | |
669 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
670 | } | |
671 | ||
672 | if(leftPair == rightPair) { | |
673 | if(leftPair == EOS) { break; } | |
674 | leftPair = rightPair = 0; | |
675 | continue; | |
676 | } | |
677 | uint32_t leftCase = leftPair & 0xffff; | |
678 | uint32_t rightCase = rightPair & 0xffff; | |
679 | if(leftCase != rightCase) { | |
680 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
681 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
682 | } else { | |
683 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
684 | } | |
685 | } | |
686 | if(leftPair == EOS) { break; } | |
687 | leftPair >>= 16; | |
688 | rightPair >>= 16; | |
689 | } | |
690 | } | |
691 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
692 | ||
693 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
694 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
695 | ||
696 | leftIndex = rightIndex = 0; | |
697 | leftPair = rightPair = 0; | |
698 | for(;;) { | |
699 | while(leftPair == 0) { | |
700 | if(leftIndex == leftLength) { | |
701 | leftPair = EOS; | |
702 | break; | |
703 | } | |
704 | UChar32 c = left[leftIndex++]; | |
705 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
706 | if(leftPair < MIN_LONG) { | |
707 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
708 | } | |
709 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
710 | } | |
711 | ||
712 | while(rightPair == 0) { | |
713 | if(rightIndex == rightLength) { | |
714 | rightPair = EOS; | |
715 | break; | |
716 | } | |
717 | UChar32 c = right[rightIndex++]; | |
718 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
719 | if(rightPair < MIN_LONG) { | |
720 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
721 | } | |
722 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
723 | } | |
724 | ||
725 | if(leftPair == rightPair) { | |
726 | if(leftPair == EOS) { break; } | |
727 | leftPair = rightPair = 0; | |
728 | continue; | |
729 | } | |
730 | uint32_t leftTertiary = leftPair & 0xffff; | |
731 | uint32_t rightTertiary = rightPair & 0xffff; | |
732 | if(leftTertiary != rightTertiary) { | |
733 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
734 | // Pass through EOS and MERGE_WEIGHT | |
735 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
736 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
737 | if(leftTertiary > MERGE_WEIGHT) { | |
738 | leftTertiary ^= CASE_MASK; | |
739 | } | |
740 | if(rightTertiary > MERGE_WEIGHT) { | |
741 | rightTertiary ^= CASE_MASK; | |
742 | } | |
743 | } | |
744 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
745 | } | |
746 | if(leftPair == EOS) { break; } | |
747 | leftPair >>= 16; | |
748 | rightPair >>= 16; | |
749 | } | |
750 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
751 | ||
752 | leftIndex = rightIndex = 0; | |
753 | leftPair = rightPair = 0; | |
754 | for(;;) { | |
755 | while(leftPair == 0) { | |
756 | if(leftIndex == leftLength) { | |
757 | leftPair = EOS; | |
758 | break; | |
759 | } | |
760 | UChar32 c = left[leftIndex++]; | |
761 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
762 | if(leftPair < MIN_LONG) { | |
763 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
764 | } | |
765 | leftPair = getQuaternaries(variableTop, leftPair); | |
766 | } | |
767 | ||
768 | while(rightPair == 0) { | |
769 | if(rightIndex == rightLength) { | |
770 | rightPair = EOS; | |
771 | break; | |
772 | } | |
773 | UChar32 c = right[rightIndex++]; | |
774 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
775 | if(rightPair < MIN_LONG) { | |
776 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
777 | } | |
778 | rightPair = getQuaternaries(variableTop, rightPair); | |
779 | } | |
780 | ||
781 | if(leftPair == rightPair) { | |
782 | if(leftPair == EOS) { break; } | |
783 | leftPair = rightPair = 0; | |
784 | continue; | |
785 | } | |
786 | uint32_t leftQuaternary = leftPair & 0xffff; | |
787 | uint32_t rightQuaternary = rightPair & 0xffff; | |
788 | if(leftQuaternary != rightQuaternary) { | |
789 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
790 | } | |
791 | if(leftPair == EOS) { break; } | |
792 | leftPair >>= 16; | |
793 | rightPair >>= 16; | |
794 | } | |
795 | return UCOL_EQUAL; | |
796 | } | |
797 | ||
798 | uint32_t | |
799 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { | |
800 | U_ASSERT(c > LATIN_MAX); | |
801 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
802 | return table[c - PUNCT_START + LATIN_LIMIT]; | |
803 | } else if(c == 0xfffe) { | |
804 | return MERGE_WEIGHT; | |
805 | } else if(c == 0xffff) { | |
806 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; | |
807 | } else { | |
808 | return BAIL_OUT; | |
809 | } | |
810 | } | |
811 | ||
812 | uint32_t | |
813 | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, | |
814 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { | |
815 | // The caller handled ASCII and valid/supported Latin. | |
816 | U_ASSERT(c > 0x7f); | |
817 | int32_t i2 = sIndex + 1; | |
818 | if(i2 < sLength || sLength < 0) { | |
819 | uint8_t t1 = s8[sIndex]; | |
820 | uint8_t t2 = s8[i2]; | |
821 | sIndex += 2; | |
822 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { | |
823 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
824 | } else if(c == 0xef && t1 == 0xbf) { | |
825 | if(t2 == 0xbe) { | |
826 | return MERGE_WEIGHT; // U+FFFE | |
827 | } else if(t2 == 0xbf) { | |
828 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
829 | } | |
830 | } | |
831 | } | |
832 | return BAIL_OUT; | |
833 | } | |
834 | ||
835 | uint32_t | |
836 | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, | |
837 | const uint8_t *s8, int32_t &sIndex) { | |
838 | // The caller handled ASCII. | |
839 | // The string is well-formed and contains only supported characters. | |
840 | U_ASSERT(c > 0x7f); | |
841 | if(c <= LATIN_MAX_UTF8_LEAD) { | |
842 | return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F | |
843 | } | |
844 | uint8_t t2 = s8[sIndex + 1]; | |
845 | sIndex += 2; | |
846 | if(c == 0xe2) { | |
847 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
848 | } else if(t2 == 0xbe) { | |
849 | return MERGE_WEIGHT; // U+FFFE | |
850 | } else { | |
851 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
852 | } | |
853 | } | |
854 | ||
855 | uint32_t | |
856 | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, | |
857 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { | |
858 | if(ce >= MIN_LONG || ce < CONTRACTION) { | |
859 | return ce; // simple or special mini CE | |
860 | } else if(ce >= EXPANSION) { | |
861 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
862 | return ((uint32_t)table[index + 1] << 16) | table[index]; | |
863 | } else /* ce >= CONTRACTION */ { | |
864 | if(c == 0 && sLength < 0) { | |
865 | sLength = sIndex - 1; | |
866 | return EOS; | |
867 | } | |
868 | // Contraction list: Default mapping followed by | |
869 | // 0 or more single-character contraction suffix mappings. | |
870 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
871 | if(sIndex != sLength) { | |
872 | // Read the next character. | |
873 | int32_t c2; | |
874 | int32_t nextIndex = sIndex; | |
875 | if(s16 != NULL) { | |
876 | c2 = s16[nextIndex++]; | |
877 | if(c2 > LATIN_MAX) { | |
878 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { | |
879 | c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF | |
880 | } else if(c2 == 0xfffe || c2 == 0xffff) { | |
881 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
882 | } else { | |
883 | return BAIL_OUT; | |
884 | } | |
885 | } | |
886 | } else { | |
887 | c2 = s8[nextIndex++]; | |
888 | if(c2 > 0x7f) { | |
889 | uint8_t t; | |
890 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && | |
891 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { | |
892 | c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F | |
893 | ++nextIndex; | |
894 | } else { | |
895 | int32_t i2 = nextIndex + 1; | |
896 | if(i2 < sLength || sLength < 0) { | |
897 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && | |
898 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { | |
899 | c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF | |
900 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && | |
901 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { | |
902 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
903 | } else { | |
904 | return BAIL_OUT; | |
905 | } | |
906 | } else { | |
907 | return BAIL_OUT; | |
908 | } | |
909 | nextIndex += 2; | |
910 | } | |
911 | } | |
912 | } | |
913 | if(c2 == 0 && sLength < 0) { | |
914 | sLength = sIndex; | |
915 | c2 = -1; | |
916 | } | |
917 | // Look for the next character in the contraction suffix list, | |
918 | // which is in ascending order of single suffix characters. | |
919 | int32_t i = index; | |
920 | int32_t head = table[i]; // first skip the default mapping | |
921 | int32_t x; | |
922 | do { | |
923 | i += head >> CONTR_LENGTH_SHIFT; | |
924 | head = table[i]; | |
925 | x = head & CONTR_CHAR_MASK; | |
926 | } while(x < c2); | |
927 | if(x == c2) { | |
928 | index = i; | |
929 | sIndex = nextIndex; | |
930 | } | |
931 | } | |
932 | // Return the CE or CEs for the default or contraction mapping. | |
933 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; | |
934 | if(length == 1) { | |
935 | return BAIL_OUT; | |
936 | } | |
937 | ce = table[index + 1]; | |
938 | if(length == 2) { | |
939 | return ce; | |
940 | } else { | |
941 | return ((uint32_t)table[index + 2] << 16) | ce; | |
942 | } | |
943 | } | |
944 | } | |
945 | ||
946 | uint32_t | |
947 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { | |
948 | if(pair <= 0xffff) { | |
949 | // one mini CE | |
950 | if(pair >= MIN_SHORT) { | |
951 | pair = getSecondariesFromOneShortCE(pair); | |
952 | } else if(pair > variableTop) { | |
953 | pair = COMMON_SEC_PLUS_OFFSET; | |
954 | } else if(pair >= MIN_LONG) { | |
955 | pair = 0; // variable | |
956 | } | |
957 | // else special mini CE | |
958 | } else { | |
959 | uint32_t ce = pair & 0xffff; | |
960 | if(ce >= MIN_SHORT) { | |
961 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; | |
962 | } else if(ce > variableTop) { | |
963 | pair = TWO_COMMON_SEC_PLUS_OFFSET; | |
964 | } else { | |
965 | U_ASSERT(ce >= MIN_LONG); | |
966 | pair = 0; // variable | |
967 | } | |
968 | } | |
969 | return pair; | |
970 | } | |
971 | ||
972 | uint32_t | |
973 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { | |
974 | // Primary+caseLevel: Ignore case level weights of primary ignorables. | |
975 | // Otherwise: Ignore case level weights of secondary ignorables. | |
976 | // For details see the comments in the CollationCompare class. | |
977 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
978 | if(pair <= 0xffff) { | |
979 | // one mini CE | |
980 | if(pair >= MIN_SHORT) { | |
981 | // A high secondary weight means we really have two CEs, | |
982 | // a primary CE and a secondary CE. | |
983 | uint32_t ce = pair; | |
984 | pair &= CASE_MASK; // explicit weight of primary CE | |
985 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
986 | pair |= LOWER_CASE << 16; // implied weight of secondary CE | |
987 | } | |
988 | } else if(pair > variableTop) { | |
989 | pair = LOWER_CASE; | |
990 | } else if(pair >= MIN_LONG) { | |
991 | pair = 0; // variable | |
992 | } | |
993 | // else special mini CE | |
994 | } else { | |
995 | // two mini CEs, same primary groups, neither expands like above | |
996 | uint32_t ce = pair & 0xffff; | |
997 | if(ce >= MIN_SHORT) { | |
998 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { | |
999 | pair &= CASE_MASK; | |
1000 | } else { | |
1001 | pair &= TWO_CASES_MASK; | |
1002 | } | |
1003 | } else if(ce > variableTop) { | |
1004 | pair = TWO_LOWER_CASES; | |
1005 | } else { | |
1006 | U_ASSERT(ce >= MIN_LONG); | |
1007 | pair = 0; // variable | |
1008 | } | |
1009 | } | |
1010 | return pair; | |
1011 | } | |
1012 | ||
1013 | uint32_t | |
1014 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { | |
1015 | if(pair <= 0xffff) { | |
1016 | // one mini CE | |
1017 | if(pair >= MIN_SHORT) { | |
1018 | // A high secondary weight means we really have two CEs, | |
1019 | // a primary CE and a secondary CE. | |
1020 | uint32_t ce = pair; | |
1021 | if(withCaseBits) { | |
1022 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; | |
1023 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1024 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; | |
1025 | } | |
1026 | } else { | |
1027 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1028 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1029 | pair |= COMMON_TER_PLUS_OFFSET << 16; | |
1030 | } | |
1031 | } | |
1032 | } else if(pair > variableTop) { | |
1033 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1034 | if(withCaseBits) { | |
1035 | pair |= LOWER_CASE; | |
1036 | } | |
1037 | } else if(pair >= MIN_LONG) { | |
1038 | pair = 0; // variable | |
1039 | } | |
1040 | // else special mini CE | |
1041 | } else { | |
1042 | // two mini CEs, same primary groups, neither expands like above | |
1043 | uint32_t ce = pair & 0xffff; | |
1044 | if(ce >= MIN_SHORT) { | |
1045 | if(withCaseBits) { | |
1046 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; | |
1047 | } else { | |
1048 | pair &= TWO_TERTIARIES_MASK; | |
1049 | } | |
1050 | pair += TWO_TER_OFFSETS; | |
1051 | } else if(ce > variableTop) { | |
1052 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; | |
1053 | if(withCaseBits) { | |
1054 | pair |= TWO_LOWER_CASES; | |
1055 | } | |
1056 | } else { | |
1057 | U_ASSERT(ce >= MIN_LONG); | |
1058 | pair = 0; // variable | |
1059 | } | |
1060 | } | |
1061 | return pair; | |
1062 | } | |
1063 | ||
1064 | uint32_t | |
1065 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { | |
1066 | // Return the primary weight of a variable CE, | |
1067 | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. | |
1068 | if(pair <= 0xffff) { | |
1069 | // one mini CE | |
1070 | if(pair >= MIN_SHORT) { | |
1071 | // A high secondary weight means we really have two CEs, | |
1072 | // a primary CE and a secondary CE. | |
1073 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1074 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1075 | } else { | |
1076 | pair = SHORT_PRIMARY_MASK; | |
1077 | } | |
1078 | } else if(pair > variableTop) { | |
1079 | pair = SHORT_PRIMARY_MASK; | |
1080 | } else if(pair >= MIN_LONG) { | |
1081 | pair &= LONG_PRIMARY_MASK; // variable | |
1082 | } | |
1083 | // else special mini CE | |
1084 | } else { | |
1085 | // two mini CEs, same primary groups, neither expands like above | |
1086 | uint32_t ce = pair & 0xffff; | |
1087 | if(ce > variableTop) { | |
1088 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1089 | } else { | |
1090 | U_ASSERT(ce >= MIN_LONG); | |
1091 | pair &= TWO_LONG_PRIMARIES_MASK; // variable | |
1092 | } | |
1093 | } | |
1094 | return pair; | |
1095 | } | |
1096 | ||
1097 | U_NAMESPACE_END | |
1098 | ||
1099 | #endif // !UCONFIG_NO_COLLATION |