]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
b331163b | 3 | * Copyright (C) 2013-2015, International Business Machines |
57a6839d A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * collationfastlatin.cpp | |
7 | * | |
8 | * created on: 2013aug18 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | ||
14 | #if !UCONFIG_NO_COLLATION | |
15 | ||
16 | #include "unicode/ucol.h" | |
17 | #include "collationdata.h" | |
18 | #include "collationfastlatin.h" | |
19 | #include "collationsettings.h" | |
20 | #include "putilimp.h" // U_ALIGN_CODE | |
21 | #include "uassert.h" | |
22 | ||
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | int32_t | |
26 | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, | |
27 | uint16_t *primaries, int32_t capacity) { | |
28 | const uint16_t *table = data->fastLatinTable; | |
29 | if(table == NULL) { return -1; } | |
30 | U_ASSERT(capacity == LATIN_LIMIT); | |
31 | if(capacity != LATIN_LIMIT) { return -1; } | |
32 | ||
33 | uint32_t miniVarTop; | |
34 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { | |
35 | // No mini primaries are variable, set a variableTop just below the | |
36 | // lowest long mini primary. | |
37 | miniVarTop = MIN_LONG - 1; | |
38 | } else { | |
57a6839d | 39 | int32_t headerLength = *table & 0xff; |
b331163b A |
40 | int32_t i = 1 + settings.getMaxVariable(); |
41 | if(i >= headerLength) { | |
57a6839d A |
42 | return -1; // variableTop >= digits, should not occur |
43 | } | |
b331163b | 44 | miniVarTop = table[i]; |
57a6839d A |
45 | } |
46 | ||
b331163b A |
47 | UBool digitsAreReordered = FALSE; |
48 | if(settings.hasReordering()) { | |
49 | uint32_t prevStart = 0; | |
50 | uint32_t beforeDigitStart = 0; | |
51 | uint32_t digitStart = 0; | |
52 | uint32_t afterDigitStart = 0; | |
53 | for(int32_t group = UCOL_REORDER_CODE_FIRST; | |
54 | group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; | |
55 | ++group) { | |
56 | uint32_t start = data->getFirstPrimaryForGroup(group); | |
57 | start = settings.reorder(start); | |
58 | if(group == UCOL_REORDER_CODE_DIGIT) { | |
59 | beforeDigitStart = prevStart; | |
60 | digitStart = start; | |
61 | } else if(start != 0) { | |
62 | if(start < prevStart) { | |
63 | // The permutation affects the groups up to Latin. | |
64 | return -1; | |
65 | } | |
66 | // In the future, there might be a special group between digits & Latin. | |
67 | if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { | |
68 | afterDigitStart = start; | |
69 | } | |
70 | prevStart = start; | |
57a6839d | 71 | } |
b331163b A |
72 | } |
73 | uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); | |
74 | latinStart = settings.reorder(latinStart); | |
75 | if(latinStart < prevStart) { | |
76 | return -1; | |
77 | } | |
78 | if(afterDigitStart == 0) { | |
79 | afterDigitStart = latinStart; | |
80 | } | |
81 | if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { | |
82 | digitsAreReordered = TRUE; | |
57a6839d A |
83 | } |
84 | } | |
85 | ||
86 | table += (table[0] & 0xff); // skip the header | |
87 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { | |
88 | uint32_t p = table[c]; | |
89 | if(p >= MIN_SHORT) { | |
90 | p &= SHORT_PRIMARY_MASK; | |
91 | } else if(p > miniVarTop) { | |
92 | p &= LONG_PRIMARY_MASK; | |
93 | } else { | |
94 | p = 0; | |
95 | } | |
96 | primaries[c] = (uint16_t)p; | |
97 | } | |
b331163b | 98 | if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { |
57a6839d A |
99 | // Bail out for digits. |
100 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } | |
101 | } | |
102 | ||
103 | // Shift the miniVarTop above other options. | |
104 | return ((int32_t)miniVarTop << 16) | settings.options; | |
105 | } | |
106 | ||
107 | int32_t | |
108 | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
109 | const UChar *left, int32_t leftLength, | |
110 | const UChar *right, int32_t rightLength) { | |
111 | // This is a modified copy of CollationCompare::compareUpToQuaternary(), | |
112 | // optimized for common Latin text. | |
113 | // Keep them in sync! | |
114 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
115 | ||
116 | U_ASSERT((table[0] >> 8) == VERSION); | |
117 | table += (table[0] & 0xff); // skip the header | |
118 | uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() | |
119 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
120 | ||
121 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
122 | U_ALIGN_CODE(16); | |
123 | int32_t leftIndex = 0, rightIndex = 0; | |
124 | /** | |
125 | * Single mini CE or a pair. | |
126 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
127 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
128 | */ | |
129 | uint32_t leftPair = 0, rightPair = 0; | |
130 | for(;;) { | |
131 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
132 | while(leftPair == 0) { | |
133 | if(leftIndex == leftLength) { | |
134 | leftPair = EOS; | |
135 | break; | |
136 | } | |
137 | UChar32 c = left[leftIndex++]; | |
138 | if(c <= LATIN_MAX) { | |
139 | leftPair = primaries[c]; | |
140 | if(leftPair != 0) { break; } | |
141 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
142 | return BAIL_OUT_RESULT; | |
143 | } | |
144 | leftPair = table[c]; | |
145 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
146 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
147 | } else { | |
148 | leftPair = lookup(table, c); | |
149 | } | |
150 | if(leftPair >= MIN_SHORT) { | |
151 | leftPair &= SHORT_PRIMARY_MASK; | |
152 | break; | |
153 | } else if(leftPair > variableTop) { | |
154 | leftPair &= LONG_PRIMARY_MASK; | |
155 | break; | |
156 | } else { | |
157 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
158 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
159 | leftPair = getPrimaries(variableTop, leftPair); | |
160 | } | |
161 | } | |
162 | ||
163 | while(rightPair == 0) { | |
164 | if(rightIndex == rightLength) { | |
165 | rightPair = EOS; | |
166 | break; | |
167 | } | |
168 | UChar32 c = right[rightIndex++]; | |
169 | if(c <= LATIN_MAX) { | |
170 | rightPair = primaries[c]; | |
171 | if(rightPair != 0) { break; } | |
172 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
173 | return BAIL_OUT_RESULT; | |
174 | } | |
175 | rightPair = table[c]; | |
176 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
177 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
178 | } else { | |
179 | rightPair = lookup(table, c); | |
180 | } | |
181 | if(rightPair >= MIN_SHORT) { | |
182 | rightPair &= SHORT_PRIMARY_MASK; | |
183 | break; | |
184 | } else if(rightPair > variableTop) { | |
185 | rightPair &= LONG_PRIMARY_MASK; | |
186 | break; | |
187 | } else { | |
188 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
189 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
190 | rightPair = getPrimaries(variableTop, rightPair); | |
191 | } | |
192 | } | |
193 | ||
194 | if(leftPair == rightPair) { | |
195 | if(leftPair == EOS) { break; } | |
196 | leftPair = rightPair = 0; | |
197 | continue; | |
198 | } | |
199 | uint32_t leftPrimary = leftPair & 0xffff; | |
200 | uint32_t rightPrimary = rightPair & 0xffff; | |
201 | if(leftPrimary != rightPrimary) { | |
202 | // Return the primary difference. | |
203 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
204 | } | |
205 | if(leftPair == EOS) { break; } | |
206 | leftPair >>= 16; | |
207 | rightPair >>= 16; | |
208 | } | |
209 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
210 | // but we know that the string is well-formed and | |
211 | // only contains supported characters and mappings. | |
212 | ||
213 | // We might skip the secondary level but continue with the case level | |
214 | // which is turned on separately. | |
215 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
216 | leftIndex = rightIndex = 0; | |
217 | leftPair = rightPair = 0; | |
218 | for(;;) { | |
219 | while(leftPair == 0) { | |
220 | if(leftIndex == leftLength) { | |
221 | leftPair = EOS; | |
222 | break; | |
223 | } | |
224 | UChar32 c = left[leftIndex++]; | |
225 | if(c <= LATIN_MAX) { | |
226 | leftPair = table[c]; | |
227 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
228 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
229 | } else { | |
230 | leftPair = lookup(table, c); | |
231 | } | |
232 | if(leftPair >= MIN_SHORT) { | |
233 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
234 | break; | |
235 | } else if(leftPair > variableTop) { | |
236 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
237 | break; | |
238 | } else { | |
239 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
240 | leftPair = getSecondaries(variableTop, leftPair); | |
241 | } | |
242 | } | |
243 | ||
244 | while(rightPair == 0) { | |
245 | if(rightIndex == rightLength) { | |
246 | rightPair = EOS; | |
247 | break; | |
248 | } | |
249 | UChar32 c = right[rightIndex++]; | |
250 | if(c <= LATIN_MAX) { | |
251 | rightPair = table[c]; | |
252 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
253 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
254 | } else { | |
255 | rightPair = lookup(table, c); | |
256 | } | |
257 | if(rightPair >= MIN_SHORT) { | |
258 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
259 | break; | |
260 | } else if(rightPair > variableTop) { | |
261 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
262 | break; | |
263 | } else { | |
264 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
265 | rightPair = getSecondaries(variableTop, rightPair); | |
266 | } | |
267 | } | |
268 | ||
269 | if(leftPair == rightPair) { | |
270 | if(leftPair == EOS) { break; } | |
271 | leftPair = rightPair = 0; | |
272 | continue; | |
273 | } | |
274 | uint32_t leftSecondary = leftPair & 0xffff; | |
275 | uint32_t rightSecondary = rightPair & 0xffff; | |
276 | if(leftSecondary != rightSecondary) { | |
277 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
278 | // Full support for backwards secondary requires backwards contraction matching | |
279 | // and moving backwards between merge separators. | |
280 | return BAIL_OUT_RESULT; | |
281 | } | |
282 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
283 | } | |
284 | if(leftPair == EOS) { break; } | |
285 | leftPair >>= 16; | |
286 | rightPair >>= 16; | |
287 | } | |
288 | } | |
289 | ||
290 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
291 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
292 | leftIndex = rightIndex = 0; | |
293 | leftPair = rightPair = 0; | |
294 | for(;;) { | |
295 | while(leftPair == 0) { | |
296 | if(leftIndex == leftLength) { | |
297 | leftPair = EOS; | |
298 | break; | |
299 | } | |
300 | UChar32 c = left[leftIndex++]; | |
301 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
302 | if(leftPair < MIN_LONG) { | |
303 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
304 | } | |
305 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
306 | } | |
307 | ||
308 | while(rightPair == 0) { | |
309 | if(rightIndex == rightLength) { | |
310 | rightPair = EOS; | |
311 | break; | |
312 | } | |
313 | UChar32 c = right[rightIndex++]; | |
314 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
315 | if(rightPair < MIN_LONG) { | |
316 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
317 | } | |
318 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
319 | } | |
320 | ||
321 | if(leftPair == rightPair) { | |
322 | if(leftPair == EOS) { break; } | |
323 | leftPair = rightPair = 0; | |
324 | continue; | |
325 | } | |
326 | uint32_t leftCase = leftPair & 0xffff; | |
327 | uint32_t rightCase = rightPair & 0xffff; | |
328 | if(leftCase != rightCase) { | |
329 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
330 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
331 | } else { | |
332 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
333 | } | |
334 | } | |
335 | if(leftPair == EOS) { break; } | |
336 | leftPair >>= 16; | |
337 | rightPair >>= 16; | |
338 | } | |
339 | } | |
340 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
341 | ||
342 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
343 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
344 | ||
345 | leftIndex = rightIndex = 0; | |
346 | leftPair = rightPair = 0; | |
347 | for(;;) { | |
348 | while(leftPair == 0) { | |
349 | if(leftIndex == leftLength) { | |
350 | leftPair = EOS; | |
351 | break; | |
352 | } | |
353 | UChar32 c = left[leftIndex++]; | |
354 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
355 | if(leftPair < MIN_LONG) { | |
356 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
357 | } | |
358 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
359 | } | |
360 | ||
361 | while(rightPair == 0) { | |
362 | if(rightIndex == rightLength) { | |
363 | rightPair = EOS; | |
364 | break; | |
365 | } | |
366 | UChar32 c = right[rightIndex++]; | |
367 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
368 | if(rightPair < MIN_LONG) { | |
369 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
370 | } | |
371 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
372 | } | |
373 | ||
374 | if(leftPair == rightPair) { | |
375 | if(leftPair == EOS) { break; } | |
376 | leftPair = rightPair = 0; | |
377 | continue; | |
378 | } | |
379 | uint32_t leftTertiary = leftPair & 0xffff; | |
380 | uint32_t rightTertiary = rightPair & 0xffff; | |
381 | if(leftTertiary != rightTertiary) { | |
382 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
383 | // Pass through EOS and MERGE_WEIGHT | |
384 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
385 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
386 | if(leftTertiary > MERGE_WEIGHT) { | |
387 | leftTertiary ^= CASE_MASK; | |
388 | } | |
389 | if(rightTertiary > MERGE_WEIGHT) { | |
390 | rightTertiary ^= CASE_MASK; | |
391 | } | |
392 | } | |
393 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
394 | } | |
395 | if(leftPair == EOS) { break; } | |
396 | leftPair >>= 16; | |
397 | rightPair >>= 16; | |
398 | } | |
399 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
400 | ||
401 | leftIndex = rightIndex = 0; | |
402 | leftPair = rightPair = 0; | |
403 | for(;;) { | |
404 | while(leftPair == 0) { | |
405 | if(leftIndex == leftLength) { | |
406 | leftPair = EOS; | |
407 | break; | |
408 | } | |
409 | UChar32 c = left[leftIndex++]; | |
410 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
411 | if(leftPair < MIN_LONG) { | |
412 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
413 | } | |
414 | leftPair = getQuaternaries(variableTop, leftPair); | |
415 | } | |
416 | ||
417 | while(rightPair == 0) { | |
418 | if(rightIndex == rightLength) { | |
419 | rightPair = EOS; | |
420 | break; | |
421 | } | |
422 | UChar32 c = right[rightIndex++]; | |
423 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
424 | if(rightPair < MIN_LONG) { | |
425 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
426 | } | |
427 | rightPair = getQuaternaries(variableTop, rightPair); | |
428 | } | |
429 | ||
430 | if(leftPair == rightPair) { | |
431 | if(leftPair == EOS) { break; } | |
432 | leftPair = rightPair = 0; | |
433 | continue; | |
434 | } | |
435 | uint32_t leftQuaternary = leftPair & 0xffff; | |
436 | uint32_t rightQuaternary = rightPair & 0xffff; | |
437 | if(leftQuaternary != rightQuaternary) { | |
438 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
439 | } | |
440 | if(leftPair == EOS) { break; } | |
441 | leftPair >>= 16; | |
442 | rightPair >>= 16; | |
443 | } | |
444 | return UCOL_EQUAL; | |
445 | } | |
446 | ||
447 | int32_t | |
448 | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
449 | const uint8_t *left, int32_t leftLength, | |
450 | const uint8_t *right, int32_t rightLength) { | |
451 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
452 | ||
453 | U_ASSERT((table[0] >> 8) == VERSION); | |
454 | table += (table[0] & 0xff); // skip the header | |
455 | uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() | |
456 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
457 | ||
458 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
459 | U_ALIGN_CODE(16); | |
460 | int32_t leftIndex = 0, rightIndex = 0; | |
461 | /** | |
462 | * Single mini CE or a pair. | |
463 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
464 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
465 | */ | |
466 | uint32_t leftPair = 0, rightPair = 0; | |
467 | // Note: There is no need to assemble the code point. | |
468 | // We only need to look up the table entry for the character, | |
469 | // and nextPair() looks for whether c==0. | |
470 | for(;;) { | |
471 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
472 | while(leftPair == 0) { | |
473 | if(leftIndex == leftLength) { | |
474 | leftPair = EOS; | |
475 | break; | |
476 | } | |
477 | UChar32 c = left[leftIndex++]; | |
478 | uint8_t t; | |
479 | if(c <= 0x7f) { | |
480 | leftPair = primaries[c]; | |
481 | if(leftPair != 0) { break; } | |
482 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
483 | return BAIL_OUT_RESULT; | |
484 | } | |
485 | leftPair = table[c]; | |
486 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && | |
487 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { | |
488 | ++leftIndex; | |
489 | c = ((c - 0xc2) << 6) + t; | |
490 | leftPair = primaries[c]; | |
491 | if(leftPair != 0) { break; } | |
492 | leftPair = table[c]; | |
493 | } else { | |
494 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); | |
495 | } | |
496 | if(leftPair >= MIN_SHORT) { | |
497 | leftPair &= SHORT_PRIMARY_MASK; | |
498 | break; | |
499 | } else if(leftPair > variableTop) { | |
500 | leftPair &= LONG_PRIMARY_MASK; | |
501 | break; | |
502 | } else { | |
503 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
504 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
505 | leftPair = getPrimaries(variableTop, leftPair); | |
506 | } | |
507 | } | |
508 | ||
509 | while(rightPair == 0) { | |
510 | if(rightIndex == rightLength) { | |
511 | rightPair = EOS; | |
512 | break; | |
513 | } | |
514 | UChar32 c = right[rightIndex++]; | |
515 | uint8_t t; | |
516 | if(c <= 0x7f) { | |
517 | rightPair = primaries[c]; | |
518 | if(rightPair != 0) { break; } | |
519 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
520 | return BAIL_OUT_RESULT; | |
521 | } | |
522 | rightPair = table[c]; | |
523 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && | |
524 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { | |
525 | ++rightIndex; | |
526 | c = ((c - 0xc2) << 6) + t; | |
527 | rightPair = primaries[c]; | |
528 | if(rightPair != 0) { break; } | |
529 | rightPair = table[c]; | |
530 | } else { | |
531 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); | |
532 | } | |
533 | if(rightPair >= MIN_SHORT) { | |
534 | rightPair &= SHORT_PRIMARY_MASK; | |
535 | break; | |
536 | } else if(rightPair > variableTop) { | |
537 | rightPair &= LONG_PRIMARY_MASK; | |
538 | break; | |
539 | } else { | |
540 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
541 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
542 | rightPair = getPrimaries(variableTop, rightPair); | |
543 | } | |
544 | } | |
545 | ||
546 | if(leftPair == rightPair) { | |
547 | if(leftPair == EOS) { break; } | |
548 | leftPair = rightPair = 0; | |
549 | continue; | |
550 | } | |
551 | uint32_t leftPrimary = leftPair & 0xffff; | |
552 | uint32_t rightPrimary = rightPair & 0xffff; | |
553 | if(leftPrimary != rightPrimary) { | |
554 | // Return the primary difference. | |
555 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
556 | } | |
557 | if(leftPair == EOS) { break; } | |
558 | leftPair >>= 16; | |
559 | rightPair >>= 16; | |
560 | } | |
561 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
562 | // but we know that the string is well-formed and | |
563 | // only contains supported characters and mappings. | |
564 | ||
565 | // We might skip the secondary level but continue with the case level | |
566 | // which is turned on separately. | |
567 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
568 | leftIndex = rightIndex = 0; | |
569 | leftPair = rightPair = 0; | |
570 | for(;;) { | |
571 | while(leftPair == 0) { | |
572 | if(leftIndex == leftLength) { | |
573 | leftPair = EOS; | |
574 | break; | |
575 | } | |
576 | UChar32 c = left[leftIndex++]; | |
577 | if(c <= 0x7f) { | |
578 | leftPair = table[c]; | |
579 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
580 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; | |
581 | } else { | |
582 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); | |
583 | } | |
584 | if(leftPair >= MIN_SHORT) { | |
585 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
586 | break; | |
587 | } else if(leftPair > variableTop) { | |
588 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
589 | break; | |
590 | } else { | |
591 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
592 | leftPair = getSecondaries(variableTop, leftPair); | |
593 | } | |
594 | } | |
595 | ||
596 | while(rightPair == 0) { | |
597 | if(rightIndex == rightLength) { | |
598 | rightPair = EOS; | |
599 | break; | |
600 | } | |
601 | UChar32 c = right[rightIndex++]; | |
602 | if(c <= 0x7f) { | |
603 | rightPair = table[c]; | |
604 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
605 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; | |
606 | } else { | |
607 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); | |
608 | } | |
609 | if(rightPair >= MIN_SHORT) { | |
610 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
611 | break; | |
612 | } else if(rightPair > variableTop) { | |
613 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
614 | break; | |
615 | } else { | |
616 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
617 | rightPair = getSecondaries(variableTop, rightPair); | |
618 | } | |
619 | } | |
620 | ||
621 | if(leftPair == rightPair) { | |
622 | if(leftPair == EOS) { break; } | |
623 | leftPair = rightPair = 0; | |
624 | continue; | |
625 | } | |
626 | uint32_t leftSecondary = leftPair & 0xffff; | |
627 | uint32_t rightSecondary = rightPair & 0xffff; | |
628 | if(leftSecondary != rightSecondary) { | |
629 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
630 | // Full support for backwards secondary requires backwards contraction matching | |
631 | // and moving backwards between merge separators. | |
632 | return BAIL_OUT_RESULT; | |
633 | } | |
634 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
635 | } | |
636 | if(leftPair == EOS) { break; } | |
637 | leftPair >>= 16; | |
638 | rightPair >>= 16; | |
639 | } | |
640 | } | |
641 | ||
642 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
643 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
644 | leftIndex = rightIndex = 0; | |
645 | leftPair = rightPair = 0; | |
646 | for(;;) { | |
647 | while(leftPair == 0) { | |
648 | if(leftIndex == leftLength) { | |
649 | leftPair = EOS; | |
650 | break; | |
651 | } | |
652 | UChar32 c = left[leftIndex++]; | |
653 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
654 | if(leftPair < MIN_LONG) { | |
655 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
656 | } | |
657 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
658 | } | |
659 | ||
660 | while(rightPair == 0) { | |
661 | if(rightIndex == rightLength) { | |
662 | rightPair = EOS; | |
663 | break; | |
664 | } | |
665 | UChar32 c = right[rightIndex++]; | |
666 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
667 | if(rightPair < MIN_LONG) { | |
668 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
669 | } | |
670 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
671 | } | |
672 | ||
673 | if(leftPair == rightPair) { | |
674 | if(leftPair == EOS) { break; } | |
675 | leftPair = rightPair = 0; | |
676 | continue; | |
677 | } | |
678 | uint32_t leftCase = leftPair & 0xffff; | |
679 | uint32_t rightCase = rightPair & 0xffff; | |
680 | if(leftCase != rightCase) { | |
681 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
682 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
683 | } else { | |
684 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
685 | } | |
686 | } | |
687 | if(leftPair == EOS) { break; } | |
688 | leftPair >>= 16; | |
689 | rightPair >>= 16; | |
690 | } | |
691 | } | |
692 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
693 | ||
694 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
695 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
696 | ||
697 | leftIndex = rightIndex = 0; | |
698 | leftPair = rightPair = 0; | |
699 | for(;;) { | |
700 | while(leftPair == 0) { | |
701 | if(leftIndex == leftLength) { | |
702 | leftPair = EOS; | |
703 | break; | |
704 | } | |
705 | UChar32 c = left[leftIndex++]; | |
706 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
707 | if(leftPair < MIN_LONG) { | |
708 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
709 | } | |
710 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
711 | } | |
712 | ||
713 | while(rightPair == 0) { | |
714 | if(rightIndex == rightLength) { | |
715 | rightPair = EOS; | |
716 | break; | |
717 | } | |
718 | UChar32 c = right[rightIndex++]; | |
719 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
720 | if(rightPair < MIN_LONG) { | |
721 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
722 | } | |
723 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
724 | } | |
725 | ||
726 | if(leftPair == rightPair) { | |
727 | if(leftPair == EOS) { break; } | |
728 | leftPair = rightPair = 0; | |
729 | continue; | |
730 | } | |
731 | uint32_t leftTertiary = leftPair & 0xffff; | |
732 | uint32_t rightTertiary = rightPair & 0xffff; | |
733 | if(leftTertiary != rightTertiary) { | |
734 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
735 | // Pass through EOS and MERGE_WEIGHT | |
736 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
737 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
738 | if(leftTertiary > MERGE_WEIGHT) { | |
739 | leftTertiary ^= CASE_MASK; | |
740 | } | |
741 | if(rightTertiary > MERGE_WEIGHT) { | |
742 | rightTertiary ^= CASE_MASK; | |
743 | } | |
744 | } | |
745 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
746 | } | |
747 | if(leftPair == EOS) { break; } | |
748 | leftPair >>= 16; | |
749 | rightPair >>= 16; | |
750 | } | |
751 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
752 | ||
753 | leftIndex = rightIndex = 0; | |
754 | leftPair = rightPair = 0; | |
755 | for(;;) { | |
756 | while(leftPair == 0) { | |
757 | if(leftIndex == leftLength) { | |
758 | leftPair = EOS; | |
759 | break; | |
760 | } | |
761 | UChar32 c = left[leftIndex++]; | |
762 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
763 | if(leftPair < MIN_LONG) { | |
764 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
765 | } | |
766 | leftPair = getQuaternaries(variableTop, leftPair); | |
767 | } | |
768 | ||
769 | while(rightPair == 0) { | |
770 | if(rightIndex == rightLength) { | |
771 | rightPair = EOS; | |
772 | break; | |
773 | } | |
774 | UChar32 c = right[rightIndex++]; | |
775 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
776 | if(rightPair < MIN_LONG) { | |
777 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
778 | } | |
779 | rightPair = getQuaternaries(variableTop, rightPair); | |
780 | } | |
781 | ||
782 | if(leftPair == rightPair) { | |
783 | if(leftPair == EOS) { break; } | |
784 | leftPair = rightPair = 0; | |
785 | continue; | |
786 | } | |
787 | uint32_t leftQuaternary = leftPair & 0xffff; | |
788 | uint32_t rightQuaternary = rightPair & 0xffff; | |
789 | if(leftQuaternary != rightQuaternary) { | |
790 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
791 | } | |
792 | if(leftPair == EOS) { break; } | |
793 | leftPair >>= 16; | |
794 | rightPair >>= 16; | |
795 | } | |
796 | return UCOL_EQUAL; | |
797 | } | |
798 | ||
799 | uint32_t | |
800 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { | |
801 | U_ASSERT(c > LATIN_MAX); | |
802 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
803 | return table[c - PUNCT_START + LATIN_LIMIT]; | |
804 | } else if(c == 0xfffe) { | |
805 | return MERGE_WEIGHT; | |
806 | } else if(c == 0xffff) { | |
807 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; | |
808 | } else { | |
809 | return BAIL_OUT; | |
810 | } | |
811 | } | |
812 | ||
813 | uint32_t | |
814 | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, | |
815 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { | |
816 | // The caller handled ASCII and valid/supported Latin. | |
817 | U_ASSERT(c > 0x7f); | |
818 | int32_t i2 = sIndex + 1; | |
819 | if(i2 < sLength || sLength < 0) { | |
820 | uint8_t t1 = s8[sIndex]; | |
821 | uint8_t t2 = s8[i2]; | |
822 | sIndex += 2; | |
823 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { | |
824 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
825 | } else if(c == 0xef && t1 == 0xbf) { | |
826 | if(t2 == 0xbe) { | |
827 | return MERGE_WEIGHT; // U+FFFE | |
828 | } else if(t2 == 0xbf) { | |
829 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
830 | } | |
831 | } | |
832 | } | |
833 | return BAIL_OUT; | |
834 | } | |
835 | ||
836 | uint32_t | |
837 | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, | |
838 | const uint8_t *s8, int32_t &sIndex) { | |
839 | // The caller handled ASCII. | |
840 | // The string is well-formed and contains only supported characters. | |
841 | U_ASSERT(c > 0x7f); | |
842 | if(c <= LATIN_MAX_UTF8_LEAD) { | |
843 | return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F | |
844 | } | |
845 | uint8_t t2 = s8[sIndex + 1]; | |
846 | sIndex += 2; | |
847 | if(c == 0xe2) { | |
848 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
849 | } else if(t2 == 0xbe) { | |
850 | return MERGE_WEIGHT; // U+FFFE | |
851 | } else { | |
852 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
853 | } | |
854 | } | |
855 | ||
856 | uint32_t | |
857 | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, | |
858 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { | |
859 | if(ce >= MIN_LONG || ce < CONTRACTION) { | |
860 | return ce; // simple or special mini CE | |
861 | } else if(ce >= EXPANSION) { | |
862 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
863 | return ((uint32_t)table[index + 1] << 16) | table[index]; | |
864 | } else /* ce >= CONTRACTION */ { | |
865 | if(c == 0 && sLength < 0) { | |
866 | sLength = sIndex - 1; | |
867 | return EOS; | |
868 | } | |
869 | // Contraction list: Default mapping followed by | |
870 | // 0 or more single-character contraction suffix mappings. | |
871 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
872 | if(sIndex != sLength) { | |
873 | // Read the next character. | |
874 | int32_t c2; | |
875 | int32_t nextIndex = sIndex; | |
876 | if(s16 != NULL) { | |
877 | c2 = s16[nextIndex++]; | |
878 | if(c2 > LATIN_MAX) { | |
879 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { | |
880 | c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF | |
881 | } else if(c2 == 0xfffe || c2 == 0xffff) { | |
882 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
883 | } else { | |
884 | return BAIL_OUT; | |
885 | } | |
886 | } | |
887 | } else { | |
888 | c2 = s8[nextIndex++]; | |
889 | if(c2 > 0x7f) { | |
890 | uint8_t t; | |
891 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && | |
892 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { | |
893 | c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F | |
894 | ++nextIndex; | |
895 | } else { | |
896 | int32_t i2 = nextIndex + 1; | |
897 | if(i2 < sLength || sLength < 0) { | |
898 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && | |
899 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { | |
900 | c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF | |
901 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && | |
902 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { | |
903 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
904 | } else { | |
905 | return BAIL_OUT; | |
906 | } | |
907 | } else { | |
908 | return BAIL_OUT; | |
909 | } | |
910 | nextIndex += 2; | |
911 | } | |
912 | } | |
913 | } | |
914 | if(c2 == 0 && sLength < 0) { | |
915 | sLength = sIndex; | |
916 | c2 = -1; | |
917 | } | |
918 | // Look for the next character in the contraction suffix list, | |
919 | // which is in ascending order of single suffix characters. | |
920 | int32_t i = index; | |
921 | int32_t head = table[i]; // first skip the default mapping | |
922 | int32_t x; | |
923 | do { | |
924 | i += head >> CONTR_LENGTH_SHIFT; | |
925 | head = table[i]; | |
926 | x = head & CONTR_CHAR_MASK; | |
927 | } while(x < c2); | |
928 | if(x == c2) { | |
929 | index = i; | |
930 | sIndex = nextIndex; | |
931 | } | |
932 | } | |
933 | // Return the CE or CEs for the default or contraction mapping. | |
934 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; | |
935 | if(length == 1) { | |
936 | return BAIL_OUT; | |
937 | } | |
938 | ce = table[index + 1]; | |
939 | if(length == 2) { | |
940 | return ce; | |
941 | } else { | |
942 | return ((uint32_t)table[index + 2] << 16) | ce; | |
943 | } | |
944 | } | |
945 | } | |
946 | ||
947 | uint32_t | |
948 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { | |
949 | if(pair <= 0xffff) { | |
950 | // one mini CE | |
951 | if(pair >= MIN_SHORT) { | |
952 | pair = getSecondariesFromOneShortCE(pair); | |
953 | } else if(pair > variableTop) { | |
954 | pair = COMMON_SEC_PLUS_OFFSET; | |
955 | } else if(pair >= MIN_LONG) { | |
956 | pair = 0; // variable | |
957 | } | |
958 | // else special mini CE | |
959 | } else { | |
960 | uint32_t ce = pair & 0xffff; | |
961 | if(ce >= MIN_SHORT) { | |
962 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; | |
963 | } else if(ce > variableTop) { | |
964 | pair = TWO_COMMON_SEC_PLUS_OFFSET; | |
965 | } else { | |
966 | U_ASSERT(ce >= MIN_LONG); | |
967 | pair = 0; // variable | |
968 | } | |
969 | } | |
970 | return pair; | |
971 | } | |
972 | ||
973 | uint32_t | |
974 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { | |
975 | // Primary+caseLevel: Ignore case level weights of primary ignorables. | |
976 | // Otherwise: Ignore case level weights of secondary ignorables. | |
977 | // For details see the comments in the CollationCompare class. | |
978 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
979 | if(pair <= 0xffff) { | |
980 | // one mini CE | |
981 | if(pair >= MIN_SHORT) { | |
982 | // A high secondary weight means we really have two CEs, | |
983 | // a primary CE and a secondary CE. | |
984 | uint32_t ce = pair; | |
985 | pair &= CASE_MASK; // explicit weight of primary CE | |
986 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
987 | pair |= LOWER_CASE << 16; // implied weight of secondary CE | |
988 | } | |
989 | } else if(pair > variableTop) { | |
990 | pair = LOWER_CASE; | |
991 | } else if(pair >= MIN_LONG) { | |
992 | pair = 0; // variable | |
993 | } | |
994 | // else special mini CE | |
995 | } else { | |
996 | // two mini CEs, same primary groups, neither expands like above | |
997 | uint32_t ce = pair & 0xffff; | |
998 | if(ce >= MIN_SHORT) { | |
999 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { | |
1000 | pair &= CASE_MASK; | |
1001 | } else { | |
1002 | pair &= TWO_CASES_MASK; | |
1003 | } | |
1004 | } else if(ce > variableTop) { | |
1005 | pair = TWO_LOWER_CASES; | |
1006 | } else { | |
1007 | U_ASSERT(ce >= MIN_LONG); | |
1008 | pair = 0; // variable | |
1009 | } | |
1010 | } | |
1011 | return pair; | |
1012 | } | |
1013 | ||
1014 | uint32_t | |
1015 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { | |
1016 | if(pair <= 0xffff) { | |
1017 | // one mini CE | |
1018 | if(pair >= MIN_SHORT) { | |
1019 | // A high secondary weight means we really have two CEs, | |
1020 | // a primary CE and a secondary CE. | |
1021 | uint32_t ce = pair; | |
1022 | if(withCaseBits) { | |
1023 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; | |
1024 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1025 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; | |
1026 | } | |
1027 | } else { | |
1028 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1029 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1030 | pair |= COMMON_TER_PLUS_OFFSET << 16; | |
1031 | } | |
1032 | } | |
1033 | } else if(pair > variableTop) { | |
1034 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1035 | if(withCaseBits) { | |
1036 | pair |= LOWER_CASE; | |
1037 | } | |
1038 | } else if(pair >= MIN_LONG) { | |
1039 | pair = 0; // variable | |
1040 | } | |
1041 | // else special mini CE | |
1042 | } else { | |
1043 | // two mini CEs, same primary groups, neither expands like above | |
1044 | uint32_t ce = pair & 0xffff; | |
1045 | if(ce >= MIN_SHORT) { | |
1046 | if(withCaseBits) { | |
1047 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; | |
1048 | } else { | |
1049 | pair &= TWO_TERTIARIES_MASK; | |
1050 | } | |
1051 | pair += TWO_TER_OFFSETS; | |
1052 | } else if(ce > variableTop) { | |
1053 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; | |
1054 | if(withCaseBits) { | |
1055 | pair |= TWO_LOWER_CASES; | |
1056 | } | |
1057 | } else { | |
1058 | U_ASSERT(ce >= MIN_LONG); | |
1059 | pair = 0; // variable | |
1060 | } | |
1061 | } | |
1062 | return pair; | |
1063 | } | |
1064 | ||
1065 | uint32_t | |
1066 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { | |
1067 | // Return the primary weight of a variable CE, | |
1068 | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. | |
1069 | if(pair <= 0xffff) { | |
1070 | // one mini CE | |
1071 | if(pair >= MIN_SHORT) { | |
1072 | // A high secondary weight means we really have two CEs, | |
1073 | // a primary CE and a secondary CE. | |
1074 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1075 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1076 | } else { | |
1077 | pair = SHORT_PRIMARY_MASK; | |
1078 | } | |
1079 | } else if(pair > variableTop) { | |
1080 | pair = SHORT_PRIMARY_MASK; | |
1081 | } else if(pair >= MIN_LONG) { | |
1082 | pair &= LONG_PRIMARY_MASK; // variable | |
1083 | } | |
1084 | // else special mini CE | |
1085 | } else { | |
1086 | // two mini CEs, same primary groups, neither expands like above | |
1087 | uint32_t ce = pair & 0xffff; | |
1088 | if(ce > variableTop) { | |
1089 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1090 | } else { | |
1091 | U_ASSERT(ce >= MIN_LONG); | |
1092 | pair &= TWO_LONG_PRIMARIES_MASK; // variable | |
1093 | } | |
1094 | } | |
1095 | return pair; | |
1096 | } | |
1097 | ||
1098 | U_NAMESPACE_END | |
1099 | ||
1100 | #endif // !UCONFIG_NO_COLLATION |