]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2013-2014, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * collationfastlatin.cpp | |
7 | * | |
8 | * created on: 2013aug18 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | ||
14 | #if !UCONFIG_NO_COLLATION | |
15 | ||
16 | #include "unicode/ucol.h" | |
17 | #include "collationdata.h" | |
18 | #include "collationfastlatin.h" | |
19 | #include "collationsettings.h" | |
20 | #include "putilimp.h" // U_ALIGN_CODE | |
21 | #include "uassert.h" | |
22 | ||
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | int32_t | |
26 | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, | |
27 | uint16_t *primaries, int32_t capacity) { | |
28 | const uint16_t *table = data->fastLatinTable; | |
29 | if(table == NULL) { return -1; } | |
30 | U_ASSERT(capacity == LATIN_LIMIT); | |
31 | if(capacity != LATIN_LIMIT) { return -1; } | |
32 | ||
33 | uint32_t miniVarTop; | |
34 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { | |
35 | // No mini primaries are variable, set a variableTop just below the | |
36 | // lowest long mini primary. | |
37 | miniVarTop = MIN_LONG - 1; | |
38 | } else { | |
39 | uint32_t v1 = settings.variableTop >> 24; | |
40 | int32_t headerLength = *table & 0xff; | |
41 | int32_t i = headerLength - 1; | |
42 | if(i <= 0 || v1 > (table[i] & 0x7f)) { | |
43 | return -1; // variableTop >= digits, should not occur | |
44 | } | |
45 | while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; } | |
46 | // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied. | |
47 | // Shift right to make it comparable with long mini primaries in bits 15..3. | |
48 | miniVarTop = (table[i] & 0xff80) >> 4; | |
49 | } | |
50 | ||
51 | const uint8_t *reorderTable = settings.reorderTable; | |
52 | if(reorderTable != NULL) { | |
53 | const uint16_t *scripts = data->scripts; | |
54 | int32_t length = data->scriptsLength; | |
55 | uint32_t prevLastByte = 0; | |
56 | for(int32_t i = 0; i < length;) { | |
57 | // reordered last byte of the group | |
58 | uint32_t lastByte = reorderTable[scripts[i] & 0xff]; | |
59 | if(lastByte < prevLastByte) { | |
60 | // The permutation affects the groups up to Latin. | |
61 | return -1; | |
62 | } | |
63 | if(scripts[i + 2] == USCRIPT_LATIN) { break; } | |
64 | i = i + 2 + scripts[i + 1]; | |
65 | prevLastByte = lastByte; | |
66 | } | |
67 | } | |
68 | ||
69 | table += (table[0] & 0xff); // skip the header | |
70 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { | |
71 | uint32_t p = table[c]; | |
72 | if(p >= MIN_SHORT) { | |
73 | p &= SHORT_PRIMARY_MASK; | |
74 | } else if(p > miniVarTop) { | |
75 | p &= LONG_PRIMARY_MASK; | |
76 | } else { | |
77 | p = 0; | |
78 | } | |
79 | primaries[c] = (uint16_t)p; | |
80 | } | |
81 | if((settings.options & CollationSettings::NUMERIC) != 0) { | |
82 | // Bail out for digits. | |
83 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } | |
84 | } | |
85 | ||
86 | // Shift the miniVarTop above other options. | |
87 | return ((int32_t)miniVarTop << 16) | settings.options; | |
88 | } | |
89 | ||
90 | int32_t | |
91 | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
92 | const UChar *left, int32_t leftLength, | |
93 | const UChar *right, int32_t rightLength) { | |
94 | // This is a modified copy of CollationCompare::compareUpToQuaternary(), | |
95 | // optimized for common Latin text. | |
96 | // Keep them in sync! | |
97 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
98 | ||
99 | U_ASSERT((table[0] >> 8) == VERSION); | |
100 | table += (table[0] & 0xff); // skip the header | |
101 | uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() | |
102 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
103 | ||
104 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
105 | U_ALIGN_CODE(16); | |
106 | int32_t leftIndex = 0, rightIndex = 0; | |
107 | /** | |
108 | * Single mini CE or a pair. | |
109 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
110 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
111 | */ | |
112 | uint32_t leftPair = 0, rightPair = 0; | |
113 | for(;;) { | |
114 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
115 | while(leftPair == 0) { | |
116 | if(leftIndex == leftLength) { | |
117 | leftPair = EOS; | |
118 | break; | |
119 | } | |
120 | UChar32 c = left[leftIndex++]; | |
121 | if(c <= LATIN_MAX) { | |
122 | leftPair = primaries[c]; | |
123 | if(leftPair != 0) { break; } | |
124 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
125 | return BAIL_OUT_RESULT; | |
126 | } | |
127 | leftPair = table[c]; | |
128 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
129 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
130 | } else { | |
131 | leftPair = lookup(table, c); | |
132 | } | |
133 | if(leftPair >= MIN_SHORT) { | |
134 | leftPair &= SHORT_PRIMARY_MASK; | |
135 | break; | |
136 | } else if(leftPair > variableTop) { | |
137 | leftPair &= LONG_PRIMARY_MASK; | |
138 | break; | |
139 | } else { | |
140 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
141 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
142 | leftPair = getPrimaries(variableTop, leftPair); | |
143 | } | |
144 | } | |
145 | ||
146 | while(rightPair == 0) { | |
147 | if(rightIndex == rightLength) { | |
148 | rightPair = EOS; | |
149 | break; | |
150 | } | |
151 | UChar32 c = right[rightIndex++]; | |
152 | if(c <= LATIN_MAX) { | |
153 | rightPair = primaries[c]; | |
154 | if(rightPair != 0) { break; } | |
155 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
156 | return BAIL_OUT_RESULT; | |
157 | } | |
158 | rightPair = table[c]; | |
159 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
160 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
161 | } else { | |
162 | rightPair = lookup(table, c); | |
163 | } | |
164 | if(rightPair >= MIN_SHORT) { | |
165 | rightPair &= SHORT_PRIMARY_MASK; | |
166 | break; | |
167 | } else if(rightPair > variableTop) { | |
168 | rightPair &= LONG_PRIMARY_MASK; | |
169 | break; | |
170 | } else { | |
171 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
172 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
173 | rightPair = getPrimaries(variableTop, rightPair); | |
174 | } | |
175 | } | |
176 | ||
177 | if(leftPair == rightPair) { | |
178 | if(leftPair == EOS) { break; } | |
179 | leftPair = rightPair = 0; | |
180 | continue; | |
181 | } | |
182 | uint32_t leftPrimary = leftPair & 0xffff; | |
183 | uint32_t rightPrimary = rightPair & 0xffff; | |
184 | if(leftPrimary != rightPrimary) { | |
185 | // Return the primary difference. | |
186 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
187 | } | |
188 | if(leftPair == EOS) { break; } | |
189 | leftPair >>= 16; | |
190 | rightPair >>= 16; | |
191 | } | |
192 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
193 | // but we know that the string is well-formed and | |
194 | // only contains supported characters and mappings. | |
195 | ||
196 | // We might skip the secondary level but continue with the case level | |
197 | // which is turned on separately. | |
198 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
199 | leftIndex = rightIndex = 0; | |
200 | leftPair = rightPair = 0; | |
201 | for(;;) { | |
202 | while(leftPair == 0) { | |
203 | if(leftIndex == leftLength) { | |
204 | leftPair = EOS; | |
205 | break; | |
206 | } | |
207 | UChar32 c = left[leftIndex++]; | |
208 | if(c <= LATIN_MAX) { | |
209 | leftPair = table[c]; | |
210 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
211 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
212 | } else { | |
213 | leftPair = lookup(table, c); | |
214 | } | |
215 | if(leftPair >= MIN_SHORT) { | |
216 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
217 | break; | |
218 | } else if(leftPair > variableTop) { | |
219 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
220 | break; | |
221 | } else { | |
222 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
223 | leftPair = getSecondaries(variableTop, leftPair); | |
224 | } | |
225 | } | |
226 | ||
227 | while(rightPair == 0) { | |
228 | if(rightIndex == rightLength) { | |
229 | rightPair = EOS; | |
230 | break; | |
231 | } | |
232 | UChar32 c = right[rightIndex++]; | |
233 | if(c <= LATIN_MAX) { | |
234 | rightPair = table[c]; | |
235 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
236 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | |
237 | } else { | |
238 | rightPair = lookup(table, c); | |
239 | } | |
240 | if(rightPair >= MIN_SHORT) { | |
241 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
242 | break; | |
243 | } else if(rightPair > variableTop) { | |
244 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
245 | break; | |
246 | } else { | |
247 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
248 | rightPair = getSecondaries(variableTop, rightPair); | |
249 | } | |
250 | } | |
251 | ||
252 | if(leftPair == rightPair) { | |
253 | if(leftPair == EOS) { break; } | |
254 | leftPair = rightPair = 0; | |
255 | continue; | |
256 | } | |
257 | uint32_t leftSecondary = leftPair & 0xffff; | |
258 | uint32_t rightSecondary = rightPair & 0xffff; | |
259 | if(leftSecondary != rightSecondary) { | |
260 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
261 | // Full support for backwards secondary requires backwards contraction matching | |
262 | // and moving backwards between merge separators. | |
263 | return BAIL_OUT_RESULT; | |
264 | } | |
265 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
266 | } | |
267 | if(leftPair == EOS) { break; } | |
268 | leftPair >>= 16; | |
269 | rightPair >>= 16; | |
270 | } | |
271 | } | |
272 | ||
273 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
274 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
275 | leftIndex = rightIndex = 0; | |
276 | leftPair = rightPair = 0; | |
277 | for(;;) { | |
278 | while(leftPair == 0) { | |
279 | if(leftIndex == leftLength) { | |
280 | leftPair = EOS; | |
281 | break; | |
282 | } | |
283 | UChar32 c = left[leftIndex++]; | |
284 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
285 | if(leftPair < MIN_LONG) { | |
286 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
287 | } | |
288 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
289 | } | |
290 | ||
291 | while(rightPair == 0) { | |
292 | if(rightIndex == rightLength) { | |
293 | rightPair = EOS; | |
294 | break; | |
295 | } | |
296 | UChar32 c = right[rightIndex++]; | |
297 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
298 | if(rightPair < MIN_LONG) { | |
299 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
300 | } | |
301 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
302 | } | |
303 | ||
304 | if(leftPair == rightPair) { | |
305 | if(leftPair == EOS) { break; } | |
306 | leftPair = rightPair = 0; | |
307 | continue; | |
308 | } | |
309 | uint32_t leftCase = leftPair & 0xffff; | |
310 | uint32_t rightCase = rightPair & 0xffff; | |
311 | if(leftCase != rightCase) { | |
312 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
313 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
314 | } else { | |
315 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
316 | } | |
317 | } | |
318 | if(leftPair == EOS) { break; } | |
319 | leftPair >>= 16; | |
320 | rightPair >>= 16; | |
321 | } | |
322 | } | |
323 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
324 | ||
325 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
326 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
327 | ||
328 | leftIndex = rightIndex = 0; | |
329 | leftPair = rightPair = 0; | |
330 | for(;;) { | |
331 | while(leftPair == 0) { | |
332 | if(leftIndex == leftLength) { | |
333 | leftPair = EOS; | |
334 | break; | |
335 | } | |
336 | UChar32 c = left[leftIndex++]; | |
337 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
338 | if(leftPair < MIN_LONG) { | |
339 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
340 | } | |
341 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
342 | } | |
343 | ||
344 | while(rightPair == 0) { | |
345 | if(rightIndex == rightLength) { | |
346 | rightPair = EOS; | |
347 | break; | |
348 | } | |
349 | UChar32 c = right[rightIndex++]; | |
350 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
351 | if(rightPair < MIN_LONG) { | |
352 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
353 | } | |
354 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
355 | } | |
356 | ||
357 | if(leftPair == rightPair) { | |
358 | if(leftPair == EOS) { break; } | |
359 | leftPair = rightPair = 0; | |
360 | continue; | |
361 | } | |
362 | uint32_t leftTertiary = leftPair & 0xffff; | |
363 | uint32_t rightTertiary = rightPair & 0xffff; | |
364 | if(leftTertiary != rightTertiary) { | |
365 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
366 | // Pass through EOS and MERGE_WEIGHT | |
367 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
368 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
369 | if(leftTertiary > MERGE_WEIGHT) { | |
370 | leftTertiary ^= CASE_MASK; | |
371 | } | |
372 | if(rightTertiary > MERGE_WEIGHT) { | |
373 | rightTertiary ^= CASE_MASK; | |
374 | } | |
375 | } | |
376 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
377 | } | |
378 | if(leftPair == EOS) { break; } | |
379 | leftPair >>= 16; | |
380 | rightPair >>= 16; | |
381 | } | |
382 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
383 | ||
384 | leftIndex = rightIndex = 0; | |
385 | leftPair = rightPair = 0; | |
386 | for(;;) { | |
387 | while(leftPair == 0) { | |
388 | if(leftIndex == leftLength) { | |
389 | leftPair = EOS; | |
390 | break; | |
391 | } | |
392 | UChar32 c = left[leftIndex++]; | |
393 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
394 | if(leftPair < MIN_LONG) { | |
395 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | |
396 | } | |
397 | leftPair = getQuaternaries(variableTop, leftPair); | |
398 | } | |
399 | ||
400 | while(rightPair == 0) { | |
401 | if(rightIndex == rightLength) { | |
402 | rightPair = EOS; | |
403 | break; | |
404 | } | |
405 | UChar32 c = right[rightIndex++]; | |
406 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | |
407 | if(rightPair < MIN_LONG) { | |
408 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | |
409 | } | |
410 | rightPair = getQuaternaries(variableTop, rightPair); | |
411 | } | |
412 | ||
413 | if(leftPair == rightPair) { | |
414 | if(leftPair == EOS) { break; } | |
415 | leftPair = rightPair = 0; | |
416 | continue; | |
417 | } | |
418 | uint32_t leftQuaternary = leftPair & 0xffff; | |
419 | uint32_t rightQuaternary = rightPair & 0xffff; | |
420 | if(leftQuaternary != rightQuaternary) { | |
421 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
422 | } | |
423 | if(leftPair == EOS) { break; } | |
424 | leftPair >>= 16; | |
425 | rightPair >>= 16; | |
426 | } | |
427 | return UCOL_EQUAL; | |
428 | } | |
429 | ||
430 | int32_t | |
431 | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, | |
432 | const uint8_t *left, int32_t leftLength, | |
433 | const uint8_t *right, int32_t rightLength) { | |
434 | // Keep compareUTF16() and compareUTF8() in sync very closely! | |
435 | ||
436 | U_ASSERT((table[0] >> 8) == VERSION); | |
437 | table += (table[0] & 0xff); // skip the header | |
438 | uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() | |
439 | options &= 0xffff; // needed for CollationSettings::getStrength() to work | |
440 | ||
441 | // Check for supported characters, fetch mini CEs, and compare primaries. | |
442 | U_ALIGN_CODE(16); | |
443 | int32_t leftIndex = 0, rightIndex = 0; | |
444 | /** | |
445 | * Single mini CE or a pair. | |
446 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | |
447 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | |
448 | */ | |
449 | uint32_t leftPair = 0, rightPair = 0; | |
450 | // Note: There is no need to assemble the code point. | |
451 | // We only need to look up the table entry for the character, | |
452 | // and nextPair() looks for whether c==0. | |
453 | for(;;) { | |
454 | // We fetch CEs until we get a non-ignorable primary or reach the end. | |
455 | while(leftPair == 0) { | |
456 | if(leftIndex == leftLength) { | |
457 | leftPair = EOS; | |
458 | break; | |
459 | } | |
460 | UChar32 c = left[leftIndex++]; | |
461 | uint8_t t; | |
462 | if(c <= 0x7f) { | |
463 | leftPair = primaries[c]; | |
464 | if(leftPair != 0) { break; } | |
465 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
466 | return BAIL_OUT_RESULT; | |
467 | } | |
468 | leftPair = table[c]; | |
469 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && | |
470 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { | |
471 | ++leftIndex; | |
472 | c = ((c - 0xc2) << 6) + t; | |
473 | leftPair = primaries[c]; | |
474 | if(leftPair != 0) { break; } | |
475 | leftPair = table[c]; | |
476 | } else { | |
477 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); | |
478 | } | |
479 | if(leftPair >= MIN_SHORT) { | |
480 | leftPair &= SHORT_PRIMARY_MASK; | |
481 | break; | |
482 | } else if(leftPair > variableTop) { | |
483 | leftPair &= LONG_PRIMARY_MASK; | |
484 | break; | |
485 | } else { | |
486 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
487 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
488 | leftPair = getPrimaries(variableTop, leftPair); | |
489 | } | |
490 | } | |
491 | ||
492 | while(rightPair == 0) { | |
493 | if(rightIndex == rightLength) { | |
494 | rightPair = EOS; | |
495 | break; | |
496 | } | |
497 | UChar32 c = right[rightIndex++]; | |
498 | uint8_t t; | |
499 | if(c <= 0x7f) { | |
500 | rightPair = primaries[c]; | |
501 | if(rightPair != 0) { break; } | |
502 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | |
503 | return BAIL_OUT_RESULT; | |
504 | } | |
505 | rightPair = table[c]; | |
506 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && | |
507 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { | |
508 | ++rightIndex; | |
509 | c = ((c - 0xc2) << 6) + t; | |
510 | rightPair = primaries[c]; | |
511 | if(rightPair != 0) { break; } | |
512 | rightPair = table[c]; | |
513 | } else { | |
514 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); | |
515 | } | |
516 | if(rightPair >= MIN_SHORT) { | |
517 | rightPair &= SHORT_PRIMARY_MASK; | |
518 | break; | |
519 | } else if(rightPair > variableTop) { | |
520 | rightPair &= LONG_PRIMARY_MASK; | |
521 | break; | |
522 | } else { | |
523 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
524 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | |
525 | rightPair = getPrimaries(variableTop, rightPair); | |
526 | } | |
527 | } | |
528 | ||
529 | if(leftPair == rightPair) { | |
530 | if(leftPair == EOS) { break; } | |
531 | leftPair = rightPair = 0; | |
532 | continue; | |
533 | } | |
534 | uint32_t leftPrimary = leftPair & 0xffff; | |
535 | uint32_t rightPrimary = rightPair & 0xffff; | |
536 | if(leftPrimary != rightPrimary) { | |
537 | // Return the primary difference. | |
538 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | |
539 | } | |
540 | if(leftPair == EOS) { break; } | |
541 | leftPair >>= 16; | |
542 | rightPair >>= 16; | |
543 | } | |
544 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | |
545 | // but we know that the string is well-formed and | |
546 | // only contains supported characters and mappings. | |
547 | ||
548 | // We might skip the secondary level but continue with the case level | |
549 | // which is turned on separately. | |
550 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | |
551 | leftIndex = rightIndex = 0; | |
552 | leftPair = rightPair = 0; | |
553 | for(;;) { | |
554 | while(leftPair == 0) { | |
555 | if(leftIndex == leftLength) { | |
556 | leftPair = EOS; | |
557 | break; | |
558 | } | |
559 | UChar32 c = left[leftIndex++]; | |
560 | if(c <= 0x7f) { | |
561 | leftPair = table[c]; | |
562 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
563 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; | |
564 | } else { | |
565 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); | |
566 | } | |
567 | if(leftPair >= MIN_SHORT) { | |
568 | leftPair = getSecondariesFromOneShortCE(leftPair); | |
569 | break; | |
570 | } else if(leftPair > variableTop) { | |
571 | leftPair = COMMON_SEC_PLUS_OFFSET; | |
572 | break; | |
573 | } else { | |
574 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
575 | leftPair = getSecondaries(variableTop, leftPair); | |
576 | } | |
577 | } | |
578 | ||
579 | while(rightPair == 0) { | |
580 | if(rightIndex == rightLength) { | |
581 | rightPair = EOS; | |
582 | break; | |
583 | } | |
584 | UChar32 c = right[rightIndex++]; | |
585 | if(c <= 0x7f) { | |
586 | rightPair = table[c]; | |
587 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | |
588 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; | |
589 | } else { | |
590 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); | |
591 | } | |
592 | if(rightPair >= MIN_SHORT) { | |
593 | rightPair = getSecondariesFromOneShortCE(rightPair); | |
594 | break; | |
595 | } else if(rightPair > variableTop) { | |
596 | rightPair = COMMON_SEC_PLUS_OFFSET; | |
597 | break; | |
598 | } else { | |
599 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
600 | rightPair = getSecondaries(variableTop, rightPair); | |
601 | } | |
602 | } | |
603 | ||
604 | if(leftPair == rightPair) { | |
605 | if(leftPair == EOS) { break; } | |
606 | leftPair = rightPair = 0; | |
607 | continue; | |
608 | } | |
609 | uint32_t leftSecondary = leftPair & 0xffff; | |
610 | uint32_t rightSecondary = rightPair & 0xffff; | |
611 | if(leftSecondary != rightSecondary) { | |
612 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
613 | // Full support for backwards secondary requires backwards contraction matching | |
614 | // and moving backwards between merge separators. | |
615 | return BAIL_OUT_RESULT; | |
616 | } | |
617 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | |
618 | } | |
619 | if(leftPair == EOS) { break; } | |
620 | leftPair >>= 16; | |
621 | rightPair >>= 16; | |
622 | } | |
623 | } | |
624 | ||
625 | if((options & CollationSettings::CASE_LEVEL) != 0) { | |
626 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | |
627 | leftIndex = rightIndex = 0; | |
628 | leftPair = rightPair = 0; | |
629 | for(;;) { | |
630 | while(leftPair == 0) { | |
631 | if(leftIndex == leftLength) { | |
632 | leftPair = EOS; | |
633 | break; | |
634 | } | |
635 | UChar32 c = left[leftIndex++]; | |
636 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
637 | if(leftPair < MIN_LONG) { | |
638 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
639 | } | |
640 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | |
641 | } | |
642 | ||
643 | while(rightPair == 0) { | |
644 | if(rightIndex == rightLength) { | |
645 | rightPair = EOS; | |
646 | break; | |
647 | } | |
648 | UChar32 c = right[rightIndex++]; | |
649 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
650 | if(rightPair < MIN_LONG) { | |
651 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
652 | } | |
653 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | |
654 | } | |
655 | ||
656 | if(leftPair == rightPair) { | |
657 | if(leftPair == EOS) { break; } | |
658 | leftPair = rightPair = 0; | |
659 | continue; | |
660 | } | |
661 | uint32_t leftCase = leftPair & 0xffff; | |
662 | uint32_t rightCase = rightPair & 0xffff; | |
663 | if(leftCase != rightCase) { | |
664 | if((options & CollationSettings::UPPER_FIRST) == 0) { | |
665 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | |
666 | } else { | |
667 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | |
668 | } | |
669 | } | |
670 | if(leftPair == EOS) { break; } | |
671 | leftPair >>= 16; | |
672 | rightPair >>= 16; | |
673 | } | |
674 | } | |
675 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | |
676 | ||
677 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
678 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | |
679 | ||
680 | leftIndex = rightIndex = 0; | |
681 | leftPair = rightPair = 0; | |
682 | for(;;) { | |
683 | while(leftPair == 0) { | |
684 | if(leftIndex == leftLength) { | |
685 | leftPair = EOS; | |
686 | break; | |
687 | } | |
688 | UChar32 c = left[leftIndex++]; | |
689 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
690 | if(leftPair < MIN_LONG) { | |
691 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
692 | } | |
693 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | |
694 | } | |
695 | ||
696 | while(rightPair == 0) { | |
697 | if(rightIndex == rightLength) { | |
698 | rightPair = EOS; | |
699 | break; | |
700 | } | |
701 | UChar32 c = right[rightIndex++]; | |
702 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
703 | if(rightPair < MIN_LONG) { | |
704 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
705 | } | |
706 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | |
707 | } | |
708 | ||
709 | if(leftPair == rightPair) { | |
710 | if(leftPair == EOS) { break; } | |
711 | leftPair = rightPair = 0; | |
712 | continue; | |
713 | } | |
714 | uint32_t leftTertiary = leftPair & 0xffff; | |
715 | uint32_t rightTertiary = rightPair & 0xffff; | |
716 | if(leftTertiary != rightTertiary) { | |
717 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | |
718 | // Pass through EOS and MERGE_WEIGHT | |
719 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | |
720 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
721 | if(leftTertiary > MERGE_WEIGHT) { | |
722 | leftTertiary ^= CASE_MASK; | |
723 | } | |
724 | if(rightTertiary > MERGE_WEIGHT) { | |
725 | rightTertiary ^= CASE_MASK; | |
726 | } | |
727 | } | |
728 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | |
729 | } | |
730 | if(leftPair == EOS) { break; } | |
731 | leftPair >>= 16; | |
732 | rightPair >>= 16; | |
733 | } | |
734 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | |
735 | ||
736 | leftIndex = rightIndex = 0; | |
737 | leftPair = rightPair = 0; | |
738 | for(;;) { | |
739 | while(leftPair == 0) { | |
740 | if(leftIndex == leftLength) { | |
741 | leftPair = EOS; | |
742 | break; | |
743 | } | |
744 | UChar32 c = left[leftIndex++]; | |
745 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | |
746 | if(leftPair < MIN_LONG) { | |
747 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | |
748 | } | |
749 | leftPair = getQuaternaries(variableTop, leftPair); | |
750 | } | |
751 | ||
752 | while(rightPair == 0) { | |
753 | if(rightIndex == rightLength) { | |
754 | rightPair = EOS; | |
755 | break; | |
756 | } | |
757 | UChar32 c = right[rightIndex++]; | |
758 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | |
759 | if(rightPair < MIN_LONG) { | |
760 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | |
761 | } | |
762 | rightPair = getQuaternaries(variableTop, rightPair); | |
763 | } | |
764 | ||
765 | if(leftPair == rightPair) { | |
766 | if(leftPair == EOS) { break; } | |
767 | leftPair = rightPair = 0; | |
768 | continue; | |
769 | } | |
770 | uint32_t leftQuaternary = leftPair & 0xffff; | |
771 | uint32_t rightQuaternary = rightPair & 0xffff; | |
772 | if(leftQuaternary != rightQuaternary) { | |
773 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | |
774 | } | |
775 | if(leftPair == EOS) { break; } | |
776 | leftPair >>= 16; | |
777 | rightPair >>= 16; | |
778 | } | |
779 | return UCOL_EQUAL; | |
780 | } | |
781 | ||
782 | uint32_t | |
783 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { | |
784 | U_ASSERT(c > LATIN_MAX); | |
785 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { | |
786 | return table[c - PUNCT_START + LATIN_LIMIT]; | |
787 | } else if(c == 0xfffe) { | |
788 | return MERGE_WEIGHT; | |
789 | } else if(c == 0xffff) { | |
790 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; | |
791 | } else { | |
792 | return BAIL_OUT; | |
793 | } | |
794 | } | |
795 | ||
796 | uint32_t | |
797 | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, | |
798 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { | |
799 | // The caller handled ASCII and valid/supported Latin. | |
800 | U_ASSERT(c > 0x7f); | |
801 | int32_t i2 = sIndex + 1; | |
802 | if(i2 < sLength || sLength < 0) { | |
803 | uint8_t t1 = s8[sIndex]; | |
804 | uint8_t t2 = s8[i2]; | |
805 | sIndex += 2; | |
806 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { | |
807 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
808 | } else if(c == 0xef && t1 == 0xbf) { | |
809 | if(t2 == 0xbe) { | |
810 | return MERGE_WEIGHT; // U+FFFE | |
811 | } else if(t2 == 0xbf) { | |
812 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
813 | } | |
814 | } | |
815 | } | |
816 | return BAIL_OUT; | |
817 | } | |
818 | ||
819 | uint32_t | |
820 | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, | |
821 | const uint8_t *s8, int32_t &sIndex) { | |
822 | // The caller handled ASCII. | |
823 | // The string is well-formed and contains only supported characters. | |
824 | U_ASSERT(c > 0x7f); | |
825 | if(c <= LATIN_MAX_UTF8_LEAD) { | |
826 | return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F | |
827 | } | |
828 | uint8_t t2 = s8[sIndex + 1]; | |
829 | sIndex += 2; | |
830 | if(c == 0xe2) { | |
831 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF | |
832 | } else if(t2 == 0xbe) { | |
833 | return MERGE_WEIGHT; // U+FFFE | |
834 | } else { | |
835 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF | |
836 | } | |
837 | } | |
838 | ||
839 | uint32_t | |
840 | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, | |
841 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { | |
842 | if(ce >= MIN_LONG || ce < CONTRACTION) { | |
843 | return ce; // simple or special mini CE | |
844 | } else if(ce >= EXPANSION) { | |
845 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
846 | return ((uint32_t)table[index + 1] << 16) | table[index]; | |
847 | } else /* ce >= CONTRACTION */ { | |
848 | if(c == 0 && sLength < 0) { | |
849 | sLength = sIndex - 1; | |
850 | return EOS; | |
851 | } | |
852 | // Contraction list: Default mapping followed by | |
853 | // 0 or more single-character contraction suffix mappings. | |
854 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | |
855 | if(sIndex != sLength) { | |
856 | // Read the next character. | |
857 | int32_t c2; | |
858 | int32_t nextIndex = sIndex; | |
859 | if(s16 != NULL) { | |
860 | c2 = s16[nextIndex++]; | |
861 | if(c2 > LATIN_MAX) { | |
862 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { | |
863 | c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF | |
864 | } else if(c2 == 0xfffe || c2 == 0xffff) { | |
865 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
866 | } else { | |
867 | return BAIL_OUT; | |
868 | } | |
869 | } | |
870 | } else { | |
871 | c2 = s8[nextIndex++]; | |
872 | if(c2 > 0x7f) { | |
873 | uint8_t t; | |
874 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && | |
875 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { | |
876 | c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F | |
877 | ++nextIndex; | |
878 | } else { | |
879 | int32_t i2 = nextIndex + 1; | |
880 | if(i2 < sLength || sLength < 0) { | |
881 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && | |
882 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { | |
883 | c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF | |
884 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && | |
885 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { | |
886 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. | |
887 | } else { | |
888 | return BAIL_OUT; | |
889 | } | |
890 | } else { | |
891 | return BAIL_OUT; | |
892 | } | |
893 | nextIndex += 2; | |
894 | } | |
895 | } | |
896 | } | |
897 | if(c2 == 0 && sLength < 0) { | |
898 | sLength = sIndex; | |
899 | c2 = -1; | |
900 | } | |
901 | // Look for the next character in the contraction suffix list, | |
902 | // which is in ascending order of single suffix characters. | |
903 | int32_t i = index; | |
904 | int32_t head = table[i]; // first skip the default mapping | |
905 | int32_t x; | |
906 | do { | |
907 | i += head >> CONTR_LENGTH_SHIFT; | |
908 | head = table[i]; | |
909 | x = head & CONTR_CHAR_MASK; | |
910 | } while(x < c2); | |
911 | if(x == c2) { | |
912 | index = i; | |
913 | sIndex = nextIndex; | |
914 | } | |
915 | } | |
916 | // Return the CE or CEs for the default or contraction mapping. | |
917 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; | |
918 | if(length == 1) { | |
919 | return BAIL_OUT; | |
920 | } | |
921 | ce = table[index + 1]; | |
922 | if(length == 2) { | |
923 | return ce; | |
924 | } else { | |
925 | return ((uint32_t)table[index + 2] << 16) | ce; | |
926 | } | |
927 | } | |
928 | } | |
929 | ||
930 | uint32_t | |
931 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { | |
932 | if(pair <= 0xffff) { | |
933 | // one mini CE | |
934 | if(pair >= MIN_SHORT) { | |
935 | pair = getSecondariesFromOneShortCE(pair); | |
936 | } else if(pair > variableTop) { | |
937 | pair = COMMON_SEC_PLUS_OFFSET; | |
938 | } else if(pair >= MIN_LONG) { | |
939 | pair = 0; // variable | |
940 | } | |
941 | // else special mini CE | |
942 | } else { | |
943 | uint32_t ce = pair & 0xffff; | |
944 | if(ce >= MIN_SHORT) { | |
945 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; | |
946 | } else if(ce > variableTop) { | |
947 | pair = TWO_COMMON_SEC_PLUS_OFFSET; | |
948 | } else { | |
949 | U_ASSERT(ce >= MIN_LONG); | |
950 | pair = 0; // variable | |
951 | } | |
952 | } | |
953 | return pair; | |
954 | } | |
955 | ||
956 | uint32_t | |
957 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { | |
958 | // Primary+caseLevel: Ignore case level weights of primary ignorables. | |
959 | // Otherwise: Ignore case level weights of secondary ignorables. | |
960 | // For details see the comments in the CollationCompare class. | |
961 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | |
962 | if(pair <= 0xffff) { | |
963 | // one mini CE | |
964 | if(pair >= MIN_SHORT) { | |
965 | // A high secondary weight means we really have two CEs, | |
966 | // a primary CE and a secondary CE. | |
967 | uint32_t ce = pair; | |
968 | pair &= CASE_MASK; // explicit weight of primary CE | |
969 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
970 | pair |= LOWER_CASE << 16; // implied weight of secondary CE | |
971 | } | |
972 | } else if(pair > variableTop) { | |
973 | pair = LOWER_CASE; | |
974 | } else if(pair >= MIN_LONG) { | |
975 | pair = 0; // variable | |
976 | } | |
977 | // else special mini CE | |
978 | } else { | |
979 | // two mini CEs, same primary groups, neither expands like above | |
980 | uint32_t ce = pair & 0xffff; | |
981 | if(ce >= MIN_SHORT) { | |
982 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { | |
983 | pair &= CASE_MASK; | |
984 | } else { | |
985 | pair &= TWO_CASES_MASK; | |
986 | } | |
987 | } else if(ce > variableTop) { | |
988 | pair = TWO_LOWER_CASES; | |
989 | } else { | |
990 | U_ASSERT(ce >= MIN_LONG); | |
991 | pair = 0; // variable | |
992 | } | |
993 | } | |
994 | return pair; | |
995 | } | |
996 | ||
997 | uint32_t | |
998 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { | |
999 | if(pair <= 0xffff) { | |
1000 | // one mini CE | |
1001 | if(pair >= MIN_SHORT) { | |
1002 | // A high secondary weight means we really have two CEs, | |
1003 | // a primary CE and a secondary CE. | |
1004 | uint32_t ce = pair; | |
1005 | if(withCaseBits) { | |
1006 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; | |
1007 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1008 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; | |
1009 | } | |
1010 | } else { | |
1011 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1012 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1013 | pair |= COMMON_TER_PLUS_OFFSET << 16; | |
1014 | } | |
1015 | } | |
1016 | } else if(pair > variableTop) { | |
1017 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | |
1018 | if(withCaseBits) { | |
1019 | pair |= LOWER_CASE; | |
1020 | } | |
1021 | } else if(pair >= MIN_LONG) { | |
1022 | pair = 0; // variable | |
1023 | } | |
1024 | // else special mini CE | |
1025 | } else { | |
1026 | // two mini CEs, same primary groups, neither expands like above | |
1027 | uint32_t ce = pair & 0xffff; | |
1028 | if(ce >= MIN_SHORT) { | |
1029 | if(withCaseBits) { | |
1030 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; | |
1031 | } else { | |
1032 | pair &= TWO_TERTIARIES_MASK; | |
1033 | } | |
1034 | pair += TWO_TER_OFFSETS; | |
1035 | } else if(ce > variableTop) { | |
1036 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; | |
1037 | if(withCaseBits) { | |
1038 | pair |= TWO_LOWER_CASES; | |
1039 | } | |
1040 | } else { | |
1041 | U_ASSERT(ce >= MIN_LONG); | |
1042 | pair = 0; // variable | |
1043 | } | |
1044 | } | |
1045 | return pair; | |
1046 | } | |
1047 | ||
1048 | uint32_t | |
1049 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { | |
1050 | // Return the primary weight of a variable CE, | |
1051 | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. | |
1052 | if(pair <= 0xffff) { | |
1053 | // one mini CE | |
1054 | if(pair >= MIN_SHORT) { | |
1055 | // A high secondary weight means we really have two CEs, | |
1056 | // a primary CE and a secondary CE. | |
1057 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { | |
1058 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1059 | } else { | |
1060 | pair = SHORT_PRIMARY_MASK; | |
1061 | } | |
1062 | } else if(pair > variableTop) { | |
1063 | pair = SHORT_PRIMARY_MASK; | |
1064 | } else if(pair >= MIN_LONG) { | |
1065 | pair &= LONG_PRIMARY_MASK; // variable | |
1066 | } | |
1067 | // else special mini CE | |
1068 | } else { | |
1069 | // two mini CEs, same primary groups, neither expands like above | |
1070 | uint32_t ce = pair & 0xffff; | |
1071 | if(ce > variableTop) { | |
1072 | pair = TWO_SHORT_PRIMARIES_MASK; | |
1073 | } else { | |
1074 | U_ASSERT(ce >= MIN_LONG); | |
1075 | pair &= TWO_LONG_PRIMARIES_MASK; // variable | |
1076 | } | |
1077 | } | |
1078 | return pair; | |
1079 | } | |
1080 | ||
1081 | U_NAMESPACE_END | |
1082 | ||
1083 | #endif // !UCONFIG_NO_COLLATION |