1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationfastlatinbuilder.cpp
10 * created on: 2013aug09
11 * created by: Markus W. Scherer
14 #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2
15 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/ucol.h"
25 #include "unicode/ucharstrie.h"
26 #include "unicode/unistr.h"
27 #include "unicode/uobject.h"
28 #include "unicode/uscript.h"
30 #include "collation.h"
31 #include "collationdata.h"
32 #include "collationfastlatin.h"
33 #include "collationfastlatinbuilder.h"
44 * Compare two signed int64_t values as if they were unsigned.
47 compareInt64AsUnsigned(int64_t a
, int64_t b
) {
48 if((uint64_t)a
< (uint64_t)b
) {
50 } else if((uint64_t)a
> (uint64_t)b
) {
57 // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
59 * Like Java Collections.binarySearch(List, String, Comparator).
61 * @return the index>=0 where the item was found,
62 * or the index<0 for inserting the string at ~index in sorted order
65 binarySearch(const int64_t list
[], int32_t limit
, int64_t ce
) {
66 if (limit
== 0) { return ~0; }
69 int32_t i
= (start
+ limit
) / 2;
70 int32_t cmp
= compareInt64AsUnsigned(ce
, list
[i
]);
75 return ~start
; // insert ce before i
80 return ~(start
+ 1); // insert ce after i
89 CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode
&errorCode
)
91 contractionCEs(errorCode
), uniqueCEs(errorCode
),
93 firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
94 firstShortPrimary(0), shortPrimaryOverflow(FALSE
),
98 CollationFastLatinBuilder::~CollationFastLatinBuilder() {
103 CollationFastLatinBuilder::forData(const CollationData
&data
, UErrorCode
&errorCode
) {
104 if(U_FAILURE(errorCode
)) { return FALSE
; }
105 if(!result
.isEmpty()) { // This builder is not reusable.
106 errorCode
= U_INVALID_STATE_ERROR
;
109 if(!loadGroups(data
, errorCode
)) { return FALSE
; }
111 // Fast handling of digits.
112 firstShortPrimary
= firstDigitPrimary
;
113 getCEs(data
, errorCode
);
114 if(!encodeUniqueCEs(errorCode
)) { return FALSE
; }
115 if(shortPrimaryOverflow
) {
116 // Give digits long mini primaries,
117 // so that there are more short primaries for letters.
118 firstShortPrimary
= firstLatinPrimary
;
120 getCEs(data
, errorCode
);
121 if(!encodeUniqueCEs(errorCode
)) { return FALSE
; }
123 // Note: If we still have a short-primary overflow but not a long-primary overflow,
124 // then we could calculate how many more long primaries would fit,
125 // and set the firstShortPrimary to that many after the current firstShortPrimary,
127 // However, this might only benefit the en_US_POSIX tailoring,
128 // and it is simpler to suppress building fast Latin data for it in genrb,
129 // or by returning FALSE here if shortPrimaryOverflow.
131 UBool ok
= !shortPrimaryOverflow
&&
132 encodeCharCEs(errorCode
) && encodeContractions(errorCode
);
133 contractionCEs
.removeAllElements(); // might reduce heap memory usage
134 uniqueCEs
.removeAllElements();
139 CollationFastLatinBuilder::loadGroups(const CollationData
&data
, UErrorCode
&errorCode
) {
140 if(U_FAILURE(errorCode
)) { return FALSE
; }
141 headerLength
= 1 + NUM_SPECIAL_GROUPS
;
142 uint32_t r0
= (CollationFastLatin::VERSION
<< 8) | headerLength
;
143 result
.append((UChar
)r0
);
144 // The first few reordering groups should be special groups
145 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
146 for(int32_t i
= 0; i
< NUM_SPECIAL_GROUPS
; ++i
) {
147 lastSpecialPrimaries
[i
] = data
.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST
+ i
);
148 if(lastSpecialPrimaries
[i
] == 0) {
152 result
.append((UChar
)0); // reserve a slot for this group
155 firstDigitPrimary
= data
.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT
);
156 firstLatinPrimary
= data
.getFirstPrimaryForGroup(USCRIPT_LATIN
);
157 lastLatinPrimary
= data
.getLastPrimaryForGroup(USCRIPT_LATIN
);
158 if(firstDigitPrimary
== 0 || firstLatinPrimary
== 0) {
166 CollationFastLatinBuilder::inSameGroup(uint32_t p
, uint32_t q
) const {
167 // Both or neither need to be encoded as short primaries,
168 // so that we can test only one and use the same bit mask.
169 if(p
>= firstShortPrimary
) {
170 return q
>= firstShortPrimary
;
171 } else if(q
>= firstShortPrimary
) {
174 // Both or neither must be potentially-variable,
175 // so that we can test only one and determine if both are variable.
176 uint32_t lastVariablePrimary
= lastSpecialPrimaries
[NUM_SPECIAL_GROUPS
- 1];
177 if(p
> lastVariablePrimary
) {
178 return q
> lastVariablePrimary
;
179 } else if(q
> lastVariablePrimary
) {
182 // Both will be encoded with long mini primaries.
183 // They must be in the same special reordering group,
184 // so that we can test only one and determine if both are variable.
185 U_ASSERT(p
!= 0 && q
!= 0);
186 for(int32_t i
= 0;; ++i
) { // will terminate
187 uint32_t lastPrimary
= lastSpecialPrimaries
[i
];
188 if(p
<= lastPrimary
) {
189 return q
<= lastPrimary
;
190 } else if(q
<= lastPrimary
) {
197 CollationFastLatinBuilder::resetCEs() {
198 contractionCEs
.removeAllElements();
199 uniqueCEs
.removeAllElements();
200 shortPrimaryOverflow
= FALSE
;
201 result
.truncate(headerLength
);
205 CollationFastLatinBuilder::getCEs(const CollationData
&data
, UErrorCode
&errorCode
) {
206 if(U_FAILURE(errorCode
)) { return; }
208 for(UChar c
= 0;; ++i
, ++c
) {
209 if(c
== CollationFastLatin::LATIN_LIMIT
) {
210 c
= CollationFastLatin::PUNCT_START
;
211 } else if(c
== CollationFastLatin::PUNCT_LIMIT
) {
214 const CollationData
*d
;
215 uint32_t ce32
= data
.getCE32(c
);
216 if(ce32
== Collation::FALLBACK_CE32
) {
218 ce32
= d
->getCE32(c
);
222 if(getCEsFromCE32(*d
, c
, ce32
, errorCode
)) {
225 addUniqueCE(ce0
, errorCode
);
226 addUniqueCE(ce1
, errorCode
);
229 charCEs
[i
][0] = ce0
= Collation::NO_CE
;
230 charCEs
[i
][1] = ce1
= 0;
232 if(c
== 0 && !isContractionCharCE(ce0
)) {
233 // Always map U+0000 to a contraction.
234 // Write a contraction list with only a default value if there is no real contraction.
235 U_ASSERT(contractionCEs
.isEmpty());
236 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, ce0
, ce1
, errorCode
);
237 charCEs
[0][0] = ((int64_t)Collation::NO_CE_PRIMARY
<< 32) | CONTRACTION_FLAG
;
241 // Terminate the last contraction list.
242 contractionCEs
.addElement(CollationFastLatin::CONTR_CHAR_MASK
, errorCode
);
246 CollationFastLatinBuilder::getCEsFromCE32(const CollationData
&data
, UChar32 c
, uint32_t ce32
,
247 UErrorCode
&errorCode
) {
248 if(U_FAILURE(errorCode
)) { return FALSE
; }
249 ce32
= data
.getFinalCE32(ce32
);
251 if(Collation::isSimpleOrLongCE32(ce32
)) {
252 ce0
= Collation::ceFromCE32(ce32
);
254 switch(Collation::tagFromCE32(ce32
)) {
255 case Collation::LATIN_EXPANSION_TAG
:
256 ce0
= Collation::latinCE0FromCE32(ce32
);
257 ce1
= Collation::latinCE1FromCE32(ce32
);
259 case Collation::EXPANSION32_TAG
: {
260 const uint32_t *ce32s
= data
.ce32s
+ Collation::indexFromCE32(ce32
);
261 int32_t length
= Collation::lengthFromCE32(ce32
);
263 ce0
= Collation::ceFromCE32(ce32s
[0]);
265 ce1
= Collation::ceFromCE32(ce32s
[1]);
272 case Collation::EXPANSION_TAG
: {
273 const int64_t *ces
= data
.ces
+ Collation::indexFromCE32(ce32
);
274 int32_t length
= Collation::lengthFromCE32(ce32
);
285 // Note: We could support PREFIX_TAG (assert c>=0)
286 // by recursing on its default CE32 and checking that none of the prefixes starts
287 // with a fast Latin character.
288 // However, currently (2013) there are only the L-before-middle-dot
289 // prefix mappings in the Latin range, and those would be rejected anyway.
290 case Collation::CONTRACTION_TAG
:
292 return getCEsFromContractionCE32(data
, ce32
, errorCode
);
293 case Collation::OFFSET_TAG
:
295 ce0
= data
.getCEFromOffsetCE32(c
, ce32
);
301 // A mapping can be completely ignorable.
302 if(ce0
== 0) { return ce1
== 0; }
303 // We do not support an ignorable ce0 unless it is completely ignorable.
304 uint32_t p0
= (uint32_t)(ce0
>> 32);
305 if(p0
== 0) { return FALSE
; }
306 // We only support primaries up to the Latin script.
307 if(p0
> lastLatinPrimary
) { return FALSE
; }
308 // We support non-common secondary and case weights only together with short primaries.
309 uint32_t lower32_0
= (uint32_t)ce0
;
310 if(p0
< firstShortPrimary
) {
311 uint32_t sc0
= lower32_0
& Collation::SECONDARY_AND_CASE_MASK
;
312 if(sc0
!= Collation::COMMON_SECONDARY_CE
) { return FALSE
; }
314 // No below-common tertiary weights.
315 if((lower32_0
& Collation::ONLY_TERTIARY_MASK
) < Collation::COMMON_WEIGHT16
) { return FALSE
; }
317 // Both primaries must be in the same group,
318 // or both must get short mini primaries,
319 // or a short-primary CE is followed by a secondary CE.
320 // This is so that we can test the first primary and use the same mask for both,
321 // and determine for both whether they are variable.
322 uint32_t p1
= (uint32_t)(ce1
>> 32);
323 if(p1
== 0 ? p0
< firstShortPrimary
: !inSameGroup(p0
, p1
)) { return FALSE
; }
324 uint32_t lower32_1
= (uint32_t)ce1
;
326 if((lower32_1
>> 16) == 0) { return FALSE
; }
327 // We support non-common secondary and case weights
328 // only for secondary CEs or together with short primaries.
329 if(p1
!= 0 && p1
< firstShortPrimary
) {
330 uint32_t sc1
= lower32_1
& Collation::SECONDARY_AND_CASE_MASK
;
331 if(sc1
!= Collation::COMMON_SECONDARY_CE
) { return FALSE
; }
333 // No below-common tertiary weights.
334 if((lower32_1
& Collation::ONLY_TERTIARY_MASK
) < Collation::COMMON_WEIGHT16
) { return FALSE
; }
336 // No quaternary weights.
337 if(((ce0
| ce1
) & Collation::QUATERNARY_MASK
) != 0) { return FALSE
; }
342 CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData
&data
, uint32_t ce32
,
343 UErrorCode
&errorCode
) {
344 if(U_FAILURE(errorCode
)) { return FALSE
; }
345 const UChar
*p
= data
.contexts
+ Collation::indexFromCE32(ce32
);
346 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
347 // Since the original ce32 is not a prefix mapping,
348 // the default ce32 must not be another contraction.
349 U_ASSERT(!Collation::isContractionCE32(ce32
));
350 int32_t contractionIndex
= contractionCEs
.size();
351 if(getCEsFromCE32(data
, U_SENTINEL
, ce32
, errorCode
)) {
352 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, ce0
, ce1
, errorCode
);
354 // Bail out for c-without-contraction.
355 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, Collation::NO_CE
, 0, errorCode
);
357 // Handle an encodable contraction unless the next contraction is too long
358 // and starts with the same character.
360 UBool addContraction
= FALSE
;
361 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
362 while(suffixes
.next(errorCode
)) {
363 const UnicodeString
&suffix
= suffixes
.getString();
364 int32_t x
= CollationFastLatin::getCharIndex(suffix
.charAt(0));
365 if(x
< 0) { continue; } // ignore anything but fast Latin text
368 // Bail out for all contractions starting with this character.
369 addContractionEntry(x
, Collation::NO_CE
, 0, errorCode
);
370 addContraction
= FALSE
;
375 addContractionEntry(prevX
, ce0
, ce1
, errorCode
);
377 ce32
= (uint32_t)suffixes
.getValue();
378 if(suffix
.length() == 1 && getCEsFromCE32(data
, U_SENTINEL
, ce32
, errorCode
)) {
379 addContraction
= TRUE
;
381 addContractionEntry(x
, Collation::NO_CE
, 0, errorCode
);
382 addContraction
= FALSE
;
387 addContractionEntry(prevX
, ce0
, ce1
, errorCode
);
389 if(U_FAILURE(errorCode
)) { return FALSE
; }
390 // Note: There might not be any fast Latin contractions, but
391 // we need to enter contraction handling anyway so that we can bail out
392 // when there is a non-fast-Latin character following.
393 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
394 // following umlaut and bail out, rather than return the difference of Y vs. u.
395 ce0
= ((int64_t)Collation::NO_CE_PRIMARY
<< 32) | CONTRACTION_FLAG
| contractionIndex
;
401 CollationFastLatinBuilder::addContractionEntry(int32_t x
, int64_t cce0
, int64_t cce1
,
402 UErrorCode
&errorCode
) {
403 contractionCEs
.addElement(x
, errorCode
);
404 contractionCEs
.addElement(cce0
, errorCode
);
405 contractionCEs
.addElement(cce1
, errorCode
);
406 addUniqueCE(cce0
, errorCode
);
407 addUniqueCE(cce1
, errorCode
);
411 CollationFastLatinBuilder::addUniqueCE(int64_t ce
, UErrorCode
&errorCode
) {
412 if(U_FAILURE(errorCode
)) { return; }
413 if(ce
== 0 || (uint32_t)(ce
>> 32) == Collation::NO_CE_PRIMARY
) { return; }
414 ce
&= ~(int64_t)Collation::CASE_MASK
; // blank out case bits
415 int32_t i
= binarySearch(uniqueCEs
.getBuffer(), uniqueCEs
.size(), ce
);
417 uniqueCEs
.insertElementAt(ce
, ~i
, errorCode
);
422 CollationFastLatinBuilder::getMiniCE(int64_t ce
) const {
423 ce
&= ~(int64_t)Collation::CASE_MASK
; // blank out case bits
424 int32_t index
= binarySearch(uniqueCEs
.getBuffer(), uniqueCEs
.size(), ce
);
425 U_ASSERT(index
>= 0);
426 return miniCEs
[index
];
430 CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode
&errorCode
) {
431 if(U_FAILURE(errorCode
)) { return FALSE
; }
433 miniCEs
= (uint16_t *)uprv_malloc(uniqueCEs
.size() * 2);
434 if(miniCEs
== NULL
) {
435 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
439 uint32_t lastGroupPrimary
= lastSpecialPrimaries
[group
];
440 // The lowest unique CE must be at least a secondary CE.
441 U_ASSERT(((uint32_t)uniqueCEs
.elementAti(0) >> 16) != 0);
442 uint32_t prevPrimary
= 0;
443 uint32_t prevSecondary
= 0;
446 uint32_t ter
= CollationFastLatin::COMMON_TER
;
447 for(int32_t i
= 0; i
< uniqueCEs
.size(); ++i
) {
448 int64_t ce
= uniqueCEs
.elementAti(i
);
449 // Note: At least one of the p/s/t weights changes from one unique CE to the next.
450 // (uniqueCEs does not store case bits.)
451 uint32_t p
= (uint32_t)(ce
>> 32);
452 if(p
!= prevPrimary
) {
453 while(p
> lastGroupPrimary
) {
454 U_ASSERT(pri
<= CollationFastLatin::MAX_LONG
);
455 // Set the group's header entry to the
456 // last "long primary" in or before the group.
457 result
.setCharAt(1 + group
, (UChar
)pri
);
458 if(++group
< NUM_SPECIAL_GROUPS
) {
459 lastGroupPrimary
= lastSpecialPrimaries
[group
];
461 lastGroupPrimary
= 0xffffffff;
465 if(p
< firstShortPrimary
) {
467 pri
= CollationFastLatin::MIN_LONG
;
468 } else if(pri
< CollationFastLatin::MAX_LONG
) {
469 pri
+= CollationFastLatin::LONG_INC
;
471 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
472 printf("long-primary overflow for %08x\n", p
);
474 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
478 if(pri
< CollationFastLatin::MIN_SHORT
) {
479 pri
= CollationFastLatin::MIN_SHORT
;
480 } else if(pri
< (CollationFastLatin::MAX_SHORT
- CollationFastLatin::SHORT_INC
)) {
481 // Reserve the highest primary weight for U+FFFF.
482 pri
+= CollationFastLatin::SHORT_INC
;
484 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
485 printf("short-primary overflow for %08x\n", p
);
487 shortPrimaryOverflow
= TRUE
;
488 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
493 prevSecondary
= Collation::COMMON_WEIGHT16
;
494 sec
= CollationFastLatin::COMMON_SEC
;
495 ter
= CollationFastLatin::COMMON_TER
;
497 uint32_t lower32
= (uint32_t)ce
;
498 uint32_t s
= lower32
>> 16;
499 if(s
!= prevSecondary
) {
502 sec
= CollationFastLatin::MIN_SEC_HIGH
;
503 } else if(sec
< CollationFastLatin::MAX_SEC_HIGH
) {
504 sec
+= CollationFastLatin::SEC_INC
;
506 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
510 ter
= CollationFastLatin::COMMON_TER
;
511 } else if(s
< Collation::COMMON_WEIGHT16
) {
512 if(sec
== CollationFastLatin::COMMON_SEC
) {
513 sec
= CollationFastLatin::MIN_SEC_BEFORE
;
514 } else if(sec
< CollationFastLatin::MAX_SEC_BEFORE
) {
515 sec
+= CollationFastLatin::SEC_INC
;
517 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
520 } else if(s
== Collation::COMMON_WEIGHT16
) {
521 sec
= CollationFastLatin::COMMON_SEC
;
523 if(sec
< CollationFastLatin::MIN_SEC_AFTER
) {
524 sec
= CollationFastLatin::MIN_SEC_AFTER
;
525 } else if(sec
< CollationFastLatin::MAX_SEC_AFTER
) {
526 sec
+= CollationFastLatin::SEC_INC
;
528 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
533 ter
= CollationFastLatin::COMMON_TER
;
535 U_ASSERT((lower32
& Collation::CASE_MASK
) == 0); // blanked out in uniqueCEs
536 uint32_t t
= lower32
& Collation::ONLY_TERTIARY_MASK
;
537 if(t
> Collation::COMMON_WEIGHT16
) {
538 if(ter
< CollationFastLatin::MAX_TER_AFTER
) {
541 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
545 if(CollationFastLatin::MIN_LONG
<= pri
&& pri
<= CollationFastLatin::MAX_LONG
) {
546 U_ASSERT(sec
== CollationFastLatin::COMMON_SEC
);
547 miniCEs
[i
] = (uint16_t)(pri
| ter
);
549 miniCEs
[i
] = (uint16_t)(pri
| sec
| ter
);
552 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
553 printf("last mini primary: %04x\n", pri
);
555 #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
556 for(int32_t i
= 0; i
< uniqueCEs
.size(); ++i
) {
557 int64_t ce
= uniqueCEs
.elementAti(i
);
558 printf("unique CE 0x%016lx -> 0x%04x\n", ce
, miniCEs
[i
]);
561 return U_SUCCESS(errorCode
);
565 CollationFastLatinBuilder::encodeCharCEs(UErrorCode
&errorCode
) {
566 if(U_FAILURE(errorCode
)) { return FALSE
; }
567 int32_t miniCEsStart
= result
.length();
568 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
569 result
.append((UChar
)0); // initialize to completely ignorable
571 int32_t indexBase
= result
.length();
572 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
573 int64_t ce
= charCEs
[i
][0];
574 if(isContractionCharCE(ce
)) { continue; } // defer contraction
575 uint32_t miniCE
= encodeTwoCEs(ce
, charCEs
[i
][1]);
576 if(miniCE
> 0xffff) {
577 // Note: There is a chance that this new expansion is the same as a previous one,
578 // and if so, then we could reuse the other expansion.
579 // However, that seems unlikely.
580 int32_t expansionIndex
= result
.length() - indexBase
;
581 if(expansionIndex
> (int32_t)CollationFastLatin::INDEX_MASK
) {
582 miniCE
= CollationFastLatin::BAIL_OUT
;
584 result
.append((UChar
)(miniCE
>> 16)).append((UChar
)miniCE
);
585 miniCE
= CollationFastLatin::EXPANSION
| expansionIndex
;
588 result
.setCharAt(miniCEsStart
+ i
, (UChar
)miniCE
);
590 return U_SUCCESS(errorCode
);
594 CollationFastLatinBuilder::encodeContractions(UErrorCode
&errorCode
) {
595 // We encode all contraction lists so that the first word of a list
596 // terminates the previous list, and we only need one additional terminator at the end.
597 if(U_FAILURE(errorCode
)) { return FALSE
; }
598 int32_t indexBase
= headerLength
+ CollationFastLatin::NUM_FAST_CHARS
;
599 int32_t firstContractionIndex
= result
.length();
600 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
601 int64_t ce
= charCEs
[i
][0];
602 if(!isContractionCharCE(ce
)) { continue; }
603 int32_t contractionIndex
= result
.length() - indexBase
;
604 if(contractionIndex
> (int32_t)CollationFastLatin::INDEX_MASK
) {
605 result
.setCharAt(headerLength
+ i
, CollationFastLatin::BAIL_OUT
);
608 UBool firstTriple
= TRUE
;
609 for(int32_t index
= (int32_t)ce
& 0x7fffffff;; index
+= 3) {
610 int32_t x
= static_cast<int32_t>(contractionCEs
.elementAti(index
));
611 if((uint32_t)x
== CollationFastLatin::CONTR_CHAR_MASK
&& !firstTriple
) { break; }
612 int64_t cce0
= contractionCEs
.elementAti(index
+ 1);
613 int64_t cce1
= contractionCEs
.elementAti(index
+ 2);
614 uint32_t miniCE
= encodeTwoCEs(cce0
, cce1
);
615 if(miniCE
== CollationFastLatin::BAIL_OUT
) {
616 result
.append((UChar
)(x
| (1 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
617 } else if(miniCE
<= 0xffff) {
618 result
.append((UChar
)(x
| (2 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
619 result
.append((UChar
)miniCE
);
621 result
.append((UChar
)(x
| (3 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
622 result
.append((UChar
)(miniCE
>> 16)).append((UChar
)miniCE
);
626 // Note: There is a chance that this new contraction list is the same as a previous one,
627 // and if so, then we could truncate the result and reuse the other list.
628 // However, that seems unlikely.
629 result
.setCharAt(headerLength
+ i
,
630 (UChar
)(CollationFastLatin::CONTRACTION
| contractionIndex
));
632 if(result
.length() > firstContractionIndex
) {
633 // Terminate the last contraction list.
634 result
.append((UChar
)CollationFastLatin::CONTR_CHAR_MASK
);
636 if(result
.isBogus()) {
637 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
640 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
641 printf("** fast Latin %d * 2 = %d bytes\n", result
.length(), result
.length() * 2);
642 puts(" header & below-digit groups map");
644 for(; i
< headerLength
; ++i
) {
645 printf(" %04x", result
[i
]);
647 printf("\n char mini CEs");
648 U_ASSERT(CollationFastLatin::NUM_FAST_CHARS
% 16 == 0);
649 for(; i
< indexBase
; i
+= 16) {
650 UChar32 c
= i
- headerLength
;
651 if(c
>= CollationFastLatin::LATIN_LIMIT
) {
652 c
= CollationFastLatin::PUNCT_START
+ c
- CollationFastLatin::LATIN_LIMIT
;
654 printf("\n %04x:", c
);
655 for(int32_t j
= 0; j
< 16; ++j
) {
656 printf(" %04x", result
[i
+ j
]);
659 printf("\n expansions & contractions");
660 for(; i
< result
.length(); ++i
) {
661 if((i
- indexBase
) % 16 == 0) { puts(""); }
662 printf(" %04x", result
[i
]);
670 CollationFastLatinBuilder::encodeTwoCEs(int64_t first
, int64_t second
) const {
672 return 0; // completely ignorable
674 if(first
== Collation::NO_CE
) {
675 return CollationFastLatin::BAIL_OUT
;
677 U_ASSERT((uint32_t)(first
>> 32) != Collation::NO_CE_PRIMARY
);
679 uint32_t miniCE
= getMiniCE(first
);
680 if(miniCE
== CollationFastLatin::BAIL_OUT
) { return miniCE
; }
681 if(miniCE
>= CollationFastLatin::MIN_SHORT
) {
682 // Extract & copy the case bits.
683 // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
684 uint32_t c
= (((uint32_t)first
& Collation::CASE_MASK
) >> (14 - 3));
685 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
686 c
+= CollationFastLatin::LOWER_CASE
;
689 if(second
== 0) { return miniCE
; }
691 uint32_t miniCE1
= getMiniCE(second
);
692 if(miniCE1
== CollationFastLatin::BAIL_OUT
) { return miniCE1
; }
694 uint32_t case1
= (uint32_t)second
& Collation::CASE_MASK
;
695 if(miniCE
>= CollationFastLatin::MIN_SHORT
&&
696 (miniCE
& CollationFastLatin::SECONDARY_MASK
) == CollationFastLatin::COMMON_SEC
) {
697 // Try to combine the two mini CEs into one.
698 uint32_t sec1
= miniCE1
& CollationFastLatin::SECONDARY_MASK
;
699 uint32_t ter1
= miniCE1
& CollationFastLatin::TERTIARY_MASK
;
700 if(sec1
>= CollationFastLatin::MIN_SEC_HIGH
&& case1
== 0 &&
701 ter1
== CollationFastLatin::COMMON_TER
) {
702 // sec1>=sec_high implies pri1==0.
703 return (miniCE
& ~CollationFastLatin::SECONDARY_MASK
) | sec1
;
707 if(miniCE1
<= CollationFastLatin::SECONDARY_MASK
|| CollationFastLatin::MIN_SHORT
<= miniCE1
) {
708 // Secondary CE, or a CE with a short primary, copy the case bits.
709 case1
= (case1
>> (14 - 3)) + CollationFastLatin::LOWER_CASE
;
712 return (miniCE
<< 16) | miniCE1
;
717 #endif // !UCONFIG_NO_COLLATION