2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatinbuilder.cpp
8 * created on: 2013aug09
9 * created by: Markus W. Scherer
12 #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2
13 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
18 #include "unicode/utypes.h"
20 #if !UCONFIG_NO_COLLATION
22 #include "unicode/ucol.h"
23 #include "unicode/ucharstrie.h"
24 #include "unicode/unistr.h"
25 #include "unicode/uobject.h"
26 #include "unicode/uscript.h"
28 #include "collation.h"
29 #include "collationdata.h"
30 #include "collationfastlatin.h"
31 #include "collationfastlatinbuilder.h"
42 * Compare two signed int64_t values as if they were unsigned.
45 compareInt64AsUnsigned(int64_t a
, int64_t b
) {
46 if((uint64_t)a
< (uint64_t)b
) {
48 } else if((uint64_t)a
> (uint64_t)b
) {
55 // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
57 * Like Java Collections.binarySearch(List, String, Comparator).
59 * @return the index>=0 where the item was found,
60 * or the index<0 for inserting the string at ~index in sorted order
63 binarySearch(const int64_t list
[], int32_t limit
, int64_t ce
) {
64 if (limit
== 0) { return ~0; }
67 int32_t i
= (start
+ limit
) / 2;
68 int32_t cmp
= compareInt64AsUnsigned(ce
, list
[i
]);
73 return ~start
; // insert ce before i
78 return ~(start
+ 1); // insert ce after i
87 CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode
&errorCode
)
89 contractionCEs(errorCode
), uniqueCEs(errorCode
),
91 firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
92 firstShortPrimary(0), shortPrimaryOverflow(FALSE
),
96 CollationFastLatinBuilder::~CollationFastLatinBuilder() {
101 CollationFastLatinBuilder::forData(const CollationData
&data
, UErrorCode
&errorCode
) {
102 if(U_FAILURE(errorCode
)) { return FALSE
; }
103 if(!result
.isEmpty()) { // This builder is not reusable.
104 errorCode
= U_INVALID_STATE_ERROR
;
107 if(!loadGroups(data
, errorCode
)) { return FALSE
; }
109 // Fast handling of digits.
110 firstShortPrimary
= firstDigitPrimary
;
111 getCEs(data
, errorCode
);
112 if(!encodeUniqueCEs(errorCode
)) { return FALSE
; }
113 if(shortPrimaryOverflow
) {
114 // Give digits long mini primaries,
115 // so that there are more short primaries for letters.
116 firstShortPrimary
= firstLatinPrimary
;
118 getCEs(data
, errorCode
);
119 if(!encodeUniqueCEs(errorCode
)) { return FALSE
; }
121 // Note: If we still have a short-primary overflow but not a long-primary overflow,
122 // then we could calculate how many more long primaries would fit,
123 // and set the firstShortPrimary to that many after the current firstShortPrimary,
125 // However, this might only benefit the en_US_POSIX tailoring,
126 // and it is simpler to suppress building fast Latin data for it in genrb,
127 // or by returning FALSE here if shortPrimaryOverflow.
129 UBool ok
= !shortPrimaryOverflow
&&
130 encodeCharCEs(errorCode
) && encodeContractions(errorCode
);
131 contractionCEs
.removeAllElements(); // might reduce heap memory usage
132 uniqueCEs
.removeAllElements();
137 CollationFastLatinBuilder::loadGroups(const CollationData
&data
, UErrorCode
&errorCode
) {
138 if(U_FAILURE(errorCode
)) { return FALSE
; }
139 headerLength
= 1 + NUM_SPECIAL_GROUPS
;
140 uint32_t r0
= (CollationFastLatin::VERSION
<< 8) | headerLength
;
141 result
.append((UChar
)r0
);
142 // The first few reordering groups should be special groups
143 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
144 for(int32_t i
= 0; i
< NUM_SPECIAL_GROUPS
; ++i
) {
145 lastSpecialPrimaries
[i
] = data
.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST
+ i
);
146 if(lastSpecialPrimaries
[i
] == 0) {
150 result
.append(0); // reserve a slot for this group
153 firstDigitPrimary
= data
.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT
);
154 firstLatinPrimary
= data
.getFirstPrimaryForGroup(USCRIPT_LATIN
);
155 lastLatinPrimary
= data
.getLastPrimaryForGroup(USCRIPT_LATIN
);
156 if(firstDigitPrimary
== 0 || firstLatinPrimary
== 0) {
164 CollationFastLatinBuilder::inSameGroup(uint32_t p
, uint32_t q
) const {
165 // Both or neither need to be encoded as short primaries,
166 // so that we can test only one and use the same bit mask.
167 if(p
>= firstShortPrimary
) {
168 return q
>= firstShortPrimary
;
169 } else if(q
>= firstShortPrimary
) {
172 // Both or neither must be potentially-variable,
173 // so that we can test only one and determine if both are variable.
174 uint32_t lastVariablePrimary
= lastSpecialPrimaries
[NUM_SPECIAL_GROUPS
- 1];
175 if(p
> lastVariablePrimary
) {
176 return q
> lastVariablePrimary
;
177 } else if(q
> lastVariablePrimary
) {
180 // Both will be encoded with long mini primaries.
181 // They must be in the same special reordering group,
182 // so that we can test only one and determine if both are variable.
183 U_ASSERT(p
!= 0 && q
!= 0);
184 for(int32_t i
= 0;; ++i
) { // will terminate
185 uint32_t lastPrimary
= lastSpecialPrimaries
[i
];
186 if(p
<= lastPrimary
) {
187 return q
<= lastPrimary
;
188 } else if(q
<= lastPrimary
) {
195 CollationFastLatinBuilder::resetCEs() {
196 contractionCEs
.removeAllElements();
197 uniqueCEs
.removeAllElements();
198 shortPrimaryOverflow
= FALSE
;
199 result
.truncate(headerLength
);
203 CollationFastLatinBuilder::getCEs(const CollationData
&data
, UErrorCode
&errorCode
) {
204 if(U_FAILURE(errorCode
)) { return; }
206 for(UChar c
= 0;; ++i
, ++c
) {
207 if(c
== CollationFastLatin::LATIN_LIMIT
) {
208 c
= CollationFastLatin::PUNCT_START
;
209 } else if(c
== CollationFastLatin::PUNCT_LIMIT
) {
212 const CollationData
*d
;
213 uint32_t ce32
= data
.getCE32(c
);
214 if(ce32
== Collation::FALLBACK_CE32
) {
216 ce32
= d
->getCE32(c
);
220 if(getCEsFromCE32(*d
, c
, ce32
, errorCode
)) {
223 addUniqueCE(ce0
, errorCode
);
224 addUniqueCE(ce1
, errorCode
);
227 charCEs
[i
][0] = ce0
= Collation::NO_CE
;
228 charCEs
[i
][1] = ce1
= 0;
230 if(c
== 0 && !isContractionCharCE(ce0
)) {
231 // Always map U+0000 to a contraction.
232 // Write a contraction list with only a default value if there is no real contraction.
233 U_ASSERT(contractionCEs
.isEmpty());
234 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, ce0
, ce1
, errorCode
);
235 charCEs
[0][0] = ((int64_t)Collation::NO_CE_PRIMARY
<< 32) | CONTRACTION_FLAG
;
239 // Terminate the last contraction list.
240 contractionCEs
.addElement(CollationFastLatin::CONTR_CHAR_MASK
, errorCode
);
244 CollationFastLatinBuilder::getCEsFromCE32(const CollationData
&data
, UChar32 c
, uint32_t ce32
,
245 UErrorCode
&errorCode
) {
246 if(U_FAILURE(errorCode
)) { return FALSE
; }
247 ce32
= data
.getFinalCE32(ce32
);
249 if(Collation::isSimpleOrLongCE32(ce32
)) {
250 ce0
= Collation::ceFromCE32(ce32
);
252 switch(Collation::tagFromCE32(ce32
)) {
253 case Collation::LATIN_EXPANSION_TAG
:
254 ce0
= Collation::latinCE0FromCE32(ce32
);
255 ce1
= Collation::latinCE1FromCE32(ce32
);
257 case Collation::EXPANSION32_TAG
: {
258 const uint32_t *ce32s
= data
.ce32s
+ Collation::indexFromCE32(ce32
);
259 int32_t length
= Collation::lengthFromCE32(ce32
);
261 ce0
= Collation::ceFromCE32(ce32s
[0]);
263 ce1
= Collation::ceFromCE32(ce32s
[1]);
270 case Collation::EXPANSION_TAG
: {
271 const int64_t *ces
= data
.ces
+ Collation::indexFromCE32(ce32
);
272 int32_t length
= Collation::lengthFromCE32(ce32
);
283 // Note: We could support PREFIX_TAG (assert c>=0)
284 // by recursing on its default CE32 and checking that none of the prefixes starts
285 // with a fast Latin character.
286 // However, currently (2013) there are only the L-before-middle-dot
287 // prefix mappings in the Latin range, and those would be rejected anyway.
288 case Collation::CONTRACTION_TAG
:
290 return getCEsFromContractionCE32(data
, ce32
, errorCode
);
291 case Collation::OFFSET_TAG
:
293 ce0
= data
.getCEFromOffsetCE32(c
, ce32
);
299 // A mapping can be completely ignorable.
300 if(ce0
== 0) { return ce1
== 0; }
301 // We do not support an ignorable ce0 unless it is completely ignorable.
302 uint32_t p0
= (uint32_t)(ce0
>> 32);
303 if(p0
== 0) { return FALSE
; }
304 // We only support primaries up to the Latin script.
305 if(p0
> lastLatinPrimary
) { return FALSE
; }
306 // We support non-common secondary and case weights only together with short primaries.
307 uint32_t lower32_0
= (uint32_t)ce0
;
308 if(p0
< firstShortPrimary
) {
309 uint32_t sc0
= lower32_0
& Collation::SECONDARY_AND_CASE_MASK
;
310 if(sc0
!= Collation::COMMON_SECONDARY_CE
) { return FALSE
; }
312 // No below-common tertiary weights.
313 if((lower32_0
& Collation::ONLY_TERTIARY_MASK
) < Collation::COMMON_WEIGHT16
) { return FALSE
; }
315 // Both primaries must be in the same group,
316 // or both must get short mini primaries,
317 // or a short-primary CE is followed by a secondary CE.
318 // This is so that we can test the first primary and use the same mask for both,
319 // and determine for both whether they are variable.
320 uint32_t p1
= (uint32_t)(ce1
>> 32);
321 if(p1
== 0 ? p0
< firstShortPrimary
: !inSameGroup(p0
, p1
)) { return FALSE
; }
322 uint32_t lower32_1
= (uint32_t)ce1
;
324 if((lower32_1
>> 16) == 0) { return FALSE
; }
325 // We support non-common secondary and case weights
326 // only for secondary CEs or together with short primaries.
327 if(p1
!= 0 && p1
< firstShortPrimary
) {
328 uint32_t sc1
= lower32_1
& Collation::SECONDARY_AND_CASE_MASK
;
329 if(sc1
!= Collation::COMMON_SECONDARY_CE
) { return FALSE
; }
331 // No below-common tertiary weights.
332 if((lower32_1
& Collation::ONLY_TERTIARY_MASK
) < Collation::COMMON_WEIGHT16
) { return FALSE
; }
334 // No quaternary weights.
335 if(((ce0
| ce1
) & Collation::QUATERNARY_MASK
) != 0) { return FALSE
; }
340 CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData
&data
, uint32_t ce32
,
341 UErrorCode
&errorCode
) {
342 if(U_FAILURE(errorCode
)) { return FALSE
; }
343 const UChar
*p
= data
.contexts
+ Collation::indexFromCE32(ce32
);
344 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
345 // Since the original ce32 is not a prefix mapping,
346 // the default ce32 must not be another contraction.
347 U_ASSERT(!Collation::isContractionCE32(ce32
));
348 int32_t contractionIndex
= contractionCEs
.size();
349 if(getCEsFromCE32(data
, U_SENTINEL
, ce32
, errorCode
)) {
350 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, ce0
, ce1
, errorCode
);
352 // Bail out for c-without-contraction.
353 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK
, Collation::NO_CE
, 0, errorCode
);
355 // Handle an encodable contraction unless the next contraction is too long
356 // and starts with the same character.
358 UBool addContraction
= FALSE
;
359 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
360 while(suffixes
.next(errorCode
)) {
361 const UnicodeString
&suffix
= suffixes
.getString();
362 int32_t x
= CollationFastLatin::getCharIndex(suffix
.charAt(0));
363 if(x
< 0) { continue; } // ignore anything but fast Latin text
366 // Bail out for all contractions starting with this character.
367 addContractionEntry(x
, Collation::NO_CE
, 0, errorCode
);
368 addContraction
= FALSE
;
373 addContractionEntry(prevX
, ce0
, ce1
, errorCode
);
375 ce32
= (uint32_t)suffixes
.getValue();
376 if(suffix
.length() == 1 && getCEsFromCE32(data
, U_SENTINEL
, ce32
, errorCode
)) {
377 addContraction
= TRUE
;
379 addContractionEntry(x
, Collation::NO_CE
, 0, errorCode
);
380 addContraction
= FALSE
;
385 addContractionEntry(prevX
, ce0
, ce1
, errorCode
);
387 if(U_FAILURE(errorCode
)) { return FALSE
; }
388 // Note: There might not be any fast Latin contractions, but
389 // we need to enter contraction handling anyway so that we can bail out
390 // when there is a non-fast-Latin character following.
391 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
392 // following umlaut and bail out, rather than return the difference of Y vs. u.
393 ce0
= ((int64_t)Collation::NO_CE_PRIMARY
<< 32) | CONTRACTION_FLAG
| contractionIndex
;
399 CollationFastLatinBuilder::addContractionEntry(int32_t x
, int64_t cce0
, int64_t cce1
,
400 UErrorCode
&errorCode
) {
401 contractionCEs
.addElement(x
, errorCode
);
402 contractionCEs
.addElement(cce0
, errorCode
);
403 contractionCEs
.addElement(cce1
, errorCode
);
404 addUniqueCE(cce0
, errorCode
);
405 addUniqueCE(cce1
, errorCode
);
409 CollationFastLatinBuilder::addUniqueCE(int64_t ce
, UErrorCode
&errorCode
) {
410 if(U_FAILURE(errorCode
)) { return; }
411 if(ce
== 0 || (uint32_t)(ce
>> 32) == Collation::NO_CE_PRIMARY
) { return; }
412 ce
&= ~(int64_t)Collation::CASE_MASK
; // blank out case bits
413 int32_t i
= binarySearch(uniqueCEs
.getBuffer(), uniqueCEs
.size(), ce
);
415 uniqueCEs
.insertElementAt(ce
, ~i
, errorCode
);
420 CollationFastLatinBuilder::getMiniCE(int64_t ce
) const {
421 ce
&= ~(int64_t)Collation::CASE_MASK
; // blank out case bits
422 int32_t index
= binarySearch(uniqueCEs
.getBuffer(), uniqueCEs
.size(), ce
);
423 U_ASSERT(index
>= 0);
424 return miniCEs
[index
];
428 CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode
&errorCode
) {
429 if(U_FAILURE(errorCode
)) { return FALSE
; }
431 miniCEs
= (uint16_t *)uprv_malloc(uniqueCEs
.size() * 2);
432 if(miniCEs
== NULL
) {
433 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
437 uint32_t lastGroupPrimary
= lastSpecialPrimaries
[group
];
438 // The lowest unique CE must be at least a secondary CE.
439 U_ASSERT(((uint32_t)uniqueCEs
.elementAti(0) >> 16) != 0);
440 uint32_t prevPrimary
= 0;
441 uint32_t prevSecondary
= 0;
444 uint32_t ter
= CollationFastLatin::COMMON_TER
;
445 for(int32_t i
= 0; i
< uniqueCEs
.size(); ++i
) {
446 int64_t ce
= uniqueCEs
.elementAti(i
);
447 // Note: At least one of the p/s/t weights changes from one unique CE to the next.
448 // (uniqueCEs does not store case bits.)
449 uint32_t p
= (uint32_t)(ce
>> 32);
450 if(p
!= prevPrimary
) {
451 while(p
> lastGroupPrimary
) {
452 U_ASSERT(pri
<= CollationFastLatin::MAX_LONG
);
453 // Set the group's header entry to the
454 // last "long primary" in or before the group.
455 result
.setCharAt(1 + group
, (UChar
)pri
);
456 if(++group
< NUM_SPECIAL_GROUPS
) {
457 lastGroupPrimary
= lastSpecialPrimaries
[group
];
459 lastGroupPrimary
= 0xffffffff;
463 if(p
< firstShortPrimary
) {
465 pri
= CollationFastLatin::MIN_LONG
;
466 } else if(pri
< CollationFastLatin::MAX_LONG
) {
467 pri
+= CollationFastLatin::LONG_INC
;
469 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
470 printf("long-primary overflow for %08x\n", p
);
472 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
476 if(pri
< CollationFastLatin::MIN_SHORT
) {
477 pri
= CollationFastLatin::MIN_SHORT
;
478 } else if(pri
< (CollationFastLatin::MAX_SHORT
- CollationFastLatin::SHORT_INC
)) {
479 // Reserve the highest primary weight for U+FFFF.
480 pri
+= CollationFastLatin::SHORT_INC
;
482 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
483 printf("short-primary overflow for %08x\n", p
);
485 shortPrimaryOverflow
= TRUE
;
486 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
491 prevSecondary
= Collation::COMMON_WEIGHT16
;
492 sec
= CollationFastLatin::COMMON_SEC
;
493 ter
= CollationFastLatin::COMMON_TER
;
495 uint32_t lower32
= (uint32_t)ce
;
496 uint32_t s
= lower32
>> 16;
497 if(s
!= prevSecondary
) {
500 sec
= CollationFastLatin::MIN_SEC_HIGH
;
501 } else if(sec
< CollationFastLatin::MAX_SEC_HIGH
) {
502 sec
+= CollationFastLatin::SEC_INC
;
504 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
508 ter
= CollationFastLatin::COMMON_TER
;
509 } else if(s
< Collation::COMMON_WEIGHT16
) {
510 if(sec
== CollationFastLatin::COMMON_SEC
) {
511 sec
= CollationFastLatin::MIN_SEC_BEFORE
;
512 } else if(sec
< CollationFastLatin::MAX_SEC_BEFORE
) {
513 sec
+= CollationFastLatin::SEC_INC
;
515 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
518 } else if(s
== Collation::COMMON_WEIGHT16
) {
519 sec
= CollationFastLatin::COMMON_SEC
;
521 if(sec
< CollationFastLatin::MIN_SEC_AFTER
) {
522 sec
= CollationFastLatin::MIN_SEC_AFTER
;
523 } else if(sec
< CollationFastLatin::MAX_SEC_AFTER
) {
524 sec
+= CollationFastLatin::SEC_INC
;
526 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
531 ter
= CollationFastLatin::COMMON_TER
;
533 U_ASSERT((lower32
& Collation::CASE_MASK
) == 0); // blanked out in uniqueCEs
534 uint32_t t
= lower32
& Collation::ONLY_TERTIARY_MASK
;
535 if(t
> Collation::COMMON_WEIGHT16
) {
536 if(ter
< CollationFastLatin::MAX_TER_AFTER
) {
539 miniCEs
[i
] = CollationFastLatin::BAIL_OUT
;
543 if(CollationFastLatin::MIN_LONG
<= pri
&& pri
<= CollationFastLatin::MAX_LONG
) {
544 U_ASSERT(sec
== CollationFastLatin::COMMON_SEC
);
545 miniCEs
[i
] = (uint16_t)(pri
| ter
);
547 miniCEs
[i
] = (uint16_t)(pri
| sec
| ter
);
550 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
551 printf("last mini primary: %04x\n", pri
);
553 #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
554 for(int32_t i
= 0; i
< uniqueCEs
.size(); ++i
) {
555 int64_t ce
= uniqueCEs
.elementAti(i
);
556 printf("unique CE 0x%016lx -> 0x%04x\n", ce
, miniCEs
[i
]);
559 return U_SUCCESS(errorCode
);
563 CollationFastLatinBuilder::encodeCharCEs(UErrorCode
&errorCode
) {
564 if(U_FAILURE(errorCode
)) { return FALSE
; }
565 int32_t miniCEsStart
= result
.length();
566 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
567 result
.append(0); // initialize to completely ignorable
569 int32_t indexBase
= result
.length();
570 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
571 int64_t ce
= charCEs
[i
][0];
572 if(isContractionCharCE(ce
)) { continue; } // defer contraction
573 uint32_t miniCE
= encodeTwoCEs(ce
, charCEs
[i
][1]);
574 if(miniCE
> 0xffff) {
575 // Note: There is a chance that this new expansion is the same as a previous one,
576 // and if so, then we could reuse the other expansion.
577 // However, that seems unlikely.
578 int32_t expansionIndex
= result
.length() - indexBase
;
579 if(expansionIndex
> (int32_t)CollationFastLatin::INDEX_MASK
) {
580 miniCE
= CollationFastLatin::BAIL_OUT
;
582 result
.append((UChar
)(miniCE
>> 16)).append((UChar
)miniCE
);
583 miniCE
= CollationFastLatin::EXPANSION
| expansionIndex
;
586 result
.setCharAt(miniCEsStart
+ i
, (UChar
)miniCE
);
588 return U_SUCCESS(errorCode
);
592 CollationFastLatinBuilder::encodeContractions(UErrorCode
&errorCode
) {
593 // We encode all contraction lists so that the first word of a list
594 // terminates the previous list, and we only need one additional terminator at the end.
595 if(U_FAILURE(errorCode
)) { return FALSE
; }
596 int32_t indexBase
= headerLength
+ CollationFastLatin::NUM_FAST_CHARS
;
597 int32_t firstContractionIndex
= result
.length();
598 for(int32_t i
= 0; i
< CollationFastLatin::NUM_FAST_CHARS
; ++i
) {
599 int64_t ce
= charCEs
[i
][0];
600 if(!isContractionCharCE(ce
)) { continue; }
601 int32_t contractionIndex
= result
.length() - indexBase
;
602 if(contractionIndex
> (int32_t)CollationFastLatin::INDEX_MASK
) {
603 result
.setCharAt(headerLength
+ i
, CollationFastLatin::BAIL_OUT
);
606 UBool firstTriple
= TRUE
;
607 for(int32_t index
= (int32_t)ce
& 0x7fffffff;; index
+= 3) {
608 int32_t x
= contractionCEs
.elementAti(index
);
609 if((uint32_t)x
== CollationFastLatin::CONTR_CHAR_MASK
&& !firstTriple
) { break; }
610 int64_t cce0
= contractionCEs
.elementAti(index
+ 1);
611 int64_t cce1
= contractionCEs
.elementAti(index
+ 2);
612 uint32_t miniCE
= encodeTwoCEs(cce0
, cce1
);
613 if(miniCE
== CollationFastLatin::BAIL_OUT
) {
614 result
.append((UChar
)(x
| (1 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
615 } else if(miniCE
<= 0xffff) {
616 result
.append((UChar
)(x
| (2 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
617 result
.append((UChar
)miniCE
);
619 result
.append((UChar
)(x
| (3 << CollationFastLatin::CONTR_LENGTH_SHIFT
)));
620 result
.append((UChar
)(miniCE
>> 16)).append((UChar
)miniCE
);
624 // Note: There is a chance that this new contraction list is the same as a previous one,
625 // and if so, then we could truncate the result and reuse the other list.
626 // However, that seems unlikely.
627 result
.setCharAt(headerLength
+ i
,
628 (UChar
)(CollationFastLatin::CONTRACTION
| contractionIndex
));
630 if(result
.length() > firstContractionIndex
) {
631 // Terminate the last contraction list.
632 result
.append((UChar
)CollationFastLatin::CONTR_CHAR_MASK
);
634 if(result
.isBogus()) {
635 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
638 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
639 printf("** fast Latin %d * 2 = %d bytes\n", result
.length(), result
.length() * 2);
640 puts(" header & below-digit groups map");
642 for(; i
< headerLength
; ++i
) {
643 printf(" %04x", result
[i
]);
645 printf("\n char mini CEs");
646 U_ASSERT(CollationFastLatin::NUM_FAST_CHARS
% 16 == 0);
647 for(; i
< indexBase
; i
+= 16) {
648 UChar32 c
= i
- headerLength
;
649 if(c
>= CollationFastLatin::LATIN_LIMIT
) {
650 c
= CollationFastLatin::PUNCT_START
+ c
- CollationFastLatin::LATIN_LIMIT
;
652 printf("\n %04x:", c
);
653 for(int32_t j
= 0; j
< 16; ++j
) {
654 printf(" %04x", result
[i
+ j
]);
657 printf("\n expansions & contractions");
658 for(; i
< result
.length(); ++i
) {
659 if((i
- indexBase
) % 16 == 0) { puts(""); }
660 printf(" %04x", result
[i
]);
668 CollationFastLatinBuilder::encodeTwoCEs(int64_t first
, int64_t second
) const {
670 return 0; // completely ignorable
672 if(first
== Collation::NO_CE
) {
673 return CollationFastLatin::BAIL_OUT
;
675 U_ASSERT((uint32_t)(first
>> 32) != Collation::NO_CE_PRIMARY
);
677 uint32_t miniCE
= getMiniCE(first
);
678 if(miniCE
== CollationFastLatin::BAIL_OUT
) { return miniCE
; }
679 if(miniCE
>= CollationFastLatin::MIN_SHORT
) {
680 // Extract & copy the case bits.
681 // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
682 uint32_t c
= (((uint32_t)first
& Collation::CASE_MASK
) >> (14 - 3));
683 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
684 c
+= CollationFastLatin::LOWER_CASE
;
687 if(second
== 0) { return miniCE
; }
689 uint32_t miniCE1
= getMiniCE(second
);
690 if(miniCE1
== CollationFastLatin::BAIL_OUT
) { return miniCE1
; }
692 uint32_t case1
= (uint32_t)second
& Collation::CASE_MASK
;
693 if(miniCE
>= CollationFastLatin::MIN_SHORT
&&
694 (miniCE
& CollationFastLatin::SECONDARY_MASK
) == CollationFastLatin::COMMON_SEC
) {
695 // Try to combine the two mini CEs into one.
696 uint32_t sec1
= miniCE1
& CollationFastLatin::SECONDARY_MASK
;
697 uint32_t ter1
= miniCE1
& CollationFastLatin::TERTIARY_MASK
;
698 if(sec1
>= CollationFastLatin::MIN_SEC_HIGH
&& case1
== 0 &&
699 ter1
== CollationFastLatin::COMMON_TER
) {
700 // sec1>=sec_high implies pri1==0.
701 return (miniCE
& ~CollationFastLatin::SECONDARY_MASK
) | sec1
;
705 if(miniCE1
<= CollationFastLatin::SECONDARY_MASK
|| CollationFastLatin::MIN_SHORT
<= miniCE1
) {
706 // Secondary CE, or a CE with a short primary, copy the case bits.
707 case1
= (case1
>> (14 - 3)) + CollationFastLatin::LOWER_CASE
;
710 return (miniCE
<< 16) | miniCE1
;
715 #endif // !UCONFIG_NO_COLLATION