1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatabuilder.cpp
10 * (replaced the former ucol_elm.cpp)
12 * created on: 2012apr01
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/localpointer.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/ucharstriebuilder.h"
24 #include "unicode/uniset.h"
25 #include "unicode/unistr.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/utf16.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationdatabuilder.h"
32 #include "collationfastlatinbuilder.h"
33 #include "collationiterator.h"
34 #include "normalizer2impl.h"
42 CollationDataBuilder::CEModifier::~CEModifier() {}
45 * Build-time context and CE32 for a code point.
46 * If a code point has contextual mappings, then the default (no-context) mapping
47 * and all conditional mappings are stored in a singly-linked list
48 * of ConditionalCE32, sorted by context strings.
50 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51 * Context strings must be unique and in ascending order.
53 struct ConditionalCE32
: public UMemory
{
56 ce32(0), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
58 ConditionalCE32(const UnicodeString
&ct
, uint32_t ce
)
60 ce32(ce
), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
63 inline UBool
hasContext() const { return context
.length() > 1; }
64 inline int32_t prefixLength() const { return context
.charAt(0); }
67 * "\0" for the first entry for any code point, with its default CE32.
69 * Otherwise one unit with the length of the prefix string,
70 * then the prefix string, then the contraction suffix.
72 UnicodeString context
;
74 * CE32 for the code point and its context.
75 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
79 * Default CE32 for all contexts with this same prefix.
80 * Initially NO_CE32. Set only while building runtime data structures,
81 * and only on one of the nodes of a sub-list with the same prefix.
85 * CE32 for the built contexts.
86 * When fetching CEs from the builder, the contexts are built into their runtime form
87 * so that the normal collation implementation can process them.
88 * The result is cached in the list head. It is reset when the contexts are modified.
92 * Index of the next ConditionalCE32.
93 * Negative for the end of the list.
100 U_CAPI
void U_CALLCONV
101 uprv_deleteConditionalCE32(void *obj
) {
102 delete static_cast<ConditionalCE32
*>(obj
);
108 * Build-time collation element and character iterator.
109 * Uses the runtime CollationIterator for fetching CEs for a string
110 * but reads from the builder's unfinished data structures.
111 * In particular, this class reads from the unfinished trie
112 * and has to avoid CollationIterator::nextCE() and redirect other
113 * calls to data->getCE32() and data->getCE32FromSupplementary().
115 * We do this so that we need not implement the collation algorithm
116 * again for the builder and make it behave exactly like the runtime code.
117 * That would be more difficult to test and maintain than this indirection.
119 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
120 * so the data accesses from those code paths need not be modified.
122 * This class iterates directly over whole code points
123 * so that the CollationIterator does not need the finished trie
124 * for handling the LEAD_SURROGATE_TAG.
126 class DataBuilderCollationIterator
: public CollationIterator
{
128 DataBuilderCollationIterator(CollationDataBuilder
&b
);
130 virtual ~DataBuilderCollationIterator();
132 int32_t fetchCEs(const UnicodeString
&str
, int32_t start
, int64_t ces
[], int32_t cesLength
);
134 virtual void resetToOffset(int32_t newOffset
);
135 virtual int32_t getOffset() const;
137 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
138 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
141 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
142 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
144 virtual uint32_t getDataCE32(UChar32 c
) const;
145 virtual uint32_t getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
);
147 CollationDataBuilder
&builder
;
148 CollationData builderData
;
149 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
150 const UnicodeString
*s
;
154 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder
&b
)
155 : CollationIterator(&builderData
, /*numeric=*/ FALSE
),
156 builder(b
), builderData(b
.nfcImpl
),
158 builderData
.base
= builder
.base
;
159 // Set all of the jamoCE32s[] to indirection CE32s.
160 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
161 UChar32 jamo
= CollationDataBuilder::jamoCpFromIndex(j
);
162 jamoCE32s
[j
] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG
, jamo
) |
163 CollationDataBuilder::IS_BUILDER_JAMO_CE32
;
165 builderData
.jamoCE32s
= jamoCE32s
;
168 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
171 DataBuilderCollationIterator::fetchCEs(const UnicodeString
&str
, int32_t start
,
172 int64_t ces
[], int32_t cesLength
) {
173 // Set the pointers each time, in case they changed due to reallocation.
174 builderData
.ce32s
= reinterpret_cast<const uint32_t *>(builder
.ce32s
.getBuffer());
175 builderData
.ces
= builder
.ce64s
.getBuffer();
176 builderData
.contexts
= builder
.contexts
.getBuffer();
177 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
181 UErrorCode errorCode
= U_ZERO_ERROR
;
182 while(U_SUCCESS(errorCode
) && pos
< s
->length()) {
183 // No need to keep all CEs in the iterator buffer.
185 UChar32 c
= s
->char32At(pos
);
186 pos
+= U16_LENGTH(c
);
187 uint32_t ce32
= utrie2_get32(builder
.trie
, c
);
188 const CollationData
*d
;
189 if(ce32
== Collation::FALLBACK_CE32
) {
191 ce32
= builder
.base
->getCE32(c
);
195 appendCEsFromCE32(d
, c
, ce32
, /*forward=*/ TRUE
, errorCode
);
196 U_ASSERT(U_SUCCESS(errorCode
));
197 for(int32_t i
= 0; i
< getCEsLength(); ++i
) {
198 int64_t ce
= getCE(i
);
200 if(cesLength
< Collation::MAX_EXPANSION_LENGTH
) {
211 DataBuilderCollationIterator::resetToOffset(int32_t newOffset
) {
217 DataBuilderCollationIterator::getOffset() const {
222 DataBuilderCollationIterator::nextCodePoint(UErrorCode
& /*errorCode*/) {
223 if(pos
== s
->length()) {
226 UChar32 c
= s
->char32At(pos
);
227 pos
+= U16_LENGTH(c
);
232 DataBuilderCollationIterator::previousCodePoint(UErrorCode
& /*errorCode*/) {
236 UChar32 c
= s
->char32At(pos
- 1);
237 pos
-= U16_LENGTH(c
);
242 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
243 pos
= s
->moveIndex32(pos
, num
);
247 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
248 pos
= s
->moveIndex32(pos
, -num
);
252 DataBuilderCollationIterator::getDataCE32(UChar32 c
) const {
253 return utrie2_get32(builder
.trie
, c
);
257 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
) {
258 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::BUILDER_DATA_TAG
));
259 if((ce32
& CollationDataBuilder::IS_BUILDER_JAMO_CE32
) != 0) {
260 UChar32 jamo
= Collation::indexFromCE32(ce32
);
261 return utrie2_get32(builder
.trie
, jamo
);
263 ConditionalCE32
*cond
= builder
.getConditionalCE32ForCE32(ce32
);
264 if(cond
->builtCE32
== Collation::NO_CE32
) {
265 // Build the context-sensitive mappings into their runtime form and cache the result.
266 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
267 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
268 errorCode
= U_ZERO_ERROR
;
269 builder
.clearContexts();
270 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
272 builderData
.contexts
= builder
.contexts
.getBuffer();
274 return cond
->builtCE32
;
278 // ------------------------------------------------------------------------- ***
280 CollationDataBuilder::CollationDataBuilder(UErrorCode
&errorCode
)
281 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode
)),
282 base(NULL
), baseSettings(NULL
),
284 ce32s(errorCode
), ce64s(errorCode
), conditionalCE32s(errorCode
),
286 fastLatinEnabled(FALSE
), fastLatinBuilder(NULL
),
288 // Reserve the first CE32 for U+0000.
289 ce32s
.addElement(0, errorCode
);
290 conditionalCE32s
.setDeleter(uprv_deleteConditionalCE32
);
293 CollationDataBuilder::~CollationDataBuilder() {
295 delete fastLatinBuilder
;
300 CollationDataBuilder::initForTailoring(const CollationData
*b
, UErrorCode
&errorCode
) {
301 if(U_FAILURE(errorCode
)) { return; }
303 errorCode
= U_INVALID_STATE_ERROR
;
307 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
312 // For a tailoring, the default is to fall back to the base.
313 trie
= utrie2_open(Collation::FALLBACK_CE32
, Collation::FFFD_CE32
, &errorCode
);
315 // Set the Latin-1 letters block so that it is allocated first in the data array,
316 // to try to improve locality of reference when sorting Latin-1 text.
317 // Do not use utrie2_setRange32() since that will not actually allocate blocks
318 // that are filled with the default value.
319 // ASCII (0..7F) is already preallocated anyway.
320 for(UChar32 c
= 0xc0; c
<= 0xff; ++c
) {
321 utrie2_set32(trie
, c
, Collation::FALLBACK_CE32
, &errorCode
);
324 // Hangul syllables are not tailorable (except via tailoring Jamos).
325 // Always set the Hangul tag to help performance.
326 // Do this here, rather than in buildMappings(),
327 // so that we see the HANGUL_TAG in various assertions.
328 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
329 utrie2_setRange32(trie
, Hangul::HANGUL_BASE
, Hangul::HANGUL_END
, hangulCE32
, TRUE
, &errorCode
);
331 // Copy the set contents but don't copy/clone the set as a whole because
332 // that would copy the isFrozen state too.
333 unsafeBackwardSet
.addAll(*b
->unsafeBackwardSet
);
335 if(U_FAILURE(errorCode
)) { return; }
339 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start
, UChar32 end
,
340 uint32_t primary
, int32_t step
,
341 UErrorCode
&errorCode
) {
342 if(U_FAILURE(errorCode
)) { return FALSE
; }
343 U_ASSERT(start
<= end
);
344 // TODO: Do we need to check what values are currently set for start..end?
345 // An offset range is worth it only if we can achieve an overlap between
346 // adjacent UTrie2 blocks of 32 code points each.
347 // An offset CE is also a little more expensive to look up and compute
349 // If the range spans at least three UTrie2 block boundaries (> 64 code points),
351 // If the range spans one or two block boundaries and there are
352 // at least 4 code points on either side, then we take it.
353 // (We could additionally require a minimum range length of, say, 16.)
354 int32_t blockDelta
= (end
>> 5) - (start
>> 5);
355 if(2 <= step
&& step
<= 0x7f &&
357 (blockDelta
> 0 && (start
& 0x1f) <= 0x1c && (end
& 0x1f) >= 3))) {
358 int64_t dataCE
= ((int64_t)primary
<< 32) | (start
<< 8) | step
;
359 if(isCompressiblePrimary(primary
)) { dataCE
|= 0x80; }
360 int32_t index
= addCE(dataCE
, errorCode
);
361 if(U_FAILURE(errorCode
)) { return 0; }
362 if(index
> Collation::MAX_INDEX
) {
363 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
366 uint32_t offsetCE32
= Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG
, index
);
367 utrie2_setRange32(trie
, start
, end
, offsetCE32
, TRUE
, &errorCode
);
376 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start
, UChar32 end
,
377 uint32_t primary
, int32_t step
,
378 UErrorCode
&errorCode
) {
379 if(U_FAILURE(errorCode
)) { return 0; }
380 UBool isCompressible
= isCompressiblePrimary(primary
);
381 if(maybeSetPrimaryRange(start
, end
, primary
, step
, errorCode
)) {
382 return Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
,
383 (end
- start
+ 1) * step
);
385 // Short range: Set individual CE32s.
387 utrie2_set32(trie
, start
, Collation::makeLongPrimaryCE32(primary
), &errorCode
);
389 primary
= Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
, step
);
390 if(start
> end
) { return primary
; }
397 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase
, UChar32 c
, uint32_t ce32
) const {
398 int32_t i
= Collation::indexFromCE32(ce32
);
399 int64_t dataCE
= fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
400 uint32_t p
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
);
401 return Collation::makeLongPrimaryCE32(p
);
405 CollationDataBuilder::isCompressibleLeadByte(uint32_t b
) const {
406 return base
->isCompressibleLeadByte(b
);
410 CollationDataBuilder::isAssigned(UChar32 c
) const {
411 return Collation::isAssignedCE32(utrie2_get32(trie
, c
));
415 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c
) const {
416 uint32_t ce32
= utrie2_get32(trie
, c
);
417 if(Collation::isLongPrimaryCE32(ce32
)) {
418 return Collation::primaryFromLongPrimaryCE32(ce32
);
425 CollationDataBuilder::getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const {
426 if(U_FAILURE(errorCode
)) { return 0; }
427 // Keep parallel with CollationData::getSingleCE().
428 UBool fromBase
= FALSE
;
429 uint32_t ce32
= utrie2_get32(trie
, c
);
430 if(ce32
== Collation::FALLBACK_CE32
) {
432 ce32
= base
->getCE32(c
);
434 while(Collation::isSpecialCE32(ce32
)) {
435 switch(Collation::tagFromCE32(ce32
)) {
436 case Collation::LATIN_EXPANSION_TAG
:
437 case Collation::BUILDER_DATA_TAG
:
438 case Collation::PREFIX_TAG
:
439 case Collation::CONTRACTION_TAG
:
440 case Collation::HANGUL_TAG
:
441 case Collation::LEAD_SURROGATE_TAG
:
442 errorCode
= U_UNSUPPORTED_ERROR
;
444 case Collation::FALLBACK_TAG
:
445 case Collation::RESERVED_TAG_3
:
446 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
448 case Collation::LONG_PRIMARY_TAG
:
449 return Collation::ceFromLongPrimaryCE32(ce32
);
450 case Collation::LONG_SECONDARY_TAG
:
451 return Collation::ceFromLongSecondaryCE32(ce32
);
452 case Collation::EXPANSION32_TAG
:
453 if(Collation::lengthFromCE32(ce32
) == 1) {
454 int32_t i
= Collation::indexFromCE32(ce32
);
455 ce32
= fromBase
? base
->ce32s
[i
] : ce32s
.elementAti(i
);
458 errorCode
= U_UNSUPPORTED_ERROR
;
461 case Collation::EXPANSION_TAG
: {
462 if(Collation::lengthFromCE32(ce32
) == 1) {
463 int32_t i
= Collation::indexFromCE32(ce32
);
464 return fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
466 errorCode
= U_UNSUPPORTED_ERROR
;
470 case Collation::DIGIT_TAG
:
471 // Fetch the non-numeric-collation CE32 and continue.
472 ce32
= ce32s
.elementAti(Collation::indexFromCE32(ce32
));
474 case Collation::U0000_TAG
:
476 // Fetch the normal ce32 for U+0000 and continue.
477 ce32
= fromBase
? base
->ce32s
[0] : ce32s
.elementAti(0);
479 case Collation::OFFSET_TAG
:
480 ce32
= getCE32FromOffsetCE32(fromBase
, c
, ce32
);
482 case Collation::IMPLICIT_TAG
:
483 return Collation::unassignedCEFromCodePoint(c
);
486 return Collation::ceFromSimpleCE32(ce32
);
490 CollationDataBuilder::addCE(int64_t ce
, UErrorCode
&errorCode
) {
491 int32_t length
= ce64s
.size();
492 for(int32_t i
= 0; i
< length
; ++i
) {
493 if(ce
== ce64s
.elementAti(i
)) { return i
; }
495 ce64s
.addElement(ce
, errorCode
);
500 CollationDataBuilder::addCE32(uint32_t ce32
, UErrorCode
&errorCode
) {
501 int32_t length
= ce32s
.size();
502 for(int32_t i
= 0; i
< length
; ++i
) {
503 if(ce32
== (uint32_t)ce32s
.elementAti(i
)) { return i
; }
505 ce32s
.addElement((int32_t)ce32
, errorCode
);
510 CollationDataBuilder::addConditionalCE32(const UnicodeString
&context
, uint32_t ce32
,
511 UErrorCode
&errorCode
) {
512 if(U_FAILURE(errorCode
)) { return -1; }
513 U_ASSERT(!context
.isEmpty());
514 int32_t index
= conditionalCE32s
.size();
515 if(index
> Collation::MAX_INDEX
) {
516 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
519 ConditionalCE32
*cond
= new ConditionalCE32(context
, ce32
);
521 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
524 conditionalCE32s
.addElement(cond
, errorCode
);
529 CollationDataBuilder::add(const UnicodeString
&prefix
, const UnicodeString
&s
,
530 const int64_t ces
[], int32_t cesLength
,
531 UErrorCode
&errorCode
) {
532 uint32_t ce32
= encodeCEs(ces
, cesLength
, errorCode
);
533 addCE32(prefix
, s
, ce32
, errorCode
);
537 CollationDataBuilder::addCE32(const UnicodeString
&prefix
, const UnicodeString
&s
,
538 uint32_t ce32
, UErrorCode
&errorCode
) {
539 if(U_FAILURE(errorCode
)) { return; }
541 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
544 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
545 errorCode
= U_INVALID_STATE_ERROR
;
548 UChar32 c
= s
.char32At(0);
549 int32_t cLength
= U16_LENGTH(c
);
550 uint32_t oldCE32
= utrie2_get32(trie
, c
);
551 UBool hasContext
= !prefix
.isEmpty() || s
.length() > cLength
;
552 if(oldCE32
== Collation::FALLBACK_CE32
) {
553 // First tailoring for c.
554 // If c has contextual base mappings or if we add a contextual mapping,
555 // then copy the base mappings.
556 // Otherwise we just override the base mapping.
557 uint32_t baseCE32
= base
->getFinalCE32(base
->getCE32(c
));
558 if(hasContext
|| Collation::ce32HasContext(baseCE32
)) {
559 oldCE32
= copyFromBaseCE32(c
, baseCE32
, TRUE
, errorCode
);
560 utrie2_set32(trie
, c
, oldCE32
, &errorCode
);
561 if(U_FAILURE(errorCode
)) { return; }
565 // No prefix, no contraction.
566 if(!isBuilderContextCE32(oldCE32
)) {
567 utrie2_set32(trie
, c
, ce32
, &errorCode
);
569 ConditionalCE32
*cond
= getConditionalCE32ForCE32(oldCE32
);
570 cond
->builtCE32
= Collation::NO_CE32
;
574 ConditionalCE32
*cond
;
575 if(!isBuilderContextCE32(oldCE32
)) {
576 // Replace the simple oldCE32 with a builder context CE32
577 // pointing to a new ConditionalCE32 list head.
578 int32_t index
= addConditionalCE32(UnicodeString((UChar
)0), oldCE32
, errorCode
);
579 if(U_FAILURE(errorCode
)) { return; }
580 uint32_t contextCE32
= makeBuilderContextCE32(index
);
581 utrie2_set32(trie
, c
, contextCE32
, &errorCode
);
583 cond
= getConditionalCE32(index
);
585 cond
= getConditionalCE32ForCE32(oldCE32
);
586 cond
->builtCE32
= Collation::NO_CE32
;
588 UnicodeString
suffix(s
, cLength
);
589 UnicodeString
context((UChar
)prefix
.length());
590 context
.append(prefix
).append(suffix
);
591 unsafeBackwardSet
.addAll(suffix
);
593 // invariant: context > cond->context
594 int32_t next
= cond
->next
;
596 // Append a new ConditionalCE32 after cond.
597 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
598 if(U_FAILURE(errorCode
)) { return; }
602 ConditionalCE32
*nextCond
= getConditionalCE32(next
);
603 int8_t cmp
= context
.compare(nextCond
->context
);
605 // Insert a new ConditionalCE32 between cond and nextCond.
606 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
607 if(U_FAILURE(errorCode
)) { return; }
609 getConditionalCE32(index
)->next
= next
;
611 } else if(cmp
== 0) {
612 // Same context as before, overwrite its ce32.
613 nextCond
->ce32
= ce32
;
623 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce
) {
624 uint32_t p
= (uint32_t)(ce
>> 32);
625 uint32_t lower32
= (uint32_t)ce
;
626 uint32_t t
= (uint32_t)(ce
& 0xffff);
627 U_ASSERT((t
& 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
628 if((ce
& INT64_C(0xffff00ff00ff)) == 0) {
629 // normal form ppppsstt
630 return p
| (lower32
>> 16) | (t
>> 8);
631 } else if((ce
& INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE
) {
632 // long-primary form ppppppC1
633 return Collation::makeLongPrimaryCE32(p
);
634 } else if(p
== 0 && (t
& 0xff) == 0) {
635 // long-secondary form ssssttC2
636 return Collation::makeLongSecondaryCE32(lower32
);
638 return Collation::NO_CE32
;
642 CollationDataBuilder::encodeOneCE(int64_t ce
, UErrorCode
&errorCode
) {
643 // Try to encode one CE as one CE32.
644 uint32_t ce32
= encodeOneCEAsCE32(ce
);
645 if(ce32
!= Collation::NO_CE32
) { return ce32
; }
646 int32_t index
= addCE(ce
, errorCode
);
647 if(U_FAILURE(errorCode
)) { return 0; }
648 if(index
> Collation::MAX_INDEX
) {
649 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
652 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, index
, 1);
656 CollationDataBuilder::encodeCEs(const int64_t ces
[], int32_t cesLength
,
657 UErrorCode
&errorCode
) {
658 if(U_FAILURE(errorCode
)) { return 0; }
659 if(cesLength
< 0 || cesLength
> Collation::MAX_EXPANSION_LENGTH
) {
660 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
663 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
664 errorCode
= U_INVALID_STATE_ERROR
;
668 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
669 // Do this here so that callers need not do it.
670 return encodeOneCEAsCE32(0);
671 } else if(cesLength
== 1) {
672 return encodeOneCE(ces
[0], errorCode
);
673 } else if(cesLength
== 2) {
674 // Try to encode two CEs as one CE32.
675 int64_t ce0
= ces
[0];
676 int64_t ce1
= ces
[1];
677 uint32_t p0
= (uint32_t)(ce0
>> 32);
678 if((ce0
& INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE
&&
679 (ce1
& INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE
&&
681 // Latin mini expansion
684 (((uint32_t)ce0
& 0xff00u
) << 8) |
685 (uint32_t)(ce1
>> 16) |
686 Collation::SPECIAL_CE32_LOW_BYTE
|
687 Collation::LATIN_EXPANSION_TAG
;
690 // Try to encode two or more CEs as CE32s.
691 int32_t newCE32s
[Collation::MAX_EXPANSION_LENGTH
];
692 for(int32_t i
= 0;; ++i
) {
694 return encodeExpansion32(newCE32s
, cesLength
, errorCode
);
696 uint32_t ce32
= encodeOneCEAsCE32(ces
[i
]);
697 if(ce32
== Collation::NO_CE32
) { break; }
698 newCE32s
[i
] = (int32_t)ce32
;
700 return encodeExpansion(ces
, cesLength
, errorCode
);
704 CollationDataBuilder::encodeExpansion(const int64_t ces
[], int32_t length
, UErrorCode
&errorCode
) {
705 if(U_FAILURE(errorCode
)) { return 0; }
706 // See if this sequence of CEs has already been stored.
707 int64_t first
= ces
[0];
708 int32_t ce64sMax
= ce64s
.size() - length
;
709 for(int32_t i
= 0; i
<= ce64sMax
; ++i
) {
710 if(first
== ce64s
.elementAti(i
)) {
711 if(i
> Collation::MAX_INDEX
) {
712 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
715 for(int32_t j
= 1;; ++j
) {
717 return Collation::makeCE32FromTagIndexAndLength(
718 Collation::EXPANSION_TAG
, i
, length
);
720 if(ce64s
.elementAti(i
+ j
) != ces
[j
]) { break; }
724 // Store the new sequence.
725 int32_t i
= ce64s
.size();
726 if(i
> Collation::MAX_INDEX
) {
727 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
730 for(int32_t j
= 0; j
< length
; ++j
) {
731 ce64s
.addElement(ces
[j
], errorCode
);
733 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, i
, length
);
737 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s
[], int32_t length
,
738 UErrorCode
&errorCode
) {
739 if(U_FAILURE(errorCode
)) { return 0; }
740 // See if this sequence of CE32s has already been stored.
741 int32_t first
= newCE32s
[0];
742 int32_t ce32sMax
= ce32s
.size() - length
;
743 for(int32_t i
= 0; i
<= ce32sMax
; ++i
) {
744 if(first
== ce32s
.elementAti(i
)) {
745 if(i
> Collation::MAX_INDEX
) {
746 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
749 for(int32_t j
= 1;; ++j
) {
751 return Collation::makeCE32FromTagIndexAndLength(
752 Collation::EXPANSION32_TAG
, i
, length
);
754 if(ce32s
.elementAti(i
+ j
) != newCE32s
[j
]) { break; }
758 // Store the new sequence.
759 int32_t i
= ce32s
.size();
760 if(i
> Collation::MAX_INDEX
) {
761 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
764 for(int32_t j
= 0; j
< length
; ++j
) {
765 ce32s
.addElement(newCE32s
[j
], errorCode
);
767 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG
, i
, length
);
771 CollationDataBuilder::copyFromBaseCE32(UChar32 c
, uint32_t ce32
, UBool withContext
,
772 UErrorCode
&errorCode
) {
773 if(U_FAILURE(errorCode
)) { return 0; }
774 if(!Collation::isSpecialCE32(ce32
)) { return ce32
; }
775 switch(Collation::tagFromCE32(ce32
)) {
776 case Collation::LONG_PRIMARY_TAG
:
777 case Collation::LONG_SECONDARY_TAG
:
778 case Collation::LATIN_EXPANSION_TAG
:
781 case Collation::EXPANSION32_TAG
: {
782 const uint32_t *baseCE32s
= base
->ce32s
+ Collation::indexFromCE32(ce32
);
783 int32_t length
= Collation::lengthFromCE32(ce32
);
784 ce32
= encodeExpansion32(
785 reinterpret_cast<const int32_t *>(baseCE32s
), length
, errorCode
);
788 case Collation::EXPANSION_TAG
: {
789 const int64_t *baseCEs
= base
->ces
+ Collation::indexFromCE32(ce32
);
790 int32_t length
= Collation::lengthFromCE32(ce32
);
791 ce32
= encodeExpansion(baseCEs
, length
, errorCode
);
794 case Collation::PREFIX_TAG
: {
795 // Flatten prefixes and nested suffixes (contractions)
796 // into a linear list of ConditionalCE32.
797 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
798 ce32
= CollationData::readCE32(p
); // Default if no prefix match.
800 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
802 ConditionalCE32 head
;
803 UnicodeString
context((UChar
)0);
805 if(Collation::isContractionCE32(ce32
)) {
806 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
808 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
809 head
.next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
811 if(U_FAILURE(errorCode
)) { return 0; }
812 ConditionalCE32
*cond
= getConditionalCE32(index
); // the last ConditionalCE32 so far
813 UCharsTrie::Iterator
prefixes(p
+ 2, 0, errorCode
);
814 while(prefixes
.next(errorCode
)) {
815 context
= prefixes
.getString();
817 context
.insert(0, (UChar
)context
.length());
818 ce32
= (uint32_t)prefixes
.getValue();
819 if(Collation::isContractionCE32(ce32
)) {
820 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, cond
, errorCode
);
822 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
823 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
825 if(U_FAILURE(errorCode
)) { return 0; }
826 cond
= getConditionalCE32(index
);
828 ce32
= makeBuilderContextCE32(head
.next
);
832 case Collation::CONTRACTION_TAG
: {
834 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
835 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
836 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
838 ConditionalCE32 head
;
839 UnicodeString
context((UChar
)0);
840 copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
841 ce32
= makeBuilderContextCE32(head
.next
);
845 case Collation::HANGUL_TAG
:
846 errorCode
= U_UNSUPPORTED_ERROR
; // We forbid tailoring of Hangul syllables.
848 case Collation::OFFSET_TAG
:
849 ce32
= getCE32FromOffsetCE32(TRUE
, c
, ce32
);
851 case Collation::IMPLICIT_TAG
:
852 ce32
= encodeOneCE(Collation::unassignedCEFromCodePoint(c
), errorCode
);
855 UPRV_UNREACHABLE
; // require ce32 == base->getFinalCE32(ce32)
861 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString
&context
, UChar32 c
, uint32_t ce32
,
862 ConditionalCE32
*cond
, UErrorCode
&errorCode
) {
863 if(U_FAILURE(errorCode
)) { return 0; }
864 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
866 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
867 // No match on the single code point.
868 // We are underneath a prefix, and the default mapping is just
869 // a fallback to the mappings for a shorter prefix.
870 U_ASSERT(context
.length() > 1);
873 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
874 U_ASSERT(!Collation::isContractionCE32(ce32
));
875 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
876 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
877 if(U_FAILURE(errorCode
)) { return 0; }
878 cond
= getConditionalCE32(index
);
881 int32_t suffixStart
= context
.length();
882 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
883 while(suffixes
.next(errorCode
)) {
884 context
.append(suffixes
.getString());
885 ce32
= copyFromBaseCE32(c
, (uint32_t)suffixes
.getValue(), TRUE
, errorCode
);
886 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
887 if(U_FAILURE(errorCode
)) { return 0; }
888 // No need to update the unsafeBackwardSet because the tailoring set
889 // is already a copy of the base set.
890 cond
= getConditionalCE32(index
);
891 context
.truncate(suffixStart
);
893 U_ASSERT(index
>= 0);
899 CopyHelper(const CollationDataBuilder
&s
, CollationDataBuilder
&d
,
900 const CollationDataBuilder::CEModifier
&m
, UErrorCode
&initialErrorCode
)
901 : src(s
), dest(d
), modifier(m
),
902 errorCode(initialErrorCode
) {}
904 UBool
copyRangeCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
905 ce32
= copyCE32(ce32
);
906 utrie2_setRange32(dest
.trie
, start
, end
, ce32
, TRUE
, &errorCode
);
907 if(CollationDataBuilder::isBuilderContextCE32(ce32
)) {
908 dest
.contextChars
.add(start
, end
);
910 return U_SUCCESS(errorCode
);
913 uint32_t copyCE32(uint32_t ce32
) {
914 if(!Collation::isSpecialCE32(ce32
)) {
915 int64_t ce
= modifier
.modifyCE32(ce32
);
916 if(ce
!= Collation::NO_CE
) {
917 ce32
= dest
.encodeOneCE(ce
, errorCode
);
920 int32_t tag
= Collation::tagFromCE32(ce32
);
921 if(tag
== Collation::EXPANSION32_TAG
) {
922 const uint32_t *srcCE32s
= reinterpret_cast<uint32_t *>(src
.ce32s
.getBuffer());
923 srcCE32s
+= Collation::indexFromCE32(ce32
);
924 int32_t length
= Collation::lengthFromCE32(ce32
);
925 // Inspect the source CE32s. Just copy them if none are modified.
926 // Otherwise copy to modifiedCEs, with modifications.
927 UBool isModified
= FALSE
;
928 for(int32_t i
= 0; i
< length
; ++i
) {
931 if(Collation::isSpecialCE32(ce32
) ||
932 (ce
= modifier
.modifyCE32(ce32
)) == Collation::NO_CE
) {
934 modifiedCEs
[i
] = Collation::ceFromCE32(ce32
);
938 for(int32_t j
= 0; j
< i
; ++j
) {
939 modifiedCEs
[j
] = Collation::ceFromCE32(srcCE32s
[j
]);
947 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
949 ce32
= dest
.encodeExpansion32(
950 reinterpret_cast<const int32_t *>(srcCE32s
), length
, errorCode
);
952 } else if(tag
== Collation::EXPANSION_TAG
) {
953 const int64_t *srcCEs
= src
.ce64s
.getBuffer();
954 srcCEs
+= Collation::indexFromCE32(ce32
);
955 int32_t length
= Collation::lengthFromCE32(ce32
);
956 // Inspect the source CEs. Just copy them if none are modified.
957 // Otherwise copy to modifiedCEs, with modifications.
958 UBool isModified
= FALSE
;
959 for(int32_t i
= 0; i
< length
; ++i
) {
960 int64_t srcCE
= srcCEs
[i
];
961 int64_t ce
= modifier
.modifyCE(srcCE
);
962 if(ce
== Collation::NO_CE
) {
964 modifiedCEs
[i
] = srcCE
;
968 for(int32_t j
= 0; j
< i
; ++j
) {
969 modifiedCEs
[j
] = srcCEs
[j
];
977 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
979 ce32
= dest
.encodeExpansion(srcCEs
, length
, errorCode
);
981 } else if(tag
== Collation::BUILDER_DATA_TAG
) {
982 // Copy the list of ConditionalCE32.
983 ConditionalCE32
*cond
= src
.getConditionalCE32ForCE32(ce32
);
984 U_ASSERT(!cond
->hasContext());
985 int32_t destIndex
= dest
.addConditionalCE32(
986 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
987 ce32
= CollationDataBuilder::makeBuilderContextCE32(destIndex
);
988 while(cond
->next
>= 0) {
989 cond
= src
.getConditionalCE32(cond
->next
);
990 ConditionalCE32
*prevDestCond
= dest
.getConditionalCE32(destIndex
);
991 destIndex
= dest
.addConditionalCE32(
992 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
993 int32_t suffixStart
= cond
->prefixLength() + 1;
994 dest
.unsafeBackwardSet
.addAll(cond
->context
.tempSubString(suffixStart
));
995 prevDestCond
->next
= destIndex
;
998 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
999 // assuming that the modifier would not modify them.
1000 U_ASSERT(tag
== Collation::LONG_PRIMARY_TAG
||
1001 tag
== Collation::LONG_SECONDARY_TAG
||
1002 tag
== Collation::LATIN_EXPANSION_TAG
||
1003 tag
== Collation::HANGUL_TAG
);
1009 const CollationDataBuilder
&src
;
1010 CollationDataBuilder
&dest
;
1011 const CollationDataBuilder::CEModifier
&modifier
;
1012 int64_t modifiedCEs
[Collation::MAX_EXPANSION_LENGTH
];
1013 UErrorCode errorCode
;
1018 static UBool U_CALLCONV
1019 enumRangeForCopy(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
1021 value
== Collation::UNASSIGNED_CE32
|| value
== Collation::FALLBACK_CE32
||
1022 ((CopyHelper
*)context
)->copyRangeCE32(start
, end
, value
);
1028 CollationDataBuilder::copyFrom(const CollationDataBuilder
&src
, const CEModifier
&modifier
,
1029 UErrorCode
&errorCode
) {
1030 if(U_FAILURE(errorCode
)) { return; }
1031 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1032 errorCode
= U_INVALID_STATE_ERROR
;
1035 CopyHelper
helper(src
, *this, modifier
, errorCode
);
1036 utrie2_enum(src
.trie
, NULL
, enumRangeForCopy
, &helper
);
1037 errorCode
= helper
.errorCode
;
1038 // Update the contextChars and the unsafeBackwardSet while copying,
1039 // in case a character had conditional mappings in the source builder
1040 // and they were removed later.
1041 modified
|= src
.modified
;
1045 CollationDataBuilder::optimize(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1046 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1047 UnicodeSetIterator
iter(set
);
1048 while(iter
.next() && !iter
.isString()) {
1049 UChar32 c
= iter
.getCodepoint();
1050 uint32_t ce32
= utrie2_get32(trie
, c
);
1051 if(ce32
== Collation::FALLBACK_CE32
) {
1052 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1053 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
1054 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1061 CollationDataBuilder::suppressContractions(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1062 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1063 UnicodeSetIterator
iter(set
);
1064 while(iter
.next() && !iter
.isString()) {
1065 UChar32 c
= iter
.getCodepoint();
1066 uint32_t ce32
= utrie2_get32(trie
, c
);
1067 if(ce32
== Collation::FALLBACK_CE32
) {
1068 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1069 if(Collation::ce32HasContext(ce32
)) {
1070 ce32
= copyFromBaseCE32(c
, ce32
, FALSE
/* without context */, errorCode
);
1071 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1073 } else if(isBuilderContextCE32(ce32
)) {
1074 ce32
= getConditionalCE32ForCE32(ce32
)->ce32
;
1075 // Simply abandon the list of ConditionalCE32.
1076 // The caller will copy this builder in the end,
1077 // eliminating unreachable data.
1078 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1079 contextChars
.remove(c
);
1086 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s
[], UErrorCode
&errorCode
) {
1087 if(U_FAILURE(errorCode
)) { return FALSE
; }
1088 UBool anyJamoAssigned
= base
== NULL
; // always set jamoCE32s in the base data
1089 UBool needToCopyFromBase
= FALSE
;
1090 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
1091 UChar32 jamo
= jamoCpFromIndex(j
);
1092 UBool fromBase
= FALSE
;
1093 uint32_t ce32
= utrie2_get32(trie
, jamo
);
1094 anyJamoAssigned
|= Collation::isAssignedCE32(ce32
);
1095 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1096 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1097 if(ce32
== Collation::FALLBACK_CE32
) {
1099 ce32
= base
->getCE32(jamo
);
1101 if(Collation::isSpecialCE32(ce32
)) {
1102 switch(Collation::tagFromCE32(ce32
)) {
1103 case Collation::LONG_PRIMARY_TAG
:
1104 case Collation::LONG_SECONDARY_TAG
:
1105 case Collation::LATIN_EXPANSION_TAG
:
1106 // Copy the ce32 as-is.
1108 case Collation::EXPANSION32_TAG
:
1109 case Collation::EXPANSION_TAG
:
1110 case Collation::PREFIX_TAG
:
1111 case Collation::CONTRACTION_TAG
:
1113 // Defer copying until we know if anyJamoAssigned.
1114 ce32
= Collation::FALLBACK_CE32
;
1115 needToCopyFromBase
= TRUE
;
1118 case Collation::IMPLICIT_TAG
:
1119 // An unassigned Jamo should only occur in tests with incomplete bases.
1121 ce32
= Collation::FALLBACK_CE32
;
1122 needToCopyFromBase
= TRUE
;
1124 case Collation::OFFSET_TAG
:
1125 ce32
= getCE32FromOffsetCE32(fromBase
, jamo
, ce32
);
1127 case Collation::FALLBACK_TAG
:
1128 case Collation::RESERVED_TAG_3
:
1129 case Collation::BUILDER_DATA_TAG
:
1130 case Collation::DIGIT_TAG
:
1131 case Collation::U0000_TAG
:
1132 case Collation::HANGUL_TAG
:
1133 case Collation::LEAD_SURROGATE_TAG
:
1134 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1138 jamoCE32s
[j
] = ce32
;
1140 if(anyJamoAssigned
&& needToCopyFromBase
) {
1141 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) {
1142 if(jamoCE32s
[j
] == Collation::FALLBACK_CE32
) {
1143 UChar32 jamo
= jamoCpFromIndex(j
);
1144 jamoCE32s
[j
] = copyFromBaseCE32(jamo
, base
->getCE32(jamo
),
1145 /*withContext=*/ TRUE
, errorCode
);
1149 return anyJamoAssigned
&& U_SUCCESS(errorCode
);
1153 CollationDataBuilder::setDigitTags(UErrorCode
&errorCode
) {
1154 UnicodeSet
digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode
);
1155 if(U_FAILURE(errorCode
)) { return; }
1156 UnicodeSetIterator
iter(digits
);
1157 while(iter
.next()) {
1158 U_ASSERT(!iter
.isString());
1159 UChar32 c
= iter
.getCodepoint();
1160 uint32_t ce32
= utrie2_get32(trie
, c
);
1161 if(ce32
!= Collation::FALLBACK_CE32
&& ce32
!= Collation::UNASSIGNED_CE32
) {
1162 int32_t index
= addCE32(ce32
, errorCode
);
1163 if(U_FAILURE(errorCode
)) { return; }
1164 if(index
> Collation::MAX_INDEX
) {
1165 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1168 ce32
= Collation::makeCE32FromTagIndexAndLength(
1169 Collation::DIGIT_TAG
, index
, u_charDigitValue(c
));
1170 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1177 static UBool U_CALLCONV
1178 enumRangeLeadValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
1179 int32_t *pValue
= (int32_t *)context
;
1180 if(value
== Collation::UNASSIGNED_CE32
) {
1181 value
= Collation::LEAD_ALL_UNASSIGNED
;
1182 } else if(value
== Collation::FALLBACK_CE32
) {
1183 value
= Collation::LEAD_ALL_FALLBACK
;
1185 *pValue
= Collation::LEAD_MIXED
;
1189 *pValue
= (int32_t)value
;
1190 } else if(*pValue
!= (int32_t)value
) {
1191 *pValue
= Collation::LEAD_MIXED
;
1200 CollationDataBuilder::setLeadSurrogates(UErrorCode
&errorCode
) {
1201 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
) {
1203 utrie2_enumForLeadSurrogate(trie
, lead
, NULL
, enumRangeLeadValue
, &value
);
1204 utrie2_set32ForLeadSurrogateCodeUnit(
1206 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG
, 0) | (uint32_t)value
,
1212 CollationDataBuilder::build(CollationData
&data
, UErrorCode
&errorCode
) {
1213 buildMappings(data
, errorCode
);
1215 data
.numericPrimary
= base
->numericPrimary
;
1216 data
.compressibleBytes
= base
->compressibleBytes
;
1217 data
.numScripts
= base
->numScripts
;
1218 data
.scriptsIndex
= base
->scriptsIndex
;
1219 data
.scriptStarts
= base
->scriptStarts
;
1220 data
.scriptStartsLength
= base
->scriptStartsLength
;
1222 buildFastLatinTable(data
, errorCode
);
1226 CollationDataBuilder::buildMappings(CollationData
&data
, UErrorCode
&errorCode
) {
1227 if(U_FAILURE(errorCode
)) { return; }
1228 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1229 errorCode
= U_INVALID_STATE_ERROR
;
1233 buildContexts(errorCode
);
1235 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
1236 int32_t jamoIndex
= -1;
1237 if(getJamoCE32s(jamoCE32s
, errorCode
)) {
1238 jamoIndex
= ce32s
.size();
1239 for(int32_t i
= 0; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1240 ce32s
.addElement((int32_t)jamoCE32s
[i
], errorCode
);
1242 // Small optimization: Use a bit in the Hangul ce32
1243 // to indicate that none of the Jamo CE32s are isSpecialCE32()
1244 // (as it should be in the root collator).
1245 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1246 // In order to still have good trie compression and keep this code simple,
1247 // we only set this flag if a whole block of 588 Hangul syllables starting with
1248 // a common leading consonant (Jamo L) has this property.
1249 UBool isAnyJamoVTSpecial
= FALSE
;
1250 for(int32_t i
= Hangul::JAMO_L_COUNT
; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1251 if(Collation::isSpecialCE32(jamoCE32s
[i
])) {
1252 isAnyJamoVTSpecial
= TRUE
;
1256 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
1257 UChar32 c
= Hangul::HANGUL_BASE
;
1258 for(int32_t i
= 0; i
< Hangul::JAMO_L_COUNT
; ++i
) { // iterate over the Jamo L
1259 uint32_t ce32
= hangulCE32
;
1260 if(!isAnyJamoVTSpecial
&& !Collation::isSpecialCE32(jamoCE32s
[i
])) {
1261 ce32
|= Collation::HANGUL_NO_SPECIAL_JAMO
;
1263 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1264 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1268 // Copy the Hangul CE32s from the base in blocks per Jamo L,
1269 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1270 for(UChar32 c
= Hangul::HANGUL_BASE
; c
< Hangul::HANGUL_LIMIT
;) {
1271 uint32_t ce32
= base
->getCE32(c
);
1272 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::HANGUL_TAG
));
1273 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1274 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1279 setDigitTags(errorCode
);
1280 setLeadSurrogates(errorCode
);
1282 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1283 ce32s
.setElementAt((int32_t)utrie2_get32(trie
, 0), 0);
1284 utrie2_set32(trie
, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG
, 0), &errorCode
);
1286 utrie2_freeze(trie
, UTRIE2_32_VALUE_BITS
, &errorCode
);
1287 if(U_FAILURE(errorCode
)) { return; }
1289 // Mark each lead surrogate as "unsafe"
1290 // if any of its 1024 associated supplementary code points is "unsafe".
1291 UChar32 c
= 0x10000;
1292 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
1293 if(unsafeBackwardSet
.containsSome(c
, c
+ 0x3ff)) {
1294 unsafeBackwardSet
.add(lead
);
1297 unsafeBackwardSet
.freeze();
1300 data
.ce32s
= reinterpret_cast<const uint32_t *>(ce32s
.getBuffer());
1301 data
.ces
= ce64s
.getBuffer();
1302 data
.contexts
= contexts
.getBuffer();
1304 data
.ce32sLength
= ce32s
.size();
1305 data
.cesLength
= ce64s
.size();
1306 data
.contextsLength
= contexts
.length();
1309 if(jamoIndex
>= 0) {
1310 data
.jamoCE32s
= data
.ce32s
+ jamoIndex
;
1312 data
.jamoCE32s
= base
->jamoCE32s
;
1314 data
.unsafeBackwardSet
= &unsafeBackwardSet
;
1318 CollationDataBuilder::clearContexts() {
1320 UnicodeSetIterator
iter(contextChars
);
1321 while(iter
.next()) {
1322 U_ASSERT(!iter
.isString());
1323 uint32_t ce32
= utrie2_get32(trie
, iter
.getCodepoint());
1324 U_ASSERT(isBuilderContextCE32(ce32
));
1325 getConditionalCE32ForCE32(ce32
)->builtCE32
= Collation::NO_CE32
;
1330 CollationDataBuilder::buildContexts(UErrorCode
&errorCode
) {
1331 if(U_FAILURE(errorCode
)) { return; }
1332 // Ignore abandoned lists and the cached builtCE32,
1333 // and build all contexts from scratch.
1335 UnicodeSetIterator
iter(contextChars
);
1336 while(U_SUCCESS(errorCode
) && iter
.next()) {
1337 U_ASSERT(!iter
.isString());
1338 UChar32 c
= iter
.getCodepoint();
1339 uint32_t ce32
= utrie2_get32(trie
, c
);
1340 if(!isBuilderContextCE32(ce32
)) {
1341 // Impossible: No context data for c in contextChars.
1342 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1345 ConditionalCE32
*cond
= getConditionalCE32ForCE32(ce32
);
1346 ce32
= buildContext(cond
, errorCode
);
1347 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1352 CollationDataBuilder::buildContext(ConditionalCE32
*head
, UErrorCode
&errorCode
) {
1353 if(U_FAILURE(errorCode
)) { return 0; }
1354 // The list head must have no context.
1355 U_ASSERT(!head
->hasContext());
1356 // The list head must be followed by one or more nodes that all do have context.
1357 U_ASSERT(head
->next
>= 0);
1358 UCharsTrieBuilder
prefixBuilder(errorCode
);
1359 UCharsTrieBuilder
contractionBuilder(errorCode
);
1360 for(ConditionalCE32
*cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1361 // After the list head, the prefix or suffix can be empty, but not both.
1362 U_ASSERT(cond
== head
|| cond
->hasContext());
1363 int32_t prefixLength
= cond
->prefixLength();
1364 UnicodeString
prefix(cond
->context
, 0, prefixLength
+ 1);
1365 // Collect all contraction suffixes for one prefix.
1366 ConditionalCE32
*firstCond
= cond
;
1367 ConditionalCE32
*lastCond
= cond
;
1368 while(cond
->next
>= 0 &&
1369 (cond
= getConditionalCE32(cond
->next
))->context
.startsWith(prefix
)) {
1373 int32_t suffixStart
= prefixLength
+ 1; // == prefix.length()
1374 if(lastCond
->context
.length() == suffixStart
) {
1375 // One prefix without contraction suffix.
1376 U_ASSERT(firstCond
== lastCond
);
1377 ce32
= lastCond
->ce32
;
1380 // Build the contractions trie.
1381 contractionBuilder
.clear();
1382 // Entry for an empty suffix, to be stored before the trie.
1383 uint32_t emptySuffixCE32
= 0;
1385 if(firstCond
->context
.length() == suffixStart
) {
1386 // There is a mapping for the prefix and the single character c. (p|c)
1387 // If no other suffix matches, then we return this value.
1388 emptySuffixCE32
= firstCond
->ce32
;
1389 cond
= getConditionalCE32(firstCond
->next
);
1391 // There is no mapping for the prefix and just the single character.
1392 // (There is no p|c, only p|cd, p|ce etc.)
1393 flags
|= Collation::CONTRACT_SINGLE_CP_NO_MATCH
;
1394 // When the prefix matches but none of the prefix-specific suffixes,
1395 // then we fall back to the mappings with the next-longest prefix,
1396 // and ultimately to mappings with no prefix.
1397 // Each fallback might be another set of contractions.
1398 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1399 // then in text "pch" we find the ch contraction.
1400 for(cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1401 int32_t length
= cond
->prefixLength();
1402 if(length
== prefixLength
) { break; }
1403 if(cond
->defaultCE32
!= Collation::NO_CE32
&&
1404 (length
==0 || prefix
.endsWith(cond
->context
, 1, length
))) {
1405 emptySuffixCE32
= cond
->defaultCE32
;
1410 // Optimization: Set a flag when
1411 // the first character of every contraction suffix has lccc!=0.
1412 // Short-circuits contraction matching when a normal letter follows.
1413 flags
|= Collation::CONTRACT_NEXT_CCC
;
1414 // Add all of the non-empty suffixes into the contraction trie.
1416 UnicodeString
suffix(cond
->context
, suffixStart
);
1417 uint16_t fcd16
= nfcImpl
.getFCD16(suffix
.char32At(0));
1419 flags
&= ~Collation::CONTRACT_NEXT_CCC
;
1421 fcd16
= nfcImpl
.getFCD16(suffix
.char32At(suffix
.length() - 1));
1423 // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1424 flags
|= Collation::CONTRACT_TRAILING_CCC
;
1426 contractionBuilder
.add(suffix
, (int32_t)cond
->ce32
, errorCode
);
1427 if(cond
== lastCond
) { break; }
1428 cond
= getConditionalCE32(cond
->next
);
1430 int32_t index
= addContextTrie(emptySuffixCE32
, contractionBuilder
, errorCode
);
1431 if(U_FAILURE(errorCode
)) { return 0; }
1432 if(index
> Collation::MAX_INDEX
) {
1433 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1436 ce32
= Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG
, index
) | flags
;
1438 U_ASSERT(cond
== lastCond
);
1439 firstCond
->defaultCE32
= ce32
;
1440 if(prefixLength
== 0) {
1441 if(cond
->next
< 0) {
1442 // No non-empty prefixes, only contractions.
1446 prefix
.remove(0, 1); // Remove the length unit.
1448 prefixBuilder
.add(prefix
, (int32_t)ce32
, errorCode
);
1449 if(cond
->next
< 0) { break; }
1452 U_ASSERT(head
->defaultCE32
!= Collation::NO_CE32
);
1453 int32_t index
= addContextTrie(head
->defaultCE32
, prefixBuilder
, errorCode
);
1454 if(U_FAILURE(errorCode
)) { return 0; }
1455 if(index
> Collation::MAX_INDEX
) {
1456 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1459 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG
, index
);
1463 CollationDataBuilder::addContextTrie(uint32_t defaultCE32
, UCharsTrieBuilder
&trieBuilder
,
1464 UErrorCode
&errorCode
) {
1465 UnicodeString context
;
1466 context
.append((UChar
)(defaultCE32
>> 16)).append((UChar
)defaultCE32
);
1467 UnicodeString trieString
;
1468 context
.append(trieBuilder
.buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieString
, errorCode
));
1469 if(U_FAILURE(errorCode
)) { return -1; }
1470 int32_t index
= contexts
.indexOf(context
);
1472 index
= contexts
.length();
1473 contexts
.append(context
);
1479 CollationDataBuilder::buildFastLatinTable(CollationData
&data
, UErrorCode
&errorCode
) {
1480 if(U_FAILURE(errorCode
) || !fastLatinEnabled
) { return; }
1482 delete fastLatinBuilder
;
1483 fastLatinBuilder
= new CollationFastLatinBuilder(errorCode
);
1484 if(fastLatinBuilder
== NULL
) {
1485 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1488 if(fastLatinBuilder
->forData(data
, errorCode
)) {
1489 const uint16_t *table
= fastLatinBuilder
->getTable();
1490 int32_t length
= fastLatinBuilder
->lengthOfTable();
1491 if(base
!= NULL
&& length
== base
->fastLatinTableLength
&&
1492 uprv_memcmp(table
, base
->fastLatinTable
, length
* 2) == 0) {
1493 // Same fast Latin table as in the base, use that one instead.
1494 delete fastLatinBuilder
;
1495 fastLatinBuilder
= NULL
;
1496 table
= base
->fastLatinTable
;
1498 data
.fastLatinTable
= table
;
1499 data
.fastLatinTableLength
= length
;
1501 delete fastLatinBuilder
;
1502 fastLatinBuilder
= NULL
;
1507 CollationDataBuilder::getCEs(const UnicodeString
&s
, int64_t ces
[], int32_t cesLength
) {
1508 return getCEs(s
, 0, ces
, cesLength
);
1512 CollationDataBuilder::getCEs(const UnicodeString
&prefix
, const UnicodeString
&s
,
1513 int64_t ces
[], int32_t cesLength
) {
1514 int32_t prefixLength
= prefix
.length();
1515 if(prefixLength
== 0) {
1516 return getCEs(s
, 0, ces
, cesLength
);
1518 return getCEs(prefix
+ s
, prefixLength
, ces
, cesLength
);
1523 CollationDataBuilder::getCEs(const UnicodeString
&s
, int32_t start
,
1524 int64_t ces
[], int32_t cesLength
) {
1525 if(collIter
== NULL
) {
1526 collIter
= new DataBuilderCollationIterator(*this);
1527 if(collIter
== NULL
) { return 0; }
1529 return collIter
->fetchCEs(s
, start
, ces
, cesLength
);
1534 #endif // !UCONFIG_NO_COLLATION