1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatabuilder.cpp
10 * (replaced the former ucol_elm.cpp)
12 * created on: 2012apr01
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/localpointer.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/ucharstriebuilder.h"
24 #include "unicode/uniset.h"
25 #include "unicode/unistr.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/utf16.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationdatabuilder.h"
32 #include "collationfastlatinbuilder.h"
33 #include "collationiterator.h"
34 #include "normalizer2impl.h"
42 CollationDataBuilder::CEModifier::~CEModifier() {}
45 * Build-time context and CE32 for a code point.
46 * If a code point has contextual mappings, then the default (no-context) mapping
47 * and all conditional mappings are stored in a singly-linked list
48 * of ConditionalCE32, sorted by context strings.
50 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51 * Context strings must be unique and in ascending order.
53 struct ConditionalCE32
: public UMemory
{
56 ce32(0), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
58 ConditionalCE32(const UnicodeString
&ct
, uint32_t ce
)
60 ce32(ce
), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
63 inline UBool
hasContext() const { return context
.length() > 1; }
64 inline int32_t prefixLength() const { return context
.charAt(0); }
67 * "\0" for the first entry for any code point, with its default CE32.
69 * Otherwise one unit with the length of the prefix string,
70 * then the prefix string, then the contraction suffix.
72 UnicodeString context
;
74 * CE32 for the code point and its context.
75 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
79 * Default CE32 for all contexts with this same prefix.
80 * Initially NO_CE32. Set only while building runtime data structures,
81 * and only on one of the nodes of a sub-list with the same prefix.
85 * CE32 for the built contexts.
86 * When fetching CEs from the builder, the contexts are built into their runtime form
87 * so that the normal collation implementation can process them.
88 * The result is cached in the list head. It is reset when the contexts are modified.
92 * Index of the next ConditionalCE32.
93 * Negative for the end of the list.
100 U_CAPI
void U_CALLCONV
101 uprv_deleteConditionalCE32(void *obj
) {
102 delete static_cast<ConditionalCE32
*>(obj
);
108 * Build-time collation element and character iterator.
109 * Uses the runtime CollationIterator for fetching CEs for a string
110 * but reads from the builder's unfinished data structures.
111 * In particular, this class reads from the unfinished trie
112 * and has to avoid CollationIterator::nextCE() and redirect other
113 * calls to data->getCE32() and data->getCE32FromSupplementary().
115 * We do this so that we need not implement the collation algorithm
116 * again for the builder and make it behave exactly like the runtime code.
117 * That would be more difficult to test and maintain than this indirection.
119 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
120 * so the data accesses from those code paths need not be modified.
122 * This class iterates directly over whole code points
123 * so that the CollationIterator does not need the finished trie
124 * for handling the LEAD_SURROGATE_TAG.
126 class DataBuilderCollationIterator
: public CollationIterator
{
128 DataBuilderCollationIterator(CollationDataBuilder
&b
);
130 virtual ~DataBuilderCollationIterator();
132 int32_t fetchCEs(const UnicodeString
&str
, int32_t start
, int64_t ces
[], int32_t cesLength
);
134 virtual void resetToOffset(int32_t newOffset
);
135 virtual int32_t getOffset() const;
137 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
138 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
141 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
142 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
144 virtual uint32_t getDataCE32(UChar32 c
) const;
145 virtual uint32_t getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
);
147 CollationDataBuilder
&builder
;
148 CollationData builderData
;
149 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
150 const UnicodeString
*s
;
154 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder
&b
)
155 : CollationIterator(&builderData
, /*numeric=*/ FALSE
),
156 builder(b
), builderData(b
.nfcImpl
),
158 builderData
.base
= builder
.base
;
159 // Set all of the jamoCE32s[] to indirection CE32s.
160 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
161 UChar32 jamo
= CollationDataBuilder::jamoCpFromIndex(j
);
162 jamoCE32s
[j
] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG
, jamo
) |
163 CollationDataBuilder::IS_BUILDER_JAMO_CE32
;
165 builderData
.jamoCE32s
= jamoCE32s
;
168 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
171 DataBuilderCollationIterator::fetchCEs(const UnicodeString
&str
, int32_t start
,
172 int64_t ces
[], int32_t cesLength
) {
173 // Set the pointers each time, in case they changed due to reallocation.
174 builderData
.ce32s
= reinterpret_cast<const uint32_t *>(builder
.ce32s
.getBuffer());
175 builderData
.ces
= builder
.ce64s
.getBuffer();
176 builderData
.contexts
= builder
.contexts
.getBuffer();
177 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
181 UErrorCode errorCode
= U_ZERO_ERROR
;
182 while(U_SUCCESS(errorCode
) && pos
< s
->length()) {
183 // No need to keep all CEs in the iterator buffer.
185 UChar32 c
= s
->char32At(pos
);
186 pos
+= U16_LENGTH(c
);
187 uint32_t ce32
= utrie2_get32(builder
.trie
, c
);
188 const CollationData
*d
;
189 if(ce32
== Collation::FALLBACK_CE32
) {
191 ce32
= builder
.base
->getCE32(c
);
195 appendCEsFromCE32(d
, c
, ce32
, /*forward=*/ TRUE
, errorCode
);
196 U_ASSERT(U_SUCCESS(errorCode
));
197 for(int32_t i
= 0; i
< getCEsLength(); ++i
) {
198 int64_t ce
= getCE(i
);
200 if(cesLength
< Collation::MAX_EXPANSION_LENGTH
) {
211 DataBuilderCollationIterator::resetToOffset(int32_t newOffset
) {
217 DataBuilderCollationIterator::getOffset() const {
222 DataBuilderCollationIterator::nextCodePoint(UErrorCode
& /*errorCode*/) {
223 if(pos
== s
->length()) {
226 UChar32 c
= s
->char32At(pos
);
227 pos
+= U16_LENGTH(c
);
232 DataBuilderCollationIterator::previousCodePoint(UErrorCode
& /*errorCode*/) {
236 UChar32 c
= s
->char32At(pos
- 1);
237 pos
-= U16_LENGTH(c
);
242 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
243 pos
= s
->moveIndex32(pos
, num
);
247 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
248 pos
= s
->moveIndex32(pos
, -num
);
252 DataBuilderCollationIterator::getDataCE32(UChar32 c
) const {
253 return utrie2_get32(builder
.trie
, c
);
257 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
) {
258 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::BUILDER_DATA_TAG
));
259 if((ce32
& CollationDataBuilder::IS_BUILDER_JAMO_CE32
) != 0) {
260 UChar32 jamo
= Collation::indexFromCE32(ce32
);
261 return utrie2_get32(builder
.trie
, jamo
);
263 ConditionalCE32
*cond
= builder
.getConditionalCE32ForCE32(ce32
);
264 if(cond
->builtCE32
== Collation::NO_CE32
) {
265 // Build the context-sensitive mappings into their runtime form and cache the result.
266 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
267 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
268 errorCode
= U_ZERO_ERROR
;
269 builder
.clearContexts();
270 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
272 builderData
.contexts
= builder
.contexts
.getBuffer();
274 return cond
->builtCE32
;
278 // ------------------------------------------------------------------------- ***
280 CollationDataBuilder::CollationDataBuilder(UErrorCode
&errorCode
)
281 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode
)),
282 base(NULL
), baseSettings(NULL
),
284 ce32s(errorCode
), ce64s(errorCode
), conditionalCE32s(errorCode
),
286 fastLatinEnabled(FALSE
), fastLatinBuilder(NULL
),
288 // Reserve the first CE32 for U+0000.
289 ce32s
.addElement(0, errorCode
);
290 conditionalCE32s
.setDeleter(uprv_deleteConditionalCE32
);
293 CollationDataBuilder::~CollationDataBuilder() {
295 delete fastLatinBuilder
;
300 CollationDataBuilder::initForTailoring(const CollationData
*b
, UErrorCode
&errorCode
) {
301 if(U_FAILURE(errorCode
)) { return; }
303 errorCode
= U_INVALID_STATE_ERROR
;
307 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
312 // For a tailoring, the default is to fall back to the base.
313 trie
= utrie2_open(Collation::FALLBACK_CE32
, Collation::FFFD_CE32
, &errorCode
);
315 // Set the Latin-1 letters block so that it is allocated first in the data array,
316 // to try to improve locality of reference when sorting Latin-1 text.
317 // Do not use utrie2_setRange32() since that will not actually allocate blocks
318 // that are filled with the default value.
319 // ASCII (0..7F) is already preallocated anyway.
320 for(UChar32 c
= 0xc0; c
<= 0xff; ++c
) {
321 utrie2_set32(trie
, c
, Collation::FALLBACK_CE32
, &errorCode
);
324 // Hangul syllables are not tailorable (except via tailoring Jamos).
325 // Always set the Hangul tag to help performance.
326 // Do this here, rather than in buildMappings(),
327 // so that we see the HANGUL_TAG in various assertions.
328 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
329 utrie2_setRange32(trie
, Hangul::HANGUL_BASE
, Hangul::HANGUL_END
, hangulCE32
, TRUE
, &errorCode
);
331 // Copy the set contents but don't copy/clone the set as a whole because
332 // that would copy the isFrozen state too.
333 unsafeBackwardSet
.addAll(*b
->unsafeBackwardSet
);
335 if(U_FAILURE(errorCode
)) { return; }
339 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start
, UChar32 end
,
340 uint32_t primary
, int32_t step
,
341 UErrorCode
&errorCode
) {
342 if(U_FAILURE(errorCode
)) { return FALSE
; }
343 U_ASSERT(start
<= end
);
344 // TODO: Do we need to check what values are currently set for start..end?
345 // An offset range is worth it only if we can achieve an overlap between
346 // adjacent UTrie2 blocks of 32 code points each.
347 // An offset CE is also a little more expensive to look up and compute
349 // If the range spans at least three UTrie2 block boundaries (> 64 code points),
351 // If the range spans one or two block boundaries and there are
352 // at least 4 code points on either side, then we take it.
353 // (We could additionally require a minimum range length of, say, 16.)
354 int32_t blockDelta
= (end
>> 5) - (start
>> 5);
355 if(2 <= step
&& step
<= 0x7f &&
357 (blockDelta
> 0 && (start
& 0x1f) <= 0x1c && (end
& 0x1f) >= 3))) {
358 int64_t dataCE
= ((int64_t)primary
<< 32) | (start
<< 8) | step
;
359 if(isCompressiblePrimary(primary
)) { dataCE
|= 0x80; }
360 int32_t index
= addCE(dataCE
, errorCode
);
361 if(U_FAILURE(errorCode
)) { return 0; }
362 if(index
> Collation::MAX_INDEX
) {
363 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
366 uint32_t offsetCE32
= Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG
, index
);
367 utrie2_setRange32(trie
, start
, end
, offsetCE32
, TRUE
, &errorCode
);
376 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start
, UChar32 end
,
377 uint32_t primary
, int32_t step
,
378 UErrorCode
&errorCode
) {
379 if(U_FAILURE(errorCode
)) { return 0; }
380 UBool isCompressible
= isCompressiblePrimary(primary
);
381 if(maybeSetPrimaryRange(start
, end
, primary
, step
, errorCode
)) {
382 return Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
,
383 (end
- start
+ 1) * step
);
385 // Short range: Set individual CE32s.
387 utrie2_set32(trie
, start
, Collation::makeLongPrimaryCE32(primary
), &errorCode
);
389 primary
= Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
, step
);
390 if(start
> end
) { return primary
; }
397 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase
, UChar32 c
, uint32_t ce32
) const {
398 int32_t i
= Collation::indexFromCE32(ce32
);
399 int64_t dataCE
= fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
400 uint32_t p
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
);
401 return Collation::makeLongPrimaryCE32(p
);
405 CollationDataBuilder::isCompressibleLeadByte(uint32_t b
) const {
406 return base
->isCompressibleLeadByte(b
);
410 CollationDataBuilder::isAssigned(UChar32 c
) const {
411 return Collation::isAssignedCE32(utrie2_get32(trie
, c
));
415 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c
) const {
416 uint32_t ce32
= utrie2_get32(trie
, c
);
417 if(Collation::isLongPrimaryCE32(ce32
)) {
418 return Collation::primaryFromLongPrimaryCE32(ce32
);
425 CollationDataBuilder::getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const {
426 if(U_FAILURE(errorCode
)) { return 0; }
427 // Keep parallel with CollationData::getSingleCE().
428 UBool fromBase
= FALSE
;
429 uint32_t ce32
= utrie2_get32(trie
, c
);
430 if(ce32
== Collation::FALLBACK_CE32
) {
432 ce32
= base
->getCE32(c
);
434 while(Collation::isSpecialCE32(ce32
)) {
435 switch(Collation::tagFromCE32(ce32
)) {
436 case Collation::LATIN_EXPANSION_TAG
:
437 case Collation::BUILDER_DATA_TAG
:
438 case Collation::PREFIX_TAG
:
439 case Collation::CONTRACTION_TAG
:
440 case Collation::HANGUL_TAG
:
441 case Collation::LEAD_SURROGATE_TAG
:
442 errorCode
= U_UNSUPPORTED_ERROR
;
444 case Collation::FALLBACK_TAG
:
445 case Collation::RESERVED_TAG_3
:
446 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
448 case Collation::LONG_PRIMARY_TAG
:
449 return Collation::ceFromLongPrimaryCE32(ce32
);
450 case Collation::LONG_SECONDARY_TAG
:
451 return Collation::ceFromLongSecondaryCE32(ce32
);
452 case Collation::EXPANSION32_TAG
:
453 if(Collation::lengthFromCE32(ce32
) == 1) {
454 int32_t i
= Collation::indexFromCE32(ce32
);
455 ce32
= fromBase
? base
->ce32s
[i
] : ce32s
.elementAti(i
);
458 errorCode
= U_UNSUPPORTED_ERROR
;
461 case Collation::EXPANSION_TAG
: {
462 if(Collation::lengthFromCE32(ce32
) == 1) {
463 int32_t i
= Collation::indexFromCE32(ce32
);
464 return fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
466 errorCode
= U_UNSUPPORTED_ERROR
;
470 case Collation::DIGIT_TAG
:
471 // Fetch the non-numeric-collation CE32 and continue.
472 ce32
= ce32s
.elementAti(Collation::indexFromCE32(ce32
));
474 case Collation::U0000_TAG
:
476 // Fetch the normal ce32 for U+0000 and continue.
477 ce32
= fromBase
? base
->ce32s
[0] : ce32s
.elementAti(0);
479 case Collation::OFFSET_TAG
:
480 ce32
= getCE32FromOffsetCE32(fromBase
, c
, ce32
);
482 case Collation::IMPLICIT_TAG
:
483 return Collation::unassignedCEFromCodePoint(c
);
486 return Collation::ceFromSimpleCE32(ce32
);
490 CollationDataBuilder::addCE(int64_t ce
, UErrorCode
&errorCode
) {
491 int32_t length
= ce64s
.size();
492 for(int32_t i
= 0; i
< length
; ++i
) {
493 if(ce
== ce64s
.elementAti(i
)) { return i
; }
495 ce64s
.addElement(ce
, errorCode
);
500 CollationDataBuilder::addCE32(uint32_t ce32
, UErrorCode
&errorCode
) {
501 int32_t length
= ce32s
.size();
502 for(int32_t i
= 0; i
< length
; ++i
) {
503 if(ce32
== (uint32_t)ce32s
.elementAti(i
)) { return i
; }
505 ce32s
.addElement((int32_t)ce32
, errorCode
);
510 CollationDataBuilder::addConditionalCE32(const UnicodeString
&context
, uint32_t ce32
,
511 UErrorCode
&errorCode
) {
512 if(U_FAILURE(errorCode
)) { return -1; }
513 U_ASSERT(!context
.isEmpty());
514 int32_t index
= conditionalCE32s
.size();
515 if(index
> Collation::MAX_INDEX
) {
516 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
519 ConditionalCE32
*cond
= new ConditionalCE32(context
, ce32
);
521 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
524 conditionalCE32s
.addElement(cond
, errorCode
);
529 CollationDataBuilder::add(const UnicodeString
&prefix
, const UnicodeString
&s
,
530 const int64_t ces
[], int32_t cesLength
,
531 UErrorCode
&errorCode
) {
532 uint32_t ce32
= encodeCEs(ces
, cesLength
, errorCode
);
533 addCE32(prefix
, s
, ce32
, errorCode
);
537 CollationDataBuilder::addCE32(const UnicodeString
&prefix
, const UnicodeString
&s
,
538 uint32_t ce32
, UErrorCode
&errorCode
) {
539 if(U_FAILURE(errorCode
)) { return; }
541 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
544 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
545 errorCode
= U_INVALID_STATE_ERROR
;
548 UChar32 c
= s
.char32At(0);
549 int32_t cLength
= U16_LENGTH(c
);
550 uint32_t oldCE32
= utrie2_get32(trie
, c
);
551 UBool hasContext
= !prefix
.isEmpty() || s
.length() > cLength
;
552 if(oldCE32
== Collation::FALLBACK_CE32
) {
553 // First tailoring for c.
554 // If c has contextual base mappings or if we add a contextual mapping,
555 // then copy the base mappings.
556 // Otherwise we just override the base mapping.
557 uint32_t baseCE32
= base
->getFinalCE32(base
->getCE32(c
));
558 if(hasContext
|| Collation::ce32HasContext(baseCE32
)) {
559 oldCE32
= copyFromBaseCE32(c
, baseCE32
, TRUE
, errorCode
);
560 utrie2_set32(trie
, c
, oldCE32
, &errorCode
);
561 if(U_FAILURE(errorCode
)) { return; }
565 // No prefix, no contraction.
566 if(!isBuilderContextCE32(oldCE32
)) {
567 utrie2_set32(trie
, c
, ce32
, &errorCode
);
569 ConditionalCE32
*cond
= getConditionalCE32ForCE32(oldCE32
);
570 cond
->builtCE32
= Collation::NO_CE32
;
574 ConditionalCE32
*cond
;
575 if(!isBuilderContextCE32(oldCE32
)) {
576 // Replace the simple oldCE32 with a builder context CE32
577 // pointing to a new ConditionalCE32 list head.
578 int32_t index
= addConditionalCE32(UnicodeString((UChar
)0), oldCE32
, errorCode
);
579 if(U_FAILURE(errorCode
)) { return; }
580 uint32_t contextCE32
= makeBuilderContextCE32(index
);
581 utrie2_set32(trie
, c
, contextCE32
, &errorCode
);
583 cond
= getConditionalCE32(index
);
585 cond
= getConditionalCE32ForCE32(oldCE32
);
586 cond
->builtCE32
= Collation::NO_CE32
;
588 UnicodeString
suffix(s
, cLength
);
589 UnicodeString
context((UChar
)prefix
.length());
590 context
.append(prefix
).append(suffix
);
591 unsafeBackwardSet
.addAll(suffix
);
593 // invariant: context > cond->context
594 int32_t next
= cond
->next
;
596 // Append a new ConditionalCE32 after cond.
597 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
598 if(U_FAILURE(errorCode
)) { return; }
602 ConditionalCE32
*nextCond
= getConditionalCE32(next
);
603 int8_t cmp
= context
.compare(nextCond
->context
);
605 // Insert a new ConditionalCE32 between cond and nextCond.
606 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
607 if(U_FAILURE(errorCode
)) { return; }
609 getConditionalCE32(index
)->next
= next
;
611 } else if(cmp
== 0) {
612 // Same context as before, overwrite its ce32.
613 nextCond
->ce32
= ce32
;
623 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce
) {
624 uint32_t p
= (uint32_t)(ce
>> 32);
625 uint32_t lower32
= (uint32_t)ce
;
626 uint32_t t
= (uint32_t)(ce
& 0xffff);
627 U_ASSERT((t
& 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
628 if((ce
& INT64_C(0xffff00ff00ff)) == 0) {
629 // normal form ppppsstt
630 return p
| (lower32
>> 16) | (t
>> 8);
631 } else if((ce
& INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE
) {
632 // long-primary form ppppppC1
633 return Collation::makeLongPrimaryCE32(p
);
634 } else if(p
== 0 && (t
& 0xff) == 0) {
635 // long-secondary form ssssttC2
636 return Collation::makeLongSecondaryCE32(lower32
);
638 return Collation::NO_CE32
;
642 CollationDataBuilder::encodeOneCE(int64_t ce
, UErrorCode
&errorCode
) {
643 // Try to encode one CE as one CE32.
644 uint32_t ce32
= encodeOneCEAsCE32(ce
);
645 if(ce32
!= Collation::NO_CE32
) { return ce32
; }
646 int32_t index
= addCE(ce
, errorCode
);
647 if(U_FAILURE(errorCode
)) { return 0; }
648 if(index
> Collation::MAX_INDEX
) {
649 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
652 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, index
, 1);
656 CollationDataBuilder::encodeCEs(const int64_t ces
[], int32_t cesLength
,
657 UErrorCode
&errorCode
) {
658 if(U_FAILURE(errorCode
)) { return 0; }
659 if(cesLength
< 0 || cesLength
> Collation::MAX_EXPANSION_LENGTH
) {
660 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
663 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
664 errorCode
= U_INVALID_STATE_ERROR
;
668 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
669 // Do this here so that callers need not do it.
670 return encodeOneCEAsCE32(0);
671 } else if(cesLength
== 1) {
672 return encodeOneCE(ces
[0], errorCode
);
673 } else if(cesLength
== 2) {
674 // Try to encode two CEs as one CE32.
675 int64_t ce0
= ces
[0];
676 int64_t ce1
= ces
[1];
677 uint32_t p0
= (uint32_t)(ce0
>> 32);
678 if((ce0
& INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE
&&
679 (ce1
& INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE
&&
681 // Latin mini expansion
684 (((uint32_t)ce0
& 0xff00u
) << 8) |
685 (uint32_t)(ce1
>> 16) |
686 Collation::SPECIAL_CE32_LOW_BYTE
|
687 Collation::LATIN_EXPANSION_TAG
;
690 // Try to encode two or more CEs as CE32s.
691 int32_t newCE32s
[Collation::MAX_EXPANSION_LENGTH
];
692 for(int32_t i
= 0;; ++i
) {
694 return encodeExpansion32(newCE32s
, cesLength
, errorCode
);
696 uint32_t ce32
= encodeOneCEAsCE32(ces
[i
]);
697 if(ce32
== Collation::NO_CE32
) { break; }
698 newCE32s
[i
] = (int32_t)ce32
;
700 return encodeExpansion(ces
, cesLength
, errorCode
);
704 CollationDataBuilder::encodeExpansion(const int64_t ces
[], int32_t length
, UErrorCode
&errorCode
) {
705 if(U_FAILURE(errorCode
)) { return 0; }
706 // See if this sequence of CEs has already been stored.
707 int64_t first
= ces
[0];
708 int32_t ce64sMax
= ce64s
.size() - length
;
709 for(int32_t i
= 0; i
<= ce64sMax
; ++i
) {
710 if(first
== ce64s
.elementAti(i
)) {
711 if(i
> Collation::MAX_INDEX
) {
712 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
715 for(int32_t j
= 1;; ++j
) {
717 return Collation::makeCE32FromTagIndexAndLength(
718 Collation::EXPANSION_TAG
, i
, length
);
720 if(ce64s
.elementAti(i
+ j
) != ces
[j
]) { break; }
724 // Store the new sequence.
725 int32_t i
= ce64s
.size();
726 if(i
> Collation::MAX_INDEX
) {
727 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
730 for(int32_t j
= 0; j
< length
; ++j
) {
731 ce64s
.addElement(ces
[j
], errorCode
);
733 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, i
, length
);
737 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s
[], int32_t length
,
738 UErrorCode
&errorCode
) {
739 if(U_FAILURE(errorCode
)) { return 0; }
740 // See if this sequence of CE32s has already been stored.
741 int32_t first
= newCE32s
[0];
742 int32_t ce32sMax
= ce32s
.size() - length
;
743 for(int32_t i
= 0; i
<= ce32sMax
; ++i
) {
744 if(first
== ce32s
.elementAti(i
)) {
745 if(i
> Collation::MAX_INDEX
) {
746 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
749 for(int32_t j
= 1;; ++j
) {
751 return Collation::makeCE32FromTagIndexAndLength(
752 Collation::EXPANSION32_TAG
, i
, length
);
754 if(ce32s
.elementAti(i
+ j
) != newCE32s
[j
]) { break; }
758 // Store the new sequence.
759 int32_t i
= ce32s
.size();
760 if(i
> Collation::MAX_INDEX
) {
761 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
764 for(int32_t j
= 0; j
< length
; ++j
) {
765 ce32s
.addElement(newCE32s
[j
], errorCode
);
767 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG
, i
, length
);
771 CollationDataBuilder::copyFromBaseCE32(UChar32 c
, uint32_t ce32
, UBool withContext
,
772 UErrorCode
&errorCode
) {
773 if(U_FAILURE(errorCode
)) { return 0; }
774 if(!Collation::isSpecialCE32(ce32
)) { return ce32
; }
775 switch(Collation::tagFromCE32(ce32
)) {
776 case Collation::LONG_PRIMARY_TAG
:
777 case Collation::LONG_SECONDARY_TAG
:
778 case Collation::LATIN_EXPANSION_TAG
:
781 case Collation::EXPANSION32_TAG
: {
782 const uint32_t *baseCE32s
= base
->ce32s
+ Collation::indexFromCE32(ce32
);
783 int32_t length
= Collation::lengthFromCE32(ce32
);
784 ce32
= encodeExpansion32(
785 reinterpret_cast<const int32_t *>(baseCE32s
), length
, errorCode
);
788 case Collation::EXPANSION_TAG
: {
789 const int64_t *baseCEs
= base
->ces
+ Collation::indexFromCE32(ce32
);
790 int32_t length
= Collation::lengthFromCE32(ce32
);
791 ce32
= encodeExpansion(baseCEs
, length
, errorCode
);
794 case Collation::PREFIX_TAG
: {
795 // Flatten prefixes and nested suffixes (contractions)
796 // into a linear list of ConditionalCE32.
797 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
798 ce32
= CollationData::readCE32(p
); // Default if no prefix match.
800 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
802 ConditionalCE32 head
;
803 UnicodeString
context((UChar
)0);
805 if(Collation::isContractionCE32(ce32
)) {
806 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
808 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
809 head
.next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
811 if(U_FAILURE(errorCode
)) { return 0; }
812 ConditionalCE32
*cond
= getConditionalCE32(index
); // the last ConditionalCE32 so far
813 UCharsTrie::Iterator
prefixes(p
+ 2, 0, errorCode
);
814 while(prefixes
.next(errorCode
)) {
815 context
= prefixes
.getString();
817 context
.insert(0, (UChar
)context
.length());
818 ce32
= (uint32_t)prefixes
.getValue();
819 if(Collation::isContractionCE32(ce32
)) {
820 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, cond
, errorCode
);
822 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
823 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
825 if(U_FAILURE(errorCode
)) { return 0; }
826 cond
= getConditionalCE32(index
);
828 ce32
= makeBuilderContextCE32(head
.next
);
832 case Collation::CONTRACTION_TAG
: {
834 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
835 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
836 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
838 ConditionalCE32 head
;
839 UnicodeString
context((UChar
)0);
840 copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
841 ce32
= makeBuilderContextCE32(head
.next
);
845 case Collation::HANGUL_TAG
:
846 errorCode
= U_UNSUPPORTED_ERROR
; // We forbid tailoring of Hangul syllables.
848 case Collation::OFFSET_TAG
:
849 ce32
= getCE32FromOffsetCE32(TRUE
, c
, ce32
);
851 case Collation::IMPLICIT_TAG
:
852 ce32
= encodeOneCE(Collation::unassignedCEFromCodePoint(c
), errorCode
);
855 U_ASSERT(FALSE
); // require ce32 == base->getFinalCE32(ce32)
862 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString
&context
, UChar32 c
, uint32_t ce32
,
863 ConditionalCE32
*cond
, UErrorCode
&errorCode
) {
864 if(U_FAILURE(errorCode
)) { return 0; }
865 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
867 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
868 // No match on the single code point.
869 // We are underneath a prefix, and the default mapping is just
870 // a fallback to the mappings for a shorter prefix.
871 U_ASSERT(context
.length() > 1);
874 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
875 U_ASSERT(!Collation::isContractionCE32(ce32
));
876 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
877 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
878 if(U_FAILURE(errorCode
)) { return 0; }
879 cond
= getConditionalCE32(index
);
882 int32_t suffixStart
= context
.length();
883 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
884 while(suffixes
.next(errorCode
)) {
885 context
.append(suffixes
.getString());
886 ce32
= copyFromBaseCE32(c
, (uint32_t)suffixes
.getValue(), TRUE
, errorCode
);
887 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
888 if(U_FAILURE(errorCode
)) { return 0; }
889 // No need to update the unsafeBackwardSet because the tailoring set
890 // is already a copy of the base set.
891 cond
= getConditionalCE32(index
);
892 context
.truncate(suffixStart
);
894 U_ASSERT(index
>= 0);
900 CopyHelper(const CollationDataBuilder
&s
, CollationDataBuilder
&d
,
901 const CollationDataBuilder::CEModifier
&m
, UErrorCode
&initialErrorCode
)
902 : src(s
), dest(d
), modifier(m
),
903 errorCode(initialErrorCode
) {}
905 UBool
copyRangeCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
906 ce32
= copyCE32(ce32
);
907 utrie2_setRange32(dest
.trie
, start
, end
, ce32
, TRUE
, &errorCode
);
908 if(CollationDataBuilder::isBuilderContextCE32(ce32
)) {
909 dest
.contextChars
.add(start
, end
);
911 return U_SUCCESS(errorCode
);
914 uint32_t copyCE32(uint32_t ce32
) {
915 if(!Collation::isSpecialCE32(ce32
)) {
916 int64_t ce
= modifier
.modifyCE32(ce32
);
917 if(ce
!= Collation::NO_CE
) {
918 ce32
= dest
.encodeOneCE(ce
, errorCode
);
921 int32_t tag
= Collation::tagFromCE32(ce32
);
922 if(tag
== Collation::EXPANSION32_TAG
) {
923 const uint32_t *srcCE32s
= reinterpret_cast<uint32_t *>(src
.ce32s
.getBuffer());
924 srcCE32s
+= Collation::indexFromCE32(ce32
);
925 int32_t length
= Collation::lengthFromCE32(ce32
);
926 // Inspect the source CE32s. Just copy them if none are modified.
927 // Otherwise copy to modifiedCEs, with modifications.
928 UBool isModified
= FALSE
;
929 for(int32_t i
= 0; i
< length
; ++i
) {
932 if(Collation::isSpecialCE32(ce32
) ||
933 (ce
= modifier
.modifyCE32(ce32
)) == Collation::NO_CE
) {
935 modifiedCEs
[i
] = Collation::ceFromCE32(ce32
);
939 for(int32_t j
= 0; j
< i
; ++j
) {
940 modifiedCEs
[j
] = Collation::ceFromCE32(srcCE32s
[j
]);
948 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
950 ce32
= dest
.encodeExpansion32(
951 reinterpret_cast<const int32_t *>(srcCE32s
), length
, errorCode
);
953 } else if(tag
== Collation::EXPANSION_TAG
) {
954 const int64_t *srcCEs
= src
.ce64s
.getBuffer();
955 srcCEs
+= Collation::indexFromCE32(ce32
);
956 int32_t length
= Collation::lengthFromCE32(ce32
);
957 // Inspect the source CEs. Just copy them if none are modified.
958 // Otherwise copy to modifiedCEs, with modifications.
959 UBool isModified
= FALSE
;
960 for(int32_t i
= 0; i
< length
; ++i
) {
961 int64_t srcCE
= srcCEs
[i
];
962 int64_t ce
= modifier
.modifyCE(srcCE
);
963 if(ce
== Collation::NO_CE
) {
965 modifiedCEs
[i
] = srcCE
;
969 for(int32_t j
= 0; j
< i
; ++j
) {
970 modifiedCEs
[j
] = srcCEs
[j
];
978 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
980 ce32
= dest
.encodeExpansion(srcCEs
, length
, errorCode
);
982 } else if(tag
== Collation::BUILDER_DATA_TAG
) {
983 // Copy the list of ConditionalCE32.
984 ConditionalCE32
*cond
= src
.getConditionalCE32ForCE32(ce32
);
985 U_ASSERT(!cond
->hasContext());
986 int32_t destIndex
= dest
.addConditionalCE32(
987 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
988 ce32
= CollationDataBuilder::makeBuilderContextCE32(destIndex
);
989 while(cond
->next
>= 0) {
990 cond
= src
.getConditionalCE32(cond
->next
);
991 ConditionalCE32
*prevDestCond
= dest
.getConditionalCE32(destIndex
);
992 destIndex
= dest
.addConditionalCE32(
993 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
994 int32_t suffixStart
= cond
->prefixLength() + 1;
995 dest
.unsafeBackwardSet
.addAll(cond
->context
.tempSubString(suffixStart
));
996 prevDestCond
->next
= destIndex
;
999 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
1000 // assuming that the modifier would not modify them.
1001 U_ASSERT(tag
== Collation::LONG_PRIMARY_TAG
||
1002 tag
== Collation::LONG_SECONDARY_TAG
||
1003 tag
== Collation::LATIN_EXPANSION_TAG
||
1004 tag
== Collation::HANGUL_TAG
);
1010 const CollationDataBuilder
&src
;
1011 CollationDataBuilder
&dest
;
1012 const CollationDataBuilder::CEModifier
&modifier
;
1013 int64_t modifiedCEs
[Collation::MAX_EXPANSION_LENGTH
];
1014 UErrorCode errorCode
;
1019 static UBool U_CALLCONV
1020 enumRangeForCopy(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
1022 value
== Collation::UNASSIGNED_CE32
|| value
== Collation::FALLBACK_CE32
||
1023 ((CopyHelper
*)context
)->copyRangeCE32(start
, end
, value
);
1029 CollationDataBuilder::copyFrom(const CollationDataBuilder
&src
, const CEModifier
&modifier
,
1030 UErrorCode
&errorCode
) {
1031 if(U_FAILURE(errorCode
)) { return; }
1032 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1033 errorCode
= U_INVALID_STATE_ERROR
;
1036 CopyHelper
helper(src
, *this, modifier
, errorCode
);
1037 utrie2_enum(src
.trie
, NULL
, enumRangeForCopy
, &helper
);
1038 errorCode
= helper
.errorCode
;
1039 // Update the contextChars and the unsafeBackwardSet while copying,
1040 // in case a character had conditional mappings in the source builder
1041 // and they were removed later.
1042 modified
|= src
.modified
;
1046 CollationDataBuilder::optimize(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1047 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1048 UnicodeSetIterator
iter(set
);
1049 while(iter
.next() && !iter
.isString()) {
1050 UChar32 c
= iter
.getCodepoint();
1051 uint32_t ce32
= utrie2_get32(trie
, c
);
1052 if(ce32
== Collation::FALLBACK_CE32
) {
1053 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1054 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
1055 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1062 CollationDataBuilder::suppressContractions(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1063 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1064 UnicodeSetIterator
iter(set
);
1065 while(iter
.next() && !iter
.isString()) {
1066 UChar32 c
= iter
.getCodepoint();
1067 uint32_t ce32
= utrie2_get32(trie
, c
);
1068 if(ce32
== Collation::FALLBACK_CE32
) {
1069 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1070 if(Collation::ce32HasContext(ce32
)) {
1071 ce32
= copyFromBaseCE32(c
, ce32
, FALSE
/* without context */, errorCode
);
1072 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1074 } else if(isBuilderContextCE32(ce32
)) {
1075 ce32
= getConditionalCE32ForCE32(ce32
)->ce32
;
1076 // Simply abandon the list of ConditionalCE32.
1077 // The caller will copy this builder in the end,
1078 // eliminating unreachable data.
1079 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1080 contextChars
.remove(c
);
1087 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s
[], UErrorCode
&errorCode
) {
1088 if(U_FAILURE(errorCode
)) { return FALSE
; }
1089 UBool anyJamoAssigned
= base
== NULL
; // always set jamoCE32s in the base data
1090 UBool needToCopyFromBase
= FALSE
;
1091 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
1092 UChar32 jamo
= jamoCpFromIndex(j
);
1093 UBool fromBase
= FALSE
;
1094 uint32_t ce32
= utrie2_get32(trie
, jamo
);
1095 anyJamoAssigned
|= Collation::isAssignedCE32(ce32
);
1096 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1097 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1098 if(ce32
== Collation::FALLBACK_CE32
) {
1100 ce32
= base
->getCE32(jamo
);
1102 if(Collation::isSpecialCE32(ce32
)) {
1103 switch(Collation::tagFromCE32(ce32
)) {
1104 case Collation::LONG_PRIMARY_TAG
:
1105 case Collation::LONG_SECONDARY_TAG
:
1106 case Collation::LATIN_EXPANSION_TAG
:
1107 // Copy the ce32 as-is.
1109 case Collation::EXPANSION32_TAG
:
1110 case Collation::EXPANSION_TAG
:
1111 case Collation::PREFIX_TAG
:
1112 case Collation::CONTRACTION_TAG
:
1114 // Defer copying until we know if anyJamoAssigned.
1115 ce32
= Collation::FALLBACK_CE32
;
1116 needToCopyFromBase
= TRUE
;
1119 case Collation::IMPLICIT_TAG
:
1120 // An unassigned Jamo should only occur in tests with incomplete bases.
1122 ce32
= Collation::FALLBACK_CE32
;
1123 needToCopyFromBase
= TRUE
;
1125 case Collation::OFFSET_TAG
:
1126 ce32
= getCE32FromOffsetCE32(fromBase
, jamo
, ce32
);
1128 case Collation::FALLBACK_TAG
:
1129 case Collation::RESERVED_TAG_3
:
1130 case Collation::BUILDER_DATA_TAG
:
1131 case Collation::DIGIT_TAG
:
1132 case Collation::U0000_TAG
:
1133 case Collation::HANGUL_TAG
:
1134 case Collation::LEAD_SURROGATE_TAG
:
1135 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1139 jamoCE32s
[j
] = ce32
;
1141 if(anyJamoAssigned
&& needToCopyFromBase
) {
1142 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) {
1143 if(jamoCE32s
[j
] == Collation::FALLBACK_CE32
) {
1144 UChar32 jamo
= jamoCpFromIndex(j
);
1145 jamoCE32s
[j
] = copyFromBaseCE32(jamo
, base
->getCE32(jamo
),
1146 /*withContext=*/ TRUE
, errorCode
);
1150 return anyJamoAssigned
&& U_SUCCESS(errorCode
);
1154 CollationDataBuilder::setDigitTags(UErrorCode
&errorCode
) {
1155 UnicodeSet
digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode
);
1156 if(U_FAILURE(errorCode
)) { return; }
1157 UnicodeSetIterator
iter(digits
);
1158 while(iter
.next()) {
1159 U_ASSERT(!iter
.isString());
1160 UChar32 c
= iter
.getCodepoint();
1161 uint32_t ce32
= utrie2_get32(trie
, c
);
1162 if(ce32
!= Collation::FALLBACK_CE32
&& ce32
!= Collation::UNASSIGNED_CE32
) {
1163 int32_t index
= addCE32(ce32
, errorCode
);
1164 if(U_FAILURE(errorCode
)) { return; }
1165 if(index
> Collation::MAX_INDEX
) {
1166 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1169 ce32
= Collation::makeCE32FromTagIndexAndLength(
1170 Collation::DIGIT_TAG
, index
, u_charDigitValue(c
));
1171 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1178 static UBool U_CALLCONV
1179 enumRangeLeadValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
1180 int32_t *pValue
= (int32_t *)context
;
1181 if(value
== Collation::UNASSIGNED_CE32
) {
1182 value
= Collation::LEAD_ALL_UNASSIGNED
;
1183 } else if(value
== Collation::FALLBACK_CE32
) {
1184 value
= Collation::LEAD_ALL_FALLBACK
;
1186 *pValue
= Collation::LEAD_MIXED
;
1190 *pValue
= (int32_t)value
;
1191 } else if(*pValue
!= (int32_t)value
) {
1192 *pValue
= Collation::LEAD_MIXED
;
1201 CollationDataBuilder::setLeadSurrogates(UErrorCode
&errorCode
) {
1202 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
) {
1204 utrie2_enumForLeadSurrogate(trie
, lead
, NULL
, enumRangeLeadValue
, &value
);
1205 utrie2_set32ForLeadSurrogateCodeUnit(
1207 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG
, 0) | (uint32_t)value
,
1213 CollationDataBuilder::build(CollationData
&data
, UErrorCode
&errorCode
) {
1214 buildMappings(data
, errorCode
);
1216 data
.numericPrimary
= base
->numericPrimary
;
1217 data
.compressibleBytes
= base
->compressibleBytes
;
1218 data
.numScripts
= base
->numScripts
;
1219 data
.scriptsIndex
= base
->scriptsIndex
;
1220 data
.scriptStarts
= base
->scriptStarts
;
1221 data
.scriptStartsLength
= base
->scriptStartsLength
;
1223 buildFastLatinTable(data
, errorCode
);
1227 CollationDataBuilder::buildMappings(CollationData
&data
, UErrorCode
&errorCode
) {
1228 if(U_FAILURE(errorCode
)) { return; }
1229 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1230 errorCode
= U_INVALID_STATE_ERROR
;
1234 buildContexts(errorCode
);
1236 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
1237 int32_t jamoIndex
= -1;
1238 if(getJamoCE32s(jamoCE32s
, errorCode
)) {
1239 jamoIndex
= ce32s
.size();
1240 for(int32_t i
= 0; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1241 ce32s
.addElement((int32_t)jamoCE32s
[i
], errorCode
);
1243 // Small optimization: Use a bit in the Hangul ce32
1244 // to indicate that none of the Jamo CE32s are isSpecialCE32()
1245 // (as it should be in the root collator).
1246 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1247 // In order to still have good trie compression and keep this code simple,
1248 // we only set this flag if a whole block of 588 Hangul syllables starting with
1249 // a common leading consonant (Jamo L) has this property.
1250 UBool isAnyJamoVTSpecial
= FALSE
;
1251 for(int32_t i
= Hangul::JAMO_L_COUNT
; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1252 if(Collation::isSpecialCE32(jamoCE32s
[i
])) {
1253 isAnyJamoVTSpecial
= TRUE
;
1257 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
1258 UChar32 c
= Hangul::HANGUL_BASE
;
1259 for(int32_t i
= 0; i
< Hangul::JAMO_L_COUNT
; ++i
) { // iterate over the Jamo L
1260 uint32_t ce32
= hangulCE32
;
1261 if(!isAnyJamoVTSpecial
&& !Collation::isSpecialCE32(jamoCE32s
[i
])) {
1262 ce32
|= Collation::HANGUL_NO_SPECIAL_JAMO
;
1264 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1265 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1269 // Copy the Hangul CE32s from the base in blocks per Jamo L,
1270 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1271 for(UChar32 c
= Hangul::HANGUL_BASE
; c
< Hangul::HANGUL_LIMIT
;) {
1272 uint32_t ce32
= base
->getCE32(c
);
1273 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::HANGUL_TAG
));
1274 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1275 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1280 setDigitTags(errorCode
);
1281 setLeadSurrogates(errorCode
);
1283 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1284 ce32s
.setElementAt((int32_t)utrie2_get32(trie
, 0), 0);
1285 utrie2_set32(trie
, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG
, 0), &errorCode
);
1287 utrie2_freeze(trie
, UTRIE2_32_VALUE_BITS
, &errorCode
);
1288 if(U_FAILURE(errorCode
)) { return; }
1290 // Mark each lead surrogate as "unsafe"
1291 // if any of its 1024 associated supplementary code points is "unsafe".
1292 UChar32 c
= 0x10000;
1293 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
1294 if(unsafeBackwardSet
.containsSome(c
, c
+ 0x3ff)) {
1295 unsafeBackwardSet
.add(lead
);
1298 unsafeBackwardSet
.freeze();
1301 data
.ce32s
= reinterpret_cast<const uint32_t *>(ce32s
.getBuffer());
1302 data
.ces
= ce64s
.getBuffer();
1303 data
.contexts
= contexts
.getBuffer();
1305 data
.ce32sLength
= ce32s
.size();
1306 data
.cesLength
= ce64s
.size();
1307 data
.contextsLength
= contexts
.length();
1310 if(jamoIndex
>= 0) {
1311 data
.jamoCE32s
= data
.ce32s
+ jamoIndex
;
1313 data
.jamoCE32s
= base
->jamoCE32s
;
1315 data
.unsafeBackwardSet
= &unsafeBackwardSet
;
1319 CollationDataBuilder::clearContexts() {
1321 UnicodeSetIterator
iter(contextChars
);
1322 while(iter
.next()) {
1323 U_ASSERT(!iter
.isString());
1324 uint32_t ce32
= utrie2_get32(trie
, iter
.getCodepoint());
1325 U_ASSERT(isBuilderContextCE32(ce32
));
1326 getConditionalCE32ForCE32(ce32
)->builtCE32
= Collation::NO_CE32
;
1331 CollationDataBuilder::buildContexts(UErrorCode
&errorCode
) {
1332 if(U_FAILURE(errorCode
)) { return; }
1333 // Ignore abandoned lists and the cached builtCE32,
1334 // and build all contexts from scratch.
1336 UnicodeSetIterator
iter(contextChars
);
1337 while(U_SUCCESS(errorCode
) && iter
.next()) {
1338 U_ASSERT(!iter
.isString());
1339 UChar32 c
= iter
.getCodepoint();
1340 uint32_t ce32
= utrie2_get32(trie
, c
);
1341 if(!isBuilderContextCE32(ce32
)) {
1342 // Impossible: No context data for c in contextChars.
1343 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1346 ConditionalCE32
*cond
= getConditionalCE32ForCE32(ce32
);
1347 ce32
= buildContext(cond
, errorCode
);
1348 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1353 CollationDataBuilder::buildContext(ConditionalCE32
*head
, UErrorCode
&errorCode
) {
1354 if(U_FAILURE(errorCode
)) { return 0; }
1355 // The list head must have no context.
1356 U_ASSERT(!head
->hasContext());
1357 // The list head must be followed by one or more nodes that all do have context.
1358 U_ASSERT(head
->next
>= 0);
1359 UCharsTrieBuilder
prefixBuilder(errorCode
);
1360 UCharsTrieBuilder
contractionBuilder(errorCode
);
1361 for(ConditionalCE32
*cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1362 // After the list head, the prefix or suffix can be empty, but not both.
1363 U_ASSERT(cond
== head
|| cond
->hasContext());
1364 int32_t prefixLength
= cond
->prefixLength();
1365 UnicodeString
prefix(cond
->context
, 0, prefixLength
+ 1);
1366 // Collect all contraction suffixes for one prefix.
1367 ConditionalCE32
*firstCond
= cond
;
1368 ConditionalCE32
*lastCond
= cond
;
1369 while(cond
->next
>= 0 &&
1370 (cond
= getConditionalCE32(cond
->next
))->context
.startsWith(prefix
)) {
1374 int32_t suffixStart
= prefixLength
+ 1; // == prefix.length()
1375 if(lastCond
->context
.length() == suffixStart
) {
1376 // One prefix without contraction suffix.
1377 U_ASSERT(firstCond
== lastCond
);
1378 ce32
= lastCond
->ce32
;
1381 // Build the contractions trie.
1382 contractionBuilder
.clear();
1383 // Entry for an empty suffix, to be stored before the trie.
1384 uint32_t emptySuffixCE32
= 0;
1386 if(firstCond
->context
.length() == suffixStart
) {
1387 // There is a mapping for the prefix and the single character c. (p|c)
1388 // If no other suffix matches, then we return this value.
1389 emptySuffixCE32
= firstCond
->ce32
;
1390 cond
= getConditionalCE32(firstCond
->next
);
1392 // There is no mapping for the prefix and just the single character.
1393 // (There is no p|c, only p|cd, p|ce etc.)
1394 flags
|= Collation::CONTRACT_SINGLE_CP_NO_MATCH
;
1395 // When the prefix matches but none of the prefix-specific suffixes,
1396 // then we fall back to the mappings with the next-longest prefix,
1397 // and ultimately to mappings with no prefix.
1398 // Each fallback might be another set of contractions.
1399 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1400 // then in text "pch" we find the ch contraction.
1401 for(cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1402 int32_t length
= cond
->prefixLength();
1403 if(length
== prefixLength
) { break; }
1404 if(cond
->defaultCE32
!= Collation::NO_CE32
&&
1405 (length
==0 || prefix
.endsWith(cond
->context
, 1, length
))) {
1406 emptySuffixCE32
= cond
->defaultCE32
;
1411 // Optimization: Set a flag when
1412 // the first character of every contraction suffix has lccc!=0.
1413 // Short-circuits contraction matching when a normal letter follows.
1414 flags
|= Collation::CONTRACT_NEXT_CCC
;
1415 // Add all of the non-empty suffixes into the contraction trie.
1417 UnicodeString
suffix(cond
->context
, suffixStart
);
1418 uint16_t fcd16
= nfcImpl
.getFCD16(suffix
.char32At(0));
1420 flags
&= ~Collation::CONTRACT_NEXT_CCC
;
1422 fcd16
= nfcImpl
.getFCD16(suffix
.char32At(suffix
.length() - 1));
1424 // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1425 flags
|= Collation::CONTRACT_TRAILING_CCC
;
1427 contractionBuilder
.add(suffix
, (int32_t)cond
->ce32
, errorCode
);
1428 if(cond
== lastCond
) { break; }
1429 cond
= getConditionalCE32(cond
->next
);
1431 int32_t index
= addContextTrie(emptySuffixCE32
, contractionBuilder
, errorCode
);
1432 if(U_FAILURE(errorCode
)) { return 0; }
1433 if(index
> Collation::MAX_INDEX
) {
1434 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1437 ce32
= Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG
, index
) | flags
;
1439 U_ASSERT(cond
== lastCond
);
1440 firstCond
->defaultCE32
= ce32
;
1441 if(prefixLength
== 0) {
1442 if(cond
->next
< 0) {
1443 // No non-empty prefixes, only contractions.
1447 prefix
.remove(0, 1); // Remove the length unit.
1449 prefixBuilder
.add(prefix
, (int32_t)ce32
, errorCode
);
1450 if(cond
->next
< 0) { break; }
1453 U_ASSERT(head
->defaultCE32
!= Collation::NO_CE32
);
1454 int32_t index
= addContextTrie(head
->defaultCE32
, prefixBuilder
, errorCode
);
1455 if(U_FAILURE(errorCode
)) { return 0; }
1456 if(index
> Collation::MAX_INDEX
) {
1457 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1460 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG
, index
);
1464 CollationDataBuilder::addContextTrie(uint32_t defaultCE32
, UCharsTrieBuilder
&trieBuilder
,
1465 UErrorCode
&errorCode
) {
1466 UnicodeString context
;
1467 context
.append((UChar
)(defaultCE32
>> 16)).append((UChar
)defaultCE32
);
1468 UnicodeString trieString
;
1469 context
.append(trieBuilder
.buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieString
, errorCode
));
1470 if(U_FAILURE(errorCode
)) { return -1; }
1471 int32_t index
= contexts
.indexOf(context
);
1473 index
= contexts
.length();
1474 contexts
.append(context
);
1480 CollationDataBuilder::buildFastLatinTable(CollationData
&data
, UErrorCode
&errorCode
) {
1481 if(U_FAILURE(errorCode
) || !fastLatinEnabled
) { return; }
1483 delete fastLatinBuilder
;
1484 fastLatinBuilder
= new CollationFastLatinBuilder(errorCode
);
1485 if(fastLatinBuilder
== NULL
) {
1486 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1489 if(fastLatinBuilder
->forData(data
, errorCode
)) {
1490 const uint16_t *table
= fastLatinBuilder
->getTable();
1491 int32_t length
= fastLatinBuilder
->lengthOfTable();
1492 if(base
!= NULL
&& length
== base
->fastLatinTableLength
&&
1493 uprv_memcmp(table
, base
->fastLatinTable
, length
* 2) == 0) {
1494 // Same fast Latin table as in the base, use that one instead.
1495 delete fastLatinBuilder
;
1496 fastLatinBuilder
= NULL
;
1497 table
= base
->fastLatinTable
;
1499 data
.fastLatinTable
= table
;
1500 data
.fastLatinTableLength
= length
;
1502 delete fastLatinBuilder
;
1503 fastLatinBuilder
= NULL
;
1508 CollationDataBuilder::getCEs(const UnicodeString
&s
, int64_t ces
[], int32_t cesLength
) {
1509 return getCEs(s
, 0, ces
, cesLength
);
1513 CollationDataBuilder::getCEs(const UnicodeString
&prefix
, const UnicodeString
&s
,
1514 int64_t ces
[], int32_t cesLength
) {
1515 int32_t prefixLength
= prefix
.length();
1516 if(prefixLength
== 0) {
1517 return getCEs(s
, 0, ces
, cesLength
);
1519 return getCEs(prefix
+ s
, prefixLength
, ces
, cesLength
);
1524 CollationDataBuilder::getCEs(const UnicodeString
&s
, int32_t start
,
1525 int64_t ces
[], int32_t cesLength
) {
1526 if(collIter
== NULL
) {
1527 collIter
= new DataBuilderCollationIterator(*this);
1528 if(collIter
== NULL
) { return 0; }
1530 return collIter
->fetchCEs(s
, start
, ces
, cesLength
);
1535 #endif // !UCONFIG_NO_COLLATION