2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatabuilder.cpp
8 * (replaced the former ucol_elm.cpp)
10 * created on: 2012apr01
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/localpointer.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ucharstrie.h"
21 #include "unicode/ucharstriebuilder.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/usetiter.h"
25 #include "unicode/utf16.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationdatabuilder.h"
30 #include "collationfastlatinbuilder.h"
31 #include "collationiterator.h"
32 #include "normalizer2impl.h"
40 CollationDataBuilder::CEModifier::~CEModifier() {}
43 * Build-time context and CE32 for a code point.
44 * If a code point has contextual mappings, then the default (no-context) mapping
45 * and all conditional mappings are stored in a singly-linked list
46 * of ConditionalCE32, sorted by context strings.
48 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
49 * Context strings must be unique and in ascending order.
51 struct ConditionalCE32
: public UMemory
{
54 ce32(0), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
56 ConditionalCE32(const UnicodeString
&ct
, uint32_t ce
)
58 ce32(ce
), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
),
61 inline UBool
hasContext() const { return context
.length() > 1; }
62 inline int32_t prefixLength() const { return context
.charAt(0); }
65 * "\0" for the first entry for any code point, with its default CE32.
67 * Otherwise one unit with the length of the prefix string,
68 * then the prefix string, then the contraction suffix.
70 UnicodeString context
;
72 * CE32 for the code point and its context.
73 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
77 * Default CE32 for all contexts with this same prefix.
78 * Initially NO_CE32. Set only while building runtime data structures,
79 * and only on one of the nodes of a sub-list with the same prefix.
83 * CE32 for the built contexts.
84 * When fetching CEs from the builder, the contexts are built into their runtime form
85 * so that the normal collation implementation can process them.
86 * The result is cached in the list head. It is reset when the contexts are modified.
90 * Index of the next ConditionalCE32.
91 * Negative for the end of the list.
98 U_CAPI
void U_CALLCONV
99 uprv_deleteConditionalCE32(void *obj
) {
100 delete static_cast<ConditionalCE32
*>(obj
);
106 * Build-time collation element and character iterator.
107 * Uses the runtime CollationIterator for fetching CEs for a string
108 * but reads from the builder's unfinished data structures.
109 * In particular, this class reads from the unfinished trie
110 * and has to avoid CollationIterator::nextCE() and redirect other
111 * calls to data->getCE32() and data->getCE32FromSupplementary().
113 * We do this so that we need not implement the collation algorithm
114 * again for the builder and make it behave exactly like the runtime code.
115 * That would be more difficult to test and maintain than this indirection.
117 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
118 * so the data accesses from those code paths need not be modified.
120 * This class iterates directly over whole code points
121 * so that the CollationIterator does not need the finished trie
122 * for handling the LEAD_SURROGATE_TAG.
124 class DataBuilderCollationIterator
: public CollationIterator
{
126 DataBuilderCollationIterator(CollationDataBuilder
&b
);
128 virtual ~DataBuilderCollationIterator();
130 int32_t fetchCEs(const UnicodeString
&str
, int32_t start
, int64_t ces
[], int32_t cesLength
);
132 virtual void resetToOffset(int32_t newOffset
);
133 virtual int32_t getOffset() const;
135 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
136 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
139 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
140 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
142 virtual uint32_t getDataCE32(UChar32 c
) const;
143 virtual uint32_t getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
);
145 CollationDataBuilder
&builder
;
146 CollationData builderData
;
147 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
148 const UnicodeString
*s
;
152 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder
&b
)
153 : CollationIterator(&builderData
, /*numeric=*/ FALSE
),
154 builder(b
), builderData(b
.nfcImpl
),
156 builderData
.base
= builder
.base
;
157 // Set all of the jamoCE32s[] to indirection CE32s.
158 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
159 UChar32 jamo
= CollationDataBuilder::jamoCpFromIndex(j
);
160 jamoCE32s
[j
] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG
, jamo
) |
161 CollationDataBuilder::IS_BUILDER_JAMO_CE32
;
163 builderData
.jamoCE32s
= jamoCE32s
;
166 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
169 DataBuilderCollationIterator::fetchCEs(const UnicodeString
&str
, int32_t start
,
170 int64_t ces
[], int32_t cesLength
) {
171 // Set the pointers each time, in case they changed due to reallocation.
172 builderData
.ce32s
= reinterpret_cast<const uint32_t *>(builder
.ce32s
.getBuffer());
173 builderData
.ces
= builder
.ce64s
.getBuffer();
174 builderData
.contexts
= builder
.contexts
.getBuffer();
175 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
179 UErrorCode errorCode
= U_ZERO_ERROR
;
180 while(U_SUCCESS(errorCode
) && pos
< s
->length()) {
181 // No need to keep all CEs in the iterator buffer.
183 UChar32 c
= s
->char32At(pos
);
184 pos
+= U16_LENGTH(c
);
185 uint32_t ce32
= utrie2_get32(builder
.trie
, c
);
186 const CollationData
*d
;
187 if(ce32
== Collation::FALLBACK_CE32
) {
189 ce32
= builder
.base
->getCE32(c
);
193 appendCEsFromCE32(d
, c
, ce32
, /*forward=*/ TRUE
, errorCode
);
194 U_ASSERT(U_SUCCESS(errorCode
));
195 for(int32_t i
= 0; i
< getCEsLength(); ++i
) {
196 int64_t ce
= getCE(i
);
198 if(cesLength
< Collation::MAX_EXPANSION_LENGTH
) {
209 DataBuilderCollationIterator::resetToOffset(int32_t newOffset
) {
215 DataBuilderCollationIterator::getOffset() const {
220 DataBuilderCollationIterator::nextCodePoint(UErrorCode
& /*errorCode*/) {
221 if(pos
== s
->length()) {
224 UChar32 c
= s
->char32At(pos
);
225 pos
+= U16_LENGTH(c
);
230 DataBuilderCollationIterator::previousCodePoint(UErrorCode
& /*errorCode*/) {
234 UChar32 c
= s
->char32At(pos
- 1);
235 pos
-= U16_LENGTH(c
);
240 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
241 pos
= s
->moveIndex32(pos
, num
);
245 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
246 pos
= s
->moveIndex32(pos
, -num
);
250 DataBuilderCollationIterator::getDataCE32(UChar32 c
) const {
251 return utrie2_get32(builder
.trie
, c
);
255 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32
, UErrorCode
&errorCode
) {
256 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::BUILDER_DATA_TAG
));
257 if((ce32
& CollationDataBuilder::IS_BUILDER_JAMO_CE32
) != 0) {
258 UChar32 jamo
= Collation::indexFromCE32(ce32
);
259 return utrie2_get32(builder
.trie
, jamo
);
261 ConditionalCE32
*cond
= builder
.getConditionalCE32ForCE32(ce32
);
262 if(cond
->builtCE32
== Collation::NO_CE32
) {
263 // Build the context-sensitive mappings into their runtime form and cache the result.
264 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
265 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
266 errorCode
= U_ZERO_ERROR
;
267 builder
.clearContexts();
268 cond
->builtCE32
= builder
.buildContext(cond
, errorCode
);
270 builderData
.contexts
= builder
.contexts
.getBuffer();
272 return cond
->builtCE32
;
276 // ------------------------------------------------------------------------- ***
278 CollationDataBuilder::CollationDataBuilder(UErrorCode
&errorCode
)
279 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode
)),
280 base(NULL
), baseSettings(NULL
),
282 ce32s(errorCode
), ce64s(errorCode
), conditionalCE32s(errorCode
),
284 fastLatinEnabled(FALSE
), fastLatinBuilder(NULL
),
286 // Reserve the first CE32 for U+0000.
287 ce32s
.addElement(0, errorCode
);
288 conditionalCE32s
.setDeleter(uprv_deleteConditionalCE32
);
291 CollationDataBuilder::~CollationDataBuilder() {
293 delete fastLatinBuilder
;
298 CollationDataBuilder::initForTailoring(const CollationData
*b
, UErrorCode
&errorCode
) {
299 if(U_FAILURE(errorCode
)) { return; }
301 errorCode
= U_INVALID_STATE_ERROR
;
305 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
310 // For a tailoring, the default is to fall back to the base.
311 trie
= utrie2_open(Collation::FALLBACK_CE32
, Collation::FFFD_CE32
, &errorCode
);
313 // Set the Latin-1 letters block so that it is allocated first in the data array,
314 // to try to improve locality of reference when sorting Latin-1 text.
315 // Do not use utrie2_setRange32() since that will not actually allocate blocks
316 // that are filled with the default value.
317 // ASCII (0..7F) is already preallocated anyway.
318 for(UChar32 c
= 0xc0; c
<= 0xff; ++c
) {
319 utrie2_set32(trie
, c
, Collation::FALLBACK_CE32
, &errorCode
);
322 // Hangul syllables are not tailorable (except via tailoring Jamos).
323 // Always set the Hangul tag to help performance.
324 // Do this here, rather than in buildMappings(),
325 // so that we see the HANGUL_TAG in various assertions.
326 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
327 utrie2_setRange32(trie
, Hangul::HANGUL_BASE
, Hangul::HANGUL_END
, hangulCE32
, TRUE
, &errorCode
);
329 // Copy the set contents but don't copy/clone the set as a whole because
330 // that would copy the isFrozen state too.
331 unsafeBackwardSet
.addAll(*b
->unsafeBackwardSet
);
333 if(U_FAILURE(errorCode
)) { return; }
337 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start
, UChar32 end
,
338 uint32_t primary
, int32_t step
,
339 UErrorCode
&errorCode
) {
340 if(U_FAILURE(errorCode
)) { return FALSE
; }
341 U_ASSERT(start
<= end
);
342 // TODO: Do we need to check what values are currently set for start..end?
343 // An offset range is worth it only if we can achieve an overlap between
344 // adjacent UTrie2 blocks of 32 code points each.
345 // An offset CE is also a little more expensive to look up and compute
347 // If the range spans at least three UTrie2 block boundaries (> 64 code points),
349 // If the range spans one or two block boundaries and there are
350 // at least 4 code points on either side, then we take it.
351 // (We could additionally require a minimum range length of, say, 16.)
352 int32_t blockDelta
= (end
>> 5) - (start
>> 5);
353 if(2 <= step
&& step
<= 0x7f &&
355 (blockDelta
> 0 && (start
& 0x1f) <= 0x1c && (end
& 0x1f) >= 3))) {
356 int64_t dataCE
= ((int64_t)primary
<< 32) | (start
<< 8) | step
;
357 if(isCompressiblePrimary(primary
)) { dataCE
|= 0x80; }
358 int32_t index
= addCE(dataCE
, errorCode
);
359 if(U_FAILURE(errorCode
)) { return 0; }
360 if(index
> Collation::MAX_INDEX
) {
361 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
364 uint32_t offsetCE32
= Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG
, index
);
365 utrie2_setRange32(trie
, start
, end
, offsetCE32
, TRUE
, &errorCode
);
374 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start
, UChar32 end
,
375 uint32_t primary
, int32_t step
,
376 UErrorCode
&errorCode
) {
377 if(U_FAILURE(errorCode
)) { return 0; }
378 UBool isCompressible
= isCompressiblePrimary(primary
);
379 if(maybeSetPrimaryRange(start
, end
, primary
, step
, errorCode
)) {
380 return Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
,
381 (end
- start
+ 1) * step
);
383 // Short range: Set individual CE32s.
385 utrie2_set32(trie
, start
, Collation::makeLongPrimaryCE32(primary
), &errorCode
);
387 primary
= Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
, step
);
388 if(start
> end
) { return primary
; }
395 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase
, UChar32 c
, uint32_t ce32
) const {
396 int32_t i
= Collation::indexFromCE32(ce32
);
397 int64_t dataCE
= fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
398 uint32_t p
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
);
399 return Collation::makeLongPrimaryCE32(p
);
403 CollationDataBuilder::isCompressibleLeadByte(uint32_t b
) const {
404 return base
->isCompressibleLeadByte(b
);
408 CollationDataBuilder::isAssigned(UChar32 c
) const {
409 return Collation::isAssignedCE32(utrie2_get32(trie
, c
));
413 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c
) const {
414 uint32_t ce32
= utrie2_get32(trie
, c
);
415 if(Collation::isLongPrimaryCE32(ce32
)) {
416 return Collation::primaryFromLongPrimaryCE32(ce32
);
423 CollationDataBuilder::getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const {
424 if(U_FAILURE(errorCode
)) { return 0; }
425 // Keep parallel with CollationData::getSingleCE().
426 UBool fromBase
= FALSE
;
427 uint32_t ce32
= utrie2_get32(trie
, c
);
428 if(ce32
== Collation::FALLBACK_CE32
) {
430 ce32
= base
->getCE32(c
);
432 while(Collation::isSpecialCE32(ce32
)) {
433 switch(Collation::tagFromCE32(ce32
)) {
434 case Collation::LATIN_EXPANSION_TAG
:
435 case Collation::BUILDER_DATA_TAG
:
436 case Collation::PREFIX_TAG
:
437 case Collation::CONTRACTION_TAG
:
438 case Collation::HANGUL_TAG
:
439 case Collation::LEAD_SURROGATE_TAG
:
440 errorCode
= U_UNSUPPORTED_ERROR
;
442 case Collation::FALLBACK_TAG
:
443 case Collation::RESERVED_TAG_3
:
444 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
446 case Collation::LONG_PRIMARY_TAG
:
447 return Collation::ceFromLongPrimaryCE32(ce32
);
448 case Collation::LONG_SECONDARY_TAG
:
449 return Collation::ceFromLongSecondaryCE32(ce32
);
450 case Collation::EXPANSION32_TAG
:
451 if(Collation::lengthFromCE32(ce32
) == 1) {
452 int32_t i
= Collation::indexFromCE32(ce32
);
453 ce32
= fromBase
? base
->ce32s
[i
] : ce32s
.elementAti(i
);
456 errorCode
= U_UNSUPPORTED_ERROR
;
459 case Collation::EXPANSION_TAG
: {
460 if(Collation::lengthFromCE32(ce32
) == 1) {
461 int32_t i
= Collation::indexFromCE32(ce32
);
462 return fromBase
? base
->ces
[i
] : ce64s
.elementAti(i
);
464 errorCode
= U_UNSUPPORTED_ERROR
;
468 case Collation::DIGIT_TAG
:
469 // Fetch the non-numeric-collation CE32 and continue.
470 ce32
= ce32s
.elementAti(Collation::indexFromCE32(ce32
));
472 case Collation::U0000_TAG
:
474 // Fetch the normal ce32 for U+0000 and continue.
475 ce32
= fromBase
? base
->ce32s
[0] : ce32s
.elementAti(0);
477 case Collation::OFFSET_TAG
:
478 ce32
= getCE32FromOffsetCE32(fromBase
, c
, ce32
);
480 case Collation::IMPLICIT_TAG
:
481 return Collation::unassignedCEFromCodePoint(c
);
484 return Collation::ceFromSimpleCE32(ce32
);
488 CollationDataBuilder::addCE(int64_t ce
, UErrorCode
&errorCode
) {
489 int32_t length
= ce64s
.size();
490 for(int32_t i
= 0; i
< length
; ++i
) {
491 if(ce
== ce64s
.elementAti(i
)) { return i
; }
493 ce64s
.addElement(ce
, errorCode
);
498 CollationDataBuilder::addCE32(uint32_t ce32
, UErrorCode
&errorCode
) {
499 int32_t length
= ce32s
.size();
500 for(int32_t i
= 0; i
< length
; ++i
) {
501 if(ce32
== (uint32_t)ce32s
.elementAti(i
)) { return i
; }
503 ce32s
.addElement((int32_t)ce32
, errorCode
);
508 CollationDataBuilder::addConditionalCE32(const UnicodeString
&context
, uint32_t ce32
,
509 UErrorCode
&errorCode
) {
510 if(U_FAILURE(errorCode
)) { return -1; }
511 U_ASSERT(!context
.isEmpty());
512 int32_t index
= conditionalCE32s
.size();
513 if(index
> Collation::MAX_INDEX
) {
514 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
517 ConditionalCE32
*cond
= new ConditionalCE32(context
, ce32
);
519 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
522 conditionalCE32s
.addElement(cond
, errorCode
);
527 CollationDataBuilder::add(const UnicodeString
&prefix
, const UnicodeString
&s
,
528 const int64_t ces
[], int32_t cesLength
,
529 UErrorCode
&errorCode
) {
530 uint32_t ce32
= encodeCEs(ces
, cesLength
, errorCode
);
531 addCE32(prefix
, s
, ce32
, errorCode
);
535 CollationDataBuilder::addCE32(const UnicodeString
&prefix
, const UnicodeString
&s
,
536 uint32_t ce32
, UErrorCode
&errorCode
) {
537 if(U_FAILURE(errorCode
)) { return; }
539 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
542 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
543 errorCode
= U_INVALID_STATE_ERROR
;
546 UChar32 c
= s
.char32At(0);
547 int32_t cLength
= U16_LENGTH(c
);
548 uint32_t oldCE32
= utrie2_get32(trie
, c
);
549 UBool hasContext
= !prefix
.isEmpty() || s
.length() > cLength
;
550 if(oldCE32
== Collation::FALLBACK_CE32
) {
551 // First tailoring for c.
552 // If c has contextual base mappings or if we add a contextual mapping,
553 // then copy the base mappings.
554 // Otherwise we just override the base mapping.
555 uint32_t baseCE32
= base
->getFinalCE32(base
->getCE32(c
));
556 if(hasContext
|| Collation::ce32HasContext(baseCE32
)) {
557 oldCE32
= copyFromBaseCE32(c
, baseCE32
, TRUE
, errorCode
);
558 utrie2_set32(trie
, c
, oldCE32
, &errorCode
);
559 if(U_FAILURE(errorCode
)) { return; }
563 // No prefix, no contraction.
564 if(!isBuilderContextCE32(oldCE32
)) {
565 utrie2_set32(trie
, c
, ce32
, &errorCode
);
567 ConditionalCE32
*cond
= getConditionalCE32ForCE32(oldCE32
);
568 cond
->builtCE32
= Collation::NO_CE32
;
572 ConditionalCE32
*cond
;
573 if(!isBuilderContextCE32(oldCE32
)) {
574 // Replace the simple oldCE32 with a builder context CE32
575 // pointing to a new ConditionalCE32 list head.
576 int32_t index
= addConditionalCE32(UnicodeString((UChar
)0), oldCE32
, errorCode
);
577 if(U_FAILURE(errorCode
)) { return; }
578 uint32_t contextCE32
= makeBuilderContextCE32(index
);
579 utrie2_set32(trie
, c
, contextCE32
, &errorCode
);
581 cond
= getConditionalCE32(index
);
583 cond
= getConditionalCE32ForCE32(oldCE32
);
584 cond
->builtCE32
= Collation::NO_CE32
;
586 UnicodeString
suffix(s
, cLength
);
587 UnicodeString
context((UChar
)prefix
.length());
588 context
.append(prefix
).append(suffix
);
589 unsafeBackwardSet
.addAll(suffix
);
591 // invariant: context > cond->context
592 int32_t next
= cond
->next
;
594 // Append a new ConditionalCE32 after cond.
595 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
596 if(U_FAILURE(errorCode
)) { return; }
600 ConditionalCE32
*nextCond
= getConditionalCE32(next
);
601 int8_t cmp
= context
.compare(nextCond
->context
);
603 // Insert a new ConditionalCE32 between cond and nextCond.
604 int32_t index
= addConditionalCE32(context
, ce32
, errorCode
);
605 if(U_FAILURE(errorCode
)) { return; }
607 getConditionalCE32(index
)->next
= next
;
609 } else if(cmp
== 0) {
610 // Same context as before, overwrite its ce32.
611 nextCond
->ce32
= ce32
;
621 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce
) {
622 uint32_t p
= (uint32_t)(ce
>> 32);
623 uint32_t lower32
= (uint32_t)ce
;
624 uint32_t t
= (uint32_t)(ce
& 0xffff);
625 U_ASSERT((t
& 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
626 if((ce
& INT64_C(0xffff00ff00ff)) == 0) {
627 // normal form ppppsstt
628 return p
| (lower32
>> 16) | (t
>> 8);
629 } else if((ce
& INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE
) {
630 // long-primary form ppppppC1
631 return Collation::makeLongPrimaryCE32(p
);
632 } else if(p
== 0 && (t
& 0xff) == 0) {
633 // long-secondary form ssssttC2
634 return Collation::makeLongSecondaryCE32(lower32
);
636 return Collation::NO_CE32
;
640 CollationDataBuilder::encodeOneCE(int64_t ce
, UErrorCode
&errorCode
) {
641 // Try to encode one CE as one CE32.
642 uint32_t ce32
= encodeOneCEAsCE32(ce
);
643 if(ce32
!= Collation::NO_CE32
) { return ce32
; }
644 int32_t index
= addCE(ce
, errorCode
);
645 if(U_FAILURE(errorCode
)) { return 0; }
646 if(index
> Collation::MAX_INDEX
) {
647 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
650 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, index
, 1);
654 CollationDataBuilder::encodeCEs(const int64_t ces
[], int32_t cesLength
,
655 UErrorCode
&errorCode
) {
656 if(U_FAILURE(errorCode
)) { return 0; }
657 if(cesLength
< 0 || cesLength
> Collation::MAX_EXPANSION_LENGTH
) {
658 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
661 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
662 errorCode
= U_INVALID_STATE_ERROR
;
666 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
667 // Do this here so that callers need not do it.
668 return encodeOneCEAsCE32(0);
669 } else if(cesLength
== 1) {
670 return encodeOneCE(ces
[0], errorCode
);
671 } else if(cesLength
== 2) {
672 // Try to encode two CEs as one CE32.
673 int64_t ce0
= ces
[0];
674 int64_t ce1
= ces
[1];
675 uint32_t p0
= (uint32_t)(ce0
>> 32);
676 if((ce0
& INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE
&&
677 (ce1
& INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE
&&
679 // Latin mini expansion
682 (((uint32_t)ce0
& 0xff00u
) << 8) |
683 (uint32_t)(ce1
>> 16) |
684 Collation::SPECIAL_CE32_LOW_BYTE
|
685 Collation::LATIN_EXPANSION_TAG
;
688 // Try to encode two or more CEs as CE32s.
689 int32_t newCE32s
[Collation::MAX_EXPANSION_LENGTH
];
690 for(int32_t i
= 0;; ++i
) {
692 return encodeExpansion32(newCE32s
, cesLength
, errorCode
);
694 uint32_t ce32
= encodeOneCEAsCE32(ces
[i
]);
695 if(ce32
== Collation::NO_CE32
) { break; }
696 newCE32s
[i
] = (int32_t)ce32
;
698 return encodeExpansion(ces
, cesLength
, errorCode
);
702 CollationDataBuilder::encodeExpansion(const int64_t ces
[], int32_t length
, UErrorCode
&errorCode
) {
703 if(U_FAILURE(errorCode
)) { return 0; }
704 // See if this sequence of CEs has already been stored.
705 int64_t first
= ces
[0];
706 int32_t ce64sMax
= ce64s
.size() - length
;
707 for(int32_t i
= 0; i
<= ce64sMax
; ++i
) {
708 if(first
== ce64s
.elementAti(i
)) {
709 if(i
> Collation::MAX_INDEX
) {
710 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
713 for(int32_t j
= 1;; ++j
) {
715 return Collation::makeCE32FromTagIndexAndLength(
716 Collation::EXPANSION_TAG
, i
, length
);
718 if(ce64s
.elementAti(i
+ j
) != ces
[j
]) { break; }
722 // Store the new sequence.
723 int32_t i
= ce64s
.size();
724 if(i
> Collation::MAX_INDEX
) {
725 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
728 for(int32_t j
= 0; j
< length
; ++j
) {
729 ce64s
.addElement(ces
[j
], errorCode
);
731 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, i
, length
);
735 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s
[], int32_t length
,
736 UErrorCode
&errorCode
) {
737 if(U_FAILURE(errorCode
)) { return 0; }
738 // See if this sequence of CE32s has already been stored.
739 int32_t first
= newCE32s
[0];
740 int32_t ce32sMax
= ce32s
.size() - length
;
741 for(int32_t i
= 0; i
<= ce32sMax
; ++i
) {
742 if(first
== ce32s
.elementAti(i
)) {
743 if(i
> Collation::MAX_INDEX
) {
744 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
747 for(int32_t j
= 1;; ++j
) {
749 return Collation::makeCE32FromTagIndexAndLength(
750 Collation::EXPANSION32_TAG
, i
, length
);
752 if(ce32s
.elementAti(i
+ j
) != newCE32s
[j
]) { break; }
756 // Store the new sequence.
757 int32_t i
= ce32s
.size();
758 if(i
> Collation::MAX_INDEX
) {
759 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
762 for(int32_t j
= 0; j
< length
; ++j
) {
763 ce32s
.addElement(newCE32s
[j
], errorCode
);
765 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG
, i
, length
);
769 CollationDataBuilder::copyFromBaseCE32(UChar32 c
, uint32_t ce32
, UBool withContext
,
770 UErrorCode
&errorCode
) {
771 if(U_FAILURE(errorCode
)) { return 0; }
772 if(!Collation::isSpecialCE32(ce32
)) { return ce32
; }
773 switch(Collation::tagFromCE32(ce32
)) {
774 case Collation::LONG_PRIMARY_TAG
:
775 case Collation::LONG_SECONDARY_TAG
:
776 case Collation::LATIN_EXPANSION_TAG
:
779 case Collation::EXPANSION32_TAG
: {
780 const uint32_t *baseCE32s
= base
->ce32s
+ Collation::indexFromCE32(ce32
);
781 int32_t length
= Collation::lengthFromCE32(ce32
);
782 ce32
= encodeExpansion32(
783 reinterpret_cast<const int32_t *>(baseCE32s
), length
, errorCode
);
786 case Collation::EXPANSION_TAG
: {
787 const int64_t *baseCEs
= base
->ces
+ Collation::indexFromCE32(ce32
);
788 int32_t length
= Collation::lengthFromCE32(ce32
);
789 ce32
= encodeExpansion(baseCEs
, length
, errorCode
);
792 case Collation::PREFIX_TAG
: {
793 // Flatten prefixes and nested suffixes (contractions)
794 // into a linear list of ConditionalCE32.
795 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
796 ce32
= CollationData::readCE32(p
); // Default if no prefix match.
798 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
800 ConditionalCE32 head
;
801 UnicodeString
context((UChar
)0);
803 if(Collation::isContractionCE32(ce32
)) {
804 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
806 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
807 head
.next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
809 if(U_FAILURE(errorCode
)) { return 0; }
810 ConditionalCE32
*cond
= getConditionalCE32(index
); // the last ConditionalCE32 so far
811 UCharsTrie::Iterator
prefixes(p
+ 2, 0, errorCode
);
812 while(prefixes
.next(errorCode
)) {
813 context
= prefixes
.getString();
815 context
.insert(0, (UChar
)context
.length());
816 ce32
= (uint32_t)prefixes
.getValue();
817 if(Collation::isContractionCE32(ce32
)) {
818 index
= copyContractionsFromBaseCE32(context
, c
, ce32
, cond
, errorCode
);
820 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
821 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
823 if(U_FAILURE(errorCode
)) { return 0; }
824 cond
= getConditionalCE32(index
);
826 ce32
= makeBuilderContextCE32(head
.next
);
830 case Collation::CONTRACTION_TAG
: {
832 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
833 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
834 return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
);
836 ConditionalCE32 head
;
837 UnicodeString
context((UChar
)0);
838 copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
);
839 ce32
= makeBuilderContextCE32(head
.next
);
843 case Collation::HANGUL_TAG
:
844 errorCode
= U_UNSUPPORTED_ERROR
; // We forbid tailoring of Hangul syllables.
846 case Collation::OFFSET_TAG
:
847 ce32
= getCE32FromOffsetCE32(TRUE
, c
, ce32
);
849 case Collation::IMPLICIT_TAG
:
850 ce32
= encodeOneCE(Collation::unassignedCEFromCodePoint(c
), errorCode
);
853 U_ASSERT(FALSE
); // require ce32 == base->getFinalCE32(ce32)
860 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString
&context
, UChar32 c
, uint32_t ce32
,
861 ConditionalCE32
*cond
, UErrorCode
&errorCode
) {
862 if(U_FAILURE(errorCode
)) { return 0; }
863 const UChar
*p
= base
->contexts
+ Collation::indexFromCE32(ce32
);
865 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
866 // No match on the single code point.
867 // We are underneath a prefix, and the default mapping is just
868 // a fallback to the mappings for a shorter prefix.
869 U_ASSERT(context
.length() > 1);
872 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
873 U_ASSERT(!Collation::isContractionCE32(ce32
));
874 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
875 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
876 if(U_FAILURE(errorCode
)) { return 0; }
877 cond
= getConditionalCE32(index
);
880 int32_t suffixStart
= context
.length();
881 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
882 while(suffixes
.next(errorCode
)) {
883 context
.append(suffixes
.getString());
884 ce32
= copyFromBaseCE32(c
, (uint32_t)suffixes
.getValue(), TRUE
, errorCode
);
885 cond
->next
= index
= addConditionalCE32(context
, ce32
, errorCode
);
886 if(U_FAILURE(errorCode
)) { return 0; }
887 // No need to update the unsafeBackwardSet because the tailoring set
888 // is already a copy of the base set.
889 cond
= getConditionalCE32(index
);
890 context
.truncate(suffixStart
);
892 U_ASSERT(index
>= 0);
898 CopyHelper(const CollationDataBuilder
&s
, CollationDataBuilder
&d
,
899 const CollationDataBuilder::CEModifier
&m
, UErrorCode
&initialErrorCode
)
900 : src(s
), dest(d
), modifier(m
),
901 errorCode(initialErrorCode
) {}
903 UBool
copyRangeCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
904 ce32
= copyCE32(ce32
);
905 utrie2_setRange32(dest
.trie
, start
, end
, ce32
, TRUE
, &errorCode
);
906 if(CollationDataBuilder::isBuilderContextCE32(ce32
)) {
907 dest
.contextChars
.add(start
, end
);
909 return U_SUCCESS(errorCode
);
912 uint32_t copyCE32(uint32_t ce32
) {
913 if(!Collation::isSpecialCE32(ce32
)) {
914 int64_t ce
= modifier
.modifyCE32(ce32
);
915 if(ce
!= Collation::NO_CE
) {
916 ce32
= dest
.encodeOneCE(ce
, errorCode
);
919 int32_t tag
= Collation::tagFromCE32(ce32
);
920 if(tag
== Collation::EXPANSION32_TAG
) {
921 const uint32_t *srcCE32s
= reinterpret_cast<uint32_t *>(src
.ce32s
.getBuffer());
922 srcCE32s
+= Collation::indexFromCE32(ce32
);
923 int32_t length
= Collation::lengthFromCE32(ce32
);
924 // Inspect the source CE32s. Just copy them if none are modified.
925 // Otherwise copy to modifiedCEs, with modifications.
926 UBool isModified
= FALSE
;
927 for(int32_t i
= 0; i
< length
; ++i
) {
930 if(Collation::isSpecialCE32(ce32
) ||
931 (ce
= modifier
.modifyCE32(ce32
)) == Collation::NO_CE
) {
933 modifiedCEs
[i
] = Collation::ceFromCE32(ce32
);
937 for(int32_t j
= 0; j
< i
; ++j
) {
938 modifiedCEs
[j
] = Collation::ceFromCE32(srcCE32s
[j
]);
946 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
948 ce32
= dest
.encodeExpansion32(
949 reinterpret_cast<const int32_t *>(srcCE32s
), length
, errorCode
);
951 } else if(tag
== Collation::EXPANSION_TAG
) {
952 const int64_t *srcCEs
= src
.ce64s
.getBuffer();
953 srcCEs
+= Collation::indexFromCE32(ce32
);
954 int32_t length
= Collation::lengthFromCE32(ce32
);
955 // Inspect the source CEs. Just copy them if none are modified.
956 // Otherwise copy to modifiedCEs, with modifications.
957 UBool isModified
= FALSE
;
958 for(int32_t i
= 0; i
< length
; ++i
) {
959 int64_t srcCE
= srcCEs
[i
];
960 int64_t ce
= modifier
.modifyCE(srcCE
);
961 if(ce
== Collation::NO_CE
) {
963 modifiedCEs
[i
] = srcCE
;
967 for(int32_t j
= 0; j
< i
; ++j
) {
968 modifiedCEs
[j
] = srcCEs
[j
];
976 ce32
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
);
978 ce32
= dest
.encodeExpansion(srcCEs
, length
, errorCode
);
980 } else if(tag
== Collation::BUILDER_DATA_TAG
) {
981 // Copy the list of ConditionalCE32.
982 ConditionalCE32
*cond
= src
.getConditionalCE32ForCE32(ce32
);
983 U_ASSERT(!cond
->hasContext());
984 int32_t destIndex
= dest
.addConditionalCE32(
985 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
986 ce32
= CollationDataBuilder::makeBuilderContextCE32(destIndex
);
987 while(cond
->next
>= 0) {
988 cond
= src
.getConditionalCE32(cond
->next
);
989 ConditionalCE32
*prevDestCond
= dest
.getConditionalCE32(destIndex
);
990 destIndex
= dest
.addConditionalCE32(
991 cond
->context
, copyCE32(cond
->ce32
), errorCode
);
992 int32_t suffixStart
= cond
->prefixLength() + 1;
993 dest
.unsafeBackwardSet
.addAll(cond
->context
.tempSubString(suffixStart
));
994 prevDestCond
->next
= destIndex
;
997 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
998 // assuming that the modifier would not modify them.
999 U_ASSERT(tag
== Collation::LONG_PRIMARY_TAG
||
1000 tag
== Collation::LONG_SECONDARY_TAG
||
1001 tag
== Collation::LATIN_EXPANSION_TAG
||
1002 tag
== Collation::HANGUL_TAG
);
1008 const CollationDataBuilder
&src
;
1009 CollationDataBuilder
&dest
;
1010 const CollationDataBuilder::CEModifier
&modifier
;
1011 int64_t modifiedCEs
[Collation::MAX_EXPANSION_LENGTH
];
1012 UErrorCode errorCode
;
1017 static UBool U_CALLCONV
1018 enumRangeForCopy(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
1020 value
== Collation::UNASSIGNED_CE32
|| value
== Collation::FALLBACK_CE32
||
1021 ((CopyHelper
*)context
)->copyRangeCE32(start
, end
, value
);
1027 CollationDataBuilder::copyFrom(const CollationDataBuilder
&src
, const CEModifier
&modifier
,
1028 UErrorCode
&errorCode
) {
1029 if(U_FAILURE(errorCode
)) { return; }
1030 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1031 errorCode
= U_INVALID_STATE_ERROR
;
1034 CopyHelper
helper(src
, *this, modifier
, errorCode
);
1035 utrie2_enum(src
.trie
, NULL
, enumRangeForCopy
, &helper
);
1036 errorCode
= helper
.errorCode
;
1037 // Update the contextChars and the unsafeBackwardSet while copying,
1038 // in case a character had conditional mappings in the source builder
1039 // and they were removed later.
1040 modified
|= src
.modified
;
1044 CollationDataBuilder::optimize(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1045 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1046 UnicodeSetIterator
iter(set
);
1047 while(iter
.next() && !iter
.isString()) {
1048 UChar32 c
= iter
.getCodepoint();
1049 uint32_t ce32
= utrie2_get32(trie
, c
);
1050 if(ce32
== Collation::FALLBACK_CE32
) {
1051 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1052 ce32
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
);
1053 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1060 CollationDataBuilder::suppressContractions(const UnicodeSet
&set
, UErrorCode
&errorCode
) {
1061 if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; }
1062 UnicodeSetIterator
iter(set
);
1063 while(iter
.next() && !iter
.isString()) {
1064 UChar32 c
= iter
.getCodepoint();
1065 uint32_t ce32
= utrie2_get32(trie
, c
);
1066 if(ce32
== Collation::FALLBACK_CE32
) {
1067 ce32
= base
->getFinalCE32(base
->getCE32(c
));
1068 if(Collation::ce32HasContext(ce32
)) {
1069 ce32
= copyFromBaseCE32(c
, ce32
, FALSE
/* without context */, errorCode
);
1070 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1072 } else if(isBuilderContextCE32(ce32
)) {
1073 ce32
= getConditionalCE32ForCE32(ce32
)->ce32
;
1074 // Simply abandon the list of ConditionalCE32.
1075 // The caller will copy this builder in the end,
1076 // eliminating unreachable data.
1077 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1078 contextChars
.remove(c
);
1085 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s
[], UErrorCode
&errorCode
) {
1086 if(U_FAILURE(errorCode
)) { return FALSE
; }
1087 UBool anyJamoAssigned
= base
== NULL
; // always set jamoCE32s in the base data
1088 UBool needToCopyFromBase
= FALSE
;
1089 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { // Count across Jamo types.
1090 UChar32 jamo
= jamoCpFromIndex(j
);
1091 UBool fromBase
= FALSE
;
1092 uint32_t ce32
= utrie2_get32(trie
, jamo
);
1093 anyJamoAssigned
|= Collation::isAssignedCE32(ce32
);
1094 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1095 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1096 if(ce32
== Collation::FALLBACK_CE32
) {
1098 ce32
= base
->getCE32(jamo
);
1100 if(Collation::isSpecialCE32(ce32
)) {
1101 switch(Collation::tagFromCE32(ce32
)) {
1102 case Collation::LONG_PRIMARY_TAG
:
1103 case Collation::LONG_SECONDARY_TAG
:
1104 case Collation::LATIN_EXPANSION_TAG
:
1105 // Copy the ce32 as-is.
1107 case Collation::EXPANSION32_TAG
:
1108 case Collation::EXPANSION_TAG
:
1109 case Collation::PREFIX_TAG
:
1110 case Collation::CONTRACTION_TAG
:
1112 // Defer copying until we know if anyJamoAssigned.
1113 ce32
= Collation::FALLBACK_CE32
;
1114 needToCopyFromBase
= TRUE
;
1117 case Collation::IMPLICIT_TAG
:
1118 // An unassigned Jamo should only occur in tests with incomplete bases.
1120 ce32
= Collation::FALLBACK_CE32
;
1121 needToCopyFromBase
= TRUE
;
1123 case Collation::OFFSET_TAG
:
1124 ce32
= getCE32FromOffsetCE32(fromBase
, jamo
, ce32
);
1126 case Collation::FALLBACK_TAG
:
1127 case Collation::RESERVED_TAG_3
:
1128 case Collation::BUILDER_DATA_TAG
:
1129 case Collation::DIGIT_TAG
:
1130 case Collation::U0000_TAG
:
1131 case Collation::HANGUL_TAG
:
1132 case Collation::LEAD_SURROGATE_TAG
:
1133 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1137 jamoCE32s
[j
] = ce32
;
1139 if(anyJamoAssigned
&& needToCopyFromBase
) {
1140 for(int32_t j
= 0; j
< CollationData::JAMO_CE32S_LENGTH
; ++j
) {
1141 if(jamoCE32s
[j
] == Collation::FALLBACK_CE32
) {
1142 UChar32 jamo
= jamoCpFromIndex(j
);
1143 jamoCE32s
[j
] = copyFromBaseCE32(jamo
, base
->getCE32(jamo
),
1144 /*withContext=*/ TRUE
, errorCode
);
1148 return anyJamoAssigned
&& U_SUCCESS(errorCode
);
1152 CollationDataBuilder::setDigitTags(UErrorCode
&errorCode
) {
1153 UnicodeSet
digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode
);
1154 if(U_FAILURE(errorCode
)) { return; }
1155 UnicodeSetIterator
iter(digits
);
1156 while(iter
.next()) {
1157 U_ASSERT(!iter
.isString());
1158 UChar32 c
= iter
.getCodepoint();
1159 uint32_t ce32
= utrie2_get32(trie
, c
);
1160 if(ce32
!= Collation::FALLBACK_CE32
&& ce32
!= Collation::UNASSIGNED_CE32
) {
1161 int32_t index
= addCE32(ce32
, errorCode
);
1162 if(U_FAILURE(errorCode
)) { return; }
1163 if(index
> Collation::MAX_INDEX
) {
1164 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1167 ce32
= Collation::makeCE32FromTagIndexAndLength(
1168 Collation::DIGIT_TAG
, index
, u_charDigitValue(c
));
1169 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1176 static UBool U_CALLCONV
1177 enumRangeLeadValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
1178 int32_t *pValue
= (int32_t *)context
;
1179 if(value
== Collation::UNASSIGNED_CE32
) {
1180 value
= Collation::LEAD_ALL_UNASSIGNED
;
1181 } else if(value
== Collation::FALLBACK_CE32
) {
1182 value
= Collation::LEAD_ALL_FALLBACK
;
1184 *pValue
= Collation::LEAD_MIXED
;
1188 *pValue
= (int32_t)value
;
1189 } else if(*pValue
!= (int32_t)value
) {
1190 *pValue
= Collation::LEAD_MIXED
;
1199 CollationDataBuilder::setLeadSurrogates(UErrorCode
&errorCode
) {
1200 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
) {
1202 utrie2_enumForLeadSurrogate(trie
, lead
, NULL
, enumRangeLeadValue
, &value
);
1203 utrie2_set32ForLeadSurrogateCodeUnit(
1205 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG
, 0) | (uint32_t)value
,
1211 CollationDataBuilder::build(CollationData
&data
, UErrorCode
&errorCode
) {
1212 buildMappings(data
, errorCode
);
1214 data
.numericPrimary
= base
->numericPrimary
;
1215 data
.compressibleBytes
= base
->compressibleBytes
;
1216 data
.numScripts
= base
->numScripts
;
1217 data
.scriptsIndex
= base
->scriptsIndex
;
1218 data
.scriptStarts
= base
->scriptStarts
;
1219 data
.scriptStartsLength
= base
->scriptStartsLength
;
1221 buildFastLatinTable(data
, errorCode
);
1225 CollationDataBuilder::buildMappings(CollationData
&data
, UErrorCode
&errorCode
) {
1226 if(U_FAILURE(errorCode
)) { return; }
1227 if(trie
== NULL
|| utrie2_isFrozen(trie
)) {
1228 errorCode
= U_INVALID_STATE_ERROR
;
1232 buildContexts(errorCode
);
1234 uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
];
1235 int32_t jamoIndex
= -1;
1236 if(getJamoCE32s(jamoCE32s
, errorCode
)) {
1237 jamoIndex
= ce32s
.size();
1238 for(int32_t i
= 0; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1239 ce32s
.addElement((int32_t)jamoCE32s
[i
], errorCode
);
1241 // Small optimization: Use a bit in the Hangul ce32
1242 // to indicate that none of the Jamo CE32s are isSpecialCE32()
1243 // (as it should be in the root collator).
1244 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1245 // In order to still have good trie compression and keep this code simple,
1246 // we only set this flag if a whole block of 588 Hangul syllables starting with
1247 // a common leading consonant (Jamo L) has this property.
1248 UBool isAnyJamoVTSpecial
= FALSE
;
1249 for(int32_t i
= Hangul::JAMO_L_COUNT
; i
< CollationData::JAMO_CE32S_LENGTH
; ++i
) {
1250 if(Collation::isSpecialCE32(jamoCE32s
[i
])) {
1251 isAnyJamoVTSpecial
= TRUE
;
1255 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
1256 UChar32 c
= Hangul::HANGUL_BASE
;
1257 for(int32_t i
= 0; i
< Hangul::JAMO_L_COUNT
; ++i
) { // iterate over the Jamo L
1258 uint32_t ce32
= hangulCE32
;
1259 if(!isAnyJamoVTSpecial
&& !Collation::isSpecialCE32(jamoCE32s
[i
])) {
1260 ce32
|= Collation::HANGUL_NO_SPECIAL_JAMO
;
1262 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1263 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1267 // Copy the Hangul CE32s from the base in blocks per Jamo L,
1268 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1269 for(UChar32 c
= Hangul::HANGUL_BASE
; c
< Hangul::HANGUL_LIMIT
;) {
1270 uint32_t ce32
= base
->getCE32(c
);
1271 U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::HANGUL_TAG
));
1272 UChar32 limit
= c
+ Hangul::JAMO_VT_COUNT
;
1273 utrie2_setRange32(trie
, c
, limit
- 1, ce32
, TRUE
, &errorCode
);
1278 setDigitTags(errorCode
);
1279 setLeadSurrogates(errorCode
);
1281 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1282 ce32s
.setElementAt((int32_t)utrie2_get32(trie
, 0), 0);
1283 utrie2_set32(trie
, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG
, 0), &errorCode
);
1285 utrie2_freeze(trie
, UTRIE2_32_VALUE_BITS
, &errorCode
);
1286 if(U_FAILURE(errorCode
)) { return; }
1288 // Mark each lead surrogate as "unsafe"
1289 // if any of its 1024 associated supplementary code points is "unsafe".
1290 UChar32 c
= 0x10000;
1291 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
1292 if(unsafeBackwardSet
.containsSome(c
, c
+ 0x3ff)) {
1293 unsafeBackwardSet
.add(lead
);
1296 unsafeBackwardSet
.freeze();
1299 data
.ce32s
= reinterpret_cast<const uint32_t *>(ce32s
.getBuffer());
1300 data
.ces
= ce64s
.getBuffer();
1301 data
.contexts
= contexts
.getBuffer();
1303 data
.ce32sLength
= ce32s
.size();
1304 data
.cesLength
= ce64s
.size();
1305 data
.contextsLength
= contexts
.length();
1308 if(jamoIndex
>= 0) {
1309 data
.jamoCE32s
= data
.ce32s
+ jamoIndex
;
1311 data
.jamoCE32s
= base
->jamoCE32s
;
1313 data
.unsafeBackwardSet
= &unsafeBackwardSet
;
1317 CollationDataBuilder::clearContexts() {
1319 UnicodeSetIterator
iter(contextChars
);
1320 while(iter
.next()) {
1321 U_ASSERT(!iter
.isString());
1322 uint32_t ce32
= utrie2_get32(trie
, iter
.getCodepoint());
1323 U_ASSERT(isBuilderContextCE32(ce32
));
1324 getConditionalCE32ForCE32(ce32
)->builtCE32
= Collation::NO_CE32
;
1329 CollationDataBuilder::buildContexts(UErrorCode
&errorCode
) {
1330 if(U_FAILURE(errorCode
)) { return; }
1331 // Ignore abandoned lists and the cached builtCE32,
1332 // and build all contexts from scratch.
1334 UnicodeSetIterator
iter(contextChars
);
1335 while(U_SUCCESS(errorCode
) && iter
.next()) {
1336 U_ASSERT(!iter
.isString());
1337 UChar32 c
= iter
.getCodepoint();
1338 uint32_t ce32
= utrie2_get32(trie
, c
);
1339 if(!isBuilderContextCE32(ce32
)) {
1340 // Impossible: No context data for c in contextChars.
1341 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
1344 ConditionalCE32
*cond
= getConditionalCE32ForCE32(ce32
);
1345 ce32
= buildContext(cond
, errorCode
);
1346 utrie2_set32(trie
, c
, ce32
, &errorCode
);
1351 CollationDataBuilder::buildContext(ConditionalCE32
*head
, UErrorCode
&errorCode
) {
1352 if(U_FAILURE(errorCode
)) { return 0; }
1353 // The list head must have no context.
1354 U_ASSERT(!head
->hasContext());
1355 // The list head must be followed by one or more nodes that all do have context.
1356 U_ASSERT(head
->next
>= 0);
1357 UCharsTrieBuilder
prefixBuilder(errorCode
);
1358 UCharsTrieBuilder
contractionBuilder(errorCode
);
1359 for(ConditionalCE32
*cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1360 // After the list head, the prefix or suffix can be empty, but not both.
1361 U_ASSERT(cond
== head
|| cond
->hasContext());
1362 int32_t prefixLength
= cond
->prefixLength();
1363 UnicodeString
prefix(cond
->context
, 0, prefixLength
+ 1);
1364 // Collect all contraction suffixes for one prefix.
1365 ConditionalCE32
*firstCond
= cond
;
1366 ConditionalCE32
*lastCond
= cond
;
1367 while(cond
->next
>= 0 &&
1368 (cond
= getConditionalCE32(cond
->next
))->context
.startsWith(prefix
)) {
1372 int32_t suffixStart
= prefixLength
+ 1; // == prefix.length()
1373 if(lastCond
->context
.length() == suffixStart
) {
1374 // One prefix without contraction suffix.
1375 U_ASSERT(firstCond
== lastCond
);
1376 ce32
= lastCond
->ce32
;
1379 // Build the contractions trie.
1380 contractionBuilder
.clear();
1381 // Entry for an empty suffix, to be stored before the trie.
1382 uint32_t emptySuffixCE32
= 0;
1384 if(firstCond
->context
.length() == suffixStart
) {
1385 // There is a mapping for the prefix and the single character c. (p|c)
1386 // If no other suffix matches, then we return this value.
1387 emptySuffixCE32
= firstCond
->ce32
;
1388 cond
= getConditionalCE32(firstCond
->next
);
1390 // There is no mapping for the prefix and just the single character.
1391 // (There is no p|c, only p|cd, p|ce etc.)
1392 flags
|= Collation::CONTRACT_SINGLE_CP_NO_MATCH
;
1393 // When the prefix matches but none of the prefix-specific suffixes,
1394 // then we fall back to the mappings with the next-longest prefix,
1395 // and ultimately to mappings with no prefix.
1396 // Each fallback might be another set of contractions.
1397 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1398 // then in text "pch" we find the ch contraction.
1399 for(cond
= head
;; cond
= getConditionalCE32(cond
->next
)) {
1400 int32_t length
= cond
->prefixLength();
1401 if(length
== prefixLength
) { break; }
1402 if(cond
->defaultCE32
!= Collation::NO_CE32
&&
1403 (length
==0 || prefix
.endsWith(cond
->context
, 1, length
))) {
1404 emptySuffixCE32
= cond
->defaultCE32
;
1409 // Optimization: Set a flag when
1410 // the first character of every contraction suffix has lccc!=0.
1411 // Short-circuits contraction matching when a normal letter follows.
1412 flags
|= Collation::CONTRACT_NEXT_CCC
;
1413 // Add all of the non-empty suffixes into the contraction trie.
1415 UnicodeString
suffix(cond
->context
, suffixStart
);
1416 uint16_t fcd16
= nfcImpl
.getFCD16(suffix
.char32At(0));
1418 flags
&= ~Collation::CONTRACT_NEXT_CCC
;
1420 fcd16
= nfcImpl
.getFCD16(suffix
.char32At(suffix
.length() - 1));
1422 // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1423 flags
|= Collation::CONTRACT_TRAILING_CCC
;
1425 contractionBuilder
.add(suffix
, (int32_t)cond
->ce32
, errorCode
);
1426 if(cond
== lastCond
) { break; }
1427 cond
= getConditionalCE32(cond
->next
);
1429 int32_t index
= addContextTrie(emptySuffixCE32
, contractionBuilder
, errorCode
);
1430 if(U_FAILURE(errorCode
)) { return 0; }
1431 if(index
> Collation::MAX_INDEX
) {
1432 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1435 ce32
= Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG
, index
) | flags
;
1437 U_ASSERT(cond
== lastCond
);
1438 firstCond
->defaultCE32
= ce32
;
1439 if(prefixLength
== 0) {
1440 if(cond
->next
< 0) {
1441 // No non-empty prefixes, only contractions.
1445 prefix
.remove(0, 1); // Remove the length unit.
1447 prefixBuilder
.add(prefix
, (int32_t)ce32
, errorCode
);
1448 if(cond
->next
< 0) { break; }
1451 U_ASSERT(head
->defaultCE32
!= Collation::NO_CE32
);
1452 int32_t index
= addContextTrie(head
->defaultCE32
, prefixBuilder
, errorCode
);
1453 if(U_FAILURE(errorCode
)) { return 0; }
1454 if(index
> Collation::MAX_INDEX
) {
1455 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
1458 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG
, index
);
1462 CollationDataBuilder::addContextTrie(uint32_t defaultCE32
, UCharsTrieBuilder
&trieBuilder
,
1463 UErrorCode
&errorCode
) {
1464 UnicodeString context
;
1465 context
.append((UChar
)(defaultCE32
>> 16)).append((UChar
)defaultCE32
);
1466 UnicodeString trieString
;
1467 context
.append(trieBuilder
.buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieString
, errorCode
));
1468 if(U_FAILURE(errorCode
)) { return -1; }
1469 int32_t index
= contexts
.indexOf(context
);
1471 index
= contexts
.length();
1472 contexts
.append(context
);
1478 CollationDataBuilder::buildFastLatinTable(CollationData
&data
, UErrorCode
&errorCode
) {
1479 if(U_FAILURE(errorCode
) || !fastLatinEnabled
) { return; }
1481 delete fastLatinBuilder
;
1482 fastLatinBuilder
= new CollationFastLatinBuilder(errorCode
);
1483 if(fastLatinBuilder
== NULL
) {
1484 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1487 if(fastLatinBuilder
->forData(data
, errorCode
)) {
1488 const uint16_t *table
= fastLatinBuilder
->getTable();
1489 int32_t length
= fastLatinBuilder
->lengthOfTable();
1490 if(base
!= NULL
&& length
== base
->fastLatinTableLength
&&
1491 uprv_memcmp(table
, base
->fastLatinTable
, length
* 2) == 0) {
1492 // Same fast Latin table as in the base, use that one instead.
1493 delete fastLatinBuilder
;
1494 fastLatinBuilder
= NULL
;
1495 table
= base
->fastLatinTable
;
1497 data
.fastLatinTable
= table
;
1498 data
.fastLatinTableLength
= length
;
1500 delete fastLatinBuilder
;
1501 fastLatinBuilder
= NULL
;
1506 CollationDataBuilder::getCEs(const UnicodeString
&s
, int64_t ces
[], int32_t cesLength
) {
1507 return getCEs(s
, 0, ces
, cesLength
);
1511 CollationDataBuilder::getCEs(const UnicodeString
&prefix
, const UnicodeString
&s
,
1512 int64_t ces
[], int32_t cesLength
) {
1513 int32_t prefixLength
= prefix
.length();
1514 if(prefixLength
== 0) {
1515 return getCEs(s
, 0, ces
, cesLength
);
1517 return getCEs(prefix
+ s
, prefixLength
, ces
, cesLength
);
1522 CollationDataBuilder::getCEs(const UnicodeString
&s
, int32_t start
,
1523 int64_t ces
[], int32_t cesLength
) {
1524 if(collIter
== NULL
) {
1525 collIter
= new DataBuilderCollationIterator(*this);
1526 if(collIter
== NULL
) { return 0; }
1528 return collIter
->fetchCEs(s
, start
, ces
, cesLength
);
1533 #endif // !UCONFIG_NO_COLLATION