1 // © 2016 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4 ******************************************************************************* 
   5 * Copyright (C) 2012-2015, International Business Machines 
   6 * Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
   8 * collationdatabuilder.cpp 
  10 * (replaced the former ucol_elm.cpp) 
  12 * created on: 2012apr01 
  13 * created by: Markus W. Scherer 
  16 #include "unicode/utypes.h" 
  18 #if !UCONFIG_NO_COLLATION 
  20 #include "unicode/localpointer.h" 
  21 #include "unicode/uchar.h" 
  22 #include "unicode/ucharstrie.h" 
  23 #include "unicode/ucharstriebuilder.h" 
  24 #include "unicode/uniset.h" 
  25 #include "unicode/unistr.h" 
  26 #include "unicode/usetiter.h" 
  27 #include "unicode/utf16.h" 
  29 #include "collation.h" 
  30 #include "collationdata.h" 
  31 #include "collationdatabuilder.h" 
  32 #include "collationfastlatinbuilder.h" 
  33 #include "collationiterator.h" 
  34 #include "normalizer2impl.h" 
  42 CollationDataBuilder::CEModifier::~CEModifier() {} 
  45  * Build-time context and CE32 for a code point. 
  46  * If a code point has contextual mappings, then the default (no-context) mapping 
  47  * and all conditional mappings are stored in a singly-linked list 
  48  * of ConditionalCE32, sorted by context strings. 
  50  * Context strings sort by prefix length, then by prefix, then by contraction suffix. 
  51  * Context strings must be unique and in ascending order. 
  53 struct ConditionalCE32 
: public UMemory 
{ 
  56               ce32(0), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
), 
  58     ConditionalCE32(const UnicodeString 
&ct
, uint32_t ce
) 
  60               ce32(ce
), defaultCE32(Collation::NO_CE32
), builtCE32(Collation::NO_CE32
), 
  63     inline UBool 
hasContext() const { return context
.length() > 1; } 
  64     inline int32_t prefixLength() const { return context
.charAt(0); } 
  67      * "\0" for the first entry for any code point, with its default CE32. 
  69      * Otherwise one unit with the length of the prefix string, 
  70      * then the prefix string, then the contraction suffix. 
  72     UnicodeString context
; 
  74      * CE32 for the code point and its context. 
  75      * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 
  79      * Default CE32 for all contexts with this same prefix. 
  80      * Initially NO_CE32. Set only while building runtime data structures, 
  81      * and only on one of the nodes of a sub-list with the same prefix. 
  85      * CE32 for the built contexts. 
  86      * When fetching CEs from the builder, the contexts are built into their runtime form 
  87      * so that the normal collation implementation can process them. 
  88      * The result is cached in the list head. It is reset when the contexts are modified. 
  92      * Index of the next ConditionalCE32. 
  93      * Negative for the end of the list. 
 100 U_CAPI 
void U_CALLCONV
 
 101 uprv_deleteConditionalCE32(void *obj
) { 
 102     delete static_cast<ConditionalCE32 
*>(obj
); 
 108  * Build-time collation element and character iterator. 
 109  * Uses the runtime CollationIterator for fetching CEs for a string 
 110  * but reads from the builder's unfinished data structures. 
 111  * In particular, this class reads from the unfinished trie 
 112  * and has to avoid CollationIterator::nextCE() and redirect other 
 113  * calls to data->getCE32() and data->getCE32FromSupplementary(). 
 115  * We do this so that we need not implement the collation algorithm 
 116  * again for the builder and make it behave exactly like the runtime code. 
 117  * That would be more difficult to test and maintain than this indirection. 
 119  * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 
 120  * so the data accesses from those code paths need not be modified. 
 122  * This class iterates directly over whole code points 
 123  * so that the CollationIterator does not need the finished trie 
 124  * for handling the LEAD_SURROGATE_TAG. 
 126 class DataBuilderCollationIterator 
: public CollationIterator 
{ 
 128     DataBuilderCollationIterator(CollationDataBuilder 
&b
); 
 130     virtual ~DataBuilderCollationIterator(); 
 132     int32_t fetchCEs(const UnicodeString 
&str
, int32_t start
, int64_t ces
[], int32_t cesLength
); 
 134     virtual void resetToOffset(int32_t newOffset
); 
 135     virtual int32_t getOffset() const; 
 137     virtual UChar32 
nextCodePoint(UErrorCode 
&errorCode
); 
 138     virtual UChar32 
previousCodePoint(UErrorCode 
&errorCode
); 
 141     virtual void forwardNumCodePoints(int32_t num
, UErrorCode 
&errorCode
); 
 142     virtual void backwardNumCodePoints(int32_t num
, UErrorCode 
&errorCode
); 
 144     virtual uint32_t getDataCE32(UChar32 c
) const; 
 145     virtual uint32_t getCE32FromBuilderData(uint32_t ce32
, UErrorCode 
&errorCode
); 
 147     CollationDataBuilder 
&builder
; 
 148     CollationData builderData
; 
 149     uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
]; 
 150     const UnicodeString 
*s
; 
 154 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder 
&b
) 
 155         : CollationIterator(&builderData
, /*numeric=*/ FALSE
), 
 156           builder(b
), builderData(b
.nfcImpl
), 
 158     builderData
.base 
= builder
.base
; 
 159     // Set all of the jamoCE32s[] to indirection CE32s. 
 160     for(int32_t j 
= 0; j 
< CollationData::JAMO_CE32S_LENGTH
; ++j
) {  // Count across Jamo types. 
 161         UChar32 jamo 
= CollationDataBuilder::jamoCpFromIndex(j
); 
 162         jamoCE32s
[j
] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG
, jamo
) | 
 163                 CollationDataBuilder::IS_BUILDER_JAMO_CE32
; 
 165     builderData
.jamoCE32s 
= jamoCE32s
; 
 168 DataBuilderCollationIterator::~DataBuilderCollationIterator() {} 
 171 DataBuilderCollationIterator::fetchCEs(const UnicodeString 
&str
, int32_t start
, 
 172                                        int64_t ces
[], int32_t cesLength
) { 
 173     // Set the pointers each time, in case they changed due to reallocation. 
 174     builderData
.ce32s 
= reinterpret_cast<const uint32_t *>(builder
.ce32s
.getBuffer()); 
 175     builderData
.ces 
= builder
.ce64s
.getBuffer(); 
 176     builderData
.contexts 
= builder
.contexts
.getBuffer(); 
 177     // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). 
 181     UErrorCode errorCode 
= U_ZERO_ERROR
; 
 182     while(U_SUCCESS(errorCode
) && pos 
< s
->length()) { 
 183         // No need to keep all CEs in the iterator buffer. 
 185         UChar32 c 
= s
->char32At(pos
); 
 186         pos 
+= U16_LENGTH(c
); 
 187         uint32_t ce32 
= utrie2_get32(builder
.trie
, c
); 
 188         const CollationData 
*d
; 
 189         if(ce32 
== Collation::FALLBACK_CE32
) { 
 191             ce32 
= builder
.base
->getCE32(c
); 
 195         appendCEsFromCE32(d
, c
, ce32
, /*forward=*/ TRUE
, errorCode
); 
 196         U_ASSERT(U_SUCCESS(errorCode
)); 
 197         for(int32_t i 
= 0; i 
< getCEsLength(); ++i
) { 
 198             int64_t ce 
= getCE(i
); 
 200                 if(cesLength 
< Collation::MAX_EXPANSION_LENGTH
) { 
 211 DataBuilderCollationIterator::resetToOffset(int32_t newOffset
) { 
 217 DataBuilderCollationIterator::getOffset() const { 
 222 DataBuilderCollationIterator::nextCodePoint(UErrorCode 
& /*errorCode*/) { 
 223     if(pos 
== s
->length()) { 
 226     UChar32 c 
= s
->char32At(pos
); 
 227     pos 
+= U16_LENGTH(c
); 
 232 DataBuilderCollationIterator::previousCodePoint(UErrorCode 
& /*errorCode*/) { 
 236     UChar32 c 
= s
->char32At(pos 
- 1); 
 237     pos 
-= U16_LENGTH(c
); 
 242 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode 
& /*errorCode*/) { 
 243     pos 
= s
->moveIndex32(pos
, num
); 
 247 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode 
& /*errorCode*/) { 
 248     pos 
= s
->moveIndex32(pos
, -num
); 
 252 DataBuilderCollationIterator::getDataCE32(UChar32 c
) const { 
 253     return utrie2_get32(builder
.trie
, c
); 
 257 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32
, UErrorCode 
&errorCode
) { 
 258     U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::BUILDER_DATA_TAG
)); 
 259     if((ce32 
& CollationDataBuilder::IS_BUILDER_JAMO_CE32
) != 0) { 
 260         UChar32 jamo 
= Collation::indexFromCE32(ce32
); 
 261         return utrie2_get32(builder
.trie
, jamo
); 
 263         ConditionalCE32 
*cond 
= builder
.getConditionalCE32ForCE32(ce32
); 
 264         if(cond
->builtCE32 
== Collation::NO_CE32
) { 
 265             // Build the context-sensitive mappings into their runtime form and cache the result. 
 266             cond
->builtCE32 
= builder
.buildContext(cond
, errorCode
); 
 267             if(errorCode 
== U_BUFFER_OVERFLOW_ERROR
) { 
 268                 errorCode 
= U_ZERO_ERROR
; 
 269                 builder
.clearContexts(); 
 270                 cond
->builtCE32 
= builder
.buildContext(cond
, errorCode
); 
 272             builderData
.contexts 
= builder
.contexts
.getBuffer(); 
 274         return cond
->builtCE32
; 
 278 // ------------------------------------------------------------------------- *** 
 280 CollationDataBuilder::CollationDataBuilder(UErrorCode 
&errorCode
) 
 281         : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode
)), 
 282           base(NULL
), baseSettings(NULL
), 
 284           ce32s(errorCode
), ce64s(errorCode
), conditionalCE32s(errorCode
), 
 286           fastLatinEnabled(FALSE
), fastLatinBuilder(NULL
), 
 288     // Reserve the first CE32 for U+0000. 
 289     ce32s
.addElement(0, errorCode
); 
 290     conditionalCE32s
.setDeleter(uprv_deleteConditionalCE32
); 
 293 CollationDataBuilder::~CollationDataBuilder() { 
 295     delete fastLatinBuilder
; 
 300 CollationDataBuilder::initForTailoring(const CollationData 
*b
, UErrorCode 
&errorCode
) { 
 301     if(U_FAILURE(errorCode
)) { return; } 
 303         errorCode 
= U_INVALID_STATE_ERROR
; 
 307         errorCode 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 312     // For a tailoring, the default is to fall back to the base. 
 313     trie 
= utrie2_open(Collation::FALLBACK_CE32
, Collation::FFFD_CE32
, &errorCode
); 
 315     // Set the Latin-1 letters block so that it is allocated first in the data array, 
 316     // to try to improve locality of reference when sorting Latin-1 text. 
 317     // Do not use utrie2_setRange32() since that will not actually allocate blocks 
 318     // that are filled with the default value. 
 319     // ASCII (0..7F) is already preallocated anyway. 
 320     for(UChar32 c 
= 0xc0; c 
<= 0xff; ++c
) { 
 321         utrie2_set32(trie
, c
, Collation::FALLBACK_CE32
, &errorCode
); 
 324     // Hangul syllables are not tailorable (except via tailoring Jamos). 
 325     // Always set the Hangul tag to help performance. 
 326     // Do this here, rather than in buildMappings(), 
 327     // so that we see the HANGUL_TAG in various assertions. 
 328     uint32_t hangulCE32 
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0); 
 329     utrie2_setRange32(trie
, Hangul::HANGUL_BASE
, Hangul::HANGUL_END
, hangulCE32
, TRUE
, &errorCode
); 
 331     // Copy the set contents but don't copy/clone the set as a whole because 
 332     // that would copy the isFrozen state too. 
 333     unsafeBackwardSet
.addAll(*b
->unsafeBackwardSet
); 
 335     if(U_FAILURE(errorCode
)) { return; } 
 339 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start
, UChar32 end
, 
 340                                            uint32_t primary
, int32_t step
, 
 341                                            UErrorCode 
&errorCode
) { 
 342     if(U_FAILURE(errorCode
)) { return FALSE
; } 
 343     U_ASSERT(start 
<= end
); 
 344     // TODO: Do we need to check what values are currently set for start..end? 
 345     // An offset range is worth it only if we can achieve an overlap between 
 346     // adjacent UTrie2 blocks of 32 code points each. 
 347     // An offset CE is also a little more expensive to look up and compute 
 349     // If the range spans at least three UTrie2 block boundaries (> 64 code points), 
 351     // If the range spans one or two block boundaries and there are 
 352     // at least 4 code points on either side, then we take it. 
 353     // (We could additionally require a minimum range length of, say, 16.) 
 354     int32_t blockDelta 
= (end 
>> 5) - (start 
>> 5); 
 355     if(2 <= step 
&& step 
<= 0x7f && 
 357             (blockDelta 
> 0 && (start 
& 0x1f) <= 0x1c && (end 
& 0x1f) >= 3))) { 
 358         int64_t dataCE 
= ((int64_t)primary 
<< 32) | (start 
<< 8) | step
; 
 359         if(isCompressiblePrimary(primary
)) { dataCE 
|= 0x80; } 
 360         int32_t index 
= addCE(dataCE
, errorCode
); 
 361         if(U_FAILURE(errorCode
)) { return 0; } 
 362         if(index 
> Collation::MAX_INDEX
) { 
 363             errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 366         uint32_t offsetCE32 
= Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG
, index
); 
 367         utrie2_setRange32(trie
, start
, end
, offsetCE32
, TRUE
, &errorCode
); 
 376 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start
, UChar32 end
, 
 377                                                    uint32_t primary
, int32_t step
, 
 378                                                    UErrorCode 
&errorCode
) { 
 379     if(U_FAILURE(errorCode
)) { return 0; } 
 380     UBool isCompressible 
= isCompressiblePrimary(primary
); 
 381     if(maybeSetPrimaryRange(start
, end
, primary
, step
, errorCode
)) { 
 382         return Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
, 
 383                                                       (end 
- start 
+ 1) * step
); 
 385         // Short range: Set individual CE32s. 
 387             utrie2_set32(trie
, start
, Collation::makeLongPrimaryCE32(primary
), &errorCode
); 
 389             primary 
= Collation::incThreeBytePrimaryByOffset(primary
, isCompressible
, step
); 
 390             if(start 
> end
) { return primary
; } 
 397 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase
, UChar32 c
, uint32_t ce32
) const { 
 398     int32_t i 
= Collation::indexFromCE32(ce32
); 
 399     int64_t dataCE 
= fromBase 
? base
->ces
[i
] : ce64s
.elementAti(i
); 
 400     uint32_t p 
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
); 
 401     return Collation::makeLongPrimaryCE32(p
); 
 405 CollationDataBuilder::isCompressibleLeadByte(uint32_t b
) const { 
 406     return base
->isCompressibleLeadByte(b
); 
 410 CollationDataBuilder::isAssigned(UChar32 c
) const { 
 411     return Collation::isAssignedCE32(utrie2_get32(trie
, c
)); 
 415 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c
) const { 
 416     uint32_t ce32 
= utrie2_get32(trie
, c
); 
 417     if(Collation::isLongPrimaryCE32(ce32
)) { 
 418         return Collation::primaryFromLongPrimaryCE32(ce32
); 
 425 CollationDataBuilder::getSingleCE(UChar32 c
, UErrorCode 
&errorCode
) const { 
 426     if(U_FAILURE(errorCode
)) { return 0; } 
 427     // Keep parallel with CollationData::getSingleCE(). 
 428     UBool fromBase 
= FALSE
; 
 429     uint32_t ce32 
= utrie2_get32(trie
, c
); 
 430     if(ce32 
== Collation::FALLBACK_CE32
) { 
 432         ce32 
= base
->getCE32(c
); 
 434     while(Collation::isSpecialCE32(ce32
)) { 
 435         switch(Collation::tagFromCE32(ce32
)) { 
 436         case Collation::LATIN_EXPANSION_TAG
: 
 437         case Collation::BUILDER_DATA_TAG
: 
 438         case Collation::PREFIX_TAG
: 
 439         case Collation::CONTRACTION_TAG
: 
 440         case Collation::HANGUL_TAG
: 
 441         case Collation::LEAD_SURROGATE_TAG
: 
 442             errorCode 
= U_UNSUPPORTED_ERROR
; 
 444         case Collation::FALLBACK_TAG
: 
 445         case Collation::RESERVED_TAG_3
: 
 446             errorCode 
= U_INTERNAL_PROGRAM_ERROR
; 
 448         case Collation::LONG_PRIMARY_TAG
: 
 449             return Collation::ceFromLongPrimaryCE32(ce32
); 
 450         case Collation::LONG_SECONDARY_TAG
: 
 451             return Collation::ceFromLongSecondaryCE32(ce32
); 
 452         case Collation::EXPANSION32_TAG
: 
 453             if(Collation::lengthFromCE32(ce32
) == 1) { 
 454                 int32_t i 
= Collation::indexFromCE32(ce32
); 
 455                 ce32 
= fromBase 
? base
->ce32s
[i
] : ce32s
.elementAti(i
); 
 458                 errorCode 
= U_UNSUPPORTED_ERROR
; 
 461         case Collation::EXPANSION_TAG
: { 
 462             if(Collation::lengthFromCE32(ce32
) == 1) { 
 463                 int32_t i 
= Collation::indexFromCE32(ce32
); 
 464                 return fromBase 
? base
->ces
[i
] : ce64s
.elementAti(i
); 
 466                 errorCode 
= U_UNSUPPORTED_ERROR
; 
 470         case Collation::DIGIT_TAG
: 
 471             // Fetch the non-numeric-collation CE32 and continue. 
 472             ce32 
= ce32s
.elementAti(Collation::indexFromCE32(ce32
)); 
 474         case Collation::U0000_TAG
: 
 476             // Fetch the normal ce32 for U+0000 and continue. 
 477             ce32 
= fromBase 
? base
->ce32s
[0] : ce32s
.elementAti(0); 
 479         case Collation::OFFSET_TAG
: 
 480             ce32 
= getCE32FromOffsetCE32(fromBase
, c
, ce32
); 
 482         case Collation::IMPLICIT_TAG
: 
 483             return Collation::unassignedCEFromCodePoint(c
); 
 486     return Collation::ceFromSimpleCE32(ce32
); 
 490 CollationDataBuilder::addCE(int64_t ce
, UErrorCode 
&errorCode
) { 
 491     int32_t length 
= ce64s
.size(); 
 492     for(int32_t i 
= 0; i 
< length
; ++i
) { 
 493         if(ce 
== ce64s
.elementAti(i
)) { return i
; } 
 495     ce64s
.addElement(ce
, errorCode
); 
 500 CollationDataBuilder::addCE32(uint32_t ce32
, UErrorCode 
&errorCode
) { 
 501     int32_t length 
= ce32s
.size(); 
 502     for(int32_t i 
= 0; i 
< length
; ++i
) { 
 503         if(ce32 
== (uint32_t)ce32s
.elementAti(i
)) { return i
; } 
 505     ce32s
.addElement((int32_t)ce32
, errorCode
);   
 510 CollationDataBuilder::addConditionalCE32(const UnicodeString 
&context
, uint32_t ce32
, 
 511                                          UErrorCode 
&errorCode
) { 
 512     if(U_FAILURE(errorCode
)) { return -1; } 
 513     U_ASSERT(!context
.isEmpty()); 
 514     int32_t index 
= conditionalCE32s
.size(); 
 515     if(index 
> Collation::MAX_INDEX
) { 
 516         errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 519     ConditionalCE32 
*cond 
= new ConditionalCE32(context
, ce32
); 
 521         errorCode 
= U_MEMORY_ALLOCATION_ERROR
; 
 524     conditionalCE32s
.addElement(cond
, errorCode
); 
 529 CollationDataBuilder::add(const UnicodeString 
&prefix
, const UnicodeString 
&s
, 
 530                           const int64_t ces
[], int32_t cesLength
, 
 531                           UErrorCode 
&errorCode
) { 
 532     uint32_t ce32 
= encodeCEs(ces
, cesLength
, errorCode
); 
 533     addCE32(prefix
, s
, ce32
, errorCode
); 
 537 CollationDataBuilder::addCE32(const UnicodeString 
&prefix
, const UnicodeString 
&s
, 
 538                               uint32_t ce32
, UErrorCode 
&errorCode
) { 
 539     if(U_FAILURE(errorCode
)) { return; } 
 541         errorCode 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 544     if(trie 
== NULL 
|| utrie2_isFrozen(trie
)) { 
 545         errorCode 
= U_INVALID_STATE_ERROR
; 
 548     UChar32 c 
= s
.char32At(0); 
 549     int32_t cLength 
= U16_LENGTH(c
); 
 550     uint32_t oldCE32 
= utrie2_get32(trie
, c
); 
 551     UBool hasContext 
= !prefix
.isEmpty() || s
.length() > cLength
; 
 552     if(oldCE32 
== Collation::FALLBACK_CE32
) { 
 553         // First tailoring for c. 
 554         // If c has contextual base mappings or if we add a contextual mapping, 
 555         // then copy the base mappings. 
 556         // Otherwise we just override the base mapping. 
 557         uint32_t baseCE32 
= base
->getFinalCE32(base
->getCE32(c
)); 
 558         if(hasContext 
|| Collation::ce32HasContext(baseCE32
)) { 
 559             oldCE32 
= copyFromBaseCE32(c
, baseCE32
, TRUE
, errorCode
); 
 560             utrie2_set32(trie
, c
, oldCE32
, &errorCode
); 
 561             if(U_FAILURE(errorCode
)) { return; } 
 565         // No prefix, no contraction. 
 566         if(!isBuilderContextCE32(oldCE32
)) { 
 567             utrie2_set32(trie
, c
, ce32
, &errorCode
); 
 569             ConditionalCE32 
*cond 
= getConditionalCE32ForCE32(oldCE32
); 
 570             cond
->builtCE32 
= Collation::NO_CE32
; 
 574         ConditionalCE32 
*cond
; 
 575         if(!isBuilderContextCE32(oldCE32
)) { 
 576             // Replace the simple oldCE32 with a builder context CE32 
 577             // pointing to a new ConditionalCE32 list head. 
 578             int32_t index 
= addConditionalCE32(UnicodeString((UChar
)0), oldCE32
, errorCode
); 
 579             if(U_FAILURE(errorCode
)) { return; } 
 580             uint32_t contextCE32 
= makeBuilderContextCE32(index
); 
 581             utrie2_set32(trie
, c
, contextCE32
, &errorCode
); 
 583             cond 
= getConditionalCE32(index
); 
 585             cond 
= getConditionalCE32ForCE32(oldCE32
); 
 586             cond
->builtCE32 
= Collation::NO_CE32
; 
 588         UnicodeString 
suffix(s
, cLength
); 
 589         UnicodeString 
context((UChar
)prefix
.length()); 
 590         context
.append(prefix
).append(suffix
); 
 591         unsafeBackwardSet
.addAll(suffix
); 
 593             // invariant: context > cond->context 
 594             int32_t next 
= cond
->next
; 
 596                 // Append a new ConditionalCE32 after cond. 
 597                 int32_t index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 598                 if(U_FAILURE(errorCode
)) { return; } 
 602             ConditionalCE32 
*nextCond 
= getConditionalCE32(next
); 
 603             int8_t cmp 
= context
.compare(nextCond
->context
); 
 605                 // Insert a new ConditionalCE32 between cond and nextCond. 
 606                 int32_t index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 607                 if(U_FAILURE(errorCode
)) { return; } 
 609                 getConditionalCE32(index
)->next 
= next
; 
 611             } else if(cmp 
== 0) { 
 612                 // Same context as before, overwrite its ce32. 
 613                 nextCond
->ce32 
= ce32
; 
 623 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce
) { 
 624     uint32_t p 
= (uint32_t)(ce 
>> 32); 
 625     uint32_t lower32 
= (uint32_t)ce
; 
 626     uint32_t t 
= (uint32_t)(ce 
& 0xffff); 
 627     U_ASSERT((t 
& 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s. 
 628     if((ce 
& INT64_C(0xffff00ff00ff)) == 0) { 
 629         // normal form ppppsstt 
 630         return p 
| (lower32 
>> 16) | (t 
>> 8); 
 631     } else if((ce 
& INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE
) { 
 632         // long-primary form ppppppC1 
 633         return Collation::makeLongPrimaryCE32(p
); 
 634     } else if(p 
== 0 && (t 
& 0xff) == 0) { 
 635         // long-secondary form ssssttC2 
 636         return Collation::makeLongSecondaryCE32(lower32
); 
 638     return Collation::NO_CE32
; 
 642 CollationDataBuilder::encodeOneCE(int64_t ce
, UErrorCode 
&errorCode
) { 
 643     // Try to encode one CE as one CE32. 
 644     uint32_t ce32 
= encodeOneCEAsCE32(ce
); 
 645     if(ce32 
!= Collation::NO_CE32
) { return ce32
; } 
 646     int32_t index 
= addCE(ce
, errorCode
); 
 647     if(U_FAILURE(errorCode
)) { return 0; } 
 648     if(index 
> Collation::MAX_INDEX
) { 
 649         errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 652     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, index
, 1); 
 656 CollationDataBuilder::encodeCEs(const int64_t ces
[], int32_t cesLength
, 
 657                                 UErrorCode 
&errorCode
) { 
 658     if(U_FAILURE(errorCode
)) { return 0; } 
 659     if(cesLength 
< 0 || cesLength 
> Collation::MAX_EXPANSION_LENGTH
) { 
 660         errorCode 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 663     if(trie 
== NULL 
|| utrie2_isFrozen(trie
)) { 
 664         errorCode 
= U_INVALID_STATE_ERROR
; 
 668         // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 
 669         // Do this here so that callers need not do it. 
 670         return encodeOneCEAsCE32(0); 
 671     } else if(cesLength 
== 1) { 
 672         return encodeOneCE(ces
[0], errorCode
); 
 673     } else if(cesLength 
== 2) { 
 674         // Try to encode two CEs as one CE32. 
 675         int64_t ce0 
= ces
[0]; 
 676         int64_t ce1 
= ces
[1]; 
 677         uint32_t p0 
= (uint32_t)(ce0 
>> 32); 
 678         if((ce0 
& INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE 
&& 
 679                 (ce1 
& INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE 
&& 
 681             // Latin mini expansion 
 684                 (((uint32_t)ce0 
& 0xff00u
) << 8) | 
 685                 (uint32_t)(ce1 
>> 16) | 
 686                 Collation::SPECIAL_CE32_LOW_BYTE 
| 
 687                 Collation::LATIN_EXPANSION_TAG
; 
 690     // Try to encode two or more CEs as CE32s. 
 691     int32_t newCE32s
[Collation::MAX_EXPANSION_LENGTH
]; 
 692     for(int32_t i 
= 0;; ++i
) { 
 694             return encodeExpansion32(newCE32s
, cesLength
, errorCode
); 
 696         uint32_t ce32 
= encodeOneCEAsCE32(ces
[i
]); 
 697         if(ce32 
== Collation::NO_CE32
) { break; } 
 698         newCE32s
[i
] = (int32_t)ce32
; 
 700     return encodeExpansion(ces
, cesLength
, errorCode
); 
 704 CollationDataBuilder::encodeExpansion(const int64_t ces
[], int32_t length
, UErrorCode 
&errorCode
) { 
 705     if(U_FAILURE(errorCode
)) { return 0; } 
 706     // See if this sequence of CEs has already been stored. 
 707     int64_t first 
= ces
[0]; 
 708     int32_t ce64sMax 
= ce64s
.size() - length
; 
 709     for(int32_t i 
= 0; i 
<= ce64sMax
; ++i
) { 
 710         if(first 
== ce64s
.elementAti(i
)) { 
 711             if(i 
> Collation::MAX_INDEX
) { 
 712                 errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 715             for(int32_t j 
= 1;; ++j
) { 
 717                     return Collation::makeCE32FromTagIndexAndLength( 
 718                             Collation::EXPANSION_TAG
, i
, length
); 
 720                 if(ce64s
.elementAti(i 
+ j
) != ces
[j
]) { break; } 
 724     // Store the new sequence. 
 725     int32_t i 
= ce64s
.size(); 
 726     if(i 
> Collation::MAX_INDEX
) { 
 727         errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 730     for(int32_t j 
= 0; j 
< length
; ++j
) { 
 731         ce64s
.addElement(ces
[j
], errorCode
); 
 733     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG
, i
, length
); 
 737 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s
[], int32_t length
, 
 738                                         UErrorCode 
&errorCode
) { 
 739     if(U_FAILURE(errorCode
)) { return 0; } 
 740     // See if this sequence of CE32s has already been stored. 
 741     int32_t first 
= newCE32s
[0]; 
 742     int32_t ce32sMax 
= ce32s
.size() - length
; 
 743     for(int32_t i 
= 0; i 
<= ce32sMax
; ++i
) { 
 744         if(first 
== ce32s
.elementAti(i
)) { 
 745             if(i 
> Collation::MAX_INDEX
) { 
 746                 errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 749             for(int32_t j 
= 1;; ++j
) { 
 751                     return Collation::makeCE32FromTagIndexAndLength( 
 752                             Collation::EXPANSION32_TAG
, i
, length
); 
 754                 if(ce32s
.elementAti(i 
+ j
) != newCE32s
[j
]) { break; } 
 758     // Store the new sequence. 
 759     int32_t i 
= ce32s
.size(); 
 760     if(i 
> Collation::MAX_INDEX
) { 
 761         errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
 764     for(int32_t j 
= 0; j 
< length
; ++j
) { 
 765         ce32s
.addElement(newCE32s
[j
], errorCode
); 
 767     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG
, i
, length
); 
 771 CollationDataBuilder::copyFromBaseCE32(UChar32 c
, uint32_t ce32
, UBool withContext
, 
 772                                        UErrorCode 
&errorCode
) { 
 773     if(U_FAILURE(errorCode
)) { return 0; } 
 774     if(!Collation::isSpecialCE32(ce32
)) { return ce32
; } 
 775     switch(Collation::tagFromCE32(ce32
)) { 
 776     case Collation::LONG_PRIMARY_TAG
: 
 777     case Collation::LONG_SECONDARY_TAG
: 
 778     case Collation::LATIN_EXPANSION_TAG
: 
 781     case Collation::EXPANSION32_TAG
: { 
 782         const uint32_t *baseCE32s 
= base
->ce32s 
+ Collation::indexFromCE32(ce32
); 
 783         int32_t length 
= Collation::lengthFromCE32(ce32
); 
 784         ce32 
= encodeExpansion32( 
 785             reinterpret_cast<const int32_t *>(baseCE32s
), length
, errorCode
); 
 788     case Collation::EXPANSION_TAG
: { 
 789         const int64_t *baseCEs 
= base
->ces 
+ Collation::indexFromCE32(ce32
); 
 790         int32_t length 
= Collation::lengthFromCE32(ce32
); 
 791         ce32 
= encodeExpansion(baseCEs
, length
, errorCode
); 
 794     case Collation::PREFIX_TAG
: { 
 795         // Flatten prefixes and nested suffixes (contractions) 
 796         // into a linear list of ConditionalCE32. 
 797         const UChar 
*p 
= base
->contexts 
+ Collation::indexFromCE32(ce32
); 
 798         ce32 
= CollationData::readCE32(p
);  // Default if no prefix match. 
 800             return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
); 
 802         ConditionalCE32 head
; 
 803         UnicodeString 
context((UChar
)0); 
 805         if(Collation::isContractionCE32(ce32
)) { 
 806             index 
= copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
); 
 808             ce32 
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
); 
 809             head
.next 
= index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 811         if(U_FAILURE(errorCode
)) { return 0; } 
 812         ConditionalCE32 
*cond 
= getConditionalCE32(index
);  // the last ConditionalCE32 so far 
 813         UCharsTrie::Iterator 
prefixes(p 
+ 2, 0, errorCode
); 
 814         while(prefixes
.next(errorCode
)) { 
 815             context 
= prefixes
.getString(); 
 817             context
.insert(0, (UChar
)context
.length()); 
 818             ce32 
= (uint32_t)prefixes
.getValue(); 
 819             if(Collation::isContractionCE32(ce32
)) { 
 820                 index 
= copyContractionsFromBaseCE32(context
, c
, ce32
, cond
, errorCode
); 
 822                 ce32 
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
); 
 823                 cond
->next 
= index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 825             if(U_FAILURE(errorCode
)) { return 0; } 
 826             cond 
= getConditionalCE32(index
); 
 828         ce32 
= makeBuilderContextCE32(head
.next
); 
 832     case Collation::CONTRACTION_TAG
: { 
 834             const UChar 
*p 
= base
->contexts 
+ Collation::indexFromCE32(ce32
); 
 835             ce32 
= CollationData::readCE32(p
);  // Default if no suffix match. 
 836             return copyFromBaseCE32(c
, ce32
, FALSE
, errorCode
); 
 838         ConditionalCE32 head
; 
 839         UnicodeString 
context((UChar
)0); 
 840         copyContractionsFromBaseCE32(context
, c
, ce32
, &head
, errorCode
); 
 841         ce32 
= makeBuilderContextCE32(head
.next
); 
 845     case Collation::HANGUL_TAG
: 
 846         errorCode 
= U_UNSUPPORTED_ERROR
;  // We forbid tailoring of Hangul syllables. 
 848     case Collation::OFFSET_TAG
: 
 849         ce32 
= getCE32FromOffsetCE32(TRUE
, c
, ce32
); 
 851     case Collation::IMPLICIT_TAG
: 
 852         ce32 
= encodeOneCE(Collation::unassignedCEFromCodePoint(c
), errorCode
); 
 855         UPRV_UNREACHABLE
;  // require ce32 == base->getFinalCE32(ce32) 
 861 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString 
&context
, UChar32 c
, uint32_t ce32
, 
 862                                                    ConditionalCE32 
*cond
, UErrorCode 
&errorCode
) { 
 863     if(U_FAILURE(errorCode
)) { return 0; } 
 864     const UChar 
*p 
= base
->contexts 
+ Collation::indexFromCE32(ce32
); 
 866     if((ce32 
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) { 
 867         // No match on the single code point. 
 868         // We are underneath a prefix, and the default mapping is just 
 869         // a fallback to the mappings for a shorter prefix. 
 870         U_ASSERT(context
.length() > 1); 
 873         ce32 
= CollationData::readCE32(p
);  // Default if no suffix match. 
 874         U_ASSERT(!Collation::isContractionCE32(ce32
)); 
 875         ce32 
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
); 
 876         cond
->next 
= index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 877         if(U_FAILURE(errorCode
)) { return 0; } 
 878         cond 
= getConditionalCE32(index
); 
 881     int32_t suffixStart 
= context
.length(); 
 882     UCharsTrie::Iterator 
suffixes(p 
+ 2, 0, errorCode
); 
 883     while(suffixes
.next(errorCode
)) { 
 884         context
.append(suffixes
.getString()); 
 885         ce32 
= copyFromBaseCE32(c
, (uint32_t)suffixes
.getValue(), TRUE
, errorCode
); 
 886         cond
->next 
= index 
= addConditionalCE32(context
, ce32
, errorCode
); 
 887         if(U_FAILURE(errorCode
)) { return 0; } 
 888         // No need to update the unsafeBackwardSet because the tailoring set 
 889         // is already a copy of the base set. 
 890         cond 
= getConditionalCE32(index
); 
 891         context
.truncate(suffixStart
); 
 893     U_ASSERT(index 
>= 0); 
 899     CopyHelper(const CollationDataBuilder 
&s
, CollationDataBuilder 
&d
, 
 900                const CollationDataBuilder::CEModifier 
&m
, UErrorCode 
&initialErrorCode
) 
 901             : src(s
), dest(d
), modifier(m
), 
 902               errorCode(initialErrorCode
) {} 
 904     UBool 
copyRangeCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) { 
 905         ce32 
= copyCE32(ce32
); 
 906         utrie2_setRange32(dest
.trie
, start
, end
, ce32
, TRUE
, &errorCode
); 
 907         if(CollationDataBuilder::isBuilderContextCE32(ce32
)) { 
 908             dest
.contextChars
.add(start
, end
); 
 910         return U_SUCCESS(errorCode
); 
 913     uint32_t copyCE32(uint32_t ce32
) { 
 914         if(!Collation::isSpecialCE32(ce32
)) { 
 915             int64_t ce 
= modifier
.modifyCE32(ce32
); 
 916             if(ce 
!= Collation::NO_CE
) { 
 917                 ce32 
= dest
.encodeOneCE(ce
, errorCode
); 
 920             int32_t tag 
= Collation::tagFromCE32(ce32
); 
 921             if(tag 
== Collation::EXPANSION32_TAG
) { 
 922                 const uint32_t *srcCE32s 
= reinterpret_cast<uint32_t *>(src
.ce32s
.getBuffer()); 
 923                 srcCE32s 
+= Collation::indexFromCE32(ce32
); 
 924                 int32_t length 
= Collation::lengthFromCE32(ce32
); 
 925                 // Inspect the source CE32s. Just copy them if none are modified. 
 926                 // Otherwise copy to modifiedCEs, with modifications. 
 927                 UBool isModified 
= FALSE
; 
 928                 for(int32_t i 
= 0; i 
< length
; ++i
) { 
 931                     if(Collation::isSpecialCE32(ce32
) || 
 932                             (ce 
= modifier
.modifyCE32(ce32
)) == Collation::NO_CE
) { 
 934                             modifiedCEs
[i
] = Collation::ceFromCE32(ce32
); 
 938                             for(int32_t j 
= 0; j 
< i
; ++j
) { 
 939                                 modifiedCEs
[j
] = Collation::ceFromCE32(srcCE32s
[j
]); 
 947                     ce32 
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
); 
 949                     ce32 
= dest
.encodeExpansion32( 
 950                         reinterpret_cast<const int32_t *>(srcCE32s
), length
, errorCode
); 
 952             } else if(tag 
== Collation::EXPANSION_TAG
) { 
 953                 const int64_t *srcCEs 
= src
.ce64s
.getBuffer(); 
 954                 srcCEs 
+= Collation::indexFromCE32(ce32
); 
 955                 int32_t length 
= Collation::lengthFromCE32(ce32
); 
 956                 // Inspect the source CEs. Just copy them if none are modified. 
 957                 // Otherwise copy to modifiedCEs, with modifications. 
 958                 UBool isModified 
= FALSE
; 
 959                 for(int32_t i 
= 0; i 
< length
; ++i
) { 
 960                     int64_t srcCE 
= srcCEs
[i
]; 
 961                     int64_t ce 
= modifier
.modifyCE(srcCE
); 
 962                     if(ce 
== Collation::NO_CE
) { 
 964                             modifiedCEs
[i
] = srcCE
; 
 968                             for(int32_t j 
= 0; j 
< i
; ++j
) { 
 969                                 modifiedCEs
[j
] = srcCEs
[j
]; 
 977                     ce32 
= dest
.encodeCEs(modifiedCEs
, length
, errorCode
); 
 979                     ce32 
= dest
.encodeExpansion(srcCEs
, length
, errorCode
); 
 981             } else if(tag 
== Collation::BUILDER_DATA_TAG
) { 
 982                 // Copy the list of ConditionalCE32. 
 983                 ConditionalCE32 
*cond 
= src
.getConditionalCE32ForCE32(ce32
); 
 984                 U_ASSERT(!cond
->hasContext()); 
 985                 int32_t destIndex 
= dest
.addConditionalCE32( 
 986                         cond
->context
, copyCE32(cond
->ce32
), errorCode
); 
 987                 ce32 
= CollationDataBuilder::makeBuilderContextCE32(destIndex
); 
 988                 while(cond
->next 
>= 0) { 
 989                     cond 
= src
.getConditionalCE32(cond
->next
); 
 990                     ConditionalCE32 
*prevDestCond 
= dest
.getConditionalCE32(destIndex
); 
 991                     destIndex 
= dest
.addConditionalCE32( 
 992                             cond
->context
, copyCE32(cond
->ce32
), errorCode
); 
 993                     int32_t suffixStart 
= cond
->prefixLength() + 1; 
 994                     dest
.unsafeBackwardSet
.addAll(cond
->context
.tempSubString(suffixStart
)); 
 995                     prevDestCond
->next 
= destIndex
; 
 998                 // Just copy long CEs and Latin mini expansions (and other expected values) as is, 
 999                 // assuming that the modifier would not modify them. 
1000                 U_ASSERT(tag 
== Collation::LONG_PRIMARY_TAG 
|| 
1001                         tag 
== Collation::LONG_SECONDARY_TAG 
|| 
1002                         tag 
== Collation::LATIN_EXPANSION_TAG 
|| 
1003                         tag 
== Collation::HANGUL_TAG
); 
1009     const CollationDataBuilder 
&src
; 
1010     CollationDataBuilder 
&dest
; 
1011     const CollationDataBuilder::CEModifier 
&modifier
; 
1012     int64_t modifiedCEs
[Collation::MAX_EXPANSION_LENGTH
]; 
1013     UErrorCode errorCode
; 
1018 static UBool U_CALLCONV
 
1019 enumRangeForCopy(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) { 
1021         value 
== Collation::UNASSIGNED_CE32 
|| value 
== Collation::FALLBACK_CE32 
|| 
1022         ((CopyHelper 
*)context
)->copyRangeCE32(start
, end
, value
); 
1028 CollationDataBuilder::copyFrom(const CollationDataBuilder 
&src
, const CEModifier 
&modifier
, 
1029                                UErrorCode 
&errorCode
) { 
1030     if(U_FAILURE(errorCode
)) { return; } 
1031     if(trie 
== NULL 
|| utrie2_isFrozen(trie
)) { 
1032         errorCode 
= U_INVALID_STATE_ERROR
; 
1035     CopyHelper 
helper(src
, *this, modifier
, errorCode
); 
1036     utrie2_enum(src
.trie
, NULL
, enumRangeForCopy
, &helper
); 
1037     errorCode 
= helper
.errorCode
; 
1038     // Update the contextChars and the unsafeBackwardSet while copying, 
1039     // in case a character had conditional mappings in the source builder 
1040     // and they were removed later. 
1041     modified 
|= src
.modified
; 
1045 CollationDataBuilder::optimize(const UnicodeSet 
&set
, UErrorCode 
&errorCode
) { 
1046     if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; } 
1047     UnicodeSetIterator 
iter(set
); 
1048     while(iter
.next() && !iter
.isString()) { 
1049         UChar32 c 
= iter
.getCodepoint(); 
1050         uint32_t ce32 
= utrie2_get32(trie
, c
); 
1051         if(ce32 
== Collation::FALLBACK_CE32
) { 
1052             ce32 
= base
->getFinalCE32(base
->getCE32(c
)); 
1053             ce32 
= copyFromBaseCE32(c
, ce32
, TRUE
, errorCode
); 
1054             utrie2_set32(trie
, c
, ce32
, &errorCode
); 
1061 CollationDataBuilder::suppressContractions(const UnicodeSet 
&set
, UErrorCode 
&errorCode
) { 
1062     if(U_FAILURE(errorCode
) || set
.isEmpty()) { return; } 
1063     UnicodeSetIterator 
iter(set
); 
1064     while(iter
.next() && !iter
.isString()) { 
1065         UChar32 c 
= iter
.getCodepoint(); 
1066         uint32_t ce32 
= utrie2_get32(trie
, c
); 
1067         if(ce32 
== Collation::FALLBACK_CE32
) { 
1068             ce32 
= base
->getFinalCE32(base
->getCE32(c
)); 
1069             if(Collation::ce32HasContext(ce32
)) { 
1070                 ce32 
= copyFromBaseCE32(c
, ce32
, FALSE 
/* without context */, errorCode
); 
1071                 utrie2_set32(trie
, c
, ce32
, &errorCode
); 
1073         } else if(isBuilderContextCE32(ce32
)) { 
1074             ce32 
= getConditionalCE32ForCE32(ce32
)->ce32
; 
1075             // Simply abandon the list of ConditionalCE32. 
1076             // The caller will copy this builder in the end, 
1077             // eliminating unreachable data. 
1078             utrie2_set32(trie
, c
, ce32
, &errorCode
); 
1079             contextChars
.remove(c
); 
1086 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s
[], UErrorCode 
&errorCode
) { 
1087     if(U_FAILURE(errorCode
)) { return FALSE
; } 
1088     UBool anyJamoAssigned 
= base 
== NULL
;  // always set jamoCE32s in the base data 
1089     UBool needToCopyFromBase 
= FALSE
; 
1090     for(int32_t j 
= 0; j 
< CollationData::JAMO_CE32S_LENGTH
; ++j
) {  // Count across Jamo types. 
1091         UChar32 jamo 
= jamoCpFromIndex(j
); 
1092         UBool fromBase 
= FALSE
; 
1093         uint32_t ce32 
= utrie2_get32(trie
, jamo
); 
1094         anyJamoAssigned 
|= Collation::isAssignedCE32(ce32
); 
1095         // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 
1096         // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 
1097         if(ce32 
== Collation::FALLBACK_CE32
) { 
1099             ce32 
= base
->getCE32(jamo
); 
1101         if(Collation::isSpecialCE32(ce32
)) { 
1102             switch(Collation::tagFromCE32(ce32
)) { 
1103             case Collation::LONG_PRIMARY_TAG
: 
1104             case Collation::LONG_SECONDARY_TAG
: 
1105             case Collation::LATIN_EXPANSION_TAG
: 
1106                 // Copy the ce32 as-is. 
1108             case Collation::EXPANSION32_TAG
: 
1109             case Collation::EXPANSION_TAG
: 
1110             case Collation::PREFIX_TAG
: 
1111             case Collation::CONTRACTION_TAG
: 
1113                     // Defer copying until we know if anyJamoAssigned. 
1114                     ce32 
= Collation::FALLBACK_CE32
; 
1115                     needToCopyFromBase 
= TRUE
; 
1118             case Collation::IMPLICIT_TAG
: 
1119                 // An unassigned Jamo should only occur in tests with incomplete bases. 
1121                 ce32 
= Collation::FALLBACK_CE32
; 
1122                 needToCopyFromBase 
= TRUE
; 
1124             case Collation::OFFSET_TAG
: 
1125                 ce32 
= getCE32FromOffsetCE32(fromBase
, jamo
, ce32
); 
1127             case Collation::FALLBACK_TAG
: 
1128             case Collation::RESERVED_TAG_3
: 
1129             case Collation::BUILDER_DATA_TAG
: 
1130             case Collation::DIGIT_TAG
: 
1131             case Collation::U0000_TAG
: 
1132             case Collation::HANGUL_TAG
: 
1133             case Collation::LEAD_SURROGATE_TAG
: 
1134                 errorCode 
= U_INTERNAL_PROGRAM_ERROR
; 
1138         jamoCE32s
[j
] = ce32
; 
1140     if(anyJamoAssigned 
&& needToCopyFromBase
) { 
1141         for(int32_t j 
= 0; j 
< CollationData::JAMO_CE32S_LENGTH
; ++j
) { 
1142             if(jamoCE32s
[j
] == Collation::FALLBACK_CE32
) { 
1143                 UChar32 jamo 
= jamoCpFromIndex(j
); 
1144                 jamoCE32s
[j
] = copyFromBaseCE32(jamo
, base
->getCE32(jamo
), 
1145                                                 /*withContext=*/ TRUE
, errorCode
); 
1149     return anyJamoAssigned 
&& U_SUCCESS(errorCode
); 
1153 CollationDataBuilder::setDigitTags(UErrorCode 
&errorCode
) { 
1154     UnicodeSet 
digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode
); 
1155     if(U_FAILURE(errorCode
)) { return; } 
1156     UnicodeSetIterator 
iter(digits
); 
1157     while(iter
.next()) { 
1158         U_ASSERT(!iter
.isString()); 
1159         UChar32 c 
= iter
.getCodepoint(); 
1160         uint32_t ce32 
= utrie2_get32(trie
, c
); 
1161         if(ce32 
!= Collation::FALLBACK_CE32 
&& ce32 
!= Collation::UNASSIGNED_CE32
) { 
1162             int32_t index 
= addCE32(ce32
, errorCode
); 
1163             if(U_FAILURE(errorCode
)) { return; } 
1164             if(index 
> Collation::MAX_INDEX
) { 
1165                 errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
1168             ce32 
= Collation::makeCE32FromTagIndexAndLength( 
1169                     Collation::DIGIT_TAG
, index
, u_charDigitValue(c
)); 
1170             utrie2_set32(trie
, c
, ce32
, &errorCode
); 
1177 static UBool U_CALLCONV
 
1178 enumRangeLeadValue(const void *context
, UChar32 
/*start*/, UChar32 
/*end*/, uint32_t value
) { 
1179     int32_t *pValue 
= (int32_t *)context
; 
1180     if(value 
== Collation::UNASSIGNED_CE32
) { 
1181         value 
= Collation::LEAD_ALL_UNASSIGNED
; 
1182     } else if(value 
== Collation::FALLBACK_CE32
) { 
1183         value 
= Collation::LEAD_ALL_FALLBACK
; 
1185         *pValue 
= Collation::LEAD_MIXED
; 
1189         *pValue 
= (int32_t)value
; 
1190     } else if(*pValue 
!= (int32_t)value
) { 
1191         *pValue 
= Collation::LEAD_MIXED
; 
1200 CollationDataBuilder::setLeadSurrogates(UErrorCode 
&errorCode
) { 
1201     for(UChar lead 
= 0xd800; lead 
< 0xdc00; ++lead
) { 
1203         utrie2_enumForLeadSurrogate(trie
, lead
, NULL
, enumRangeLeadValue
, &value
); 
1204         utrie2_set32ForLeadSurrogateCodeUnit( 
1206             Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG
, 0) | (uint32_t)value
, 
1212 CollationDataBuilder::build(CollationData 
&data
, UErrorCode 
&errorCode
) { 
1213     buildMappings(data
, errorCode
); 
1215         data
.numericPrimary 
= base
->numericPrimary
; 
1216         data
.compressibleBytes 
= base
->compressibleBytes
; 
1217         data
.numScripts 
= base
->numScripts
; 
1218         data
.scriptsIndex 
= base
->scriptsIndex
; 
1219         data
.scriptStarts 
= base
->scriptStarts
; 
1220         data
.scriptStartsLength 
= base
->scriptStartsLength
; 
1222     buildFastLatinTable(data
, errorCode
); 
1226 CollationDataBuilder::buildMappings(CollationData 
&data
, UErrorCode 
&errorCode
) { 
1227     if(U_FAILURE(errorCode
)) { return; } 
1228     if(trie 
== NULL 
|| utrie2_isFrozen(trie
)) { 
1229         errorCode 
= U_INVALID_STATE_ERROR
; 
1233     buildContexts(errorCode
); 
1235     uint32_t jamoCE32s
[CollationData::JAMO_CE32S_LENGTH
]; 
1236     int32_t jamoIndex 
= -1; 
1237     if(getJamoCE32s(jamoCE32s
, errorCode
)) { 
1238         jamoIndex 
= ce32s
.size(); 
1239         for(int32_t i 
= 0; i 
< CollationData::JAMO_CE32S_LENGTH
; ++i
) { 
1240             ce32s
.addElement((int32_t)jamoCE32s
[i
], errorCode
); 
1242         // Small optimization: Use a bit in the Hangul ce32 
1243         // to indicate that none of the Jamo CE32s are isSpecialCE32() 
1244         // (as it should be in the root collator). 
1245         // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 
1246         // In order to still have good trie compression and keep this code simple, 
1247         // we only set this flag if a whole block of 588 Hangul syllables starting with 
1248         // a common leading consonant (Jamo L) has this property. 
1249         UBool isAnyJamoVTSpecial 
= FALSE
; 
1250         for(int32_t i 
= Hangul::JAMO_L_COUNT
; i 
< CollationData::JAMO_CE32S_LENGTH
; ++i
) { 
1251             if(Collation::isSpecialCE32(jamoCE32s
[i
])) { 
1252                 isAnyJamoVTSpecial 
= TRUE
; 
1256         uint32_t hangulCE32 
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0); 
1257         UChar32 c 
= Hangul::HANGUL_BASE
; 
1258         for(int32_t i 
= 0; i 
< Hangul::JAMO_L_COUNT
; ++i
) {  // iterate over the Jamo L 
1259             uint32_t ce32 
= hangulCE32
; 
1260             if(!isAnyJamoVTSpecial 
&& !Collation::isSpecialCE32(jamoCE32s
[i
])) { 
1261                 ce32 
|= Collation::HANGUL_NO_SPECIAL_JAMO
; 
1263             UChar32 limit 
= c 
+ Hangul::JAMO_VT_COUNT
; 
1264             utrie2_setRange32(trie
, c
, limit 
- 1, ce32
, TRUE
, &errorCode
); 
1268         // Copy the Hangul CE32s from the base in blocks per Jamo L, 
1269         // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 
1270         for(UChar32 c 
= Hangul::HANGUL_BASE
; c 
< Hangul::HANGUL_LIMIT
;) { 
1271             uint32_t ce32 
= base
->getCE32(c
); 
1272             U_ASSERT(Collation::hasCE32Tag(ce32
, Collation::HANGUL_TAG
)); 
1273             UChar32 limit 
= c 
+ Hangul::JAMO_VT_COUNT
; 
1274             utrie2_setRange32(trie
, c
, limit 
- 1, ce32
, TRUE
, &errorCode
); 
1279     setDigitTags(errorCode
); 
1280     setLeadSurrogates(errorCode
); 
1282     // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 
1283     ce32s
.setElementAt((int32_t)utrie2_get32(trie
, 0), 0); 
1284     utrie2_set32(trie
, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG
, 0), &errorCode
); 
1286     utrie2_freeze(trie
, UTRIE2_32_VALUE_BITS
, &errorCode
); 
1287     if(U_FAILURE(errorCode
)) { return; } 
1289     // Mark each lead surrogate as "unsafe" 
1290     // if any of its 1024 associated supplementary code points is "unsafe". 
1291     UChar32 c 
= 0x10000; 
1292     for(UChar lead 
= 0xd800; lead 
< 0xdc00; ++lead
, c 
+= 0x400) { 
1293         if(unsafeBackwardSet
.containsSome(c
, c 
+ 0x3ff)) { 
1294             unsafeBackwardSet
.add(lead
); 
1297     unsafeBackwardSet
.freeze(); 
1300     data
.ce32s 
= reinterpret_cast<const uint32_t *>(ce32s
.getBuffer()); 
1301     data
.ces 
= ce64s
.getBuffer(); 
1302     data
.contexts 
= contexts
.getBuffer(); 
1304     data
.ce32sLength 
= ce32s
.size(); 
1305     data
.cesLength 
= ce64s
.size(); 
1306     data
.contextsLength 
= contexts
.length(); 
1309     if(jamoIndex 
>= 0) { 
1310         data
.jamoCE32s 
= data
.ce32s 
+ jamoIndex
; 
1312         data
.jamoCE32s 
= base
->jamoCE32s
; 
1314     data
.unsafeBackwardSet 
= &unsafeBackwardSet
; 
1318 CollationDataBuilder::clearContexts() { 
1320     UnicodeSetIterator 
iter(contextChars
); 
1321     while(iter
.next()) { 
1322         U_ASSERT(!iter
.isString()); 
1323         uint32_t ce32 
= utrie2_get32(trie
, iter
.getCodepoint()); 
1324         U_ASSERT(isBuilderContextCE32(ce32
)); 
1325         getConditionalCE32ForCE32(ce32
)->builtCE32 
= Collation::NO_CE32
; 
1330 CollationDataBuilder::buildContexts(UErrorCode 
&errorCode
) { 
1331     if(U_FAILURE(errorCode
)) { return; } 
1332     // Ignore abandoned lists and the cached builtCE32, 
1333     // and build all contexts from scratch. 
1335     UnicodeSetIterator 
iter(contextChars
); 
1336     while(U_SUCCESS(errorCode
) && iter
.next()) { 
1337         U_ASSERT(!iter
.isString()); 
1338         UChar32 c 
= iter
.getCodepoint(); 
1339         uint32_t ce32 
= utrie2_get32(trie
, c
); 
1340         if(!isBuilderContextCE32(ce32
)) { 
1341             // Impossible: No context data for c in contextChars. 
1342             errorCode 
= U_INTERNAL_PROGRAM_ERROR
; 
1345         ConditionalCE32 
*cond 
= getConditionalCE32ForCE32(ce32
); 
1346         ce32 
= buildContext(cond
, errorCode
); 
1347         utrie2_set32(trie
, c
, ce32
, &errorCode
); 
1352 CollationDataBuilder::buildContext(ConditionalCE32 
*head
, UErrorCode 
&errorCode
) { 
1353     if(U_FAILURE(errorCode
)) { return 0; } 
1354     // The list head must have no context. 
1355     U_ASSERT(!head
->hasContext()); 
1356     // The list head must be followed by one or more nodes that all do have context. 
1357     U_ASSERT(head
->next 
>= 0); 
1358     UCharsTrieBuilder 
prefixBuilder(errorCode
); 
1359     UCharsTrieBuilder 
contractionBuilder(errorCode
); 
1360     for(ConditionalCE32 
*cond 
= head
;; cond 
= getConditionalCE32(cond
->next
)) { 
1361         // After the list head, the prefix or suffix can be empty, but not both. 
1362         U_ASSERT(cond 
== head 
|| cond
->hasContext()); 
1363         int32_t prefixLength 
= cond
->prefixLength(); 
1364         UnicodeString 
prefix(cond
->context
, 0, prefixLength 
+ 1); 
1365         // Collect all contraction suffixes for one prefix. 
1366         ConditionalCE32 
*firstCond 
= cond
; 
1367         ConditionalCE32 
*lastCond 
= cond
; 
1368         while(cond
->next 
>= 0 && 
1369                 (cond 
= getConditionalCE32(cond
->next
))->context
.startsWith(prefix
)) { 
1373         int32_t suffixStart 
= prefixLength 
+ 1;  // == prefix.length() 
1374         if(lastCond
->context
.length() == suffixStart
) { 
1375             // One prefix without contraction suffix. 
1376             U_ASSERT(firstCond 
== lastCond
); 
1377             ce32 
= lastCond
->ce32
; 
1380             // Build the contractions trie. 
1381             contractionBuilder
.clear(); 
1382             // Entry for an empty suffix, to be stored before the trie. 
1383             uint32_t emptySuffixCE32 
= 0; 
1385             if(firstCond
->context
.length() == suffixStart
) { 
1386                 // There is a mapping for the prefix and the single character c. (p|c) 
1387                 // If no other suffix matches, then we return this value. 
1388                 emptySuffixCE32 
= firstCond
->ce32
; 
1389                 cond 
= getConditionalCE32(firstCond
->next
); 
1391                 // There is no mapping for the prefix and just the single character. 
1392                 // (There is no p|c, only p|cd, p|ce etc.) 
1393                 flags 
|= Collation::CONTRACT_SINGLE_CP_NO_MATCH
; 
1394                 // When the prefix matches but none of the prefix-specific suffixes, 
1395                 // then we fall back to the mappings with the next-longest prefix, 
1396                 // and ultimately to mappings with no prefix. 
1397                 // Each fallback might be another set of contractions. 
1398                 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 
1399                 // then in text "pch" we find the ch contraction. 
1400                 for(cond 
= head
;; cond 
= getConditionalCE32(cond
->next
)) { 
1401                     int32_t length 
= cond
->prefixLength(); 
1402                     if(length 
== prefixLength
) { break; } 
1403                     if(cond
->defaultCE32 
!= Collation::NO_CE32 
&& 
1404                             (length
==0 || prefix
.endsWith(cond
->context
, 1, length
))) { 
1405                         emptySuffixCE32 
= cond
->defaultCE32
; 
1410             // Optimization: Set a flag when 
1411             // the first character of every contraction suffix has lccc!=0. 
1412             // Short-circuits contraction matching when a normal letter follows. 
1413             flags 
|= Collation::CONTRACT_NEXT_CCC
; 
1414             // Add all of the non-empty suffixes into the contraction trie. 
1416                 UnicodeString 
suffix(cond
->context
, suffixStart
); 
1417                 uint16_t fcd16 
= nfcImpl
.getFCD16(suffix
.char32At(0)); 
1419                     flags 
&= ~Collation::CONTRACT_NEXT_CCC
; 
1421                 fcd16 
= nfcImpl
.getFCD16(suffix
.char32At(suffix
.length() - 1)); 
1423                     // The last suffix character has lccc!=0, allowing for discontiguous contractions. 
1424                     flags 
|= Collation::CONTRACT_TRAILING_CCC
; 
1426                 contractionBuilder
.add(suffix
, (int32_t)cond
->ce32
, errorCode
); 
1427                 if(cond 
== lastCond
) { break; } 
1428                 cond 
= getConditionalCE32(cond
->next
); 
1430             int32_t index 
= addContextTrie(emptySuffixCE32
, contractionBuilder
, errorCode
); 
1431             if(U_FAILURE(errorCode
)) { return 0; } 
1432             if(index 
> Collation::MAX_INDEX
) { 
1433                 errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
1436             ce32 
= Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG
, index
) | flags
; 
1438         U_ASSERT(cond 
== lastCond
); 
1439         firstCond
->defaultCE32 
= ce32
; 
1440         if(prefixLength 
== 0) { 
1441             if(cond
->next 
< 0) { 
1442                 // No non-empty prefixes, only contractions. 
1446             prefix
.remove(0, 1);  // Remove the length unit. 
1448             prefixBuilder
.add(prefix
, (int32_t)ce32
, errorCode
); 
1449             if(cond
->next 
< 0) { break; } 
1452     U_ASSERT(head
->defaultCE32 
!= Collation::NO_CE32
); 
1453     int32_t index 
= addContextTrie(head
->defaultCE32
, prefixBuilder
, errorCode
); 
1454     if(U_FAILURE(errorCode
)) { return 0; } 
1455     if(index 
> Collation::MAX_INDEX
) { 
1456         errorCode 
= U_BUFFER_OVERFLOW_ERROR
; 
1459     return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG
, index
); 
1463 CollationDataBuilder::addContextTrie(uint32_t defaultCE32
, UCharsTrieBuilder 
&trieBuilder
, 
1464                                      UErrorCode 
&errorCode
) { 
1465     UnicodeString context
; 
1466     context
.append((UChar
)(defaultCE32 
>> 16)).append((UChar
)defaultCE32
); 
1467     UnicodeString trieString
; 
1468     context
.append(trieBuilder
.buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieString
, errorCode
)); 
1469     if(U_FAILURE(errorCode
)) { return -1; } 
1470     int32_t index 
= contexts
.indexOf(context
); 
1472         index 
= contexts
.length(); 
1473         contexts
.append(context
); 
1479 CollationDataBuilder::buildFastLatinTable(CollationData 
&data
, UErrorCode 
&errorCode
) { 
1480     if(U_FAILURE(errorCode
) || !fastLatinEnabled
) { return; } 
1482     delete fastLatinBuilder
; 
1483     fastLatinBuilder 
= new CollationFastLatinBuilder(errorCode
); 
1484     if(fastLatinBuilder 
== NULL
) { 
1485         errorCode 
= U_MEMORY_ALLOCATION_ERROR
; 
1488     if(fastLatinBuilder
->forData(data
, errorCode
)) { 
1489         const uint16_t *table 
= fastLatinBuilder
->getTable(); 
1490         int32_t length 
= fastLatinBuilder
->lengthOfTable(); 
1491         if(base 
!= NULL 
&& length 
== base
->fastLatinTableLength 
&& 
1492                 uprv_memcmp(table
, base
->fastLatinTable
, length 
* 2) == 0) { 
1493             // Same fast Latin table as in the base, use that one instead. 
1494             delete fastLatinBuilder
; 
1495             fastLatinBuilder 
= NULL
; 
1496             table 
= base
->fastLatinTable
; 
1498         data
.fastLatinTable 
= table
; 
1499         data
.fastLatinTableLength 
= length
; 
1501         delete fastLatinBuilder
; 
1502         fastLatinBuilder 
= NULL
; 
1507 CollationDataBuilder::getCEs(const UnicodeString 
&s
, int64_t ces
[], int32_t cesLength
) { 
1508     return getCEs(s
, 0, ces
, cesLength
); 
1512 CollationDataBuilder::getCEs(const UnicodeString 
&prefix
, const UnicodeString 
&s
, 
1513                              int64_t ces
[], int32_t cesLength
) { 
1514     int32_t prefixLength 
= prefix
.length(); 
1515     if(prefixLength 
== 0) { 
1516         return getCEs(s
, 0, ces
, cesLength
); 
1518         return getCEs(prefix 
+ s
, prefixLength
, ces
, cesLength
); 
1523 CollationDataBuilder::getCEs(const UnicodeString 
&s
, int32_t start
, 
1524                              int64_t ces
[], int32_t cesLength
) { 
1525     if(collIter 
== NULL
) { 
1526         collIter 
= new DataBuilderCollationIterator(*this); 
1527         if(collIter 
== NULL
) { return 0; } 
1529     return collIter
->fetchCEs(s
, start
, ces
, cesLength
); 
1534 #endif  // !UCONFIG_NO_COLLATION