icuSources/i18n/collationfastlatinbuilder.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2013-2015, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * collationfastlatinbuilder.cpp
   7 *
   8 * created on: 2013aug09
   9 * created by: Markus W. Scherer
  10 */
  11
  12 #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2
  13 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
  14 #include <stdio.h>
  15 #include <string>
  16 #endif
  17
  18 #include "unicode/utypes.h"
  19
  20 #if !UCONFIG_NO_COLLATION
  21
  22 #include "unicode/ucol.h"
  23 #include "unicode/ucharstrie.h"
  24 #include "unicode/unistr.h"
  25 #include "unicode/uobject.h"
  26 #include "unicode/uscript.h"
  27 #include "cmemory.h"
  28 #include "collation.h"
  29 #include "collationdata.h"
  30 #include "collationfastlatin.h"
  31 #include "collationfastlatinbuilder.h"
  32 #include "uassert.h"
  33 #include "uvectr64.h"
  34
  35 U_NAMESPACE_BEGIN
  36
  37 struct CollationData;
  38
  39 namespace {
  40
  41 /**
  42  * Compare two signed int64_t values as if they were unsigned.
  43  */
  44 int32_t
  45 compareInt64AsUnsigned(int64_t a, int64_t b) {
  46     if((uint64_t)a < (uint64_t)b) {
  47         return -1;
  48     } else if((uint64_t)a > (uint64_t)b) {
  49         return 1;
  50     } else {
  51         return 0;
  52     }
  53 }
  54
  55 // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
  56 /**
  57  * Like Java Collections.binarySearch(List, String, Comparator).
  58  *
  59  * @return the index>=0 where the item was found,
  60  *         or the index<0 for inserting the string at ~index in sorted order
  61  */
  62 int32_t
  63 binarySearch(const int64_t list[], int32_t limit, int64_t ce) {
  64     if (limit == 0) { return ~0; }
  65     int32_t start = 0;
  66     for (;;) {
  67         int32_t i = (start + limit) / 2;
  68         int32_t cmp = compareInt64AsUnsigned(ce, list[i]);
  69         if (cmp == 0) {
  70             return i;
  71         } else if (cmp < 0) {
  72             if (i == start) {
  73                 return ~start;  // insert ce before i
  74             }
  75             limit = i;
  76         } else {
  77             if (i == start) {
  78                 return ~(start + 1);  // insert ce after i
  79             }
  80             start = i;
  81         }
  82     }
  83 }
  84
  85 }  // namespace
  86
  87 CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode)
  88         : ce0(0), ce1(0),
  89           contractionCEs(errorCode), uniqueCEs(errorCode),
  90           miniCEs(NULL),
  91           firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
  92           firstShortPrimary(0), shortPrimaryOverflow(FALSE),
  93           headerLength(0) {
  94 }
  95
  96 CollationFastLatinBuilder::~CollationFastLatinBuilder() {
  97     uprv_free(miniCEs);
  98 }
  99
 100 UBool
 101 CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) {
 102     if(U_FAILURE(errorCode)) { return FALSE; }
 103     if(!result.isEmpty()) {  // This builder is not reusable.
 104         errorCode = U_INVALID_STATE_ERROR;
 105         return FALSE;
 106     }
 107     if(!loadGroups(data, errorCode)) { return FALSE; }
 108
 109     // Fast handling of digits.
 110     firstShortPrimary = firstDigitPrimary;
 111     getCEs(data, errorCode);
 112     if(!encodeUniqueCEs(errorCode)) { return FALSE; }
 113     if(shortPrimaryOverflow) {
 114         // Give digits long mini primaries,
 115         // so that there are more short primaries for letters.
 116         firstShortPrimary = firstLatinPrimary;
 117         resetCEs();
 118         getCEs(data, errorCode);
 119         if(!encodeUniqueCEs(errorCode)) { return FALSE; }
 120     }
 121     // Note: If we still have a short-primary overflow but not a long-primary overflow,
 122     // then we could calculate how many more long primaries would fit,
 123     // and set the firstShortPrimary to that many after the current firstShortPrimary,
 124     // and try again.
 125     // However, this might only benefit the en_US_POSIX tailoring,
 126     // and it is simpler to suppress building fast Latin data for it in genrb,
 127     // or by returning FALSE here if shortPrimaryOverflow.
 128
 129     UBool ok = !shortPrimaryOverflow &&
 130             encodeCharCEs(errorCode) && encodeContractions(errorCode);
 131     contractionCEs.removeAllElements();  // might reduce heap memory usage
 132     uniqueCEs.removeAllElements();
 133     return ok;
 134 }
 135
 136 UBool
 137 CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
 138     if(U_FAILURE(errorCode)) { return FALSE; }
 139     headerLength = 1 + NUM_SPECIAL_GROUPS;
 140     uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
 141     result.append((UChar)r0);
 142     // The first few reordering groups should be special groups
 143     // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
 144     for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
 145         lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);
 146         if(lastSpecialPrimaries[i] == 0) {
 147             // missing data
 148             return FALSE;
 149         }
 150         result.append(0);  // reserve a slot for this group
 151     }
 152
 153     firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);
 154     firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);
 155     lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);
 156     if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
 157         // missing data
 158         return FALSE;
 159     }
 160     return TRUE;
 161 }
 162
 163 UBool
 164 CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
 165     // Both or neither need to be encoded as short primaries,
 166     // so that we can test only one and use the same bit mask.
 167     if(p >= firstShortPrimary) {
 168         return q >= firstShortPrimary;
 169     } else if(q >= firstShortPrimary) {
 170         return FALSE;
 171     }
 172     // Both or neither must be potentially-variable,
 173     // so that we can test only one and determine if both are variable.
 174     uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
 175     if(p > lastVariablePrimary) {
 176         return q > lastVariablePrimary;
 177     } else if(q > lastVariablePrimary) {
 178         return FALSE;
 179     }
 180     // Both will be encoded with long mini primaries.
 181     // They must be in the same special reordering group,
 182     // so that we can test only one and determine if both are variable.
 183     U_ASSERT(p != 0 && q != 0);
 184     for(int32_t i = 0;; ++i) {  // will terminate
 185         uint32_t lastPrimary = lastSpecialPrimaries[i];
 186         if(p <= lastPrimary) {
 187             return q <= lastPrimary;
 188         } else if(q <= lastPrimary) {
 189             return FALSE;
 190         }
 191     }
 192 }
 193
 194 void
 195 CollationFastLatinBuilder::resetCEs() {
 196     contractionCEs.removeAllElements();
 197     uniqueCEs.removeAllElements();
 198     shortPrimaryOverflow = FALSE;
 199     result.truncate(headerLength);
 200 }
 201
 202 void
 203 CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) {
 204     if(U_FAILURE(errorCode)) { return; }
 205     int32_t i = 0;
 206     for(UChar c = 0;; ++i, ++c) {
 207         if(c == CollationFastLatin::LATIN_LIMIT) {
 208             c = CollationFastLatin::PUNCT_START;
 209         } else if(c == CollationFastLatin::PUNCT_LIMIT) {
 210             break;
 211         }
 212         const CollationData *d;
 213         uint32_t ce32 = data.getCE32(c);
 214         if(ce32 == Collation::FALLBACK_CE32) {
 215             d = data.base;
 216             ce32 = d->getCE32(c);
 217         } else {
 218             d = &data;
 219         }
 220         if(getCEsFromCE32(*d, c, ce32, errorCode)) {
 221             charCEs[i][0] = ce0;
 222             charCEs[i][1] = ce1;
 223             addUniqueCE(ce0, errorCode);
 224             addUniqueCE(ce1, errorCode);
 225         } else {
 226             // bail out for c
 227             charCEs[i][0] = ce0 = Collation::NO_CE;
 228             charCEs[i][1] = ce1 = 0;
 229         }
 230         if(c == 0 && !isContractionCharCE(ce0)) {
 231             // Always map U+0000 to a contraction.
 232             // Write a contraction list with only a default value if there is no real contraction.
 233             U_ASSERT(contractionCEs.isEmpty());
 234             addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
 235             charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
 236             charCEs[0][1] = 0;
 237         }
 238     }
 239     // Terminate the last contraction list.
 240     contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode);
 241 }
 242
 243 UBool
 244 CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32,
 245                                           UErrorCode &errorCode) {
 246     if(U_FAILURE(errorCode)) { return FALSE; }
 247     ce32 = data.getFinalCE32(ce32);
 248     ce1 = 0;
 249     if(Collation::isSimpleOrLongCE32(ce32)) {
 250         ce0 = Collation::ceFromCE32(ce32);
 251     } else {
 252         switch(Collation::tagFromCE32(ce32)) {
 253         case Collation::LATIN_EXPANSION_TAG:
 254             ce0 = Collation::latinCE0FromCE32(ce32);
 255             ce1 = Collation::latinCE1FromCE32(ce32);
 256             break;
 257         case Collation::EXPANSION32_TAG: {
 258             const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32);
 259             int32_t length = Collation::lengthFromCE32(ce32);
 260             if(length <= 2) {
 261                 ce0 = Collation::ceFromCE32(ce32s[0]);
 262                 if(length == 2) {
 263                     ce1 = Collation::ceFromCE32(ce32s[1]);
 264                 }
 265                 break;
 266             } else {
 267                 return FALSE;
 268             }
 269         }
 270         case Collation::EXPANSION_TAG: {
 271             const int64_t *ces = data.ces + Collation::indexFromCE32(ce32);
 272             int32_t length = Collation::lengthFromCE32(ce32);
 273             if(length <= 2) {
 274                 ce0 = ces[0];
 275                 if(length == 2) {
 276                     ce1 = ces[1];
 277                 }
 278                 break;
 279             } else {
 280                 return FALSE;
 281             }
 282         }
 283         // Note: We could support PREFIX_TAG (assert c>=0)
 284         // by recursing on its default CE32 and checking that none of the prefixes starts
 285         // with a fast Latin character.
 286         // However, currently (2013) there are only the L-before-middle-dot
 287         // prefix mappings in the Latin range, and those would be rejected anyway.
 288         case Collation::CONTRACTION_TAG:
 289             U_ASSERT(c >= 0);
 290             return getCEsFromContractionCE32(data, ce32, errorCode);
 291         case Collation::OFFSET_TAG:
 292             U_ASSERT(c >= 0);
 293             ce0 = data.getCEFromOffsetCE32(c, ce32);
 294             break;
 295         default:
 296             return FALSE;
 297         }
 298     }
 299     // A mapping can be completely ignorable.
 300     if(ce0 == 0) { return ce1 == 0; }
 301     // We do not support an ignorable ce0 unless it is completely ignorable.
 302     uint32_t p0 = (uint32_t)(ce0 >> 32);
 303     if(p0 == 0) { return FALSE; }
 304     // We only support primaries up to the Latin script.
 305     if(p0 > lastLatinPrimary) { return FALSE; }
 306     // We support non-common secondary and case weights only together with short primaries.
 307     uint32_t lower32_0 = (uint32_t)ce0;
 308     if(p0 < firstShortPrimary) {
 309         uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK;
 310         if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
 311     }
 312     // No below-common tertiary weights.
 313     if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
 314     if(ce1 != 0) {
 315         // Both primaries must be in the same group,
 316         // or both must get short mini primaries,
 317         // or a short-primary CE is followed by a secondary CE.
 318         // This is so that we can test the first primary and use the same mask for both,
 319         // and determine for both whether they are variable.
 320         uint32_t p1 = (uint32_t)(ce1 >> 32);
 321         if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; }
 322         uint32_t lower32_1 = (uint32_t)ce1;
 323         // No tertiary CEs.
 324         if((lower32_1 >> 16) == 0) { return FALSE; }
 325         // We support non-common secondary and case weights
 326         // only for secondary CEs or together with short primaries.
 327         if(p1 != 0 && p1 < firstShortPrimary) {
 328             uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK;
 329             if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
 330         }
 331         // No below-common tertiary weights.
 332         if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
 333     }
 334     // No quaternary weights.
 335     if(((ce0 | ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; }
 336     return TRUE;
 337 }
 338
 339 UBool
 340 CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32,
 341                                                      UErrorCode &errorCode) {
 342     if(U_FAILURE(errorCode)) { return FALSE; }
 343     const UChar *p = data.contexts + Collation::indexFromCE32(ce32);
 344     ce32 = CollationData::readCE32(p);  // Default if no suffix match.
 345     // Since the original ce32 is not a prefix mapping,
 346     // the default ce32 must not be another contraction.
 347     U_ASSERT(!Collation::isContractionCE32(ce32));
 348     int32_t contractionIndex = contractionCEs.size();
 349     if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
 350         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
 351     } else {
 352         // Bail out for c-without-contraction.
 353         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode);
 354     }
 355     // Handle an encodable contraction unless the next contraction is too long
 356     // and starts with the same character.
 357     int32_t prevX = -1;
 358     UBool addContraction = FALSE;
 359     UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
 360     while(suffixes.next(errorCode)) {
 361         const UnicodeString &suffix = suffixes.getString();
 362         int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0));
 363         if(x < 0) { continue; }  // ignore anything but fast Latin text
 364         if(x == prevX) {
 365             if(addContraction) {
 366                 // Bail out for all contractions starting with this character.
 367                 addContractionEntry(x, Collation::NO_CE, 0, errorCode);
 368                 addContraction = FALSE;
 369             }
 370             continue;
 371         }
 372         if(addContraction) {
 373             addContractionEntry(prevX, ce0, ce1, errorCode);
 374         }
 375         ce32 = (uint32_t)suffixes.getValue();
 376         if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
 377             addContraction = TRUE;
 378         } else {
 379             addContractionEntry(x, Collation::NO_CE, 0, errorCode);
 380             addContraction = FALSE;
 381         }
 382         prevX = x;
 383     }
 384     if(addContraction) {
 385         addContractionEntry(prevX, ce0, ce1, errorCode);
 386     }
 387     if(U_FAILURE(errorCode)) { return FALSE; }
 388     // Note: There might not be any fast Latin contractions, but
 389     // we need to enter contraction handling anyway so that we can bail out
 390     // when there is a non-fast-Latin character following.
 391     // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
 392     // following umlaut and bail out, rather than return the difference of Y vs. u.
 393     ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
 394     ce1 = 0;
 395     return TRUE;
 396 }
 397
 398 void
 399 CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1,
 400                                                UErrorCode &errorCode) {
 401     contractionCEs.addElement(x, errorCode);
 402     contractionCEs.addElement(cce0, errorCode);
 403     contractionCEs.addElement(cce1, errorCode);
 404     addUniqueCE(cce0, errorCode);
 405     addUniqueCE(cce1, errorCode);
 406 }
 407
 408 void
 409 CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) {
 410     if(U_FAILURE(errorCode)) { return; }
 411     if(ce == 0 || (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; }
 412     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits
 413     int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
 414     if(i < 0) {
 415         uniqueCEs.insertElementAt(ce, ~i, errorCode);
 416     }
 417 }
 418
 419 uint32_t
 420 CollationFastLatinBuilder::getMiniCE(int64_t ce) const {
 421     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits
 422     int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
 423     U_ASSERT(index >= 0);
 424     return miniCEs[index];
 425 }
 426
 427 UBool
 428 CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
 429     if(U_FAILURE(errorCode)) { return FALSE; }
 430     uprv_free(miniCEs);
 431     miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2);
 432     if(miniCEs == NULL) {
 433         errorCode = U_MEMORY_ALLOCATION_ERROR;
 434         return FALSE;
 435     }
 436     int32_t group = 0;
 437     uint32_t lastGroupPrimary = lastSpecialPrimaries[group];
 438     // The lowest unique CE must be at least a secondary CE.
 439     U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
 440     uint32_t prevPrimary = 0;
 441     uint32_t prevSecondary = 0;
 442     uint32_t pri = 0;
 443     uint32_t sec = 0;
 444     uint32_t ter = CollationFastLatin::COMMON_TER;
 445     for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
 446         int64_t ce = uniqueCEs.elementAti(i);
 447         // Note: At least one of the p/s/t weights changes from one unique CE to the next.
 448         // (uniqueCEs does not store case bits.)
 449         uint32_t p = (uint32_t)(ce >> 32);
 450         if(p != prevPrimary) {
 451             while(p > lastGroupPrimary) {
 452                 U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
 453                 // Set the group's header entry to the
 454                 // last "long primary" in or before the group.
 455                 result.setCharAt(1 + group, (UChar)pri);
 456                 if(++group < NUM_SPECIAL_GROUPS) {
 457                     lastGroupPrimary = lastSpecialPrimaries[group];
 458                 } else {
 459                     lastGroupPrimary = 0xffffffff;
 460                     break;
 461                 }
 462             }
 463             if(p < firstShortPrimary) {
 464                 if(pri == 0) {
 465                     pri = CollationFastLatin::MIN_LONG;
 466                 } else if(pri < CollationFastLatin::MAX_LONG) {
 467                     pri += CollationFastLatin::LONG_INC;
 468                 } else {
 469 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
 470                     printf("long-primary overflow for %08x\n", p);
 471 #endif
 472                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
 473                     continue;
 474                 }
 475             } else {
 476                 if(pri < CollationFastLatin::MIN_SHORT) {
 477                     pri = CollationFastLatin::MIN_SHORT;
 478                 } else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) {
 479                     // Reserve the highest primary weight for U+FFFF.
 480                     pri += CollationFastLatin::SHORT_INC;
 481                 } else {
 482 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
 483                     printf("short-primary overflow for %08x\n", p);
 484 #endif
 485                     shortPrimaryOverflow = TRUE;
 486                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
 487                     continue;
 488                 }
 489             }
 490             prevPrimary = p;
 491             prevSecondary = Collation::COMMON_WEIGHT16;
 492             sec = CollationFastLatin::COMMON_SEC;
 493             ter = CollationFastLatin::COMMON_TER;
 494         }
 495         uint32_t lower32 = (uint32_t)ce;
 496         uint32_t s = lower32 >> 16;
 497         if(s != prevSecondary) {
 498             if(pri == 0) {
 499                 if(sec == 0) {
 500                     sec = CollationFastLatin::MIN_SEC_HIGH;
 501                 } else if(sec < CollationFastLatin::MAX_SEC_HIGH) {
 502                     sec += CollationFastLatin::SEC_INC;
 503                 } else {
 504                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
 505                     continue;
 506                 }
 507                 prevSecondary = s;
 508                 ter = CollationFastLatin::COMMON_TER;
 509             } else if(s < Collation::COMMON_WEIGHT16) {
 510                 if(sec == CollationFastLatin::COMMON_SEC) {
 511                     sec = CollationFastLatin::MIN_SEC_BEFORE;
 512                 } else if(sec < CollationFastLatin::MAX_SEC_BEFORE) {
 513                     sec += CollationFastLatin::SEC_INC;
 514                 } else {
 515                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
 516                     continue;
 517                 }
 518             } else if(s == Collation::COMMON_WEIGHT16) {
 519                 sec = CollationFastLatin::COMMON_SEC;
 520             } else {
 521                 if(sec < CollationFastLatin::MIN_SEC_AFTER) {
 522                     sec = CollationFastLatin::MIN_SEC_AFTER;
 523                 } else if(sec < CollationFastLatin::MAX_SEC_AFTER) {
 524                     sec += CollationFastLatin::SEC_INC;
 525                 } else {
 526                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
 527                     continue;
 528                 }
 529             }
 530             prevSecondary = s;
 531             ter = CollationFastLatin::COMMON_TER;
 532         }
 533         U_ASSERT((lower32 & Collation::CASE_MASK) == 0);  // blanked out in uniqueCEs
 534         uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK;
 535         if(t > Collation::COMMON_WEIGHT16) {
 536             if(ter < CollationFastLatin::MAX_TER_AFTER) {
 537                 ++ter;
 538             } else {
 539                 miniCEs[i] = CollationFastLatin::BAIL_OUT;
 540                 continue;
 541             }
 542         }
 543         if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) {
 544             U_ASSERT(sec == CollationFastLatin::COMMON_SEC);
 545             miniCEs[i] = (uint16_t)(pri | ter);
 546         } else {
 547             miniCEs[i] = (uint16_t)(pri | sec | ter);
 548         }
 549     }
 550 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
 551     printf("last mini primary: %04x\n", pri);
 552 #endif
 553 #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
 554     for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
 555         int64_t ce = uniqueCEs.elementAti(i);
 556         printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
 557     }
 558 #endif
 559     return U_SUCCESS(errorCode);
 560 }
 561
 562 UBool
 563 CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) {
 564     if(U_FAILURE(errorCode)) { return FALSE; }
 565     int32_t miniCEsStart = result.length();
 566     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
 567         result.append(0);  // initialize to completely ignorable
 568     }
 569     int32_t indexBase = result.length();
 570     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
 571         int64_t ce = charCEs[i][0];
 572         if(isContractionCharCE(ce)) { continue; }  // defer contraction
 573         uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]);
 574         if(miniCE > 0xffff) {
 575             // Note: There is a chance that this new expansion is the same as a previous one,
 576             // and if so, then we could reuse the other expansion.
 577             // However, that seems unlikely.
 578             int32_t expansionIndex = result.length() - indexBase;
 579             if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
 580                 miniCE = CollationFastLatin::BAIL_OUT;
 581             } else {
 582                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
 583                 miniCE = CollationFastLatin::EXPANSION | expansionIndex;
 584             }
 585         }
 586         result.setCharAt(miniCEsStart + i, (UChar)miniCE);
 587     }
 588     return U_SUCCESS(errorCode);
 589 }
 590
 591 UBool
 592 CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
 593     // We encode all contraction lists so that the first word of a list
 594     // terminates the previous list, and we only need one additional terminator at the end.
 595     if(U_FAILURE(errorCode)) { return FALSE; }
 596     int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS;
 597     int32_t firstContractionIndex = result.length();
 598     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
 599         int64_t ce = charCEs[i][0];
 600         if(!isContractionCharCE(ce)) { continue; }
 601         int32_t contractionIndex = result.length() - indexBase;
 602         if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
 603             result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT);
 604             continue;
 605         }
 606         UBool firstTriple = TRUE;
 607         for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
 608             int32_t x = contractionCEs.elementAti(index);
 609             if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
 610             int64_t cce0 = contractionCEs.elementAti(index + 1);
 611             int64_t cce1 = contractionCEs.elementAti(index + 2);
 612             uint32_t miniCE = encodeTwoCEs(cce0, cce1);
 613             if(miniCE == CollationFastLatin::BAIL_OUT) {
 614                 result.append((UChar)(x | (1 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
 615             } else if(miniCE <= 0xffff) {
 616                 result.append((UChar)(x | (2 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
 617                 result.append((UChar)miniCE);
 618             } else {
 619                 result.append((UChar)(x | (3 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
 620                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
 621             }
 622             firstTriple = FALSE;
 623         }
 624         // Note: There is a chance that this new contraction list is the same as a previous one,
 625         // and if so, then we could truncate the result and reuse the other list.
 626         // However, that seems unlikely.
 627         result.setCharAt(headerLength + i,
 628                          (UChar)(CollationFastLatin::CONTRACTION | contractionIndex));
 629     }
 630     if(result.length() > firstContractionIndex) {
 631         // Terminate the last contraction list.
 632         result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK);
 633     }
 634     if(result.isBogus()) {
 635         errorCode = U_MEMORY_ALLOCATION_ERROR;
 636         return FALSE;
 637     }
 638 #if DEBUG_COLLATION_FAST_LATIN_BUILDER
 639     printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
 640     puts("   header & below-digit groups map");
 641     int32_t i = 0;
 642     for(; i < headerLength; ++i) {
 643         printf(" %04x", result[i]);
 644     }
 645     printf("\n   char mini CEs");
 646     U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0);
 647     for(; i < indexBase; i += 16) {
 648         UChar32 c = i - headerLength;
 649         if(c >= CollationFastLatin::LATIN_LIMIT) {
 650             c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT;
 651         }
 652         printf("\n %04x:", c);
 653         for(int32_t j = 0; j < 16; ++j) {
 654             printf(" %04x", result[i + j]);
 655         }
 656     }
 657     printf("\n   expansions & contractions");
 658     for(; i < result.length(); ++i) {
 659         if((i - indexBase) % 16 == 0) { puts(""); }
 660         printf(" %04x", result[i]);
 661     }
 662     puts("");
 663 #endif
 664     return TRUE;
 665 }
 666
 667 uint32_t
 668 CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const {
 669     if(first == 0) {
 670         return 0;  // completely ignorable
 671     }
 672     if(first == Collation::NO_CE) {
 673         return CollationFastLatin::BAIL_OUT;
 674     }
 675     U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY);
 676
 677     uint32_t miniCE = getMiniCE(first);
 678     if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; }
 679     if(miniCE >= CollationFastLatin::MIN_SHORT) {
 680         // Extract & copy the case bits.
 681         // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
 682         uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3));
 683         // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
 684         c += CollationFastLatin::LOWER_CASE;
 685         miniCE |= c;
 686     }
 687     if(second == 0) { return miniCE; }
 688
 689     uint32_t miniCE1 = getMiniCE(second);
 690     if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; }
 691
 692     uint32_t case1 = (uint32_t)second & Collation::CASE_MASK;
 693     if(miniCE >= CollationFastLatin::MIN_SHORT &&
 694             (miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) {
 695         // Try to combine the two mini CEs into one.
 696         uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK;
 697         uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK;
 698         if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 &&
 699                 ter1 == CollationFastLatin::COMMON_TER) {
 700             // sec1>=sec_high implies pri1==0.
 701             return (miniCE & ~CollationFastLatin::SECONDARY_MASK) | sec1;
 702         }
 703     }
 704
 705     if(miniCE1 <= CollationFastLatin::SECONDARY_MASK || CollationFastLatin::MIN_SHORT <= miniCE1) {
 706         // Secondary CE, or a CE with a short primary, copy the case bits.
 707         case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE;
 708         miniCE1 |= case1;
 709     }
 710     return (miniCE << 16) | miniCE1;
 711 }
 712
 713 U_NAMESPACE_END
 714
 715 #endif  // !UCONFIG_NO_COLLATION