icuSources/i18n/collationdata.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2012-2015, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * collationdata.cpp
   9 *
  10 * created on: 2012jul28
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/ucol.h"
  19 #include "unicode/udata.h"
  20 #include "unicode/uscript.h"
  21 #include "cmemory.h"
  22 #include "collation.h"
  23 #include "collationdata.h"
  24 #include "uassert.h"
  25 #include "utrie2.h"
  26 #include "uvectr32.h"
  27
  28 U_NAMESPACE_BEGIN
  29
  30 uint32_t
  31 CollationData::getIndirectCE32(uint32_t ce32) const {
  32     U_ASSERT(Collation::isSpecialCE32(ce32));
  33     int32_t tag = Collation::tagFromCE32(ce32);
  34     if(tag == Collation::DIGIT_TAG) {
  35         // Fetch the non-numeric-collation CE32.
  36         ce32 = ce32s[Collation::indexFromCE32(ce32)];
  37     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
  38         ce32 = Collation::UNASSIGNED_CE32;
  39     } else if(tag == Collation::U0000_TAG) {
  40         // Fetch the normal ce32 for U+0000.
  41         ce32 = ce32s[0];
  42     }
  43     return ce32;
  44 }
  45
  46 uint32_t
  47 CollationData::getFinalCE32(uint32_t ce32) const {
  48     if(Collation::isSpecialCE32(ce32)) {
  49         ce32 = getIndirectCE32(ce32);
  50     }
  51     return ce32;
  52 }
  53
  54 int64_t
  55 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
  56     if(U_FAILURE(errorCode)) { return 0; }
  57     // Keep parallel with CollationDataBuilder::getSingleCE().
  58     const CollationData *d;
  59     uint32_t ce32 = getCE32(c);
  60     if(ce32 == Collation::FALLBACK_CE32) {
  61         d = base;
  62         ce32 = base->getCE32(c);
  63     } else {
  64         d = this;
  65     }
  66     while(Collation::isSpecialCE32(ce32)) {
  67         switch(Collation::tagFromCE32(ce32)) {
  68         case Collation::LATIN_EXPANSION_TAG:
  69         case Collation::BUILDER_DATA_TAG:
  70         case Collation::PREFIX_TAG:
  71         case Collation::CONTRACTION_TAG:
  72         case Collation::HANGUL_TAG:
  73         case Collation::LEAD_SURROGATE_TAG:
  74             errorCode = U_UNSUPPORTED_ERROR;
  75             return 0;
  76         case Collation::FALLBACK_TAG:
  77         case Collation::RESERVED_TAG_3:
  78             errorCode = U_INTERNAL_PROGRAM_ERROR;
  79             return 0;
  80         case Collation::LONG_PRIMARY_TAG:
  81             return Collation::ceFromLongPrimaryCE32(ce32);
  82         case Collation::LONG_SECONDARY_TAG:
  83             return Collation::ceFromLongSecondaryCE32(ce32);
  84         case Collation::EXPANSION32_TAG:
  85             if(Collation::lengthFromCE32(ce32) == 1) {
  86                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
  87                 break;
  88             } else {
  89                 errorCode = U_UNSUPPORTED_ERROR;
  90                 return 0;
  91             }
  92         case Collation::EXPANSION_TAG: {
  93             if(Collation::lengthFromCE32(ce32) == 1) {
  94                 return d->ces[Collation::indexFromCE32(ce32)];
  95             } else {
  96                 errorCode = U_UNSUPPORTED_ERROR;
  97                 return 0;
  98             }
  99         }
 100         case Collation::DIGIT_TAG:
 101             // Fetch the non-numeric-collation CE32 and continue.
 102             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
 103             break;
 104         case Collation::U0000_TAG:
 105             U_ASSERT(c == 0);
 106             // Fetch the normal ce32 for U+0000 and continue.
 107             ce32 = d->ce32s[0];
 108             break;
 109         case Collation::OFFSET_TAG:
 110             return d->getCEFromOffsetCE32(c, ce32);
 111         case Collation::IMPLICIT_TAG:
 112             return Collation::unassignedCEFromCodePoint(c);
 113         }
 114     }
 115     return Collation::ceFromSimpleCE32(ce32);
 116 }
 117
 118 uint32_t
 119 CollationData::getFirstPrimaryForGroup(int32_t script) const {
 120     int32_t index = getScriptIndex(script);
 121     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
 122 }
 123
 124 uint32_t
 125 CollationData::getLastPrimaryForGroup(int32_t script) const {
 126     int32_t index = getScriptIndex(script);
 127     if(index == 0) {
 128         return 0;
 129     }
 130     uint32_t limit = scriptStarts[index + 1];
 131     return (limit << 16) - 1;
 132 }
 133
 134 int32_t
 135 CollationData::getGroupForPrimary(uint32_t p) const {
 136     p >>= 16;
 137     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
 138         return -1;
 139     }
 140     int32_t index = 1;
 141     while(p >= scriptStarts[index + 1]) { ++index; }
 142     for(int32_t i = 0; i < numScripts; ++i) {
 143         if(scriptsIndex[i] == index) {
 144             return i;
 145         }
 146     }
 147     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
 148         if(scriptsIndex[numScripts + i] == index) {
 149             return UCOL_REORDER_CODE_FIRST + i;
 150         }
 151     }
 152     return -1;
 153 }
 154
 155 int32_t
 156 CollationData::getScriptIndex(int32_t script) const {
 157     if(script < 0) {
 158         return 0;
 159     } else if(script < numScripts) {
 160         return scriptsIndex[script];
 161     } else if(script < UCOL_REORDER_CODE_FIRST) {
 162         return 0;
 163     } else {
 164         script -= UCOL_REORDER_CODE_FIRST;
 165         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
 166             return scriptsIndex[numScripts + script];
 167         } else {
 168             return 0;
 169         }
 170     }
 171 }
 172
 173 int32_t
 174 CollationData::getEquivalentScripts(int32_t script,
 175                                     int32_t dest[], int32_t capacity,
 176                                     UErrorCode &errorCode) const {
 177     if(U_FAILURE(errorCode)) { return 0; }
 178     int32_t index = getScriptIndex(script);
 179     if(index == 0) { return 0; }
 180     if(script >= UCOL_REORDER_CODE_FIRST) {
 181         // Special groups have no aliases.
 182         if(capacity > 0) {
 183             dest[0] = script;
 184         } else {
 185             errorCode = U_BUFFER_OVERFLOW_ERROR;
 186         }
 187         return 1;
 188     }
 189
 190     int32_t length = 0;
 191     for(int32_t i = 0; i < numScripts; ++i) {
 192         if(scriptsIndex[i] == index) {
 193             if(length < capacity) {
 194                 dest[length] = i;
 195             }
 196             ++length;
 197         }
 198     }
 199     if(length > capacity) {
 200         errorCode = U_BUFFER_OVERFLOW_ERROR;
 201     }
 202     return length;
 203 }
 204
 205 void
 206 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
 207                                  UVector32 &ranges, UErrorCode &errorCode) const {
 208     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
 209 }
 210
 211 void
 212 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
 213                                  UBool latinMustMove,
 214                                  UVector32 &ranges, UErrorCode &errorCode) const {
 215     if(U_FAILURE(errorCode)) { return; }
 216     ranges.removeAllElements();
 217     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
 218         return;
 219     }
 220
 221     // Maps each script-or-group range to a new lead byte.
 222     uint8_t table[MAX_NUM_SCRIPT_RANGES];
 223     uprv_memset(table, 0, sizeof(table));
 224
 225     {
 226         // Set "don't care" values for reserved ranges.
 227         int32_t index = scriptsIndex[
 228                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
 229         if(index != 0) {
 230             table[index] = 0xff;
 231         }
 232         index = scriptsIndex[
 233                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
 234         if(index != 0) {
 235             table[index] = 0xff;
 236         }
 237     }
 238
 239     // Never reorder special low and high primary lead bytes.
 240     U_ASSERT(scriptStartsLength >= 2);
 241     U_ASSERT(scriptStarts[0] == 0);
 242     int32_t lowStart = scriptStarts[1];
 243     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
 244     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
 245     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
 246
 247     // Get the set of special reorder codes in the input list.
 248     // This supports a fixed number of special reorder codes;
 249     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
 250     uint32_t specials = 0;
 251     for(int32_t i = 0; i < length; ++i) {
 252         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
 253         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
 254             specials |= (uint32_t)1 << reorderCode;
 255         }
 256     }
 257
 258     // Start the reordering with the special low reorder codes that do not occur in the input.
 259     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
 260         int32_t index = scriptsIndex[numScripts + i];
 261         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
 262             lowStart = addLowScriptRange(table, index, lowStart);
 263         }
 264     }
 265
 266     // Skip the reserved range before Latin if Latin is the first script,
 267     // so that we do not move it unnecessarily.
 268     int32_t skippedReserved = 0;
 269     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
 270         int32_t index = scriptsIndex[USCRIPT_LATIN];
 271         U_ASSERT(index != 0);
 272         int32_t start = scriptStarts[index];
 273         U_ASSERT(lowStart <= start);
 274         skippedReserved = start - lowStart;
 275         lowStart = start;
 276     }
 277
 278     // Reorder according to the input scripts, continuing from the bottom of the primary range.
 279     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
 280     UBool hasReorderToEnd = FALSE;
 281     for(int32_t i = 0; i < length;) {
 282         int32_t script = reorder[i++];
 283         if(script == USCRIPT_UNKNOWN) {
 284             // Put the remaining scripts at the top.
 285             hasReorderToEnd = TRUE;
 286             while(i < length) {
 287                 script = reorder[--length];
 288                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
 289                         script == UCOL_REORDER_CODE_DEFAULT) {
 290                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 291                     return;
 292                 }
 293                 int32_t index = getScriptIndex(script);
 294                 if(index == 0) { continue; }
 295                 if(table[index] != 0) {  // Duplicate or equivalent script.
 296                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 297                     return;
 298                 }
 299                 highLimit = addHighScriptRange(table, index, highLimit);
 300             }
 301             break;
 302         }
 303         if(script == UCOL_REORDER_CODE_DEFAULT) {
 304             // The default code must be the only one in the list, and that is handled by the caller.
 305             // Otherwise it must not be used.
 306             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 307             return;
 308         }
 309         int32_t index = getScriptIndex(script);
 310         if(index == 0) { continue; }
 311         if(table[index] != 0) {  // Duplicate or equivalent script.
 312             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 313             return;
 314         }
 315         lowStart = addLowScriptRange(table, index, lowStart);
 316     }
 317
 318     // Put all remaining scripts into the middle.
 319     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
 320         int32_t leadByte = table[i];
 321         if(leadByte != 0) { continue; }
 322         int32_t start = scriptStarts[i];
 323         if(!hasReorderToEnd && start > lowStart) {
 324             // No need to move this script.
 325             lowStart = start;
 326         }
 327         lowStart = addLowScriptRange(table, i, lowStart);
 328     }
 329     if(lowStart > highLimit) {
 330         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
 331             // Try not skipping the before-Latin reserved range.
 332             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
 333             return;
 334         }
 335         // We need more primary lead bytes than available, despite the reserved ranges.
 336         errorCode = U_BUFFER_OVERFLOW_ERROR;
 337         return;
 338     }
 339
 340     // Turn lead bytes into a list of (limit, offset) pairs.
 341     // Encode each pair in one list element:
 342     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
 343     int32_t offset = 0;
 344     for(int32_t i = 1;; ++i) {
 345         int32_t nextOffset = offset;
 346         while(i < scriptStartsLength - 1) {
 347             int32_t newLeadByte = table[i];
 348             if(newLeadByte == 0xff) {
 349                 // "Don't care" lead byte for reserved range, continue with current offset.
 350             } else {
 351                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
 352                 if(nextOffset != offset) { break; }
 353             }
 354             ++i;
 355         }
 356         if(offset != 0 || i < scriptStartsLength - 1) {
 357             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
 358         }
 359         if(i == scriptStartsLength - 1) { break; }
 360         offset = nextOffset;
 361     }
 362 }
 363
 364 int32_t
 365 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
 366     int32_t start = scriptStarts[index];
 367     if((start & 0xff) < (lowStart & 0xff)) {
 368         lowStart += 0x100;
 369     }
 370     table[index] = (uint8_t)(lowStart >> 8);
 371     int32_t limit = scriptStarts[index + 1];
 372     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
 373     return lowStart;
 374 }
 375
 376 int32_t
 377 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
 378     int32_t limit = scriptStarts[index + 1];
 379     if((limit & 0xff) > (highLimit & 0xff)) {
 380         highLimit -= 0x100;
 381     }
 382     int32_t start = scriptStarts[index];
 383     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
 384     table[index] = (uint8_t)(highLimit >> 8);
 385     return highLimit;
 386 }
 387
 388 U_NAMESPACE_END
 389
 390 #endif  // !UCONFIG_NO_COLLATION