icuSources/i18n/collationdatawriter.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2013-2015, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * collationdatawriter.cpp
   9 *
  10 * created on: 2013aug06
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/tblcoll.h"
  19 #include "unicode/udata.h"
  20 #include "unicode/uniset.h"
  21 #include "cmemory.h"
  22 #include "collationdata.h"
  23 #include "collationdatabuilder.h"
  24 #include "collationdatareader.h"
  25 #include "collationdatawriter.h"
  26 #include "collationfastlatin.h"
  27 #include "collationsettings.h"
  28 #include "collationtailoring.h"
  29 #include "uassert.h"
  30 #include "ucmndata.h"
  31
  32 U_NAMESPACE_BEGIN
  33
  34 uint8_t *
  35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
  36     if(U_FAILURE(errorCode)) { return NULL; }
  37     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
  38     if(buffer.isNull()) {
  39         errorCode = U_MEMORY_ALLOCATION_ERROR;
  40         return NULL;
  41     }
  42     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
  43     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
  44         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
  45             errorCode = U_MEMORY_ALLOCATION_ERROR;
  46             return NULL;
  47         }
  48         errorCode = U_ZERO_ERROR;
  49         length = cloneBinary(buffer.getAlias(), length, errorCode);
  50     }
  51     if(U_FAILURE(errorCode)) { return NULL; }
  52     return buffer.orphan();
  53 }
  54
  55 int32_t
  56 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
  57     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
  58     return CollationDataWriter::writeTailoring(
  59             *tailoring, *settings, indexes, dest, capacity,
  60             errorCode);
  61 }
  62
  63 static const UDataInfo dataInfo = {
  64     sizeof(UDataInfo),
  65     0,
  66
  67     U_IS_BIG_ENDIAN,
  68     U_CHARSET_FAMILY,
  69     U_SIZEOF_UCHAR,
  70     0,
  71
  72     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
  73     { 5, 0, 0, 0 },                     // formatVersion
  74     { 6, 3, 0, 0 }                      // dataVersion
  75 };
  76
  77 int32_t
  78 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
  79                                const void *rootElements, int32_t rootElementsLength,
  80                                int32_t indexes[], uint8_t *dest, int32_t capacity,
  81                                UErrorCode &errorCode) {
  82     return write(TRUE, NULL,
  83                  data, settings,
  84                  rootElements, rootElementsLength,
  85                  indexes, dest, capacity, errorCode);
  86 }
  87
  88 int32_t
  89 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
  90                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
  91                                     UErrorCode &errorCode) {
  92     return write(FALSE, t.version,
  93                  *t.data, settings,
  94                  NULL, 0,
  95                  indexes, dest, capacity, errorCode);
  96 }
  97
  98 int32_t
  99 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
 100                            const CollationData &data, const CollationSettings &settings,
 101                            const void *rootElements, int32_t rootElementsLength,
 102                            int32_t indexes[], uint8_t *dest, int32_t capacity,
 103                            UErrorCode &errorCode) {
 104     if(U_FAILURE(errorCode)) { return 0; }
 105     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
 106         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 107         return 0;
 108     }
 109
 110     // Figure out which data items to write before settling on
 111     // the indexes length and writing offsets.
 112     // For any data item, we need to write the start and limit offsets,
 113     // so the indexes length must be at least index-of-start-offset + 2.
 114     int32_t indexesLength;
 115     UBool hasMappings;
 116     UnicodeSet unsafeBackwardSet;
 117     const CollationData *baseData = data.base;
 118
 119     int32_t fastLatinVersion;
 120     if(data.fastLatinTable != NULL) {
 121         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
 122     } else {
 123         fastLatinVersion = 0;
 124     }
 125     int32_t fastLatinTableLength = 0;
 126
 127     if(isBase) {
 128         // For the root collator, we write an even number of indexes
 129         // so that we start with an 8-aligned offset.
 130         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
 131         U_ASSERT(settings.reorderCodesLength == 0);
 132         hasMappings = TRUE;
 133         unsafeBackwardSet = *data.unsafeBackwardSet;
 134         fastLatinTableLength = data.fastLatinTableLength;
 135     } else if(baseData == NULL) {
 136         hasMappings = FALSE;
 137         if(settings.reorderCodesLength == 0) {
 138             // only options
 139             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
 140         } else {
 141             // only options, reorder codes, and the reorder table
 142             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
 143         }
 144     } else {
 145         hasMappings = TRUE;
 146         // Tailored mappings, and what else?
 147         // Check in ascending order of optional tailoring data items.
 148         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
 149         if(data.contextsLength != 0) {
 150             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
 151         }
 152         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
 153         if(!unsafeBackwardSet.isEmpty()) {
 154             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
 155         }
 156         if(data.fastLatinTable != baseData->fastLatinTable) {
 157             fastLatinTableLength = data.fastLatinTableLength;
 158             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
 159         }
 160     }
 161
 162     UVector32 codesAndRanges(errorCode);
 163     const int32_t *reorderCodes = settings.reorderCodes;
 164     int32_t reorderCodesLength = settings.reorderCodesLength;
 165     if(settings.hasReordering() &&
 166             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
 167         // Rebuild the full list of reorder ranges.
 168         // The list in the settings is truncated for efficiency.
 169         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
 170         // Write the codes, then the ranges.
 171         for(int32_t i = 0; i < reorderCodesLength; ++i) {
 172             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
 173         }
 174         if(U_FAILURE(errorCode)) { return 0; }
 175         reorderCodes = codesAndRanges.getBuffer();
 176         reorderCodesLength = codesAndRanges.size();
 177     }
 178
 179     int32_t headerSize;
 180     if(isBase) {
 181         headerSize = 0;  // udata_create() writes the header
 182     } else {
 183         DataHeader header;
 184         header.dataHeader.magic1 = 0xda;
 185         header.dataHeader.magic2 = 0x27;
 186         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
 187         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
 188         headerSize = (int32_t)sizeof(header);
 189         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
 190         if(hasMappings && data.cesLength != 0) {
 191             // Sum of the sizes of the data items which are
 192             // not automatically multiples of 8 bytes and which are placed before the CEs.
 193             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
 194             if((sum & 7) != 0) {
 195                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
 196                 // We add to the header size here.
 197                 // Alternatively, we could increment the indexesLength
 198                 // or add a few bytes to the reorderTable.
 199                 headerSize += 4;
 200             }
 201         }
 202         header.dataHeader.headerSize = (uint16_t)headerSize;
 203         if(headerSize <= capacity) {
 204             uprv_memcpy(dest, &header, sizeof(header));
 205             // Write 00 bytes so that the padding is not mistaken for a copyright string.
 206             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
 207             dest += headerSize;
 208             capacity -= headerSize;
 209         } else {
 210             dest = NULL;
 211             capacity = 0;
 212         }
 213     }
 214
 215     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
 216     U_ASSERT((settings.options & ~0xffff) == 0);
 217     indexes[CollationDataReader::IX_OPTIONS] =
 218             data.numericPrimary | fastLatinVersion | settings.options;
 219     indexes[CollationDataReader::IX_RESERVED2] = 0;
 220     indexes[CollationDataReader::IX_RESERVED3] = 0;
 221
 222     // Byte offsets of data items all start from the start of the indexes.
 223     // We add the headerSize at the very end.
 224     int32_t totalSize = indexesLength * 4;
 225
 226     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
 227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
 228     } else {
 229         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
 230     }
 231
 232     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
 233     totalSize += reorderCodesLength * 4;
 234
 235     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
 236     if(settings.reorderTable != NULL) {
 237         totalSize += 256;
 238     }
 239
 240     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
 241     if(hasMappings) {
 242         UErrorCode errorCode2 = U_ZERO_ERROR;
 243         int32_t length;
 244         if(totalSize < capacity) {
 245             length = utrie2_serialize(data.trie, dest + totalSize,
 246                                       capacity - totalSize, &errorCode2);
 247         } else {
 248             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
 249         }
 250         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
 251             errorCode = errorCode2;
 252             return 0;
 253         }
 254         // The trie size should be a multiple of 8 bytes due to the way
 255         // compactIndex2(UNewTrie2 *trie) currently works.
 256         U_ASSERT((length & 7) == 0);
 257         totalSize += length;
 258     }
 259
 260     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
 261     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
 262     if(hasMappings && data.cesLength != 0) {
 263         U_ASSERT(((headerSize + totalSize) & 7) == 0);
 264         totalSize += data.cesLength * 8;
 265     }
 266
 267     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
 268     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
 269     if(hasMappings) {
 270         totalSize += data.ce32sLength * 4;
 271     }
 272
 273     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
 274     totalSize += rootElementsLength * 4;
 275
 276     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
 277     if(hasMappings) {
 278         totalSize += data.contextsLength * 2;
 279     }
 280
 281     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
 282     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
 283         UErrorCode errorCode2 = U_ZERO_ERROR;
 284         int32_t length;
 285         if(totalSize < capacity) {
 286             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
 287             length = unsafeBackwardSet.serialize(
 288                     p, (capacity - totalSize) / 2, errorCode2);
 289         } else {
 290             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
 291         }
 292         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
 293             errorCode = errorCode2;
 294             return 0;
 295         }
 296         totalSize += length * 2;
 297     }
 298
 299     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
 300     totalSize += fastLatinTableLength * 2;
 301
 302     UnicodeString scripts;
 303     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
 304     if(isBase) {
 305         scripts.append((UChar)data.numScripts);
 306         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
 307         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
 308         totalSize += scripts.length() * 2;
 309     }
 310
 311     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
 312     if(isBase) {
 313         totalSize += 256;
 314     }
 315
 316     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
 317     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
 318
 319     if(totalSize > capacity) {
 320         errorCode = U_BUFFER_OVERFLOW_ERROR;
 321         return headerSize + totalSize;
 322     }
 323
 324     uprv_memcpy(dest, indexes, indexesLength * 4);
 325     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
 326     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
 327     // The trie has already been serialized into the dest buffer.
 328     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
 329     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
 330     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
 331     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
 332     // The unsafeBackwardSet has already been serialized into the dest buffer.
 333     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
 334     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
 335     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
 336
 337     return headerSize + totalSize;
 338 }
 339
 340 void
 341 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
 342                               const void *src, uint8_t *dest) {
 343     int32_t start = indexes[startIndex];
 344     int32_t limit = indexes[startIndex + 1];
 345     if(start < limit) {
 346         uprv_memcpy(dest + start, src, limit - start);
 347     }
 348 }
 349
 350 U_NAMESPACE_END
 351
 352 #endif  // !UCONFIG_NO_COLLATION