icuSources/i18n/collationdatawriter.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2013-2015, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * collationdatawriter.cpp
   7 *
   8 * created on: 2013aug06
   9 * created by: Markus W. Scherer
  10 */
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_COLLATION
  15
  16 #include "unicode/tblcoll.h"
  17 #include "unicode/udata.h"
  18 #include "unicode/uniset.h"
  19 #include "cmemory.h"
  20 #include "collationdata.h"
  21 #include "collationdatabuilder.h"
  22 #include "collationdatareader.h"
  23 #include "collationdatawriter.h"
  24 #include "collationfastlatin.h"
  25 #include "collationsettings.h"
  26 #include "collationtailoring.h"
  27 #include "uassert.h"
  28 #include "ucmndata.h"
  29
  30 U_NAMESPACE_BEGIN
  31
  32 uint8_t *
  33 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
  34     if(U_FAILURE(errorCode)) { return NULL; }
  35     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
  36     if(buffer.isNull()) {
  37         errorCode = U_MEMORY_ALLOCATION_ERROR;
  38         return NULL;
  39     }
  40     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
  41     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
  42         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
  43             errorCode = U_MEMORY_ALLOCATION_ERROR;
  44             return NULL;
  45         }
  46         errorCode = U_ZERO_ERROR;
  47         length = cloneBinary(buffer.getAlias(), length, errorCode);
  48     }
  49     if(U_FAILURE(errorCode)) { return NULL; }
  50     return buffer.orphan();
  51 }
  52
  53 int32_t
  54 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
  55     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
  56     return CollationDataWriter::writeTailoring(
  57             *tailoring, *settings, indexes, dest, capacity,
  58             errorCode);
  59 }
  60
  61 static const UDataInfo dataInfo = {
  62     sizeof(UDataInfo),
  63     0,
  64
  65     U_IS_BIG_ENDIAN,
  66     U_CHARSET_FAMILY,
  67     U_SIZEOF_UCHAR,
  68     0,
  69
  70     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
  71     { 5, 0, 0, 0 },                     // formatVersion
  72     { 6, 3, 0, 0 }                      // dataVersion
  73 };
  74
  75 int32_t
  76 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
  77                                const void *rootElements, int32_t rootElementsLength,
  78                                int32_t indexes[], uint8_t *dest, int32_t capacity,
  79                                UErrorCode &errorCode) {
  80     return write(TRUE, NULL,
  81                  data, settings,
  82                  rootElements, rootElementsLength,
  83                  indexes, dest, capacity, errorCode);
  84 }
  85
  86 int32_t
  87 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
  88                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
  89                                     UErrorCode &errorCode) {
  90     return write(FALSE, t.version,
  91                  *t.data, settings,
  92                  NULL, 0,
  93                  indexes, dest, capacity, errorCode);
  94 }
  95
  96 int32_t
  97 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
  98                            const CollationData &data, const CollationSettings &settings,
  99                            const void *rootElements, int32_t rootElementsLength,
 100                            int32_t indexes[], uint8_t *dest, int32_t capacity,
 101                            UErrorCode &errorCode) {
 102     if(U_FAILURE(errorCode)) { return 0; }
 103     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
 104         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
 105         return 0;
 106     }
 107
 108     // Figure out which data items to write before settling on
 109     // the indexes length and writing offsets.
 110     // For any data item, we need to write the start and limit offsets,
 111     // so the indexes length must be at least index-of-start-offset + 2.
 112     int32_t indexesLength;
 113     UBool hasMappings;
 114     UnicodeSet unsafeBackwardSet;
 115     const CollationData *baseData = data.base;
 116
 117     int32_t fastLatinVersion;
 118     if(data.fastLatinTable != NULL) {
 119         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
 120     } else {
 121         fastLatinVersion = 0;
 122     }
 123     int32_t fastLatinTableLength = 0;
 124
 125     if(isBase) {
 126         // For the root collator, we write an even number of indexes
 127         // so that we start with an 8-aligned offset.
 128         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
 129         U_ASSERT(settings.reorderCodesLength == 0);
 130         hasMappings = TRUE;
 131         unsafeBackwardSet = *data.unsafeBackwardSet;
 132         fastLatinTableLength = data.fastLatinTableLength;
 133     } else if(baseData == NULL) {
 134         hasMappings = FALSE;
 135         if(settings.reorderCodesLength == 0) {
 136             // only options
 137             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
 138         } else {
 139             // only options, reorder codes, and the reorder table
 140             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
 141         }
 142     } else {
 143         hasMappings = TRUE;
 144         // Tailored mappings, and what else?
 145         // Check in ascending order of optional tailoring data items.
 146         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
 147         if(data.contextsLength != 0) {
 148             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
 149         }
 150         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
 151         if(!unsafeBackwardSet.isEmpty()) {
 152             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
 153         }
 154         if(data.fastLatinTable != baseData->fastLatinTable) {
 155             fastLatinTableLength = data.fastLatinTableLength;
 156             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
 157         }
 158     }
 159
 160     UVector32 codesAndRanges(errorCode);
 161     const int32_t *reorderCodes = settings.reorderCodes;
 162     int32_t reorderCodesLength = settings.reorderCodesLength;
 163     if(settings.hasReordering() &&
 164             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
 165         // Rebuild the full list of reorder ranges.
 166         // The list in the settings is truncated for efficiency.
 167         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
 168         // Write the codes, then the ranges.
 169         for(int32_t i = 0; i < reorderCodesLength; ++i) {
 170             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
 171         }
 172         if(U_FAILURE(errorCode)) { return 0; }
 173         reorderCodes = codesAndRanges.getBuffer();
 174         reorderCodesLength = codesAndRanges.size();
 175     }
 176
 177     int32_t headerSize;
 178     if(isBase) {
 179         headerSize = 0;  // udata_create() writes the header
 180     } else {
 181         DataHeader header;
 182         header.dataHeader.magic1 = 0xda;
 183         header.dataHeader.magic2 = 0x27;
 184         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
 185         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
 186         headerSize = (int32_t)sizeof(header);
 187         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
 188         if(hasMappings && data.cesLength != 0) {
 189             // Sum of the sizes of the data items which are
 190             // not automatically multiples of 8 bytes and which are placed before the CEs.
 191             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
 192             if((sum & 7) != 0) {
 193                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
 194                 // We add to the header size here.
 195                 // Alternatively, we could increment the indexesLength
 196                 // or add a few bytes to the reorderTable.
 197                 headerSize += 4;
 198             }
 199         }
 200         header.dataHeader.headerSize = (uint16_t)headerSize;
 201         if(headerSize <= capacity) {
 202             uprv_memcpy(dest, &header, sizeof(header));
 203             // Write 00 bytes so that the padding is not mistaken for a copyright string.
 204             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
 205             dest += headerSize;
 206             capacity -= headerSize;
 207         } else {
 208             dest = NULL;
 209             capacity = 0;
 210         }
 211     }
 212
 213     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
 214     U_ASSERT((settings.options & ~0xffff) == 0);
 215     indexes[CollationDataReader::IX_OPTIONS] =
 216             data.numericPrimary | fastLatinVersion | settings.options;
 217     indexes[CollationDataReader::IX_RESERVED2] = 0;
 218     indexes[CollationDataReader::IX_RESERVED3] = 0;
 219
 220     // Byte offsets of data items all start from the start of the indexes.
 221     // We add the headerSize at the very end.
 222     int32_t totalSize = indexesLength * 4;
 223
 224     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
 225         indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
 226     } else {
 227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
 228     }
 229
 230     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
 231     totalSize += reorderCodesLength * 4;
 232
 233     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
 234     if(settings.reorderTable != NULL) {
 235         totalSize += 256;
 236     }
 237
 238     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
 239     if(hasMappings) {
 240         UErrorCode errorCode2 = U_ZERO_ERROR;
 241         int32_t length;
 242         if(totalSize < capacity) {
 243             length = utrie2_serialize(data.trie, dest + totalSize,
 244                                       capacity - totalSize, &errorCode2);
 245         } else {
 246             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
 247         }
 248         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
 249             errorCode = errorCode2;
 250             return 0;
 251         }
 252         // The trie size should be a multiple of 8 bytes due to the way
 253         // compactIndex2(UNewTrie2 *trie) currently works.
 254         U_ASSERT((length & 7) == 0);
 255         totalSize += length;
 256     }
 257
 258     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
 259     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
 260     if(hasMappings && data.cesLength != 0) {
 261         U_ASSERT(((headerSize + totalSize) & 7) == 0);
 262         totalSize += data.cesLength * 8;
 263     }
 264
 265     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
 266     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
 267     if(hasMappings) {
 268         totalSize += data.ce32sLength * 4;
 269     }
 270
 271     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
 272     totalSize += rootElementsLength * 4;
 273
 274     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
 275     if(hasMappings) {
 276         totalSize += data.contextsLength * 2;
 277     }
 278
 279     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
 280     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
 281         UErrorCode errorCode2 = U_ZERO_ERROR;
 282         int32_t length;
 283         if(totalSize < capacity) {
 284             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
 285             length = unsafeBackwardSet.serialize(
 286                     p, (capacity - totalSize) / 2, errorCode2);
 287         } else {
 288             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
 289         }
 290         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
 291             errorCode = errorCode2;
 292             return 0;
 293         }
 294         totalSize += length * 2;
 295     }
 296
 297     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
 298     totalSize += fastLatinTableLength * 2;
 299
 300     UnicodeString scripts;
 301     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
 302     if(isBase) {
 303         scripts.append((UChar)data.numScripts);
 304         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
 305         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
 306         totalSize += scripts.length() * 2;
 307     }
 308
 309     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
 310     if(isBase) {
 311         totalSize += 256;
 312     }
 313
 314     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
 315     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
 316
 317     if(totalSize > capacity) {
 318         errorCode = U_BUFFER_OVERFLOW_ERROR;
 319         return headerSize + totalSize;
 320     }
 321
 322     uprv_memcpy(dest, indexes, indexesLength * 4);
 323     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
 324     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
 325     // The trie has already been serialized into the dest buffer.
 326     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
 327     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
 328     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
 329     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
 330     // The unsafeBackwardSet has already been serialized into the dest buffer.
 331     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
 332     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
 333     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
 334
 335     return headerSize + totalSize;
 336 }
 337
 338 void
 339 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
 340                               const void *src, uint8_t *dest) {
 341     int32_t start = indexes[startIndex];
 342     int32_t limit = indexes[startIndex + 1];
 343     if(start < limit) {
 344         uprv_memcpy(dest + start, src, limit - start);
 345     }
 346 }
 347
 348 U_NAMESPACE_END
 349
 350 #endif  // !UCONFIG_NO_COLLATION