1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatawriter.cpp
10 * created on: 2013aug06
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/tblcoll.h"
19 #include "unicode/udata.h"
20 #include "unicode/uniset.h"
22 #include "collationdata.h"
23 #include "collationdatabuilder.h"
24 #include "collationdatareader.h"
25 #include "collationdatawriter.h"
26 #include "collationfastlatin.h"
27 #include "collationsettings.h"
28 #include "collationtailoring.h"
35 RuleBasedCollator::cloneRuleData(int32_t &length
, UErrorCode
&errorCode
) const {
36 if(U_FAILURE(errorCode
)) { return NULL
; }
37 LocalMemory
<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
39 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
42 length
= cloneBinary(buffer
.getAlias(), 20000, errorCode
);
43 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
44 if(buffer
.allocateInsteadAndCopy(length
, 0) == NULL
) {
45 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
48 errorCode
= U_ZERO_ERROR
;
49 length
= cloneBinary(buffer
.getAlias(), length
, errorCode
);
51 if(U_FAILURE(errorCode
)) { return NULL
; }
52 return buffer
.orphan();
56 RuleBasedCollator::cloneBinary(uint8_t *dest
, int32_t capacity
, UErrorCode
&errorCode
) const {
57 int32_t indexes
[CollationDataReader::IX_TOTAL_SIZE
+ 1];
58 return CollationDataWriter::writeTailoring(
59 *tailoring
, *settings
, indexes
, dest
, capacity
,
63 static const UDataInfo dataInfo
= {
72 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
73 { 5, 0, 0, 0 }, // formatVersion
74 { 6, 3, 0, 0 } // dataVersion
78 CollationDataWriter::writeBase(const CollationData
&data
, const CollationSettings
&settings
,
79 const void *rootElements
, int32_t rootElementsLength
,
80 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
81 UErrorCode
&errorCode
) {
82 return write(TRUE
, NULL
,
84 rootElements
, rootElementsLength
,
85 indexes
, dest
, capacity
, errorCode
);
89 CollationDataWriter::writeTailoring(const CollationTailoring
&t
, const CollationSettings
&settings
,
90 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
91 UErrorCode
&errorCode
) {
92 return write(FALSE
, t
.version
,
95 indexes
, dest
, capacity
, errorCode
);
99 CollationDataWriter::write(UBool isBase
, const UVersionInfo dataVersion
,
100 const CollationData
&data
, const CollationSettings
&settings
,
101 const void *rootElements
, int32_t rootElementsLength
,
102 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
103 UErrorCode
&errorCode
) {
104 if(U_FAILURE(errorCode
)) { return 0; }
105 if(capacity
< 0 || (capacity
> 0 && dest
== NULL
)) {
106 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
110 // Figure out which data items to write before settling on
111 // the indexes length and writing offsets.
112 // For any data item, we need to write the start and limit offsets,
113 // so the indexes length must be at least index-of-start-offset + 2.
114 int32_t indexesLength
;
116 UnicodeSet unsafeBackwardSet
;
117 const CollationData
*baseData
= data
.base
;
119 int32_t fastLatinVersion
;
120 if(data
.fastLatinTable
!= NULL
) {
121 fastLatinVersion
= (int32_t)CollationFastLatin::VERSION
<< 16;
123 fastLatinVersion
= 0;
125 int32_t fastLatinTableLength
= 0;
128 // For the root collator, we write an even number of indexes
129 // so that we start with an 8-aligned offset.
130 indexesLength
= CollationDataReader::IX_TOTAL_SIZE
+ 1;
131 U_ASSERT(settings
.reorderCodesLength
== 0);
133 unsafeBackwardSet
= *data
.unsafeBackwardSet
;
134 fastLatinTableLength
= data
.fastLatinTableLength
;
135 } else if(baseData
== NULL
) {
137 if(settings
.reorderCodesLength
== 0) {
139 indexesLength
= CollationDataReader::IX_OPTIONS
+ 1; // no limit offset here
141 // only options, reorder codes, and the reorder table
142 indexesLength
= CollationDataReader::IX_REORDER_TABLE_OFFSET
+ 2;
146 // Tailored mappings, and what else?
147 // Check in ascending order of optional tailoring data items.
148 indexesLength
= CollationDataReader::IX_CE32S_OFFSET
+ 2;
149 if(data
.contextsLength
!= 0) {
150 indexesLength
= CollationDataReader::IX_CONTEXTS_OFFSET
+ 2;
152 unsafeBackwardSet
.addAll(*data
.unsafeBackwardSet
).removeAll(*baseData
->unsafeBackwardSet
);
153 if(!unsafeBackwardSet
.isEmpty()) {
154 indexesLength
= CollationDataReader::IX_UNSAFE_BWD_OFFSET
+ 2;
156 if(data
.fastLatinTable
!= baseData
->fastLatinTable
) {
157 fastLatinTableLength
= data
.fastLatinTableLength
;
158 indexesLength
= CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
+ 2;
162 UVector32
codesAndRanges(errorCode
);
163 const int32_t *reorderCodes
= settings
.reorderCodes
;
164 int32_t reorderCodesLength
= settings
.reorderCodesLength
;
165 if(settings
.hasReordering() &&
166 CollationSettings::reorderTableHasSplitBytes(settings
.reorderTable
)) {
167 // Rebuild the full list of reorder ranges.
168 // The list in the settings is truncated for efficiency.
169 data
.makeReorderRanges(reorderCodes
, reorderCodesLength
, codesAndRanges
, errorCode
);
170 // Write the codes, then the ranges.
171 for(int32_t i
= 0; i
< reorderCodesLength
; ++i
) {
172 codesAndRanges
.insertElementAt(reorderCodes
[i
], i
, errorCode
);
174 if(U_FAILURE(errorCode
)) { return 0; }
175 reorderCodes
= codesAndRanges
.getBuffer();
176 reorderCodesLength
= codesAndRanges
.size();
181 headerSize
= 0; // udata_create() writes the header
184 header
.dataHeader
.magic1
= 0xda;
185 header
.dataHeader
.magic2
= 0x27;
186 uprv_memcpy(&header
.info
, &dataInfo
, sizeof(UDataInfo
));
187 uprv_memcpy(header
.info
.dataVersion
, dataVersion
, sizeof(UVersionInfo
));
188 headerSize
= (int32_t)sizeof(header
);
189 U_ASSERT((headerSize
& 3) == 0); // multiple of 4 bytes
190 if(hasMappings
&& data
.cesLength
!= 0) {
191 // Sum of the sizes of the data items which are
192 // not automatically multiples of 8 bytes and which are placed before the CEs.
193 int32_t sum
= headerSize
+ (indexesLength
+ reorderCodesLength
) * 4;
195 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196 // We add to the header size here.
197 // Alternatively, we could increment the indexesLength
198 // or add a few bytes to the reorderTable.
202 header
.dataHeader
.headerSize
= (uint16_t)headerSize
;
203 if(headerSize
<= capacity
) {
204 uprv_memcpy(dest
, &header
, sizeof(header
));
205 // Write 00 bytes so that the padding is not mistaken for a copyright string.
206 uprv_memset(dest
+ sizeof(header
), 0, headerSize
- (int32_t)sizeof(header
));
208 capacity
-= headerSize
;
215 indexes
[CollationDataReader::IX_INDEXES_LENGTH
] = indexesLength
;
216 U_ASSERT((settings
.options
& ~0xffff) == 0);
217 indexes
[CollationDataReader::IX_OPTIONS
] =
218 data
.numericPrimary
| fastLatinVersion
| settings
.options
;
219 indexes
[CollationDataReader::IX_RESERVED2
] = 0;
220 indexes
[CollationDataReader::IX_RESERVED3
] = 0;
222 // Byte offsets of data items all start from the start of the indexes.
223 // We add the headerSize at the very end.
224 int32_t totalSize
= indexesLength
* 4;
226 if(hasMappings
&& (isBase
|| data
.jamoCE32s
!= baseData
->jamoCE32s
)) {
227 indexes
[CollationDataReader::IX_JAMO_CE32S_START
] = data
.jamoCE32s
- data
.ce32s
;
229 indexes
[CollationDataReader::IX_JAMO_CE32S_START
] = -1;
232 indexes
[CollationDataReader::IX_REORDER_CODES_OFFSET
] = totalSize
;
233 totalSize
+= reorderCodesLength
* 4;
235 indexes
[CollationDataReader::IX_REORDER_TABLE_OFFSET
] = totalSize
;
236 if(settings
.reorderTable
!= NULL
) {
240 indexes
[CollationDataReader::IX_TRIE_OFFSET
] = totalSize
;
242 UErrorCode errorCode2
= U_ZERO_ERROR
;
244 if(totalSize
< capacity
) {
245 length
= utrie2_serialize(data
.trie
, dest
+ totalSize
,
246 capacity
- totalSize
, &errorCode2
);
248 length
= utrie2_serialize(data
.trie
, NULL
, 0, &errorCode2
);
250 if(U_FAILURE(errorCode2
) && errorCode2
!= U_BUFFER_OVERFLOW_ERROR
) {
251 errorCode
= errorCode2
;
254 // The trie size should be a multiple of 8 bytes due to the way
255 // compactIndex2(UNewTrie2 *trie) currently works.
256 U_ASSERT((length
& 7) == 0);
260 indexes
[CollationDataReader::IX_RESERVED8_OFFSET
] = totalSize
;
261 indexes
[CollationDataReader::IX_CES_OFFSET
] = totalSize
;
262 if(hasMappings
&& data
.cesLength
!= 0) {
263 U_ASSERT(((headerSize
+ totalSize
) & 7) == 0);
264 totalSize
+= data
.cesLength
* 8;
267 indexes
[CollationDataReader::IX_RESERVED10_OFFSET
] = totalSize
;
268 indexes
[CollationDataReader::IX_CE32S_OFFSET
] = totalSize
;
270 totalSize
+= data
.ce32sLength
* 4;
273 indexes
[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET
] = totalSize
;
274 totalSize
+= rootElementsLength
* 4;
276 indexes
[CollationDataReader::IX_CONTEXTS_OFFSET
] = totalSize
;
278 totalSize
+= data
.contextsLength
* 2;
281 indexes
[CollationDataReader::IX_UNSAFE_BWD_OFFSET
] = totalSize
;
282 if(hasMappings
&& !unsafeBackwardSet
.isEmpty()) {
283 UErrorCode errorCode2
= U_ZERO_ERROR
;
285 if(totalSize
< capacity
) {
286 uint16_t *p
= reinterpret_cast<uint16_t *>(dest
+ totalSize
);
287 length
= unsafeBackwardSet
.serialize(
288 p
, (capacity
- totalSize
) / 2, errorCode2
);
290 length
= unsafeBackwardSet
.serialize(NULL
, 0, errorCode2
);
292 if(U_FAILURE(errorCode2
) && errorCode2
!= U_BUFFER_OVERFLOW_ERROR
) {
293 errorCode
= errorCode2
;
296 totalSize
+= length
* 2;
299 indexes
[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
] = totalSize
;
300 totalSize
+= fastLatinTableLength
* 2;
302 UnicodeString scripts
;
303 indexes
[CollationDataReader::IX_SCRIPTS_OFFSET
] = totalSize
;
305 scripts
.append((UChar
)data
.numScripts
);
306 scripts
.append(reinterpret_cast<const UChar
*>(data
.scriptsIndex
), data
.numScripts
+ 16);
307 scripts
.append(reinterpret_cast<const UChar
*>(data
.scriptStarts
), data
.scriptStartsLength
);
308 totalSize
+= scripts
.length() * 2;
311 indexes
[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET
] = totalSize
;
316 indexes
[CollationDataReader::IX_RESERVED18_OFFSET
] = totalSize
;
317 indexes
[CollationDataReader::IX_TOTAL_SIZE
] = totalSize
;
319 if(totalSize
> capacity
) {
320 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
321 return headerSize
+ totalSize
;
324 uprv_memcpy(dest
, indexes
, indexesLength
* 4);
325 copyData(indexes
, CollationDataReader::IX_REORDER_CODES_OFFSET
, reorderCodes
, dest
);
326 copyData(indexes
, CollationDataReader::IX_REORDER_TABLE_OFFSET
, settings
.reorderTable
, dest
);
327 // The trie has already been serialized into the dest buffer.
328 copyData(indexes
, CollationDataReader::IX_CES_OFFSET
, data
.ces
, dest
);
329 copyData(indexes
, CollationDataReader::IX_CE32S_OFFSET
, data
.ce32s
, dest
);
330 copyData(indexes
, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET
, rootElements
, dest
);
331 copyData(indexes
, CollationDataReader::IX_CONTEXTS_OFFSET
, data
.contexts
, dest
);
332 // The unsafeBackwardSet has already been serialized into the dest buffer.
333 copyData(indexes
, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
, data
.fastLatinTable
, dest
);
334 copyData(indexes
, CollationDataReader::IX_SCRIPTS_OFFSET
, scripts
.getBuffer(), dest
);
335 copyData(indexes
, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET
, data
.compressibleBytes
, dest
);
337 return headerSize
+ totalSize
;
341 CollationDataWriter::copyData(const int32_t indexes
[], int32_t startIndex
,
342 const void *src
, uint8_t *dest
) {
343 int32_t start
= indexes
[startIndex
];
344 int32_t limit
= indexes
[startIndex
+ 1];
346 uprv_memcpy(dest
+ start
, src
, limit
- start
);
352 #endif // !UCONFIG_NO_COLLATION