2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatawriter.cpp
8 * created on: 2013aug06
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/tblcoll.h"
17 #include "unicode/udata.h"
18 #include "unicode/uniset.h"
20 #include "collationdata.h"
21 #include "collationdatabuilder.h"
22 #include "collationdatareader.h"
23 #include "collationdatawriter.h"
24 #include "collationfastlatin.h"
25 #include "collationsettings.h"
26 #include "collationtailoring.h"
33 RuleBasedCollator::cloneRuleData(int32_t &length
, UErrorCode
&errorCode
) const {
34 if(U_FAILURE(errorCode
)) { return NULL
; }
35 LocalMemory
<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
37 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
40 length
= cloneBinary(buffer
.getAlias(), 20000, errorCode
);
41 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
42 if(buffer
.allocateInsteadAndCopy(length
, 0) == NULL
) {
43 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
46 errorCode
= U_ZERO_ERROR
;
47 length
= cloneBinary(buffer
.getAlias(), length
, errorCode
);
49 if(U_FAILURE(errorCode
)) { return NULL
; }
50 return buffer
.orphan();
54 RuleBasedCollator::cloneBinary(uint8_t *dest
, int32_t capacity
, UErrorCode
&errorCode
) const {
55 int32_t indexes
[CollationDataReader::IX_TOTAL_SIZE
+ 1];
56 return CollationDataWriter::writeTailoring(
57 *tailoring
, *settings
, indexes
, dest
, capacity
,
61 static const UDataInfo dataInfo
= {
70 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
71 { 5, 0, 0, 0 }, // formatVersion
72 { 6, 3, 0, 0 } // dataVersion
76 CollationDataWriter::writeBase(const CollationData
&data
, const CollationSettings
&settings
,
77 const void *rootElements
, int32_t rootElementsLength
,
78 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
79 UErrorCode
&errorCode
) {
80 return write(TRUE
, NULL
,
82 rootElements
, rootElementsLength
,
83 indexes
, dest
, capacity
, errorCode
);
87 CollationDataWriter::writeTailoring(const CollationTailoring
&t
, const CollationSettings
&settings
,
88 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
89 UErrorCode
&errorCode
) {
90 return write(FALSE
, t
.version
,
93 indexes
, dest
, capacity
, errorCode
);
97 CollationDataWriter::write(UBool isBase
, const UVersionInfo dataVersion
,
98 const CollationData
&data
, const CollationSettings
&settings
,
99 const void *rootElements
, int32_t rootElementsLength
,
100 int32_t indexes
[], uint8_t *dest
, int32_t capacity
,
101 UErrorCode
&errorCode
) {
102 if(U_FAILURE(errorCode
)) { return 0; }
103 if(capacity
< 0 || (capacity
> 0 && dest
== NULL
)) {
104 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
108 // Figure out which data items to write before settling on
109 // the indexes length and writing offsets.
110 // For any data item, we need to write the start and limit offsets,
111 // so the indexes length must be at least index-of-start-offset + 2.
112 int32_t indexesLength
;
114 UnicodeSet unsafeBackwardSet
;
115 const CollationData
*baseData
= data
.base
;
117 int32_t fastLatinVersion
;
118 if(data
.fastLatinTable
!= NULL
) {
119 fastLatinVersion
= (int32_t)CollationFastLatin::VERSION
<< 16;
121 fastLatinVersion
= 0;
123 int32_t fastLatinTableLength
= 0;
126 // For the root collator, we write an even number of indexes
127 // so that we start with an 8-aligned offset.
128 indexesLength
= CollationDataReader::IX_TOTAL_SIZE
+ 1;
129 U_ASSERT(settings
.reorderCodesLength
== 0);
131 unsafeBackwardSet
= *data
.unsafeBackwardSet
;
132 fastLatinTableLength
= data
.fastLatinTableLength
;
133 } else if(baseData
== NULL
) {
135 if(settings
.reorderCodesLength
== 0) {
137 indexesLength
= CollationDataReader::IX_OPTIONS
+ 1; // no limit offset here
139 // only options, reorder codes, and the reorder table
140 indexesLength
= CollationDataReader::IX_REORDER_TABLE_OFFSET
+ 2;
144 // Tailored mappings, and what else?
145 // Check in ascending order of optional tailoring data items.
146 indexesLength
= CollationDataReader::IX_CE32S_OFFSET
+ 2;
147 if(data
.contextsLength
!= 0) {
148 indexesLength
= CollationDataReader::IX_CONTEXTS_OFFSET
+ 2;
150 unsafeBackwardSet
.addAll(*data
.unsafeBackwardSet
).removeAll(*baseData
->unsafeBackwardSet
);
151 if(!unsafeBackwardSet
.isEmpty()) {
152 indexesLength
= CollationDataReader::IX_UNSAFE_BWD_OFFSET
+ 2;
154 if(data
.fastLatinTable
!= baseData
->fastLatinTable
) {
155 fastLatinTableLength
= data
.fastLatinTableLength
;
156 indexesLength
= CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
+ 2;
160 UVector32
codesAndRanges(errorCode
);
161 const int32_t *reorderCodes
= settings
.reorderCodes
;
162 int32_t reorderCodesLength
= settings
.reorderCodesLength
;
163 if(settings
.hasReordering() &&
164 CollationSettings::reorderTableHasSplitBytes(settings
.reorderTable
)) {
165 // Rebuild the full list of reorder ranges.
166 // The list in the settings is truncated for efficiency.
167 data
.makeReorderRanges(reorderCodes
, reorderCodesLength
, codesAndRanges
, errorCode
);
168 // Write the codes, then the ranges.
169 for(int32_t i
= 0; i
< reorderCodesLength
; ++i
) {
170 codesAndRanges
.insertElementAt(reorderCodes
[i
], i
, errorCode
);
172 if(U_FAILURE(errorCode
)) { return 0; }
173 reorderCodes
= codesAndRanges
.getBuffer();
174 reorderCodesLength
= codesAndRanges
.size();
179 headerSize
= 0; // udata_create() writes the header
182 header
.dataHeader
.magic1
= 0xda;
183 header
.dataHeader
.magic2
= 0x27;
184 uprv_memcpy(&header
.info
, &dataInfo
, sizeof(UDataInfo
));
185 uprv_memcpy(header
.info
.dataVersion
, dataVersion
, sizeof(UVersionInfo
));
186 headerSize
= (int32_t)sizeof(header
);
187 U_ASSERT((headerSize
& 3) == 0); // multiple of 4 bytes
188 if(hasMappings
&& data
.cesLength
!= 0) {
189 // Sum of the sizes of the data items which are
190 // not automatically multiples of 8 bytes and which are placed before the CEs.
191 int32_t sum
= headerSize
+ (indexesLength
+ reorderCodesLength
) * 4;
193 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
194 // We add to the header size here.
195 // Alternatively, we could increment the indexesLength
196 // or add a few bytes to the reorderTable.
200 header
.dataHeader
.headerSize
= (uint16_t)headerSize
;
201 if(headerSize
<= capacity
) {
202 uprv_memcpy(dest
, &header
, sizeof(header
));
203 // Write 00 bytes so that the padding is not mistaken for a copyright string.
204 uprv_memset(dest
+ sizeof(header
), 0, headerSize
- (int32_t)sizeof(header
));
206 capacity
-= headerSize
;
213 indexes
[CollationDataReader::IX_INDEXES_LENGTH
] = indexesLength
;
214 U_ASSERT((settings
.options
& ~0xffff) == 0);
215 indexes
[CollationDataReader::IX_OPTIONS
] =
216 data
.numericPrimary
| fastLatinVersion
| settings
.options
;
217 indexes
[CollationDataReader::IX_RESERVED2
] = 0;
218 indexes
[CollationDataReader::IX_RESERVED3
] = 0;
220 // Byte offsets of data items all start from the start of the indexes.
221 // We add the headerSize at the very end.
222 int32_t totalSize
= indexesLength
* 4;
224 if(hasMappings
&& (isBase
|| data
.jamoCE32s
!= baseData
->jamoCE32s
)) {
225 indexes
[CollationDataReader::IX_JAMO_CE32S_START
] = data
.jamoCE32s
- data
.ce32s
;
227 indexes
[CollationDataReader::IX_JAMO_CE32S_START
] = -1;
230 indexes
[CollationDataReader::IX_REORDER_CODES_OFFSET
] = totalSize
;
231 totalSize
+= reorderCodesLength
* 4;
233 indexes
[CollationDataReader::IX_REORDER_TABLE_OFFSET
] = totalSize
;
234 if(settings
.reorderTable
!= NULL
) {
238 indexes
[CollationDataReader::IX_TRIE_OFFSET
] = totalSize
;
240 UErrorCode errorCode2
= U_ZERO_ERROR
;
242 if(totalSize
< capacity
) {
243 length
= utrie2_serialize(data
.trie
, dest
+ totalSize
,
244 capacity
- totalSize
, &errorCode2
);
246 length
= utrie2_serialize(data
.trie
, NULL
, 0, &errorCode2
);
248 if(U_FAILURE(errorCode2
) && errorCode2
!= U_BUFFER_OVERFLOW_ERROR
) {
249 errorCode
= errorCode2
;
252 // The trie size should be a multiple of 8 bytes due to the way
253 // compactIndex2(UNewTrie2 *trie) currently works.
254 U_ASSERT((length
& 7) == 0);
258 indexes
[CollationDataReader::IX_RESERVED8_OFFSET
] = totalSize
;
259 indexes
[CollationDataReader::IX_CES_OFFSET
] = totalSize
;
260 if(hasMappings
&& data
.cesLength
!= 0) {
261 U_ASSERT(((headerSize
+ totalSize
) & 7) == 0);
262 totalSize
+= data
.cesLength
* 8;
265 indexes
[CollationDataReader::IX_RESERVED10_OFFSET
] = totalSize
;
266 indexes
[CollationDataReader::IX_CE32S_OFFSET
] = totalSize
;
268 totalSize
+= data
.ce32sLength
* 4;
271 indexes
[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET
] = totalSize
;
272 totalSize
+= rootElementsLength
* 4;
274 indexes
[CollationDataReader::IX_CONTEXTS_OFFSET
] = totalSize
;
276 totalSize
+= data
.contextsLength
* 2;
279 indexes
[CollationDataReader::IX_UNSAFE_BWD_OFFSET
] = totalSize
;
280 if(hasMappings
&& !unsafeBackwardSet
.isEmpty()) {
281 UErrorCode errorCode2
= U_ZERO_ERROR
;
283 if(totalSize
< capacity
) {
284 uint16_t *p
= reinterpret_cast<uint16_t *>(dest
+ totalSize
);
285 length
= unsafeBackwardSet
.serialize(
286 p
, (capacity
- totalSize
) / 2, errorCode2
);
288 length
= unsafeBackwardSet
.serialize(NULL
, 0, errorCode2
);
290 if(U_FAILURE(errorCode2
) && errorCode2
!= U_BUFFER_OVERFLOW_ERROR
) {
291 errorCode
= errorCode2
;
294 totalSize
+= length
* 2;
297 indexes
[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
] = totalSize
;
298 totalSize
+= fastLatinTableLength
* 2;
300 UnicodeString scripts
;
301 indexes
[CollationDataReader::IX_SCRIPTS_OFFSET
] = totalSize
;
303 scripts
.append((UChar
)data
.numScripts
);
304 scripts
.append(reinterpret_cast<const UChar
*>(data
.scriptsIndex
), data
.numScripts
+ 16);
305 scripts
.append(reinterpret_cast<const UChar
*>(data
.scriptStarts
), data
.scriptStartsLength
);
306 totalSize
+= scripts
.length() * 2;
309 indexes
[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET
] = totalSize
;
314 indexes
[CollationDataReader::IX_RESERVED18_OFFSET
] = totalSize
;
315 indexes
[CollationDataReader::IX_TOTAL_SIZE
] = totalSize
;
317 if(totalSize
> capacity
) {
318 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
319 return headerSize
+ totalSize
;
322 uprv_memcpy(dest
, indexes
, indexesLength
* 4);
323 copyData(indexes
, CollationDataReader::IX_REORDER_CODES_OFFSET
, reorderCodes
, dest
);
324 copyData(indexes
, CollationDataReader::IX_REORDER_TABLE_OFFSET
, settings
.reorderTable
, dest
);
325 // The trie has already been serialized into the dest buffer.
326 copyData(indexes
, CollationDataReader::IX_CES_OFFSET
, data
.ces
, dest
);
327 copyData(indexes
, CollationDataReader::IX_CE32S_OFFSET
, data
.ce32s
, dest
);
328 copyData(indexes
, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET
, rootElements
, dest
);
329 copyData(indexes
, CollationDataReader::IX_CONTEXTS_OFFSET
, data
.contexts
, dest
);
330 // The unsafeBackwardSet has already been serialized into the dest buffer.
331 copyData(indexes
, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET
, data
.fastLatinTable
, dest
);
332 copyData(indexes
, CollationDataReader::IX_SCRIPTS_OFFSET
, scripts
.getBuffer(), dest
);
333 copyData(indexes
, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET
, data
.compressibleBytes
, dest
);
335 return headerSize
+ totalSize
;
339 CollationDataWriter::copyData(const int32_t indexes
[], int32_t startIndex
,
340 const void *src
, uint8_t *dest
) {
341 int32_t start
= indexes
[startIndex
];
342 int32_t limit
= indexes
[startIndex
+ 1];
344 uprv_memcpy(dest
+ start
, src
, limit
- start
);
350 #endif // !UCONFIG_NO_COLLATION