2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "normalizer2impl.h"
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39 int32_t getIndex(const int32_t *indexes
, int32_t length
, int32_t i
) {
40 return (i
< length
) ? indexes
[i
] : -1;
46 CollationDataReader::read(const CollationTailoring
*base
, const uint8_t *inBytes
, int32_t inLength
,
47 CollationTailoring
&tailoring
, UErrorCode
&errorCode
) {
48 if(U_FAILURE(errorCode
)) { return; }
50 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 24)) {
51 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
54 const DataHeader
*header
= reinterpret_cast<const DataHeader
*>(inBytes
);
55 if(!(header
->dataHeader
.magic1
== 0xda && header
->dataHeader
.magic2
== 0x27 &&
56 isAcceptable(tailoring
.version
, NULL
, NULL
, &header
->info
))) {
57 errorCode
= U_INVALID_FORMAT_ERROR
;
60 if(base
->getUCAVersion() != tailoring
.getUCAVersion()) {
61 errorCode
= U_COLLATOR_VERSION_MISMATCH
;
64 int32_t headerLength
= header
->dataHeader
.headerSize
;
65 inBytes
+= headerLength
;
67 inLength
-= headerLength
;
71 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 8)) {
72 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
75 const int32_t *inIndexes
= reinterpret_cast<const int32_t *>(inBytes
);
76 int32_t indexesLength
= inIndexes
[IX_INDEXES_LENGTH
];
77 if(indexesLength
< 2 || (0 <= inLength
&& inLength
< indexesLength
* 4)) {
78 errorCode
= U_INVALID_FORMAT_ERROR
; // Not enough indexes.
82 // Assume that the tailoring data is in initial state,
83 // with NULL pointers and 0 lengths.
85 // Set pointers to non-empty data parts.
86 // Do this in order of their byte offsets. (Should help porting to Java.)
88 int32_t index
; // one of the indexes[] slots
89 int32_t offset
; // byte offset for the index part
90 int32_t length
; // number of bytes in the index part
92 if(indexesLength
> IX_TOTAL_SIZE
) {
93 length
= inIndexes
[IX_TOTAL_SIZE
];
94 } else if(indexesLength
> IX_REORDER_CODES_OFFSET
) {
95 length
= inIndexes
[indexesLength
- 1];
97 length
= 0; // only indexes, and inLength was already checked for them
99 if(0 <= inLength
&& inLength
< length
) {
100 errorCode
= U_INVALID_FORMAT_ERROR
;
104 const CollationData
*baseData
= base
== NULL
? NULL
: base
->data
;
105 const int32_t *reorderCodes
= NULL
;
106 int32_t reorderCodesLength
= 0;
107 index
= IX_REORDER_CODES_OFFSET
;
108 offset
= getIndex(inIndexes
, indexesLength
, index
);
109 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
111 if(baseData
== NULL
) {
112 // We assume for collation settings that
113 // the base data does not have a reordering.
114 errorCode
= U_INVALID_FORMAT_ERROR
;
117 reorderCodes
= reinterpret_cast<const int32_t *>(inBytes
+ offset
);
118 reorderCodesLength
= length
/ 4;
121 // There should be a reorder table only if there are reorder codes.
122 // However, when there are reorder codes the reorder table may be omitted to reduce
124 const uint8_t *reorderTable
= NULL
;
125 index
= IX_REORDER_TABLE_OFFSET
;
126 offset
= getIndex(inIndexes
, indexesLength
, index
);
127 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
129 if(reorderCodesLength
== 0) {
130 errorCode
= U_INVALID_FORMAT_ERROR
; // Reordering table without reordering codes.
133 reorderTable
= inBytes
+ offset
;
135 // If we have reorder codes, then build the reorderTable at the end,
136 // when the CollationData is otherwise complete.
139 if(baseData
!= NULL
&& baseData
->numericPrimary
!= (inIndexes
[IX_OPTIONS
] & 0xff000000)) {
140 errorCode
= U_INVALID_FORMAT_ERROR
;
143 CollationData
*data
= NULL
; // Remains NULL if there are no mappings.
145 index
= IX_TRIE_OFFSET
;
146 offset
= getIndex(inIndexes
, indexesLength
, index
);
147 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
149 if(!tailoring
.ensureOwnedData(errorCode
)) { return; }
150 data
= tailoring
.ownedData
;
151 data
->base
= baseData
;
152 data
->numericPrimary
= inIndexes
[IX_OPTIONS
] & 0xff000000;
153 data
->trie
= tailoring
.trie
= utrie2_openFromSerialized(
154 UTRIE2_32_VALUE_BITS
, inBytes
+ offset
, length
, NULL
,
156 if(U_FAILURE(errorCode
)) { return; }
157 } else if(baseData
!= NULL
) {
158 // Use the base data. Only the settings are tailored.
159 tailoring
.data
= baseData
;
161 errorCode
= U_INVALID_FORMAT_ERROR
; // No mappings.
165 index
= IX_CES_OFFSET
;
166 offset
= getIndex(inIndexes
, indexesLength
, index
);
167 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
170 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ces without tailored trie.
173 data
->ces
= reinterpret_cast<const int64_t *>(inBytes
+ offset
);
174 data
->cesLength
= length
/ 8;
177 index
= IX_CE32S_OFFSET
;
178 offset
= getIndex(inIndexes
, indexesLength
, index
);
179 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
182 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ce32s without tailored trie.
185 data
->ce32s
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
186 data
->ce32sLength
= length
/ 4;
189 int32_t jamoCE32sStart
= getIndex(inIndexes
, indexesLength
, IX_JAMO_CE32S_START
);
190 if(jamoCE32sStart
>= 0) {
191 if(data
== NULL
|| data
->ce32s
== NULL
) {
192 errorCode
= U_INVALID_FORMAT_ERROR
; // Index into non-existent ce32s[].
195 data
->jamoCE32s
= data
->ce32s
+ jamoCE32sStart
;
196 } else if(data
== NULL
) {
198 } else if(baseData
!= NULL
) {
199 data
->jamoCE32s
= baseData
->jamoCE32s
;
201 errorCode
= U_INVALID_FORMAT_ERROR
; // No Jamo CE32s for Hangul processing.
205 index
= IX_ROOT_ELEMENTS_OFFSET
;
206 offset
= getIndex(inIndexes
, indexesLength
, index
);
207 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
210 if(data
== NULL
|| length
<= CollationRootElements::IX_SEC_TER_BOUNDARIES
) {
211 errorCode
= U_INVALID_FORMAT_ERROR
;
214 data
->rootElements
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
215 data
->rootElementsLength
= length
;
216 uint32_t commonSecTer
= data
->rootElements
[CollationRootElements::IX_COMMON_SEC_AND_TER_CE
];
217 if(commonSecTer
!= Collation::COMMON_SEC_AND_TER_CE
) {
218 errorCode
= U_INVALID_FORMAT_ERROR
;
221 uint32_t secTerBoundaries
= data
->rootElements
[CollationRootElements::IX_SEC_TER_BOUNDARIES
];
222 if((secTerBoundaries
>> 24) < CollationKeys::SEC_COMMON_HIGH
) {
223 // [fixed last secondary common byte] is too low,
224 // and secondary weights would collide with compressed common secondaries.
225 errorCode
= U_INVALID_FORMAT_ERROR
;
230 index
= IX_CONTEXTS_OFFSET
;
231 offset
= getIndex(inIndexes
, indexesLength
, index
);
232 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
235 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored contexts without tailored trie.
238 data
->contexts
= reinterpret_cast<const UChar
*>(inBytes
+ offset
);
239 data
->contextsLength
= length
/ 2;
242 index
= IX_UNSAFE_BWD_OFFSET
;
243 offset
= getIndex(inIndexes
, indexesLength
, index
);
244 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
247 errorCode
= U_INVALID_FORMAT_ERROR
;
250 if(baseData
== NULL
) {
251 // Create the unsafe-backward set for the root collator.
252 // Include all non-zero combining marks and trail surrogates.
253 // We do this at load time, rather than at build time,
254 // to simplify Unicode version bootstrapping:
255 // The root data builder only needs the new FractionalUCA.txt data,
256 // but it need not be built with a version of ICU already updated to
257 // the corresponding new Unicode Character Database.
259 // The following is an optimized version of
260 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
261 // It is faster and requires fewer code dependencies.
262 tailoring
.unsafeBackwardSet
= new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
263 if(tailoring
.unsafeBackwardSet
== NULL
) {
264 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
267 data
->nfcImpl
.addLcccChars(*tailoring
.unsafeBackwardSet
);
269 // Clone the root collator's set contents.
270 tailoring
.unsafeBackwardSet
= static_cast<UnicodeSet
*>(
271 baseData
->unsafeBackwardSet
->cloneAsThawed());
272 if(tailoring
.unsafeBackwardSet
== NULL
) {
273 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
277 // Add the ranges from the data file to the unsafe-backward set.
279 const uint16_t *unsafeData
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
280 if(!uset_getSerializedSet(&sset
, unsafeData
, length
/ 2)) {
281 errorCode
= U_INVALID_FORMAT_ERROR
;
284 int32_t count
= uset_getSerializedRangeCount(&sset
);
285 for(int32_t i
= 0; i
< count
; ++i
) {
287 uset_getSerializedRange(&sset
, i
, &start
, &end
);
288 tailoring
.unsafeBackwardSet
->add(start
, end
);
290 // Mark each lead surrogate as "unsafe"
291 // if any of its 1024 associated supplementary code points is "unsafe".
293 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
294 if(!tailoring
.unsafeBackwardSet
->containsNone(c
, c
+ 0x3ff)) {
295 tailoring
.unsafeBackwardSet
->add(lead
);
298 tailoring
.unsafeBackwardSet
->freeze();
299 data
->unsafeBackwardSet
= tailoring
.unsafeBackwardSet
;
300 } else if(data
== NULL
) {
302 } else if(baseData
!= NULL
) {
303 // No tailoring-specific data: Alias the root collator's set.
304 data
->unsafeBackwardSet
= baseData
->unsafeBackwardSet
;
306 errorCode
= U_INVALID_FORMAT_ERROR
; // No unsafeBackwardSet.
310 // If the fast Latin format version is different,
311 // or the version is set to 0 for "no fast Latin table",
312 // then just always use the normal string comparison path.
314 data
->fastLatinTable
= NULL
;
315 data
->fastLatinTableLength
= 0;
316 if(((inIndexes
[IX_OPTIONS
] >> 16) & 0xff) == CollationFastLatin::VERSION
) {
317 index
= IX_FAST_LATIN_TABLE_OFFSET
;
318 offset
= getIndex(inIndexes
, indexesLength
, index
);
319 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
321 data
->fastLatinTable
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
322 data
->fastLatinTableLength
= length
/ 2;
323 if((*data
->fastLatinTable
>> 8) != CollationFastLatin::VERSION
) {
324 errorCode
= U_INVALID_FORMAT_ERROR
; // header vs. table version mismatch
327 } else if(baseData
!= NULL
) {
328 data
->fastLatinTable
= baseData
->fastLatinTable
;
329 data
->fastLatinTableLength
= baseData
->fastLatinTableLength
;
334 index
= IX_SCRIPTS_OFFSET
;
335 offset
= getIndex(inIndexes
, indexesLength
, index
);
336 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
339 errorCode
= U_INVALID_FORMAT_ERROR
;
342 data
->scripts
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
343 data
->scriptsLength
= length
/ 2;
344 } else if(data
== NULL
) {
346 } else if(baseData
!= NULL
) {
347 data
->scripts
= baseData
->scripts
;
348 data
->scriptsLength
= baseData
->scriptsLength
;
351 index
= IX_COMPRESSIBLE_BYTES_OFFSET
;
352 offset
= getIndex(inIndexes
, indexesLength
, index
);
353 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
356 errorCode
= U_INVALID_FORMAT_ERROR
;
359 data
->compressibleBytes
= reinterpret_cast<const UBool
*>(inBytes
+ offset
);
360 } else if(data
== NULL
) {
362 } else if(baseData
!= NULL
) {
363 data
->compressibleBytes
= baseData
->compressibleBytes
;
365 errorCode
= U_INVALID_FORMAT_ERROR
; // No compressibleBytes[].
369 const CollationSettings
&ts
= *tailoring
.settings
;
370 int32_t options
= inIndexes
[IX_OPTIONS
] & 0xffff;
371 uint16_t fastLatinPrimaries
[CollationFastLatin::LATIN_LIMIT
];
372 int32_t fastLatinOptions
= CollationFastLatin::getOptions(
373 tailoring
.data
, ts
, fastLatinPrimaries
, LENGTHOF(fastLatinPrimaries
));
374 if(options
== ts
.options
&& ts
.variableTop
!= 0 &&
375 reorderCodesLength
== ts
.reorderCodesLength
&&
376 uprv_memcmp(reorderCodes
, ts
.reorderCodes
, reorderCodesLength
* 4) == 0 &&
377 fastLatinOptions
== ts
.fastLatinOptions
&&
378 (fastLatinOptions
< 0 ||
379 uprv_memcmp(fastLatinPrimaries
, ts
.fastLatinPrimaries
,
380 sizeof(fastLatinPrimaries
)) == 0)) {
384 CollationSettings
*settings
= SharedObject::copyOnWrite(tailoring
.settings
);
385 if(settings
== NULL
) {
386 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
389 settings
->options
= options
;
390 // Set variableTop from options and scripts data.
391 settings
->variableTop
= tailoring
.data
->getLastPrimaryForGroup(
392 UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
393 if(settings
->variableTop
== 0) {
394 errorCode
= U_INVALID_FORMAT_ERROR
;
398 if(reorderCodesLength
== 0 || reorderTable
!= NULL
) {
399 settings
->aliasReordering(reorderCodes
, reorderCodesLength
, reorderTable
);
402 baseData
->makeReorderTable(reorderCodes
, reorderCodesLength
, table
, errorCode
);
403 if(U_FAILURE(errorCode
)) { return; }
404 if(!settings
->setReordering(reorderCodes
, reorderCodesLength
,table
)) {
405 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
410 settings
->fastLatinOptions
= CollationFastLatin::getOptions(
411 tailoring
.data
, *settings
,
412 settings
->fastLatinPrimaries
, LENGTHOF(settings
->fastLatinPrimaries
));
416 CollationDataReader::isAcceptable(void *context
,
417 const char * /* type */, const char * /*name*/,
418 const UDataInfo
*pInfo
) {
421 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
422 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
423 pInfo
->dataFormat
[0] == 0x55 && // dataFormat="UCol"
424 pInfo
->dataFormat
[1] == 0x43 &&
425 pInfo
->dataFormat
[2] == 0x6f &&
426 pInfo
->dataFormat
[3] == 0x6c &&
427 pInfo
->formatVersion
[0] == 4
429 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
430 if(version
!= NULL
) {
431 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
441 #endif // !UCONFIG_NO_COLLATION