1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatareader.cpp
10 * created on: 2013feb07
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/ucol.h"
19 #include "unicode/udata.h"
20 #include "unicode/uscript.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationdatareader.h"
25 #include "collationfastlatin.h"
26 #include "collationkeys.h"
27 #include "collationrootelements.h"
28 #include "collationsettings.h"
29 #include "collationtailoring.h"
30 #include "collunsafe.h"
31 #include "normalizer2impl.h"
40 int32_t getIndex(const int32_t *indexes
, int32_t length
, int32_t i
) {
41 return (i
< length
) ? indexes
[i
] : -1;
47 CollationDataReader::read(const CollationTailoring
*base
, const uint8_t *inBytes
, int32_t inLength
,
48 CollationTailoring
&tailoring
, UErrorCode
&errorCode
) {
49 if(U_FAILURE(errorCode
)) { return; }
51 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 24)) {
52 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
55 const DataHeader
*header
= reinterpret_cast<const DataHeader
*>(inBytes
);
56 if(!(header
->dataHeader
.magic1
== 0xda && header
->dataHeader
.magic2
== 0x27 &&
57 isAcceptable(tailoring
.version
, NULL
, NULL
, &header
->info
))) {
58 errorCode
= U_INVALID_FORMAT_ERROR
;
61 if(base
->getUCAVersion() != tailoring
.getUCAVersion()) {
62 errorCode
= U_COLLATOR_VERSION_MISMATCH
;
65 int32_t headerLength
= header
->dataHeader
.headerSize
;
66 inBytes
+= headerLength
;
68 inLength
-= headerLength
;
72 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 8)) {
73 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
76 const int32_t *inIndexes
= reinterpret_cast<const int32_t *>(inBytes
);
77 int32_t indexesLength
= inIndexes
[IX_INDEXES_LENGTH
];
78 if(indexesLength
< 2 || (0 <= inLength
&& inLength
< indexesLength
* 4)) {
79 errorCode
= U_INVALID_FORMAT_ERROR
; // Not enough indexes.
83 // Assume that the tailoring data is in initial state,
84 // with NULL pointers and 0 lengths.
86 // Set pointers to non-empty data parts.
87 // Do this in order of their byte offsets. (Should help porting to Java.)
89 int32_t index
; // one of the indexes[] slots
90 int32_t offset
; // byte offset for the index part
91 int32_t length
; // number of bytes in the index part
93 if(indexesLength
> IX_TOTAL_SIZE
) {
94 length
= inIndexes
[IX_TOTAL_SIZE
];
95 } else if(indexesLength
> IX_REORDER_CODES_OFFSET
) {
96 length
= inIndexes
[indexesLength
- 1];
98 length
= 0; // only indexes, and inLength was already checked for them
100 if(0 <= inLength
&& inLength
< length
) {
101 errorCode
= U_INVALID_FORMAT_ERROR
;
105 const CollationData
*baseData
= base
== NULL
? NULL
: base
->data
;
106 const int32_t *reorderCodes
= NULL
;
107 int32_t reorderCodesLength
= 0;
108 const uint32_t *reorderRanges
= NULL
;
109 int32_t reorderRangesLength
= 0;
110 index
= IX_REORDER_CODES_OFFSET
;
111 offset
= getIndex(inIndexes
, indexesLength
, index
);
112 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
114 if(baseData
== NULL
) {
115 // We assume for collation settings that
116 // the base data does not have a reordering.
117 errorCode
= U_INVALID_FORMAT_ERROR
;
120 reorderCodes
= reinterpret_cast<const int32_t *>(inBytes
+ offset
);
121 reorderCodesLength
= length
/ 4;
123 // The reorderRanges (if any) are the trailing reorderCodes entries.
124 // Split the array at the boundary.
125 // Script or reorder codes do not exceed 16-bit values.
126 // Range limits are stored in the upper 16 bits, and are never 0.
127 while(reorderRangesLength
< reorderCodesLength
&&
128 (reorderCodes
[reorderCodesLength
- reorderRangesLength
- 1] & 0xffff0000) != 0) {
129 ++reorderRangesLength
;
131 U_ASSERT(reorderRangesLength
< reorderCodesLength
);
132 if(reorderRangesLength
!= 0) {
133 reorderCodesLength
-= reorderRangesLength
;
134 reorderRanges
= reinterpret_cast<const uint32_t *>(reorderCodes
+ reorderCodesLength
);
138 // There should be a reorder table only if there are reorder codes.
139 // However, when there are reorder codes the reorder table may be omitted to reduce
141 const uint8_t *reorderTable
= NULL
;
142 index
= IX_REORDER_TABLE_OFFSET
;
143 offset
= getIndex(inIndexes
, indexesLength
, index
);
144 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
146 if(reorderCodesLength
== 0) {
147 errorCode
= U_INVALID_FORMAT_ERROR
; // Reordering table without reordering codes.
150 reorderTable
= inBytes
+ offset
;
152 // If we have reorder codes, then build the reorderTable at the end,
153 // when the CollationData is otherwise complete.
156 if(baseData
!= NULL
&& baseData
->numericPrimary
!= (inIndexes
[IX_OPTIONS
] & 0xff000000)) {
157 errorCode
= U_INVALID_FORMAT_ERROR
;
160 CollationData
*data
= NULL
; // Remains NULL if there are no mappings.
162 index
= IX_TRIE_OFFSET
;
163 offset
= getIndex(inIndexes
, indexesLength
, index
);
164 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
166 if(!tailoring
.ensureOwnedData(errorCode
)) { return; }
167 data
= tailoring
.ownedData
;
168 data
->base
= baseData
;
169 data
->numericPrimary
= inIndexes
[IX_OPTIONS
] & 0xff000000;
170 data
->trie
= tailoring
.trie
= utrie2_openFromSerialized(
171 UTRIE2_32_VALUE_BITS
, inBytes
+ offset
, length
, NULL
,
173 if(U_FAILURE(errorCode
)) { return; }
174 } else if(baseData
!= NULL
) {
175 // Use the base data. Only the settings are tailored.
176 tailoring
.data
= baseData
;
178 errorCode
= U_INVALID_FORMAT_ERROR
; // No mappings.
182 index
= IX_CES_OFFSET
;
183 offset
= getIndex(inIndexes
, indexesLength
, index
);
184 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
187 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ces without tailored trie.
190 data
->ces
= reinterpret_cast<const int64_t *>(inBytes
+ offset
);
191 data
->cesLength
= length
/ 8;
194 index
= IX_CE32S_OFFSET
;
195 offset
= getIndex(inIndexes
, indexesLength
, index
);
196 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
199 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ce32s without tailored trie.
202 data
->ce32s
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
203 data
->ce32sLength
= length
/ 4;
206 int32_t jamoCE32sStart
= getIndex(inIndexes
, indexesLength
, IX_JAMO_CE32S_START
);
207 if(jamoCE32sStart
>= 0) {
208 if(data
== NULL
|| data
->ce32s
== NULL
) {
209 errorCode
= U_INVALID_FORMAT_ERROR
; // Index into non-existent ce32s[].
212 data
->jamoCE32s
= data
->ce32s
+ jamoCE32sStart
;
213 } else if(data
== NULL
) {
215 } else if(baseData
!= NULL
) {
216 data
->jamoCE32s
= baseData
->jamoCE32s
;
218 errorCode
= U_INVALID_FORMAT_ERROR
; // No Jamo CE32s for Hangul processing.
222 index
= IX_ROOT_ELEMENTS_OFFSET
;
223 offset
= getIndex(inIndexes
, indexesLength
, index
);
224 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
227 if(data
== NULL
|| length
<= CollationRootElements::IX_SEC_TER_BOUNDARIES
) {
228 errorCode
= U_INVALID_FORMAT_ERROR
;
231 data
->rootElements
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
232 data
->rootElementsLength
= length
;
233 uint32_t commonSecTer
= data
->rootElements
[CollationRootElements::IX_COMMON_SEC_AND_TER_CE
];
234 if(commonSecTer
!= Collation::COMMON_SEC_AND_TER_CE
) {
235 errorCode
= U_INVALID_FORMAT_ERROR
;
238 uint32_t secTerBoundaries
= data
->rootElements
[CollationRootElements::IX_SEC_TER_BOUNDARIES
];
239 if((secTerBoundaries
>> 24) < CollationKeys::SEC_COMMON_HIGH
) {
240 // [fixed last secondary common byte] is too low,
241 // and secondary weights would collide with compressed common secondaries.
242 errorCode
= U_INVALID_FORMAT_ERROR
;
247 index
= IX_CONTEXTS_OFFSET
;
248 offset
= getIndex(inIndexes
, indexesLength
, index
);
249 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
252 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored contexts without tailored trie.
255 data
->contexts
= reinterpret_cast<const UChar
*>(inBytes
+ offset
);
256 data
->contextsLength
= length
/ 2;
259 index
= IX_UNSAFE_BWD_OFFSET
;
260 offset
= getIndex(inIndexes
, indexesLength
, index
);
261 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
264 errorCode
= U_INVALID_FORMAT_ERROR
;
267 if(baseData
== NULL
) {
268 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
269 tailoring
.unsafeBackwardSet
= new UnicodeSet(unsafe_serializedData
, unsafe_serializedCount
, UnicodeSet::kSerialized
, errorCode
);
270 if(tailoring
.unsafeBackwardSet
== NULL
) {
271 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
273 } else if (U_FAILURE(errorCode
)) {
277 // Create the unsafe-backward set for the root collator.
278 // Include all non-zero combining marks and trail surrogates.
279 // We do this at load time, rather than at build time,
280 // to simplify Unicode version bootstrapping:
281 // The root data builder only needs the new FractionalUCA.txt data,
282 // but it need not be built with a version of ICU already updated to
283 // the corresponding new Unicode Character Database.
285 // The following is an optimized version of
286 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
287 // It is faster and requires fewer code dependencies.
288 tailoring
.unsafeBackwardSet
= new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
289 if(tailoring
.unsafeBackwardSet
== NULL
) {
290 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
293 data
->nfcImpl
.addLcccChars(*tailoring
.unsafeBackwardSet
);
294 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
296 // Clone the root collator's set contents.
297 tailoring
.unsafeBackwardSet
= static_cast<UnicodeSet
*>(
298 baseData
->unsafeBackwardSet
->cloneAsThawed());
299 if(tailoring
.unsafeBackwardSet
== NULL
) {
300 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
304 // Add the ranges from the data file to the unsafe-backward set.
306 const uint16_t *unsafeData
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
307 if(!uset_getSerializedSet(&sset
, unsafeData
, length
/ 2)) {
308 errorCode
= U_INVALID_FORMAT_ERROR
;
311 int32_t count
= uset_getSerializedRangeCount(&sset
);
312 for(int32_t i
= 0; i
< count
; ++i
) {
314 uset_getSerializedRange(&sset
, i
, &start
, &end
);
315 tailoring
.unsafeBackwardSet
->add(start
, end
);
317 // Mark each lead surrogate as "unsafe"
318 // if any of its 1024 associated supplementary code points is "unsafe".
320 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
321 if(!tailoring
.unsafeBackwardSet
->containsNone(c
, c
+ 0x3ff)) {
322 tailoring
.unsafeBackwardSet
->add(lead
);
325 tailoring
.unsafeBackwardSet
->freeze();
326 data
->unsafeBackwardSet
= tailoring
.unsafeBackwardSet
;
327 } else if(data
== NULL
) {
329 } else if(baseData
!= NULL
) {
330 // No tailoring-specific data: Alias the root collator's set.
331 data
->unsafeBackwardSet
= baseData
->unsafeBackwardSet
;
333 errorCode
= U_INVALID_FORMAT_ERROR
; // No unsafeBackwardSet.
337 // If the fast Latin format version is different,
338 // or the version is set to 0 for "no fast Latin table",
339 // then just always use the normal string comparison path.
341 data
->fastLatinTable
= NULL
;
342 data
->fastLatinTableLength
= 0;
343 if(((inIndexes
[IX_OPTIONS
] >> 16) & 0xff) == CollationFastLatin::VERSION
) {
344 index
= IX_FAST_LATIN_TABLE_OFFSET
;
345 offset
= getIndex(inIndexes
, indexesLength
, index
);
346 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
348 data
->fastLatinTable
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
349 data
->fastLatinTableLength
= length
/ 2;
350 if((*data
->fastLatinTable
>> 8) != CollationFastLatin::VERSION
) {
351 errorCode
= U_INVALID_FORMAT_ERROR
; // header vs. table version mismatch
354 } else if(baseData
!= NULL
) {
355 data
->fastLatinTable
= baseData
->fastLatinTable
;
356 data
->fastLatinTableLength
= baseData
->fastLatinTableLength
;
361 index
= IX_SCRIPTS_OFFSET
;
362 offset
= getIndex(inIndexes
, indexesLength
, index
);
363 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
366 errorCode
= U_INVALID_FORMAT_ERROR
;
369 const uint16_t *scripts
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
370 int32_t scriptsLength
= length
/ 2;
371 data
->numScripts
= scripts
[0];
372 // There must be enough entries for both arrays, including more than two range starts.
373 data
->scriptStartsLength
= scriptsLength
- (1 + data
->numScripts
+ 16);
374 if(data
->scriptStartsLength
<= 2 ||
375 CollationData::MAX_NUM_SCRIPT_RANGES
< data
->scriptStartsLength
) {
376 errorCode
= U_INVALID_FORMAT_ERROR
;
379 data
->scriptsIndex
= scripts
+ 1;
380 data
->scriptStarts
= scripts
+ 1 + data
->numScripts
+ 16;
381 if(!(data
->scriptStarts
[0] == 0 &&
382 data
->scriptStarts
[1] == ((Collation::MERGE_SEPARATOR_BYTE
+ 1) << 8) &&
383 data
->scriptStarts
[data
->scriptStartsLength
- 1] ==
384 (Collation::TRAIL_WEIGHT_BYTE
<< 8))) {
385 errorCode
= U_INVALID_FORMAT_ERROR
;
388 } else if(data
== NULL
) {
390 } else if(baseData
!= NULL
) {
391 data
->numScripts
= baseData
->numScripts
;
392 data
->scriptsIndex
= baseData
->scriptsIndex
;
393 data
->scriptStarts
= baseData
->scriptStarts
;
394 data
->scriptStartsLength
= baseData
->scriptStartsLength
;
397 index
= IX_COMPRESSIBLE_BYTES_OFFSET
;
398 offset
= getIndex(inIndexes
, indexesLength
, index
);
399 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
402 errorCode
= U_INVALID_FORMAT_ERROR
;
405 data
->compressibleBytes
= reinterpret_cast<const UBool
*>(inBytes
+ offset
);
406 } else if(data
== NULL
) {
408 } else if(baseData
!= NULL
) {
409 data
->compressibleBytes
= baseData
->compressibleBytes
;
411 errorCode
= U_INVALID_FORMAT_ERROR
; // No compressibleBytes[].
415 const CollationSettings
&ts
= *tailoring
.settings
;
416 int32_t options
= inIndexes
[IX_OPTIONS
] & 0xffff;
417 uint16_t fastLatinPrimaries
[CollationFastLatin::LATIN_LIMIT
];
418 int32_t fastLatinOptions
= CollationFastLatin::getOptions(
419 tailoring
.data
, ts
, fastLatinPrimaries
, UPRV_LENGTHOF(fastLatinPrimaries
));
420 if(options
== ts
.options
&& ts
.variableTop
!= 0 &&
421 reorderCodesLength
== ts
.reorderCodesLength
&&
422 (reorderCodesLength
== 0 ||
423 uprv_memcmp(reorderCodes
, ts
.reorderCodes
, reorderCodesLength
* 4) == 0) &&
424 fastLatinOptions
== ts
.fastLatinOptions
&&
425 (fastLatinOptions
< 0 ||
426 uprv_memcmp(fastLatinPrimaries
, ts
.fastLatinPrimaries
,
427 sizeof(fastLatinPrimaries
)) == 0)) {
431 CollationSettings
*settings
= SharedObject::copyOnWrite(tailoring
.settings
);
432 if(settings
== NULL
) {
433 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
436 settings
->options
= options
;
437 // Set variableTop from options and scripts data.
438 settings
->variableTop
= tailoring
.data
->getLastPrimaryForGroup(
439 UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
440 if(settings
->variableTop
== 0) {
441 errorCode
= U_INVALID_FORMAT_ERROR
;
445 if(reorderCodesLength
!= 0) {
446 settings
->aliasReordering(*baseData
, reorderCodes
, reorderCodesLength
,
447 reorderRanges
, reorderRangesLength
,
448 reorderTable
, errorCode
);
451 settings
->fastLatinOptions
= CollationFastLatin::getOptions(
452 tailoring
.data
, *settings
,
453 settings
->fastLatinPrimaries
, UPRV_LENGTHOF(settings
->fastLatinPrimaries
));
457 CollationDataReader::isAcceptable(void *context
,
458 const char * /* type */, const char * /*name*/,
459 const UDataInfo
*pInfo
) {
462 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
463 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
464 pInfo
->dataFormat
[0] == 0x55 && // dataFormat="UCol"
465 pInfo
->dataFormat
[1] == 0x43 &&
466 pInfo
->dataFormat
[2] == 0x6f &&
467 pInfo
->dataFormat
[3] == 0x6c &&
468 pInfo
->formatVersion
[0] == 5
470 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
471 if(version
!= NULL
) {
472 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
482 #endif // !UCONFIG_NO_COLLATION