2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "collunsafe.h"
29 #include "normalizer2impl.h"
38 int32_t getIndex(const int32_t *indexes
, int32_t length
, int32_t i
) {
39 return (i
< length
) ? indexes
[i
] : -1;
45 CollationDataReader::read(const CollationTailoring
*base
, const uint8_t *inBytes
, int32_t inLength
,
46 CollationTailoring
&tailoring
, UErrorCode
&errorCode
) {
47 if(U_FAILURE(errorCode
)) { return; }
49 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 24)) {
50 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
53 const DataHeader
*header
= reinterpret_cast<const DataHeader
*>(inBytes
);
54 if(!(header
->dataHeader
.magic1
== 0xda && header
->dataHeader
.magic2
== 0x27 &&
55 isAcceptable(tailoring
.version
, NULL
, NULL
, &header
->info
))) {
56 errorCode
= U_INVALID_FORMAT_ERROR
;
59 if(base
->getUCAVersion() != tailoring
.getUCAVersion()) {
60 errorCode
= U_COLLATOR_VERSION_MISMATCH
;
63 int32_t headerLength
= header
->dataHeader
.headerSize
;
64 inBytes
+= headerLength
;
66 inLength
-= headerLength
;
70 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 8)) {
71 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
74 const int32_t *inIndexes
= reinterpret_cast<const int32_t *>(inBytes
);
75 int32_t indexesLength
= inIndexes
[IX_INDEXES_LENGTH
];
76 if(indexesLength
< 2 || (0 <= inLength
&& inLength
< indexesLength
* 4)) {
77 errorCode
= U_INVALID_FORMAT_ERROR
; // Not enough indexes.
81 // Assume that the tailoring data is in initial state,
82 // with NULL pointers and 0 lengths.
84 // Set pointers to non-empty data parts.
85 // Do this in order of their byte offsets. (Should help porting to Java.)
87 int32_t index
; // one of the indexes[] slots
88 int32_t offset
; // byte offset for the index part
89 int32_t length
; // number of bytes in the index part
91 if(indexesLength
> IX_TOTAL_SIZE
) {
92 length
= inIndexes
[IX_TOTAL_SIZE
];
93 } else if(indexesLength
> IX_REORDER_CODES_OFFSET
) {
94 length
= inIndexes
[indexesLength
- 1];
96 length
= 0; // only indexes, and inLength was already checked for them
98 if(0 <= inLength
&& inLength
< length
) {
99 errorCode
= U_INVALID_FORMAT_ERROR
;
103 const CollationData
*baseData
= base
== NULL
? NULL
: base
->data
;
104 const int32_t *reorderCodes
= NULL
;
105 int32_t reorderCodesLength
= 0;
106 const uint32_t *reorderRanges
= NULL
;
107 int32_t reorderRangesLength
= 0;
108 index
= IX_REORDER_CODES_OFFSET
;
109 offset
= getIndex(inIndexes
, indexesLength
, index
);
110 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
112 if(baseData
== NULL
) {
113 // We assume for collation settings that
114 // the base data does not have a reordering.
115 errorCode
= U_INVALID_FORMAT_ERROR
;
118 reorderCodes
= reinterpret_cast<const int32_t *>(inBytes
+ offset
);
119 reorderCodesLength
= length
/ 4;
121 // The reorderRanges (if any) are the trailing reorderCodes entries.
122 // Split the array at the boundary.
123 // Script or reorder codes do not exceed 16-bit values.
124 // Range limits are stored in the upper 16 bits, and are never 0.
125 while(reorderRangesLength
< reorderCodesLength
&&
126 (reorderCodes
[reorderCodesLength
- reorderRangesLength
- 1] & 0xffff0000) != 0) {
127 ++reorderRangesLength
;
129 U_ASSERT(reorderRangesLength
< reorderCodesLength
);
130 if(reorderRangesLength
!= 0) {
131 reorderCodesLength
-= reorderRangesLength
;
132 reorderRanges
= reinterpret_cast<const uint32_t *>(reorderCodes
+ reorderCodesLength
);
136 // There should be a reorder table only if there are reorder codes.
137 // However, when there are reorder codes the reorder table may be omitted to reduce
139 const uint8_t *reorderTable
= NULL
;
140 index
= IX_REORDER_TABLE_OFFSET
;
141 offset
= getIndex(inIndexes
, indexesLength
, index
);
142 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
144 if(reorderCodesLength
== 0) {
145 errorCode
= U_INVALID_FORMAT_ERROR
; // Reordering table without reordering codes.
148 reorderTable
= inBytes
+ offset
;
150 // If we have reorder codes, then build the reorderTable at the end,
151 // when the CollationData is otherwise complete.
154 if(baseData
!= NULL
&& baseData
->numericPrimary
!= (inIndexes
[IX_OPTIONS
] & 0xff000000)) {
155 errorCode
= U_INVALID_FORMAT_ERROR
;
158 CollationData
*data
= NULL
; // Remains NULL if there are no mappings.
160 index
= IX_TRIE_OFFSET
;
161 offset
= getIndex(inIndexes
, indexesLength
, index
);
162 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
164 if(!tailoring
.ensureOwnedData(errorCode
)) { return; }
165 data
= tailoring
.ownedData
;
166 data
->base
= baseData
;
167 data
->numericPrimary
= inIndexes
[IX_OPTIONS
] & 0xff000000;
168 data
->trie
= tailoring
.trie
= utrie2_openFromSerialized(
169 UTRIE2_32_VALUE_BITS
, inBytes
+ offset
, length
, NULL
,
171 if(U_FAILURE(errorCode
)) { return; }
172 } else if(baseData
!= NULL
) {
173 // Use the base data. Only the settings are tailored.
174 tailoring
.data
= baseData
;
176 errorCode
= U_INVALID_FORMAT_ERROR
; // No mappings.
180 index
= IX_CES_OFFSET
;
181 offset
= getIndex(inIndexes
, indexesLength
, index
);
182 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
185 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ces without tailored trie.
188 data
->ces
= reinterpret_cast<const int64_t *>(inBytes
+ offset
);
189 data
->cesLength
= length
/ 8;
192 index
= IX_CE32S_OFFSET
;
193 offset
= getIndex(inIndexes
, indexesLength
, index
);
194 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
197 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ce32s without tailored trie.
200 data
->ce32s
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
201 data
->ce32sLength
= length
/ 4;
204 int32_t jamoCE32sStart
= getIndex(inIndexes
, indexesLength
, IX_JAMO_CE32S_START
);
205 if(jamoCE32sStart
>= 0) {
206 if(data
== NULL
|| data
->ce32s
== NULL
) {
207 errorCode
= U_INVALID_FORMAT_ERROR
; // Index into non-existent ce32s[].
210 data
->jamoCE32s
= data
->ce32s
+ jamoCE32sStart
;
211 } else if(data
== NULL
) {
213 } else if(baseData
!= NULL
) {
214 data
->jamoCE32s
= baseData
->jamoCE32s
;
216 errorCode
= U_INVALID_FORMAT_ERROR
; // No Jamo CE32s for Hangul processing.
220 index
= IX_ROOT_ELEMENTS_OFFSET
;
221 offset
= getIndex(inIndexes
, indexesLength
, index
);
222 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
225 if(data
== NULL
|| length
<= CollationRootElements::IX_SEC_TER_BOUNDARIES
) {
226 errorCode
= U_INVALID_FORMAT_ERROR
;
229 data
->rootElements
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
230 data
->rootElementsLength
= length
;
231 uint32_t commonSecTer
= data
->rootElements
[CollationRootElements::IX_COMMON_SEC_AND_TER_CE
];
232 if(commonSecTer
!= Collation::COMMON_SEC_AND_TER_CE
) {
233 errorCode
= U_INVALID_FORMAT_ERROR
;
236 uint32_t secTerBoundaries
= data
->rootElements
[CollationRootElements::IX_SEC_TER_BOUNDARIES
];
237 if((secTerBoundaries
>> 24) < CollationKeys::SEC_COMMON_HIGH
) {
238 // [fixed last secondary common byte] is too low,
239 // and secondary weights would collide with compressed common secondaries.
240 errorCode
= U_INVALID_FORMAT_ERROR
;
245 index
= IX_CONTEXTS_OFFSET
;
246 offset
= getIndex(inIndexes
, indexesLength
, index
);
247 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
250 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored contexts without tailored trie.
253 data
->contexts
= reinterpret_cast<const UChar
*>(inBytes
+ offset
);
254 data
->contextsLength
= length
/ 2;
257 index
= IX_UNSAFE_BWD_OFFSET
;
258 offset
= getIndex(inIndexes
, indexesLength
, index
);
259 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
262 errorCode
= U_INVALID_FORMAT_ERROR
;
265 if(baseData
== NULL
) {
266 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
267 tailoring
.unsafeBackwardSet
= new UnicodeSet(unsafe_serializedData
, unsafe_serializedCount
, UnicodeSet::kSerialized
, errorCode
);
268 if(tailoring
.unsafeBackwardSet
== NULL
) {
269 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
271 } else if (U_FAILURE(errorCode
)) {
275 // Create the unsafe-backward set for the root collator.
276 // Include all non-zero combining marks and trail surrogates.
277 // We do this at load time, rather than at build time,
278 // to simplify Unicode version bootstrapping:
279 // The root data builder only needs the new FractionalUCA.txt data,
280 // but it need not be built with a version of ICU already updated to
281 // the corresponding new Unicode Character Database.
283 // The following is an optimized version of
284 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
285 // It is faster and requires fewer code dependencies.
286 tailoring
.unsafeBackwardSet
= new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
287 if(tailoring
.unsafeBackwardSet
== NULL
) {
288 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
291 data
->nfcImpl
.addLcccChars(*tailoring
.unsafeBackwardSet
);
292 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
294 // Clone the root collator's set contents.
295 tailoring
.unsafeBackwardSet
= static_cast<UnicodeSet
*>(
296 baseData
->unsafeBackwardSet
->cloneAsThawed());
297 if(tailoring
.unsafeBackwardSet
== NULL
) {
298 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
302 // Add the ranges from the data file to the unsafe-backward set.
304 const uint16_t *unsafeData
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
305 if(!uset_getSerializedSet(&sset
, unsafeData
, length
/ 2)) {
306 errorCode
= U_INVALID_FORMAT_ERROR
;
309 int32_t count
= uset_getSerializedRangeCount(&sset
);
310 for(int32_t i
= 0; i
< count
; ++i
) {
312 uset_getSerializedRange(&sset
, i
, &start
, &end
);
313 tailoring
.unsafeBackwardSet
->add(start
, end
);
315 // Mark each lead surrogate as "unsafe"
316 // if any of its 1024 associated supplementary code points is "unsafe".
318 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
319 if(!tailoring
.unsafeBackwardSet
->containsNone(c
, c
+ 0x3ff)) {
320 tailoring
.unsafeBackwardSet
->add(lead
);
323 tailoring
.unsafeBackwardSet
->freeze();
324 data
->unsafeBackwardSet
= tailoring
.unsafeBackwardSet
;
325 } else if(data
== NULL
) {
327 } else if(baseData
!= NULL
) {
328 // No tailoring-specific data: Alias the root collator's set.
329 data
->unsafeBackwardSet
= baseData
->unsafeBackwardSet
;
331 errorCode
= U_INVALID_FORMAT_ERROR
; // No unsafeBackwardSet.
335 // If the fast Latin format version is different,
336 // or the version is set to 0 for "no fast Latin table",
337 // then just always use the normal string comparison path.
339 data
->fastLatinTable
= NULL
;
340 data
->fastLatinTableLength
= 0;
341 if(((inIndexes
[IX_OPTIONS
] >> 16) & 0xff) == CollationFastLatin::VERSION
) {
342 index
= IX_FAST_LATIN_TABLE_OFFSET
;
343 offset
= getIndex(inIndexes
, indexesLength
, index
);
344 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
346 data
->fastLatinTable
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
347 data
->fastLatinTableLength
= length
/ 2;
348 if((*data
->fastLatinTable
>> 8) != CollationFastLatin::VERSION
) {
349 errorCode
= U_INVALID_FORMAT_ERROR
; // header vs. table version mismatch
352 } else if(baseData
!= NULL
) {
353 data
->fastLatinTable
= baseData
->fastLatinTable
;
354 data
->fastLatinTableLength
= baseData
->fastLatinTableLength
;
359 index
= IX_SCRIPTS_OFFSET
;
360 offset
= getIndex(inIndexes
, indexesLength
, index
);
361 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
364 errorCode
= U_INVALID_FORMAT_ERROR
;
367 const uint16_t *scripts
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
368 int32_t scriptsLength
= length
/ 2;
369 data
->numScripts
= scripts
[0];
370 // There must be enough entries for both arrays, including more than two range starts.
371 data
->scriptStartsLength
= scriptsLength
- (1 + data
->numScripts
+ 16);
372 if(data
->scriptStartsLength
<= 2 ||
373 CollationData::MAX_NUM_SCRIPT_RANGES
< data
->scriptStartsLength
) {
374 errorCode
= U_INVALID_FORMAT_ERROR
;
377 data
->scriptsIndex
= scripts
+ 1;
378 data
->scriptStarts
= scripts
+ 1 + data
->numScripts
+ 16;
379 if(!(data
->scriptStarts
[0] == 0 &&
380 data
->scriptStarts
[1] == ((Collation::MERGE_SEPARATOR_BYTE
+ 1) << 8) &&
381 data
->scriptStarts
[data
->scriptStartsLength
- 1] ==
382 (Collation::TRAIL_WEIGHT_BYTE
<< 8))) {
383 errorCode
= U_INVALID_FORMAT_ERROR
;
386 } else if(data
== NULL
) {
388 } else if(baseData
!= NULL
) {
389 data
->numScripts
= baseData
->numScripts
;
390 data
->scriptsIndex
= baseData
->scriptsIndex
;
391 data
->scriptStarts
= baseData
->scriptStarts
;
392 data
->scriptStartsLength
= baseData
->scriptStartsLength
;
395 index
= IX_COMPRESSIBLE_BYTES_OFFSET
;
396 offset
= getIndex(inIndexes
, indexesLength
, index
);
397 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
400 errorCode
= U_INVALID_FORMAT_ERROR
;
403 data
->compressibleBytes
= reinterpret_cast<const UBool
*>(inBytes
+ offset
);
404 } else if(data
== NULL
) {
406 } else if(baseData
!= NULL
) {
407 data
->compressibleBytes
= baseData
->compressibleBytes
;
409 errorCode
= U_INVALID_FORMAT_ERROR
; // No compressibleBytes[].
413 const CollationSettings
&ts
= *tailoring
.settings
;
414 int32_t options
= inIndexes
[IX_OPTIONS
] & 0xffff;
415 uint16_t fastLatinPrimaries
[CollationFastLatin::LATIN_LIMIT
];
416 int32_t fastLatinOptions
= CollationFastLatin::getOptions(
417 tailoring
.data
, ts
, fastLatinPrimaries
, UPRV_LENGTHOF(fastLatinPrimaries
));
418 if(options
== ts
.options
&& ts
.variableTop
!= 0 &&
419 reorderCodesLength
== ts
.reorderCodesLength
&&
420 uprv_memcmp(reorderCodes
, ts
.reorderCodes
, reorderCodesLength
* 4) == 0 &&
421 fastLatinOptions
== ts
.fastLatinOptions
&&
422 (fastLatinOptions
< 0 ||
423 uprv_memcmp(fastLatinPrimaries
, ts
.fastLatinPrimaries
,
424 sizeof(fastLatinPrimaries
)) == 0)) {
428 CollationSettings
*settings
= SharedObject::copyOnWrite(tailoring
.settings
);
429 if(settings
== NULL
) {
430 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
433 settings
->options
= options
;
434 // Set variableTop from options and scripts data.
435 settings
->variableTop
= tailoring
.data
->getLastPrimaryForGroup(
436 UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
437 if(settings
->variableTop
== 0) {
438 errorCode
= U_INVALID_FORMAT_ERROR
;
442 if(reorderCodesLength
!= 0) {
443 settings
->aliasReordering(*baseData
, reorderCodes
, reorderCodesLength
,
444 reorderRanges
, reorderRangesLength
,
445 reorderTable
, errorCode
);
448 settings
->fastLatinOptions
= CollationFastLatin::getOptions(
449 tailoring
.data
, *settings
,
450 settings
->fastLatinPrimaries
, UPRV_LENGTHOF(settings
->fastLatinPrimaries
));
454 CollationDataReader::isAcceptable(void *context
,
455 const char * /* type */, const char * /*name*/,
456 const UDataInfo
*pInfo
) {
459 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
460 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
461 pInfo
->dataFormat
[0] == 0x55 && // dataFormat="UCol"
462 pInfo
->dataFormat
[1] == 0x43 &&
463 pInfo
->dataFormat
[2] == 0x6f &&
464 pInfo
->dataFormat
[3] == 0x6c &&
465 pInfo
->formatVersion
[0] == 5
467 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
468 if(version
!= NULL
) {
469 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
479 #endif // !UCONFIG_NO_COLLATION