2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "normalizer2impl.h"
37 int32_t getIndex(const int32_t *indexes
, int32_t length
, int32_t i
) {
38 return (i
< length
) ? indexes
[i
] : -1;
44 CollationDataReader::read(const CollationTailoring
*base
, const uint8_t *inBytes
, int32_t inLength
,
45 CollationTailoring
&tailoring
, UErrorCode
&errorCode
) {
46 if(U_FAILURE(errorCode
)) { return; }
48 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 24)) {
49 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
52 const DataHeader
*header
= reinterpret_cast<const DataHeader
*>(inBytes
);
53 if(!(header
->dataHeader
.magic1
== 0xda && header
->dataHeader
.magic2
== 0x27 &&
54 isAcceptable(tailoring
.version
, NULL
, NULL
, &header
->info
))) {
55 errorCode
= U_INVALID_FORMAT_ERROR
;
58 if(base
->getUCAVersion() != tailoring
.getUCAVersion()) {
59 errorCode
= U_COLLATOR_VERSION_MISMATCH
;
62 int32_t headerLength
= header
->dataHeader
.headerSize
;
63 inBytes
+= headerLength
;
65 inLength
-= headerLength
;
69 if(inBytes
== NULL
|| (0 <= inLength
&& inLength
< 8)) {
70 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
73 const int32_t *inIndexes
= reinterpret_cast<const int32_t *>(inBytes
);
74 int32_t indexesLength
= inIndexes
[IX_INDEXES_LENGTH
];
75 if(indexesLength
< 2 || (0 <= inLength
&& inLength
< indexesLength
* 4)) {
76 errorCode
= U_INVALID_FORMAT_ERROR
; // Not enough indexes.
80 // Assume that the tailoring data is in initial state,
81 // with NULL pointers and 0 lengths.
83 // Set pointers to non-empty data parts.
84 // Do this in order of their byte offsets. (Should help porting to Java.)
86 int32_t index
; // one of the indexes[] slots
87 int32_t offset
; // byte offset for the index part
88 int32_t length
; // number of bytes in the index part
90 if(indexesLength
> IX_TOTAL_SIZE
) {
91 length
= inIndexes
[IX_TOTAL_SIZE
];
92 } else if(indexesLength
> IX_REORDER_CODES_OFFSET
) {
93 length
= inIndexes
[indexesLength
- 1];
95 length
= 0; // only indexes, and inLength was already checked for them
97 if(0 <= inLength
&& inLength
< length
) {
98 errorCode
= U_INVALID_FORMAT_ERROR
;
102 const CollationData
*baseData
= base
== NULL
? NULL
: base
->data
;
103 const int32_t *reorderCodes
= NULL
;
104 int32_t reorderCodesLength
= 0;
105 const uint32_t *reorderRanges
= NULL
;
106 int32_t reorderRangesLength
= 0;
107 index
= IX_REORDER_CODES_OFFSET
;
108 offset
= getIndex(inIndexes
, indexesLength
, index
);
109 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
111 if(baseData
== NULL
) {
112 // We assume for collation settings that
113 // the base data does not have a reordering.
114 errorCode
= U_INVALID_FORMAT_ERROR
;
117 reorderCodes
= reinterpret_cast<const int32_t *>(inBytes
+ offset
);
118 reorderCodesLength
= length
/ 4;
120 // The reorderRanges (if any) are the trailing reorderCodes entries.
121 // Split the array at the boundary.
122 // Script or reorder codes do not exceed 16-bit values.
123 // Range limits are stored in the upper 16 bits, and are never 0.
124 while(reorderRangesLength
< reorderCodesLength
&&
125 (reorderCodes
[reorderCodesLength
- reorderRangesLength
- 1] & 0xffff0000) != 0) {
126 ++reorderRangesLength
;
128 U_ASSERT(reorderRangesLength
< reorderCodesLength
);
129 if(reorderRangesLength
!= 0) {
130 reorderCodesLength
-= reorderRangesLength
;
131 reorderRanges
= reinterpret_cast<const uint32_t *>(reorderCodes
+ reorderCodesLength
);
135 // There should be a reorder table only if there are reorder codes.
136 // However, when there are reorder codes the reorder table may be omitted to reduce
138 const uint8_t *reorderTable
= NULL
;
139 index
= IX_REORDER_TABLE_OFFSET
;
140 offset
= getIndex(inIndexes
, indexesLength
, index
);
141 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
143 if(reorderCodesLength
== 0) {
144 errorCode
= U_INVALID_FORMAT_ERROR
; // Reordering table without reordering codes.
147 reorderTable
= inBytes
+ offset
;
149 // If we have reorder codes, then build the reorderTable at the end,
150 // when the CollationData is otherwise complete.
153 if(baseData
!= NULL
&& baseData
->numericPrimary
!= (inIndexes
[IX_OPTIONS
] & 0xff000000)) {
154 errorCode
= U_INVALID_FORMAT_ERROR
;
157 CollationData
*data
= NULL
; // Remains NULL if there are no mappings.
159 index
= IX_TRIE_OFFSET
;
160 offset
= getIndex(inIndexes
, indexesLength
, index
);
161 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
163 if(!tailoring
.ensureOwnedData(errorCode
)) { return; }
164 data
= tailoring
.ownedData
;
165 data
->base
= baseData
;
166 data
->numericPrimary
= inIndexes
[IX_OPTIONS
] & 0xff000000;
167 data
->trie
= tailoring
.trie
= utrie2_openFromSerialized(
168 UTRIE2_32_VALUE_BITS
, inBytes
+ offset
, length
, NULL
,
170 if(U_FAILURE(errorCode
)) { return; }
171 } else if(baseData
!= NULL
) {
172 // Use the base data. Only the settings are tailored.
173 tailoring
.data
= baseData
;
175 errorCode
= U_INVALID_FORMAT_ERROR
; // No mappings.
179 index
= IX_CES_OFFSET
;
180 offset
= getIndex(inIndexes
, indexesLength
, index
);
181 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
184 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ces without tailored trie.
187 data
->ces
= reinterpret_cast<const int64_t *>(inBytes
+ offset
);
188 data
->cesLength
= length
/ 8;
191 index
= IX_CE32S_OFFSET
;
192 offset
= getIndex(inIndexes
, indexesLength
, index
);
193 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
196 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored ce32s without tailored trie.
199 data
->ce32s
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
200 data
->ce32sLength
= length
/ 4;
203 int32_t jamoCE32sStart
= getIndex(inIndexes
, indexesLength
, IX_JAMO_CE32S_START
);
204 if(jamoCE32sStart
>= 0) {
205 if(data
== NULL
|| data
->ce32s
== NULL
) {
206 errorCode
= U_INVALID_FORMAT_ERROR
; // Index into non-existent ce32s[].
209 data
->jamoCE32s
= data
->ce32s
+ jamoCE32sStart
;
210 } else if(data
== NULL
) {
212 } else if(baseData
!= NULL
) {
213 data
->jamoCE32s
= baseData
->jamoCE32s
;
215 errorCode
= U_INVALID_FORMAT_ERROR
; // No Jamo CE32s for Hangul processing.
219 index
= IX_ROOT_ELEMENTS_OFFSET
;
220 offset
= getIndex(inIndexes
, indexesLength
, index
);
221 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
224 if(data
== NULL
|| length
<= CollationRootElements::IX_SEC_TER_BOUNDARIES
) {
225 errorCode
= U_INVALID_FORMAT_ERROR
;
228 data
->rootElements
= reinterpret_cast<const uint32_t *>(inBytes
+ offset
);
229 data
->rootElementsLength
= length
;
230 uint32_t commonSecTer
= data
->rootElements
[CollationRootElements::IX_COMMON_SEC_AND_TER_CE
];
231 if(commonSecTer
!= Collation::COMMON_SEC_AND_TER_CE
) {
232 errorCode
= U_INVALID_FORMAT_ERROR
;
235 uint32_t secTerBoundaries
= data
->rootElements
[CollationRootElements::IX_SEC_TER_BOUNDARIES
];
236 if((secTerBoundaries
>> 24) < CollationKeys::SEC_COMMON_HIGH
) {
237 // [fixed last secondary common byte] is too low,
238 // and secondary weights would collide with compressed common secondaries.
239 errorCode
= U_INVALID_FORMAT_ERROR
;
244 index
= IX_CONTEXTS_OFFSET
;
245 offset
= getIndex(inIndexes
, indexesLength
, index
);
246 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
249 errorCode
= U_INVALID_FORMAT_ERROR
; // Tailored contexts without tailored trie.
252 data
->contexts
= reinterpret_cast<const UChar
*>(inBytes
+ offset
);
253 data
->contextsLength
= length
/ 2;
256 index
= IX_UNSAFE_BWD_OFFSET
;
257 offset
= getIndex(inIndexes
, indexesLength
, index
);
258 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
261 errorCode
= U_INVALID_FORMAT_ERROR
;
264 if(baseData
== NULL
) {
265 // Create the unsafe-backward set for the root collator.
266 // Include all non-zero combining marks and trail surrogates.
267 // We do this at load time, rather than at build time,
268 // to simplify Unicode version bootstrapping:
269 // The root data builder only needs the new FractionalUCA.txt data,
270 // but it need not be built with a version of ICU already updated to
271 // the corresponding new Unicode Character Database.
273 // The following is an optimized version of
274 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
275 // It is faster and requires fewer code dependencies.
276 tailoring
.unsafeBackwardSet
= new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
277 if(tailoring
.unsafeBackwardSet
== NULL
) {
278 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
281 data
->nfcImpl
.addLcccChars(*tailoring
.unsafeBackwardSet
);
283 // Clone the root collator's set contents.
284 tailoring
.unsafeBackwardSet
= static_cast<UnicodeSet
*>(
285 baseData
->unsafeBackwardSet
->cloneAsThawed());
286 if(tailoring
.unsafeBackwardSet
== NULL
) {
287 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
291 // Add the ranges from the data file to the unsafe-backward set.
293 const uint16_t *unsafeData
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
294 if(!uset_getSerializedSet(&sset
, unsafeData
, length
/ 2)) {
295 errorCode
= U_INVALID_FORMAT_ERROR
;
298 int32_t count
= uset_getSerializedRangeCount(&sset
);
299 for(int32_t i
= 0; i
< count
; ++i
) {
301 uset_getSerializedRange(&sset
, i
, &start
, &end
);
302 tailoring
.unsafeBackwardSet
->add(start
, end
);
304 // Mark each lead surrogate as "unsafe"
305 // if any of its 1024 associated supplementary code points is "unsafe".
307 for(UChar lead
= 0xd800; lead
< 0xdc00; ++lead
, c
+= 0x400) {
308 if(!tailoring
.unsafeBackwardSet
->containsNone(c
, c
+ 0x3ff)) {
309 tailoring
.unsafeBackwardSet
->add(lead
);
312 tailoring
.unsafeBackwardSet
->freeze();
313 data
->unsafeBackwardSet
= tailoring
.unsafeBackwardSet
;
314 } else if(data
== NULL
) {
316 } else if(baseData
!= NULL
) {
317 // No tailoring-specific data: Alias the root collator's set.
318 data
->unsafeBackwardSet
= baseData
->unsafeBackwardSet
;
320 errorCode
= U_INVALID_FORMAT_ERROR
; // No unsafeBackwardSet.
324 // If the fast Latin format version is different,
325 // or the version is set to 0 for "no fast Latin table",
326 // then just always use the normal string comparison path.
328 data
->fastLatinTable
= NULL
;
329 data
->fastLatinTableLength
= 0;
330 if(((inIndexes
[IX_OPTIONS
] >> 16) & 0xff) == CollationFastLatin::VERSION
) {
331 index
= IX_FAST_LATIN_TABLE_OFFSET
;
332 offset
= getIndex(inIndexes
, indexesLength
, index
);
333 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
335 data
->fastLatinTable
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
336 data
->fastLatinTableLength
= length
/ 2;
337 if((*data
->fastLatinTable
>> 8) != CollationFastLatin::VERSION
) {
338 errorCode
= U_INVALID_FORMAT_ERROR
; // header vs. table version mismatch
341 } else if(baseData
!= NULL
) {
342 data
->fastLatinTable
= baseData
->fastLatinTable
;
343 data
->fastLatinTableLength
= baseData
->fastLatinTableLength
;
348 index
= IX_SCRIPTS_OFFSET
;
349 offset
= getIndex(inIndexes
, indexesLength
, index
);
350 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
353 errorCode
= U_INVALID_FORMAT_ERROR
;
356 const uint16_t *scripts
= reinterpret_cast<const uint16_t *>(inBytes
+ offset
);
357 int32_t scriptsLength
= length
/ 2;
358 data
->numScripts
= scripts
[0];
359 // There must be enough entries for both arrays, including more than two range starts.
360 data
->scriptStartsLength
= scriptsLength
- (1 + data
->numScripts
+ 16);
361 if(data
->scriptStartsLength
<= 2 ||
362 CollationData::MAX_NUM_SCRIPT_RANGES
< data
->scriptStartsLength
) {
363 errorCode
= U_INVALID_FORMAT_ERROR
;
366 data
->scriptsIndex
= scripts
+ 1;
367 data
->scriptStarts
= scripts
+ 1 + data
->numScripts
+ 16;
368 if(!(data
->scriptStarts
[0] == 0 &&
369 data
->scriptStarts
[1] == ((Collation::MERGE_SEPARATOR_BYTE
+ 1) << 8) &&
370 data
->scriptStarts
[data
->scriptStartsLength
- 1] ==
371 (Collation::TRAIL_WEIGHT_BYTE
<< 8))) {
372 errorCode
= U_INVALID_FORMAT_ERROR
;
375 } else if(data
== NULL
) {
377 } else if(baseData
!= NULL
) {
378 data
->numScripts
= baseData
->numScripts
;
379 data
->scriptsIndex
= baseData
->scriptsIndex
;
380 data
->scriptStarts
= baseData
->scriptStarts
;
381 data
->scriptStartsLength
= baseData
->scriptStartsLength
;
384 index
= IX_COMPRESSIBLE_BYTES_OFFSET
;
385 offset
= getIndex(inIndexes
, indexesLength
, index
);
386 length
= getIndex(inIndexes
, indexesLength
, index
+ 1) - offset
;
389 errorCode
= U_INVALID_FORMAT_ERROR
;
392 data
->compressibleBytes
= reinterpret_cast<const UBool
*>(inBytes
+ offset
);
393 } else if(data
== NULL
) {
395 } else if(baseData
!= NULL
) {
396 data
->compressibleBytes
= baseData
->compressibleBytes
;
398 errorCode
= U_INVALID_FORMAT_ERROR
; // No compressibleBytes[].
402 const CollationSettings
&ts
= *tailoring
.settings
;
403 int32_t options
= inIndexes
[IX_OPTIONS
] & 0xffff;
404 uint16_t fastLatinPrimaries
[CollationFastLatin::LATIN_LIMIT
];
405 int32_t fastLatinOptions
= CollationFastLatin::getOptions(
406 tailoring
.data
, ts
, fastLatinPrimaries
, UPRV_LENGTHOF(fastLatinPrimaries
));
407 if(options
== ts
.options
&& ts
.variableTop
!= 0 &&
408 reorderCodesLength
== ts
.reorderCodesLength
&&
409 uprv_memcmp(reorderCodes
, ts
.reorderCodes
, reorderCodesLength
* 4) == 0 &&
410 fastLatinOptions
== ts
.fastLatinOptions
&&
411 (fastLatinOptions
< 0 ||
412 uprv_memcmp(fastLatinPrimaries
, ts
.fastLatinPrimaries
,
413 sizeof(fastLatinPrimaries
)) == 0)) {
417 CollationSettings
*settings
= SharedObject::copyOnWrite(tailoring
.settings
);
418 if(settings
== NULL
) {
419 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
422 settings
->options
= options
;
423 // Set variableTop from options and scripts data.
424 settings
->variableTop
= tailoring
.data
->getLastPrimaryForGroup(
425 UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
426 if(settings
->variableTop
== 0) {
427 errorCode
= U_INVALID_FORMAT_ERROR
;
431 if(reorderCodesLength
!= 0) {
432 settings
->aliasReordering(*baseData
, reorderCodes
, reorderCodesLength
,
433 reorderRanges
, reorderRangesLength
,
434 reorderTable
, errorCode
);
437 settings
->fastLatinOptions
= CollationFastLatin::getOptions(
438 tailoring
.data
, *settings
,
439 settings
->fastLatinPrimaries
, UPRV_LENGTHOF(settings
->fastLatinPrimaries
));
443 CollationDataReader::isAcceptable(void *context
,
444 const char * /* type */, const char * /*name*/,
445 const UDataInfo
*pInfo
) {
448 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
449 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
450 pInfo
->dataFormat
[0] == 0x55 && // dataFormat="UCol"
451 pInfo
->dataFormat
[1] == 0x43 &&
452 pInfo
->dataFormat
[2] == 0x6f &&
453 pInfo
->dataFormat
[3] == 0x6c &&
454 pInfo
->formatVersion
[0] == 5
456 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
457 if(version
!= NULL
) {
458 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
468 #endif // !UCONFIG_NO_COLLATION