]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
b331163b | 3 | * Copyright (C) 2013-2015, International Business Machines |
57a6839d A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * collationdatareader.cpp | |
7 | * | |
8 | * created on: 2013feb07 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | ||
14 | #if !UCONFIG_NO_COLLATION | |
15 | ||
16 | #include "unicode/ucol.h" | |
17 | #include "unicode/udata.h" | |
18 | #include "unicode/uscript.h" | |
19 | #include "cmemory.h" | |
20 | #include "collation.h" | |
21 | #include "collationdata.h" | |
22 | #include "collationdatareader.h" | |
23 | #include "collationfastlatin.h" | |
24 | #include "collationkeys.h" | |
25 | #include "collationrootelements.h" | |
26 | #include "collationsettings.h" | |
27 | #include "collationtailoring.h" | |
28 | #include "normalizer2impl.h" | |
29 | #include "uassert.h" | |
30 | #include "ucmndata.h" | |
31 | #include "utrie2.h" | |
32 | ||
57a6839d A |
33 | U_NAMESPACE_BEGIN |
34 | ||
35 | namespace { | |
36 | ||
37 | int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { | |
38 | return (i < length) ? indexes[i] : -1; | |
39 | } | |
40 | ||
41 | } // namespace | |
42 | ||
43 | void | |
44 | CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, | |
45 | CollationTailoring &tailoring, UErrorCode &errorCode) { | |
46 | if(U_FAILURE(errorCode)) { return; } | |
47 | if(base != NULL) { | |
48 | if(inBytes == NULL || (0 <= inLength && inLength < 24)) { | |
49 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
50 | return; | |
51 | } | |
52 | const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes); | |
53 | if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 && | |
54 | isAcceptable(tailoring.version, NULL, NULL, &header->info))) { | |
55 | errorCode = U_INVALID_FORMAT_ERROR; | |
56 | return; | |
57 | } | |
58 | if(base->getUCAVersion() != tailoring.getUCAVersion()) { | |
59 | errorCode = U_COLLATOR_VERSION_MISMATCH; | |
60 | return; | |
61 | } | |
62 | int32_t headerLength = header->dataHeader.headerSize; | |
63 | inBytes += headerLength; | |
64 | if(inLength >= 0) { | |
65 | inLength -= headerLength; | |
66 | } | |
67 | } | |
68 | ||
69 | if(inBytes == NULL || (0 <= inLength && inLength < 8)) { | |
70 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
71 | return; | |
72 | } | |
73 | const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes); | |
74 | int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH]; | |
75 | if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) { | |
76 | errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. | |
77 | return; | |
78 | } | |
79 | ||
80 | // Assume that the tailoring data is in initial state, | |
81 | // with NULL pointers and 0 lengths. | |
82 | ||
83 | // Set pointers to non-empty data parts. | |
84 | // Do this in order of their byte offsets. (Should help porting to Java.) | |
85 | ||
86 | int32_t index; // one of the indexes[] slots | |
87 | int32_t offset; // byte offset for the index part | |
88 | int32_t length; // number of bytes in the index part | |
89 | ||
90 | if(indexesLength > IX_TOTAL_SIZE) { | |
91 | length = inIndexes[IX_TOTAL_SIZE]; | |
92 | } else if(indexesLength > IX_REORDER_CODES_OFFSET) { | |
93 | length = inIndexes[indexesLength - 1]; | |
94 | } else { | |
95 | length = 0; // only indexes, and inLength was already checked for them | |
96 | } | |
97 | if(0 <= inLength && inLength < length) { | |
98 | errorCode = U_INVALID_FORMAT_ERROR; | |
99 | return; | |
100 | } | |
101 | ||
102 | const CollationData *baseData = base == NULL ? NULL : base->data; | |
103 | const int32_t *reorderCodes = NULL; | |
104 | int32_t reorderCodesLength = 0; | |
b331163b A |
105 | const uint32_t *reorderRanges = NULL; |
106 | int32_t reorderRangesLength = 0; | |
57a6839d A |
107 | index = IX_REORDER_CODES_OFFSET; |
108 | offset = getIndex(inIndexes, indexesLength, index); | |
109 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
110 | if(length >= 4) { | |
111 | if(baseData == NULL) { | |
112 | // We assume for collation settings that | |
113 | // the base data does not have a reordering. | |
114 | errorCode = U_INVALID_FORMAT_ERROR; | |
115 | return; | |
116 | } | |
117 | reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); | |
118 | reorderCodesLength = length / 4; | |
b331163b A |
119 | |
120 | // The reorderRanges (if any) are the trailing reorderCodes entries. | |
121 | // Split the array at the boundary. | |
122 | // Script or reorder codes do not exceed 16-bit values. | |
123 | // Range limits are stored in the upper 16 bits, and are never 0. | |
124 | while(reorderRangesLength < reorderCodesLength && | |
125 | (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { | |
126 | ++reorderRangesLength; | |
127 | } | |
128 | U_ASSERT(reorderRangesLength < reorderCodesLength); | |
129 | if(reorderRangesLength != 0) { | |
130 | reorderCodesLength -= reorderRangesLength; | |
131 | reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength); | |
132 | } | |
57a6839d A |
133 | } |
134 | ||
135 | // There should be a reorder table only if there are reorder codes. | |
136 | // However, when there are reorder codes the reorder table may be omitted to reduce | |
137 | // the data size. | |
138 | const uint8_t *reorderTable = NULL; | |
139 | index = IX_REORDER_TABLE_OFFSET; | |
140 | offset = getIndex(inIndexes, indexesLength, index); | |
141 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
142 | if(length >= 256) { | |
143 | if(reorderCodesLength == 0) { | |
144 | errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes. | |
145 | return; | |
146 | } | |
147 | reorderTable = inBytes + offset; | |
148 | } else { | |
149 | // If we have reorder codes, then build the reorderTable at the end, | |
150 | // when the CollationData is otherwise complete. | |
151 | } | |
152 | ||
153 | if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) { | |
154 | errorCode = U_INVALID_FORMAT_ERROR; | |
155 | return; | |
156 | } | |
157 | CollationData *data = NULL; // Remains NULL if there are no mappings. | |
158 | ||
159 | index = IX_TRIE_OFFSET; | |
160 | offset = getIndex(inIndexes, indexesLength, index); | |
161 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
162 | if(length >= 8) { | |
163 | if(!tailoring.ensureOwnedData(errorCode)) { return; } | |
164 | data = tailoring.ownedData; | |
165 | data->base = baseData; | |
166 | data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000; | |
167 | data->trie = tailoring.trie = utrie2_openFromSerialized( | |
168 | UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL, | |
169 | &errorCode); | |
170 | if(U_FAILURE(errorCode)) { return; } | |
171 | } else if(baseData != NULL) { | |
172 | // Use the base data. Only the settings are tailored. | |
173 | tailoring.data = baseData; | |
174 | } else { | |
175 | errorCode = U_INVALID_FORMAT_ERROR; // No mappings. | |
176 | return; | |
177 | } | |
178 | ||
179 | index = IX_CES_OFFSET; | |
180 | offset = getIndex(inIndexes, indexesLength, index); | |
181 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
182 | if(length >= 8) { | |
183 | if(data == NULL) { | |
184 | errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie. | |
185 | return; | |
186 | } | |
187 | data->ces = reinterpret_cast<const int64_t *>(inBytes + offset); | |
188 | data->cesLength = length / 8; | |
189 | } | |
190 | ||
191 | index = IX_CE32S_OFFSET; | |
192 | offset = getIndex(inIndexes, indexesLength, index); | |
193 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
194 | if(length >= 4) { | |
195 | if(data == NULL) { | |
196 | errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie. | |
197 | return; | |
198 | } | |
199 | data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset); | |
200 | data->ce32sLength = length / 4; | |
201 | } | |
202 | ||
203 | int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START); | |
204 | if(jamoCE32sStart >= 0) { | |
205 | if(data == NULL || data->ce32s == NULL) { | |
206 | errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[]. | |
207 | return; | |
208 | } | |
209 | data->jamoCE32s = data->ce32s + jamoCE32sStart; | |
210 | } else if(data == NULL) { | |
211 | // Nothing to do. | |
212 | } else if(baseData != NULL) { | |
213 | data->jamoCE32s = baseData->jamoCE32s; | |
214 | } else { | |
215 | errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing. | |
216 | return; | |
217 | } | |
218 | ||
219 | index = IX_ROOT_ELEMENTS_OFFSET; | |
220 | offset = getIndex(inIndexes, indexesLength, index); | |
221 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
222 | if(length >= 4) { | |
223 | length /= 4; | |
224 | if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) { | |
225 | errorCode = U_INVALID_FORMAT_ERROR; | |
226 | return; | |
227 | } | |
228 | data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset); | |
229 | data->rootElementsLength = length; | |
230 | uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE]; | |
231 | if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) { | |
232 | errorCode = U_INVALID_FORMAT_ERROR; | |
233 | return; | |
234 | } | |
235 | uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES]; | |
236 | if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) { | |
237 | // [fixed last secondary common byte] is too low, | |
238 | // and secondary weights would collide with compressed common secondaries. | |
239 | errorCode = U_INVALID_FORMAT_ERROR; | |
240 | return; | |
241 | } | |
242 | } | |
243 | ||
244 | index = IX_CONTEXTS_OFFSET; | |
245 | offset = getIndex(inIndexes, indexesLength, index); | |
246 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
247 | if(length >= 2) { | |
248 | if(data == NULL) { | |
249 | errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie. | |
250 | return; | |
251 | } | |
252 | data->contexts = reinterpret_cast<const UChar *>(inBytes + offset); | |
253 | data->contextsLength = length / 2; | |
254 | } | |
255 | ||
256 | index = IX_UNSAFE_BWD_OFFSET; | |
257 | offset = getIndex(inIndexes, indexesLength, index); | |
258 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
259 | if(length >= 2) { | |
260 | if(data == NULL) { | |
261 | errorCode = U_INVALID_FORMAT_ERROR; | |
262 | return; | |
263 | } | |
264 | if(baseData == NULL) { | |
265 | // Create the unsafe-backward set for the root collator. | |
266 | // Include all non-zero combining marks and trail surrogates. | |
267 | // We do this at load time, rather than at build time, | |
268 | // to simplify Unicode version bootstrapping: | |
269 | // The root data builder only needs the new FractionalUCA.txt data, | |
270 | // but it need not be built with a version of ICU already updated to | |
271 | // the corresponding new Unicode Character Database. | |
272 | // | |
273 | // The following is an optimized version of | |
274 | // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). | |
275 | // It is faster and requires fewer code dependencies. | |
276 | tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates | |
277 | if(tailoring.unsafeBackwardSet == NULL) { | |
278 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
279 | return; | |
280 | } | |
281 | data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); | |
282 | } else { | |
283 | // Clone the root collator's set contents. | |
284 | tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( | |
285 | baseData->unsafeBackwardSet->cloneAsThawed()); | |
286 | if(tailoring.unsafeBackwardSet == NULL) { | |
287 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
288 | return; | |
289 | } | |
290 | } | |
291 | // Add the ranges from the data file to the unsafe-backward set. | |
292 | USerializedSet sset; | |
293 | const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset); | |
294 | if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) { | |
295 | errorCode = U_INVALID_FORMAT_ERROR; | |
296 | return; | |
297 | } | |
298 | int32_t count = uset_getSerializedRangeCount(&sset); | |
299 | for(int32_t i = 0; i < count; ++i) { | |
300 | UChar32 start, end; | |
301 | uset_getSerializedRange(&sset, i, &start, &end); | |
302 | tailoring.unsafeBackwardSet->add(start, end); | |
303 | } | |
304 | // Mark each lead surrogate as "unsafe" | |
305 | // if any of its 1024 associated supplementary code points is "unsafe". | |
306 | UChar32 c = 0x10000; | |
307 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { | |
308 | if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) { | |
309 | tailoring.unsafeBackwardSet->add(lead); | |
310 | } | |
311 | } | |
312 | tailoring.unsafeBackwardSet->freeze(); | |
313 | data->unsafeBackwardSet = tailoring.unsafeBackwardSet; | |
314 | } else if(data == NULL) { | |
315 | // Nothing to do. | |
316 | } else if(baseData != NULL) { | |
317 | // No tailoring-specific data: Alias the root collator's set. | |
318 | data->unsafeBackwardSet = baseData->unsafeBackwardSet; | |
319 | } else { | |
320 | errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet. | |
321 | return; | |
322 | } | |
323 | ||
324 | // If the fast Latin format version is different, | |
325 | // or the version is set to 0 for "no fast Latin table", | |
326 | // then just always use the normal string comparison path. | |
327 | if(data != NULL) { | |
328 | data->fastLatinTable = NULL; | |
329 | data->fastLatinTableLength = 0; | |
330 | if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) { | |
331 | index = IX_FAST_LATIN_TABLE_OFFSET; | |
332 | offset = getIndex(inIndexes, indexesLength, index); | |
333 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
334 | if(length >= 2) { | |
335 | data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset); | |
336 | data->fastLatinTableLength = length / 2; | |
337 | if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) { | |
338 | errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch | |
339 | return; | |
340 | } | |
341 | } else if(baseData != NULL) { | |
342 | data->fastLatinTable = baseData->fastLatinTable; | |
343 | data->fastLatinTableLength = baseData->fastLatinTableLength; | |
344 | } | |
345 | } | |
346 | } | |
347 | ||
348 | index = IX_SCRIPTS_OFFSET; | |
349 | offset = getIndex(inIndexes, indexesLength, index); | |
350 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
351 | if(length >= 2) { | |
352 | if(data == NULL) { | |
353 | errorCode = U_INVALID_FORMAT_ERROR; | |
354 | return; | |
355 | } | |
b331163b A |
356 | const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset); |
357 | int32_t scriptsLength = length / 2; | |
358 | data->numScripts = scripts[0]; | |
359 | // There must be enough entries for both arrays, including more than two range starts. | |
360 | data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16); | |
361 | if(data->scriptStartsLength <= 2 || | |
362 | CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) { | |
363 | errorCode = U_INVALID_FORMAT_ERROR; | |
364 | return; | |
365 | } | |
366 | data->scriptsIndex = scripts + 1; | |
367 | data->scriptStarts = scripts + 1 + data->numScripts + 16; | |
368 | if(!(data->scriptStarts[0] == 0 && | |
369 | data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) && | |
370 | data->scriptStarts[data->scriptStartsLength - 1] == | |
371 | (Collation::TRAIL_WEIGHT_BYTE << 8))) { | |
372 | errorCode = U_INVALID_FORMAT_ERROR; | |
373 | return; | |
374 | } | |
57a6839d A |
375 | } else if(data == NULL) { |
376 | // Nothing to do. | |
377 | } else if(baseData != NULL) { | |
b331163b A |
378 | data->numScripts = baseData->numScripts; |
379 | data->scriptsIndex = baseData->scriptsIndex; | |
380 | data->scriptStarts = baseData->scriptStarts; | |
381 | data->scriptStartsLength = baseData->scriptStartsLength; | |
57a6839d A |
382 | } |
383 | ||
384 | index = IX_COMPRESSIBLE_BYTES_OFFSET; | |
385 | offset = getIndex(inIndexes, indexesLength, index); | |
386 | length = getIndex(inIndexes, indexesLength, index + 1) - offset; | |
387 | if(length >= 256) { | |
388 | if(data == NULL) { | |
389 | errorCode = U_INVALID_FORMAT_ERROR; | |
390 | return; | |
391 | } | |
392 | data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset); | |
393 | } else if(data == NULL) { | |
394 | // Nothing to do. | |
395 | } else if(baseData != NULL) { | |
396 | data->compressibleBytes = baseData->compressibleBytes; | |
397 | } else { | |
398 | errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[]. | |
399 | return; | |
400 | } | |
401 | ||
402 | const CollationSettings &ts = *tailoring.settings; | |
403 | int32_t options = inIndexes[IX_OPTIONS] & 0xffff; | |
404 | uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT]; | |
405 | int32_t fastLatinOptions = CollationFastLatin::getOptions( | |
b331163b | 406 | tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries)); |
57a6839d A |
407 | if(options == ts.options && ts.variableTop != 0 && |
408 | reorderCodesLength == ts.reorderCodesLength && | |
409 | uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 && | |
410 | fastLatinOptions == ts.fastLatinOptions && | |
411 | (fastLatinOptions < 0 || | |
412 | uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries, | |
413 | sizeof(fastLatinPrimaries)) == 0)) { | |
414 | return; | |
415 | } | |
416 | ||
417 | CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings); | |
418 | if(settings == NULL) { | |
419 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
420 | return; | |
421 | } | |
422 | settings->options = options; | |
423 | // Set variableTop from options and scripts data. | |
424 | settings->variableTop = tailoring.data->getLastPrimaryForGroup( | |
425 | UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); | |
426 | if(settings->variableTop == 0) { | |
427 | errorCode = U_INVALID_FORMAT_ERROR; | |
428 | return; | |
429 | } | |
430 | ||
b331163b A |
431 | if(reorderCodesLength != 0) { |
432 | settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength, | |
433 | reorderRanges, reorderRangesLength, | |
434 | reorderTable, errorCode); | |
57a6839d A |
435 | } |
436 | ||
437 | settings->fastLatinOptions = CollationFastLatin::getOptions( | |
438 | tailoring.data, *settings, | |
b331163b | 439 | settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries)); |
57a6839d A |
440 | } |
441 | ||
442 | UBool U_CALLCONV | |
443 | CollationDataReader::isAcceptable(void *context, | |
444 | const char * /* type */, const char * /*name*/, | |
445 | const UDataInfo *pInfo) { | |
446 | if( | |
447 | pInfo->size >= 20 && | |
448 | pInfo->isBigEndian == U_IS_BIG_ENDIAN && | |
449 | pInfo->charsetFamily == U_CHARSET_FAMILY && | |
450 | pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" | |
451 | pInfo->dataFormat[1] == 0x43 && | |
452 | pInfo->dataFormat[2] == 0x6f && | |
453 | pInfo->dataFormat[3] == 0x6c && | |
b331163b | 454 | pInfo->formatVersion[0] == 5 |
57a6839d A |
455 | ) { |
456 | UVersionInfo *version = static_cast<UVersionInfo *>(context); | |
457 | if(version != NULL) { | |
458 | uprv_memcpy(version, pInfo->dataVersion, 4); | |
459 | } | |
460 | return TRUE; | |
461 | } else { | |
462 | return FALSE; | |
463 | } | |
464 | } | |
465 | ||
466 | U_NAMESPACE_END | |
467 | ||
468 | #endif // !UCONFIG_NO_COLLATION |