]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationbasedatabuilder.cpp
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationbasedatabuilder.cpp
8 * created on: 2012aug11
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/localpointer.h"
17 #include "unicode/ucharstriebuilder.h"
18 #include "unicode/uniset.h"
19 #include "unicode/unistr.h"
20 #include "unicode/utf16.h"
21 #include "collation.h"
22 #include "collationbasedatabuilder.h"
23 #include "collationdata.h"
24 #include "collationdatabuilder.h"
25 #include "collationrootelements.h"
26 #include "normalizer2impl.h"
38 * Compare two signed int64_t values as if they were unsigned.
41 compareInt64AsUnsigned(int64_t a
, int64_t b
) {
42 if((uint64_t)a
< (uint64_t)b
) {
44 } else if((uint64_t)a
> (uint64_t)b
) {
51 // TODO: Try to merge this with the binarySearch in alphaindex.cpp.
53 * Like Java Collections.binarySearch(List, String, Comparator).
55 * @return the index>=0 where the item was found,
56 * or the index<0 for inserting the string at ~index in sorted order
59 binarySearch(const UVector64
&list
, int64_t ce
) {
60 if (list
.size() == 0) { return ~0; }
62 int32_t limit
= list
.size();
64 int32_t i
= (start
+ limit
) / 2;
65 int32_t cmp
= compareInt64AsUnsigned(ce
, list
.elementAti(i
));
70 return ~start
; // insert ce before i
75 return ~(start
+ 1); // insert ce after i
84 CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode
&errorCode
)
85 : CollationDataBuilder(errorCode
),
86 numericPrimary(0x12000000),
87 firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
88 rootElements(errorCode
) {
91 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
95 CollationBaseDataBuilder::init(UErrorCode
&errorCode
) {
96 if(U_FAILURE(errorCode
)) { return; }
98 errorCode
= U_INVALID_STATE_ERROR
;
107 // Some scripts are compressible, some are not.
108 uprv_memset(compressibleBytes
, FALSE
, 256);
109 compressibleBytes
[Collation::UNASSIGNED_IMPLICIT_BYTE
] = TRUE
;
111 // For a base, the default is to compute an unassigned-character implicit CE.
112 // This includes surrogate code points; see the last option in
113 // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
114 trie
= utrie2_open(Collation::UNASSIGNED_CE32
, Collation::FFFD_CE32
, &errorCode
);
116 // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
117 for(UChar32 c
= 0; c
< 0x180; ++c
) {
118 utrie2_set32(trie
, c
, Collation::UNASSIGNED_CE32
, &errorCode
);
121 utrie2_set32(trie
, 0xfffe, Collation::MERGE_SEPARATOR_CE32
, &errorCode
);
122 // No root element for the merge separator which has 02 weights.
123 // Some code assumes that the root first primary CE is the "space first primary"
124 // from FractionalUCA.txt.
126 uint32_t hangulCE32
= Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG
, 0);
127 utrie2_setRange32(trie
, Hangul::HANGUL_BASE
, Hangul::HANGUL_END
, hangulCE32
, TRUE
, &errorCode
);
129 // Add a mapping for the first-unassigned boundary,
130 // which is the AlphabeticIndex overflow boundary.
131 UnicodeString
s((UChar
)0xfdd1); // Script boundary contractions start with U+FDD1.
132 s
.append((UChar
)0xfdd0); // Zzzz script sample character U+FDD0.
133 int64_t ce
= Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY
);
134 add(UnicodeString(), s
, &ce
, 1, errorCode
);
136 // Add a tailoring boundary, but not a mapping, for [first trailing].
137 ce
= Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY
);
138 rootElements
.addElement(ce
, errorCode
);
140 // U+FFFD maps to a CE with the third-highest primary weight,
141 // for predictable handling of ill-formed UTF-8.
142 uint32_t ce32
= Collation::FFFD_CE32
;
143 utrie2_set32(trie
, 0xfffd, ce32
, &errorCode
);
144 addRootElement(Collation::ceFromSimpleCE32(ce32
), errorCode
);
146 // U+FFFF maps to a CE with the highest primary weight.
147 ce32
= Collation::MAX_REGULAR_CE32
;
148 utrie2_set32(trie
, 0xffff, ce32
, &errorCode
);
149 addRootElement(Collation::ceFromSimpleCE32(ce32
), errorCode
);
153 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges
[], int32_t length
,
154 UErrorCode
&errorCode
) {
155 if(U_FAILURE(errorCode
) || length
== 0) { return; }
156 if((length
& 1) != 0) { // incomplete start/end pairs
157 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
160 if(isAssigned(0x4e00)) { // already set
161 errorCode
= U_INVALID_STATE_ERROR
;
164 int32_t numHanCodePoints
= 0;
165 for(int32_t i
= 0; i
< length
; i
+= 2) {
166 UChar32 start
= ranges
[i
];
167 UChar32 end
= ranges
[i
+ 1];
168 numHanCodePoints
+= end
- start
+ 1;
170 // Multiply the number of code points by (gap+1).
171 // Add hanStep+2 for tailoring after the last Han character.
174 int32_t numHan
= numHanCodePoints
* hanStep
+ hanStep
+ 2;
175 // Numbers of Han primaries per lead byte determined by
176 // numbers of 2nd (not compressible) times 3rd primary byte values.
177 int32_t numHanPerLeadByte
= 254 * 254;
178 int32_t numHanLeadBytes
= (numHan
+ numHanPerLeadByte
- 1) / numHanPerLeadByte
;
179 uint32_t hanPrimary
= (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE
- numHanLeadBytes
) << 24;
180 hanPrimary
|= 0x20200;
181 firstHanPrimary
= hanPrimary
;
182 for(int32_t i
= 0; i
< length
; i
+= 2) {
183 UChar32 start
= ranges
[i
];
184 UChar32 end
= ranges
[i
+ 1];
185 hanPrimary
= setPrimaryRangeAndReturnNext(start
, end
, hanPrimary
, hanStep
, errorCode
);
187 // One past the actual last one, but that is harmless for tailoring.
188 // It saves us from subtracting "hanStep" and handling underflows.
189 lastHanPrimary
= hanPrimary
;
193 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b
) const {
194 return compressibleBytes
[b
];
198 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b
) {
199 compressibleBytes
[b
] = TRUE
;
203 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1
, uint32_t p2
, UBool isCompressible
) {
204 if((p1
& 0xff000000) == (p2
& 0xff000000)) {
206 return (int32_t)(p2
- p1
) >> 16;
212 // Second byte for compressible lead byte: 251 bytes 04..FE
213 linear1
= (int32_t)((p1
>> 16) & 0xff) - 4;
214 linear2
= (int32_t)((p2
>> 16) & 0xff) - 4;
217 // Second byte for incompressible lead byte: 254 bytes 02..FF
218 linear1
= (int32_t)((p1
>> 16) & 0xff) - 2;
219 linear2
= (int32_t)((p2
>> 16) & 0xff) - 2;
222 linear1
+= factor
* (int32_t)((p1
>> 24) & 0xff);
223 linear2
+= factor
* (int32_t)((p2
>> 24) & 0xff);
224 return linear2
- linear1
;
229 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1
, uint32_t p2
, UBool isCompressible
) {
230 if((p1
& 0xffff0000) == (p2
& 0xffff0000)) {
231 // Same first two bytes.
232 return (int32_t)(p2
- p1
) >> 8;
234 // Third byte: 254 bytes 02..FF
235 int32_t linear1
= (int32_t)((p1
>> 8) & 0xff) - 2;
236 int32_t linear2
= (int32_t)((p2
>> 8) & 0xff) - 2;
239 // Second byte for compressible lead byte: 251 bytes 04..FE
240 linear1
+= 254 * ((int32_t)((p1
>> 16) & 0xff) - 4);
241 linear2
+= 254 * ((int32_t)((p2
>> 16) & 0xff) - 4);
244 // Second byte for incompressible lead byte: 254 bytes 02..FF
245 linear1
+= 254 * ((int32_t)((p1
>> 16) & 0xff) - 2);
246 linear2
+= 254 * ((int32_t)((p2
>> 16) & 0xff) - 2);
249 linear1
+= factor
* (int32_t)((p1
>> 24) & 0xff);
250 linear2
+= factor
* (int32_t)((p2
>> 24) & 0xff);
251 return linear2
- linear1
;
256 CollationBaseDataBuilder::encodeCEs(const int64_t ces
[], int32_t cesLength
, UErrorCode
&errorCode
) {
257 addRootElements(ces
, cesLength
, errorCode
);
258 return CollationDataBuilder::encodeCEs(ces
, cesLength
, errorCode
);
262 CollationBaseDataBuilder::addRootElements(const int64_t ces
[], int32_t cesLength
,
263 UErrorCode
&errorCode
) {
264 if(U_FAILURE(errorCode
)) { return; }
265 for(int32_t i
= 0; i
< cesLength
; ++i
) {
266 addRootElement(ces
[i
], errorCode
);
271 CollationBaseDataBuilder::addRootElement(int64_t ce
, UErrorCode
&errorCode
) {
272 if(U_FAILURE(errorCode
) || ce
== 0) { return; }
274 ce
&= INT64_C(0xffffffffffff3fff);
275 U_ASSERT((ce
& 0xc0) == 0); // quaternary==0
276 // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
277 // We will add it later, as part of the Han ranges.
278 uint32_t p
= (uint32_t)(ce
>> 32);
279 uint32_t secTer
= (uint32_t)ce
;
280 if(secTer
== Collation::COMMON_SEC_AND_TER_CE
) {
281 if(firstHanPrimary
<= p
&& p
<= lastHanPrimary
) {
285 // Check that secondary and tertiary weights are >= "common".
286 uint32_t s
= secTer
>> 16;
287 uint32_t t
= secTer
& Collation::ONLY_TERTIARY_MASK
;
288 if((s
!= 0 && s
< Collation::COMMON_WEIGHT16
) || (t
!= 0 && t
< Collation::COMMON_WEIGHT16
)) {
289 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
293 // Check that primaries have at most 3 bytes.
294 if((p
& 0xff) != 0) {
295 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
298 int32_t i
= binarySearch(rootElements
, ce
);
300 rootElements
.insertElementAt(ce
, ~i
, errorCode
);
305 CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte
, uint32_t lastByte
,
306 const UnicodeString
&groupScripts
,
307 UErrorCode
&errorCode
) {
308 if(U_FAILURE(errorCode
)) { return; }
309 if(groupScripts
.isEmpty()) {
310 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
313 if(groupScripts
.indexOf((UChar
)USCRIPT_UNKNOWN
) >= 0) {
314 // Zzzz must not occur.
315 // It is the code used in the API to separate low and high scripts.
316 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
319 // Note: We are mostly trusting the input data,
320 // rather than verifying that reordering groups do not intersect
321 // with their lead byte ranges nor their sets of scripts,
322 // and that all script codes are valid.
323 scripts
.append((UChar
)((firstByte
<< 8) | lastByte
));
324 scripts
.append((UChar
)groupScripts
.length());
325 scripts
.append(groupScripts
);
329 CollationBaseDataBuilder::build(CollationData
&data
, UErrorCode
&errorCode
) {
330 buildMappings(data
, errorCode
);
331 data
.numericPrimary
= numericPrimary
;
332 data
.compressibleBytes
= compressibleBytes
;
333 data
.scripts
= reinterpret_cast<const uint16_t *>(scripts
.getBuffer());
334 data
.scriptsLength
= scripts
.length();
335 buildFastLatinTable(data
, errorCode
);
339 CollationBaseDataBuilder::buildRootElementsTable(UVector32
&table
, UErrorCode
&errorCode
) {
340 if(U_FAILURE(errorCode
)) { return; }
341 uint32_t nextHanPrimary
= firstHanPrimary
; // Set to 0xffffffff after the last Han range.
342 uint32_t prevPrimary
= 0; // Start with primary ignorable CEs.
343 UBool tryRange
= FALSE
;
344 for(int32_t i
= 0; i
< rootElements
.size(); ++i
) {
345 int64_t ce
= rootElements
.elementAti(i
);
346 uint32_t p
= (uint32_t)(ce
>> 32);
347 uint32_t secTer
= (uint32_t)ce
& Collation::ONLY_SEC_TER_MASK
;
348 if(p
!= prevPrimary
) {
349 U_ASSERT((p
& 0xff) == 0);
351 if(p
>= nextHanPrimary
) {
352 // Add a Han primary weight or range.
353 // We omitted them initially, and omitted all CEs with Han primaries
354 // and common secondary/tertiary weights.
355 U_ASSERT(p
> lastHanPrimary
|| secTer
!= Collation::COMMON_SEC_AND_TER_CE
);
356 if(p
== nextHanPrimary
) {
357 // One single Han primary with non-common secondary/tertiary weights.
358 table
.addElement((int32_t)p
, errorCode
);
359 if(p
< lastHanPrimary
) {
360 // Prepare for the next Han range.
361 nextHanPrimary
= Collation::incThreeBytePrimaryByOffset(p
, FALSE
, hanStep
);
363 // p is the last Han primary.
364 nextHanPrimary
= 0xffffffff;
367 // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
368 table
.addElement((int32_t)nextHanPrimary
, errorCode
);
369 if(nextHanPrimary
== lastHanPrimary
) {
370 // nextHanPrimary == lastHanPrimary < p
371 // We just wrote the single last Han primary.
372 nextHanPrimary
= 0xffffffff;
373 } else if(p
< lastHanPrimary
) {
374 // nextHanPrimary < p < lastHanPrimary
375 // End the Han range on p, prepare for the next range.
376 table
.addElement((int32_t)p
| hanStep
, errorCode
);
377 nextHanPrimary
= Collation::incThreeBytePrimaryByOffset(p
, FALSE
, hanStep
);
378 } else if(p
== lastHanPrimary
) {
379 // nextHanPrimary < p == lastHanPrimary
380 // End the last Han range on p.
381 table
.addElement((int32_t)p
| hanStep
, errorCode
);
382 nextHanPrimary
= 0xffffffff;
384 // nextHanPrimary < lastHanPrimary < p
385 // End the last Han range, then write p.
386 table
.addElement((int32_t)lastHanPrimary
| hanStep
, errorCode
);
387 nextHanPrimary
= 0xffffffff;
388 table
.addElement((int32_t)p
, errorCode
);
391 } else if(tryRange
&& secTer
== Collation::COMMON_SEC_AND_TER_CE
&&
392 (end
= writeRootElementsRange(prevPrimary
, p
, i
+ 1, table
, errorCode
)) != 0) {
393 // Multiple CEs with only common secondary/tertiary weights were
394 // combined into a primary range.
395 // The range end was written, ending with the primary of rootElements[end].
396 ce
= rootElements
.elementAti(end
);
397 p
= (uint32_t)(ce
>> 32);
398 secTer
= (uint32_t)ce
& Collation::ONLY_SEC_TER_MASK
;
401 // Write the primary weight of a normal CE.
402 table
.addElement((int32_t)p
, errorCode
);
406 if(secTer
== Collation::COMMON_SEC_AND_TER_CE
) {
407 // The common secondar/tertiary weights are implied in the primary unit.
408 // If there is no intervening delta unit, then we will try to combine
409 // the next several primaries into a range.
412 // For each new set of secondary/tertiary weights we write a delta unit.
413 table
.addElement((int32_t)secTer
| CollationRootElements::SEC_TER_DELTA_FLAG
, errorCode
);
418 // Limit sentinel for root elements.
419 // This allows us to reduce range checks at runtime.
420 table
.addElement(CollationRootElements::PRIMARY_SENTINEL
, errorCode
);
424 CollationBaseDataBuilder::writeRootElementsRange(
425 uint32_t prevPrimary
, uint32_t p
, int32_t i
,
426 UVector32
&table
, UErrorCode
&errorCode
) {
427 if(U_FAILURE(errorCode
) || i
>= rootElements
.size()) { return 0; }
428 U_ASSERT(prevPrimary
< p
);
429 // No ranges of single-byte primaries.
430 if((p
& prevPrimary
& 0xff0000) == 0) { return 0; }
431 // Lead bytes of compressible primaries must match.
432 UBool isCompressible
= isCompressiblePrimary(p
);
433 if((isCompressible
|| isCompressiblePrimary(prevPrimary
)) &&
434 (p
& 0xff000000) != (prevPrimary
& 0xff000000)) {
437 // Number of bytes in the primaries.
439 // Number of primaries from prevPrimary to p.
441 if((p
& 0xff00) == 0) {
443 if((prevPrimary
& 0xff00) != 0) { return 0; } // length mismatch
445 step
= diffTwoBytePrimaries(prevPrimary
, p
, isCompressible
);
448 if((prevPrimary
& 0xff00) == 0) { return 0; } // length mismatch
450 step
= diffThreeBytePrimaries(prevPrimary
, p
, isCompressible
);
452 if(step
> (int32_t)CollationRootElements::PRIMARY_STEP_MASK
) { return 0; }
453 // See if there are more than two CEs with primaries increasing by "step"
454 // and with only common secondary/tertiary weights on all but the last one.
455 int32_t end
= 0; // Initially 0: No range for just two primaries.
458 // Calculate which primary we expect next.
459 uint32_t nextPrimary
; // = p + step
461 nextPrimary
= Collation::incTwoBytePrimaryByOffset(p
, isCompressible
, step
);
463 nextPrimary
= Collation::incThreeBytePrimaryByOffset(p
, isCompressible
, step
);
465 // Fetch the actual next CE.
466 int64_t ce
= rootElements
.elementAti(i
);
467 p
= (uint32_t)(ce
>> 32);
468 uint32_t secTer
= (uint32_t)ce
& Collation::ONLY_SEC_TER_MASK
;
469 // Does this primary increase by "step" from the last one?
470 if(p
!= nextPrimary
||
471 // Do not cross into a new lead byte if either is compressible.
472 ((p
& 0xff000000) != (prevPrimary
& 0xff000000) &&
473 (isCompressible
|| isCompressiblePrimary(p
)))) {
474 // The range ends with the previous CE.
478 // Extend the range to include this primary.
480 // This primary is the last in the range if it has non-common weights
481 // or if we are at the end of the list.
482 if(secTer
!= Collation::COMMON_SEC_AND_TER_CE
|| i
>= rootElements
.size()) { break; }
485 table
.addElement((int32_t)p
| step
, errorCode
);
492 #endif // !UCONFIG_NO_COLLATION