1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * created on: 2013feb09
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/ucharstrie.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/ustringtrie.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationsets.h"
25 #include "normalizer2impl.h"
27 #include "utf16collationiterator.h"
34 static UBool U_CALLCONV
35 enumTailoredRange(const void *context
, UChar32 start
, UChar32 end
, uint32_t ce32
) {
36 if(ce32
== Collation::FALLBACK_CE32
) {
37 return TRUE
; // fallback to base, not tailored
39 TailoredSet
*ts
= (TailoredSet
*)context
;
40 return ts
->handleCE32(start
, end
, ce32
);
46 TailoredSet::forData(const CollationData
*d
, UErrorCode
&ec
) {
47 if(U_FAILURE(ec
)) { return; }
48 errorCode
= ec
; // Preserve info & warning codes.
51 U_ASSERT(baseData
!= NULL
);
52 utrie2_enum(data
->trie
, NULL
, enumTailoredRange
, this);
57 TailoredSet::handleCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
58 U_ASSERT(ce32
!= Collation::FALLBACK_CE32
);
59 if(Collation::isSpecialCE32(ce32
)) {
60 ce32
= data
->getIndirectCE32(ce32
);
61 if(ce32
== Collation::FALLBACK_CE32
) {
62 return U_SUCCESS(errorCode
);
66 uint32_t baseCE32
= baseData
->getFinalCE32(baseData
->getCE32(start
));
67 // Do not just continue if ce32 == baseCE32 because
68 // contractions and expansions in different data objects
69 // normally differ even if they have the same data offsets.
70 if(Collation::isSelfContainedCE32(ce32
) && Collation::isSelfContainedCE32(baseCE32
)) {
72 if(ce32
!= baseCE32
) {
76 compare(start
, ce32
, baseCE32
);
78 } while(++start
<= end
);
79 return U_SUCCESS(errorCode
);
83 TailoredSet::compare(UChar32 c
, uint32_t ce32
, uint32_t baseCE32
) {
84 if(Collation::isPrefixCE32(ce32
)) {
85 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
86 ce32
= data
->getFinalCE32(CollationData::readCE32(p
));
87 if(Collation::isPrefixCE32(baseCE32
)) {
88 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
89 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
90 comparePrefixes(c
, p
+ 2, q
+ 2);
92 addPrefixes(data
, c
, p
+ 2);
94 } else if(Collation::isPrefixCE32(baseCE32
)) {
95 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
96 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
97 addPrefixes(baseData
, c
, q
+ 2);
100 if(Collation::isContractionCE32(ce32
)) {
101 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
102 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
103 ce32
= Collation::NO_CE32
;
105 ce32
= data
->getFinalCE32(CollationData::readCE32(p
));
107 if(Collation::isContractionCE32(baseCE32
)) {
108 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
109 if((baseCE32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
110 baseCE32
= Collation::NO_CE32
;
112 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
114 compareContractions(c
, p
+ 2, q
+ 2);
116 addContractions(c
, p
+ 2);
118 } else if(Collation::isContractionCE32(baseCE32
)) {
119 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
120 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
121 addContractions(c
, q
+ 2);
125 if(Collation::isSpecialCE32(ce32
)) {
126 tag
= Collation::tagFromCE32(ce32
);
127 U_ASSERT(tag
!= Collation::PREFIX_TAG
);
128 U_ASSERT(tag
!= Collation::CONTRACTION_TAG
);
129 // Currently, the tailoring data builder does not write offset tags.
130 // They might be useful for saving space,
131 // but they would complicate the builder,
132 // and in tailorings we assume that performance of tailored characters is more important.
133 U_ASSERT(tag
!= Collation::OFFSET_TAG
);
138 if(Collation::isSpecialCE32(baseCE32
)) {
139 baseTag
= Collation::tagFromCE32(baseCE32
);
140 U_ASSERT(baseTag
!= Collation::PREFIX_TAG
);
141 U_ASSERT(baseTag
!= Collation::CONTRACTION_TAG
);
146 // Non-contextual mappings, expansions, etc.
147 if(baseTag
== Collation::OFFSET_TAG
) {
148 // We might be comparing a tailoring CE which is a copy of
149 // a base offset-tag CE, via the [optimize [set]] syntax
150 // or when a single-character mapping was copied for tailored contractions.
151 // Offset tags always result in long-primary CEs,
152 // with common secondary/tertiary weights.
153 if(!Collation::isLongPrimaryCE32(ce32
)) {
157 int64_t dataCE
= baseData
->ces
[Collation::indexFromCE32(baseCE32
)];
158 uint32_t p
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
);
159 if(Collation::primaryFromLongPrimaryCE32(ce32
) != p
) {
170 if(tag
== Collation::EXPANSION32_TAG
) {
171 const uint32_t *ce32s
= data
->ce32s
+ Collation::indexFromCE32(ce32
);
172 int32_t length
= Collation::lengthFromCE32(ce32
);
174 const uint32_t *baseCE32s
= baseData
->ce32s
+ Collation::indexFromCE32(baseCE32
);
175 int32_t baseLength
= Collation::lengthFromCE32(baseCE32
);
177 if(length
!= baseLength
) {
181 for(int32_t i
= 0; i
< length
; ++i
) {
182 if(ce32s
[i
] != baseCE32s
[i
]) {
187 } else if(tag
== Collation::EXPANSION_TAG
) {
188 const int64_t *ces
= data
->ces
+ Collation::indexFromCE32(ce32
);
189 int32_t length
= Collation::lengthFromCE32(ce32
);
191 const int64_t *baseCEs
= baseData
->ces
+ Collation::indexFromCE32(baseCE32
);
192 int32_t baseLength
= Collation::lengthFromCE32(baseCE32
);
194 if(length
!= baseLength
) {
198 for(int32_t i
= 0; i
< length
; ++i
) {
199 if(ces
[i
] != baseCEs
[i
]) {
204 } else if(tag
== Collation::HANGUL_TAG
) {
206 int32_t length
= Hangul::decompose(c
, jamos
);
207 if(tailored
->contains(jamos
[0]) || tailored
->contains(jamos
[1]) ||
208 (length
== 3 && tailored
->contains(jamos
[2]))) {
211 } else if(ce32
!= baseCE32
) {
217 TailoredSet::comparePrefixes(UChar32 c
, const UChar
*p
, const UChar
*q
) {
218 // Parallel iteration over prefixes of both tables.
219 UCharsTrie::Iterator
prefixes(p
, 0, errorCode
);
220 UCharsTrie::Iterator
basePrefixes(q
, 0, errorCode
);
221 const UnicodeString
*tp
= NULL
; // Tailoring prefix.
222 const UnicodeString
*bp
= NULL
; // Base prefix.
223 // Use a string with a U+FFFF as the limit sentinel.
224 // U+FFFF is untailorable and will not occur in prefixes.
225 UnicodeString
none((UChar
)0xffff);
228 if(prefixes
.next(errorCode
)) {
229 tp
= &prefixes
.getString();
235 if(basePrefixes
.next(errorCode
)) {
236 bp
= &basePrefixes
.getString();
241 if(tp
== &none
&& bp
== &none
) { break; }
242 int32_t cmp
= tp
->compare(*bp
);
244 // tp occurs in the tailoring but not in the base.
245 addPrefix(data
, *tp
, c
, (uint32_t)prefixes
.getValue());
248 // bp occurs in the base but not in the tailoring.
249 addPrefix(baseData
, *bp
, c
, (uint32_t)basePrefixes
.getValue());
253 compare(c
, (uint32_t)prefixes
.getValue(), (uint32_t)basePrefixes
.getValue());
262 TailoredSet::compareContractions(UChar32 c
, const UChar
*p
, const UChar
*q
) {
263 // Parallel iteration over suffixes of both tables.
264 UCharsTrie::Iterator
suffixes(p
, 0, errorCode
);
265 UCharsTrie::Iterator
baseSuffixes(q
, 0, errorCode
);
266 const UnicodeString
*ts
= NULL
; // Tailoring suffix.
267 const UnicodeString
*bs
= NULL
; // Base suffix.
268 // Use a string with two U+FFFF as the limit sentinel.
269 // U+FFFF is untailorable and will not occur in contractions except maybe
270 // as a single suffix character for a root-collator boundary contraction.
271 UnicodeString
none((UChar
)0xffff);
272 none
.append((UChar
)0xffff);
275 if(suffixes
.next(errorCode
)) {
276 ts
= &suffixes
.getString();
282 if(baseSuffixes
.next(errorCode
)) {
283 bs
= &baseSuffixes
.getString();
288 if(ts
== &none
&& bs
== &none
) { break; }
289 int32_t cmp
= ts
->compare(*bs
);
291 // ts occurs in the tailoring but not in the base.
295 // bs occurs in the base but not in the tailoring.
300 compare(c
, (uint32_t)suffixes
.getValue(), (uint32_t)baseSuffixes
.getValue());
309 TailoredSet::addPrefixes(const CollationData
*d
, UChar32 c
, const UChar
*p
) {
310 UCharsTrie::Iterator
prefixes(p
, 0, errorCode
);
311 while(prefixes
.next(errorCode
)) {
312 addPrefix(d
, prefixes
.getString(), c
, (uint32_t)prefixes
.getValue());
317 TailoredSet::addPrefix(const CollationData
*d
, const UnicodeString
&pfx
, UChar32 c
, uint32_t ce32
) {
319 ce32
= d
->getFinalCE32(ce32
);
320 if(Collation::isContractionCE32(ce32
)) {
321 const UChar
*p
= d
->contexts
+ Collation::indexFromCE32(ce32
);
322 addContractions(c
, p
+ 2);
324 tailored
->add(UnicodeString(unreversedPrefix
).append(c
));
329 TailoredSet::addContractions(UChar32 c
, const UChar
*p
) {
330 UCharsTrie::Iterator
suffixes(p
, 0, errorCode
);
331 while(suffixes
.next(errorCode
)) {
332 addSuffix(c
, suffixes
.getString());
337 TailoredSet::addSuffix(UChar32 c
, const UnicodeString
&sfx
) {
338 tailored
->add(UnicodeString(unreversedPrefix
).append(c
).append(sfx
));
342 TailoredSet::add(UChar32 c
) {
343 if(unreversedPrefix
.isEmpty() && suffix
== NULL
) {
346 UnicodeString
s(unreversedPrefix
);
355 ContractionsAndExpansions::CESink::~CESink() {}
359 static UBool U_CALLCONV
360 enumCnERange(const void *context
, UChar32 start
, UChar32 end
, uint32_t ce32
) {
361 ContractionsAndExpansions
*cne
= (ContractionsAndExpansions
*)context
;
362 if(cne
->checkTailored
== 0) {
363 // There is no tailoring.
364 // No need to collect nor check the tailored set.
365 } else if(cne
->checkTailored
< 0) {
366 // Collect the set of code points with mappings in the tailoring data.
367 if(ce32
== Collation::FALLBACK_CE32
) {
368 return TRUE
; // fallback to base, not tailored
370 cne
->tailored
.add(start
, end
);
372 // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
373 } else if(start
== end
) {
374 if(cne
->tailored
.contains(start
)) {
377 } else if(cne
->tailored
.containsSome(start
, end
)) {
378 cne
->ranges
.set(start
, end
).removeAll(cne
->tailored
);
379 int32_t count
= cne
->ranges
.getRangeCount();
380 for(int32_t i
= 0; i
< count
; ++i
) {
381 cne
->handleCE32(cne
->ranges
.getRangeStart(i
), cne
->ranges
.getRangeEnd(i
), ce32
);
383 return U_SUCCESS(cne
->errorCode
);
385 cne
->handleCE32(start
, end
, ce32
);
386 return U_SUCCESS(cne
->errorCode
);
392 ContractionsAndExpansions::forData(const CollationData
*d
, UErrorCode
&ec
) {
393 if(U_FAILURE(ec
)) { return; }
394 errorCode
= ec
; // Preserve info & warning codes.
395 // Add all from the data, can be tailoring or base.
396 if(d
->base
!= NULL
) {
400 utrie2_enum(data
->trie
, NULL
, enumCnERange
, this);
401 if(d
->base
== NULL
|| U_FAILURE(errorCode
)) {
405 // Add all from the base data but only for un-tailored code points.
409 utrie2_enum(data
->trie
, NULL
, enumCnERange
, this);
414 ContractionsAndExpansions::forCodePoint(const CollationData
*d
, UChar32 c
, UErrorCode
&ec
) {
415 if(U_FAILURE(ec
)) { return; }
416 errorCode
= ec
; // Preserve info & warning codes.
417 uint32_t ce32
= d
->getCE32(c
);
418 if(ce32
== Collation::FALLBACK_CE32
) {
420 ce32
= d
->getCE32(c
);
423 handleCE32(c
, c
, ce32
);
428 ContractionsAndExpansions::handleCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
430 if((ce32
& 0xff) < Collation::SPECIAL_CE32_LOW_BYTE
) {
433 sink
->handleCE(Collation::ceFromSimpleCE32(ce32
));
437 switch(Collation::tagFromCE32(ce32
)) {
438 case Collation::FALLBACK_TAG
:
440 case Collation::RESERVED_TAG_3
:
441 case Collation::BUILDER_DATA_TAG
:
442 case Collation::LEAD_SURROGATE_TAG
:
443 if(U_SUCCESS(errorCode
)) { errorCode
= U_INTERNAL_PROGRAM_ERROR
; }
445 case Collation::LONG_PRIMARY_TAG
:
447 sink
->handleCE(Collation::ceFromLongPrimaryCE32(ce32
));
450 case Collation::LONG_SECONDARY_TAG
:
452 sink
->handleCE(Collation::ceFromLongSecondaryCE32(ce32
));
455 case Collation::LATIN_EXPANSION_TAG
:
457 ces
[0] = Collation::latinCE0FromCE32(ce32
);
458 ces
[1] = Collation::latinCE1FromCE32(ce32
);
459 sink
->handleExpansion(ces
, 2);
461 // Optimization: If we have a prefix,
462 // then the relevant strings have been added already.
463 if(unreversedPrefix
.isEmpty()) {
464 addExpansions(start
, end
);
467 case Collation::EXPANSION32_TAG
:
469 const uint32_t *ce32s
= data
->ce32s
+ Collation::indexFromCE32(ce32
);
470 int32_t length
= Collation::lengthFromCE32(ce32
);
471 for(int32_t i
= 0; i
< length
; ++i
) {
472 ces
[i
] = Collation::ceFromCE32(*ce32s
++);
474 sink
->handleExpansion(ces
, length
);
476 // Optimization: If we have a prefix,
477 // then the relevant strings have been added already.
478 if(unreversedPrefix
.isEmpty()) {
479 addExpansions(start
, end
);
482 case Collation::EXPANSION_TAG
:
484 int32_t length
= Collation::lengthFromCE32(ce32
);
485 sink
->handleExpansion(data
->ces
+ Collation::indexFromCE32(ce32
), length
);
487 // Optimization: If we have a prefix,
488 // then the relevant strings have been added already.
489 if(unreversedPrefix
.isEmpty()) {
490 addExpansions(start
, end
);
493 case Collation::PREFIX_TAG
:
494 handlePrefixes(start
, end
, ce32
);
496 case Collation::CONTRACTION_TAG
:
497 handleContractions(start
, end
, ce32
);
499 case Collation::DIGIT_TAG
:
500 // Fetch the non-numeric-collation CE32 and continue.
501 ce32
= data
->ce32s
[Collation::indexFromCE32(ce32
)];
503 case Collation::U0000_TAG
:
504 U_ASSERT(start
== 0 && end
== 0);
505 // Fetch the normal ce32 for U+0000 and continue.
506 ce32
= data
->ce32s
[0];
508 case Collation::HANGUL_TAG
:
510 // TODO: This should be optimized,
511 // especially if [start..end] is the complete Hangul range. (assert that)
512 UTF16CollationIterator
iter(data
, FALSE
, NULL
, NULL
, NULL
);
513 UChar hangul
[1] = { 0 };
514 for(UChar32 c
= start
; c
<= end
; ++c
) {
515 hangul
[0] = (UChar
)c
;
516 iter
.setText(hangul
, hangul
+ 1);
517 int32_t length
= iter
.fetchCEs(errorCode
);
518 if(U_FAILURE(errorCode
)) { return; }
519 // Ignore the terminating non-CE.
520 U_ASSERT(length
>= 2 && iter
.getCE(length
- 1) == Collation::NO_CE
);
521 sink
->handleExpansion(iter
.getCEs(), length
- 1);
524 // Optimization: If we have a prefix,
525 // then the relevant strings have been added already.
526 if(unreversedPrefix
.isEmpty()) {
527 addExpansions(start
, end
);
530 case Collation::OFFSET_TAG
:
531 // Currently no need to send offset CEs to the sink.
533 case Collation::IMPLICIT_TAG
:
534 // Currently no need to send implicit CEs to the sink.
541 ContractionsAndExpansions::handlePrefixes(
542 UChar32 start
, UChar32 end
, uint32_t ce32
) {
543 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
544 ce32
= CollationData::readCE32(p
); // Default if no prefix match.
545 handleCE32(start
, end
, ce32
);
546 if(!addPrefixes
) { return; }
547 UCharsTrie::Iterator
prefixes(p
+ 2, 0, errorCode
);
548 while(prefixes
.next(errorCode
)) {
549 setPrefix(prefixes
.getString());
550 // Prefix/pre-context mappings are special kinds of contractions
551 // that always yield expansions.
552 addStrings(start
, end
, contractions
);
553 addStrings(start
, end
, expansions
);
554 handleCE32(start
, end
, (uint32_t)prefixes
.getValue());
560 ContractionsAndExpansions::handleContractions(
561 UChar32 start
, UChar32 end
, uint32_t ce32
) {
562 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
563 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
564 // No match on the single code point.
565 // We are underneath a prefix, and the default mapping is just
566 // a fallback to the mappings for a shorter prefix.
567 U_ASSERT(!unreversedPrefix
.isEmpty());
569 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
570 U_ASSERT(!Collation::isContractionCE32(ce32
));
571 handleCE32(start
, end
, ce32
);
573 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
574 while(suffixes
.next(errorCode
)) {
575 suffix
= &suffixes
.getString();
576 addStrings(start
, end
, contractions
);
577 if(!unreversedPrefix
.isEmpty()) {
578 addStrings(start
, end
, expansions
);
580 handleCE32(start
, end
, (uint32_t)suffixes
.getValue());
586 ContractionsAndExpansions::addExpansions(UChar32 start
, UChar32 end
) {
587 if(unreversedPrefix
.isEmpty() && suffix
== NULL
) {
588 if(expansions
!= NULL
) {
589 expansions
->add(start
, end
);
592 addStrings(start
, end
, expansions
);
597 ContractionsAndExpansions::addStrings(UChar32 start
, UChar32 end
, UnicodeSet
*set
) {
598 if(set
== NULL
) { return; }
599 UnicodeString
s(unreversedPrefix
);
606 s
.truncate(unreversedPrefix
.length());
607 } while(++start
<= end
);
612 #endif // !UCONFIG_NO_COLLATION