2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2013feb09
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucharstrie.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19 #include "unicode/ustringtrie.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationsets.h"
23 #include "normalizer2impl.h"
25 #include "utf16collationiterator.h"
32 static UBool U_CALLCONV
33 enumTailoredRange(const void *context
, UChar32 start
, UChar32 end
, uint32_t ce32
) {
34 if(ce32
== Collation::FALLBACK_CE32
) {
35 return TRUE
; // fallback to base, not tailored
37 TailoredSet
*ts
= (TailoredSet
*)context
;
38 return ts
->handleCE32(start
, end
, ce32
);
44 TailoredSet::forData(const CollationData
*d
, UErrorCode
&ec
) {
45 if(U_FAILURE(ec
)) { return; }
46 errorCode
= ec
; // Preserve info & warning codes.
49 U_ASSERT(baseData
!= NULL
);
50 utrie2_enum(data
->trie
, NULL
, enumTailoredRange
, this);
55 TailoredSet::handleCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
56 U_ASSERT(ce32
!= Collation::FALLBACK_CE32
);
57 if(Collation::isSpecialCE32(ce32
)) {
58 ce32
= data
->getIndirectCE32(ce32
);
59 if(ce32
== Collation::FALLBACK_CE32
) {
60 return U_SUCCESS(errorCode
);
64 uint32_t baseCE32
= baseData
->getFinalCE32(baseData
->getCE32(start
));
65 // Do not just continue if ce32 == baseCE32 because
66 // contractions and expansions in different data objects
67 // normally differ even if they have the same data offsets.
68 if(Collation::isSelfContainedCE32(ce32
) && Collation::isSelfContainedCE32(baseCE32
)) {
70 if(ce32
!= baseCE32
) {
74 compare(start
, ce32
, baseCE32
);
76 } while(++start
<= end
);
77 return U_SUCCESS(errorCode
);
81 TailoredSet::compare(UChar32 c
, uint32_t ce32
, uint32_t baseCE32
) {
82 if(Collation::isPrefixCE32(ce32
)) {
83 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
84 ce32
= data
->getFinalCE32(CollationData::readCE32(p
));
85 if(Collation::isPrefixCE32(baseCE32
)) {
86 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
87 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
88 comparePrefixes(c
, p
+ 2, q
+ 2);
90 addPrefixes(data
, c
, p
+ 2);
92 } else if(Collation::isPrefixCE32(baseCE32
)) {
93 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
94 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
95 addPrefixes(baseData
, c
, q
+ 2);
98 if(Collation::isContractionCE32(ce32
)) {
99 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
100 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
101 ce32
= Collation::NO_CE32
;
103 ce32
= data
->getFinalCE32(CollationData::readCE32(p
));
105 if(Collation::isContractionCE32(baseCE32
)) {
106 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
107 if((baseCE32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
108 baseCE32
= Collation::NO_CE32
;
110 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
112 compareContractions(c
, p
+ 2, q
+ 2);
114 addContractions(c
, p
+ 2);
116 } else if(Collation::isContractionCE32(baseCE32
)) {
117 const UChar
*q
= baseData
->contexts
+ Collation::indexFromCE32(baseCE32
);
118 baseCE32
= baseData
->getFinalCE32(CollationData::readCE32(q
));
119 addContractions(c
, q
+ 2);
123 if(Collation::isSpecialCE32(ce32
)) {
124 tag
= Collation::tagFromCE32(ce32
);
125 U_ASSERT(tag
!= Collation::PREFIX_TAG
);
126 U_ASSERT(tag
!= Collation::CONTRACTION_TAG
);
127 // Currently, the tailoring data builder does not write offset tags.
128 // They might be useful for saving space,
129 // but they would complicate the builder,
130 // and in tailorings we assume that performance of tailored characters is more important.
131 U_ASSERT(tag
!= Collation::OFFSET_TAG
);
136 if(Collation::isSpecialCE32(baseCE32
)) {
137 baseTag
= Collation::tagFromCE32(baseCE32
);
138 U_ASSERT(baseTag
!= Collation::PREFIX_TAG
);
139 U_ASSERT(baseTag
!= Collation::CONTRACTION_TAG
);
144 // Non-contextual mappings, expansions, etc.
145 if(baseTag
== Collation::OFFSET_TAG
) {
146 // We might be comparing a tailoring CE which is a copy of
147 // a base offset-tag CE, via the [optimize [set]] syntax
148 // or when a single-character mapping was copied for tailored contractions.
149 // Offset tags always result in long-primary CEs,
150 // with common secondary/tertiary weights.
151 if(!Collation::isLongPrimaryCE32(ce32
)) {
155 int64_t dataCE
= baseData
->ces
[Collation::indexFromCE32(baseCE32
)];
156 uint32_t p
= Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
);
157 if(Collation::primaryFromLongPrimaryCE32(ce32
) != p
) {
168 if(tag
== Collation::EXPANSION32_TAG
) {
169 const uint32_t *ce32s
= data
->ce32s
+ Collation::indexFromCE32(ce32
);
170 int32_t length
= Collation::lengthFromCE32(ce32
);
172 const uint32_t *baseCE32s
= baseData
->ce32s
+ Collation::indexFromCE32(baseCE32
);
173 int32_t baseLength
= Collation::lengthFromCE32(baseCE32
);
175 if(length
!= baseLength
) {
179 for(int32_t i
= 0; i
< length
; ++i
) {
180 if(ce32s
[i
] != baseCE32s
[i
]) {
185 } else if(tag
== Collation::EXPANSION_TAG
) {
186 const int64_t *ces
= data
->ces
+ Collation::indexFromCE32(ce32
);
187 int32_t length
= Collation::lengthFromCE32(ce32
);
189 const int64_t *baseCEs
= baseData
->ces
+ Collation::indexFromCE32(baseCE32
);
190 int32_t baseLength
= Collation::lengthFromCE32(baseCE32
);
192 if(length
!= baseLength
) {
196 for(int32_t i
= 0; i
< length
; ++i
) {
197 if(ces
[i
] != baseCEs
[i
]) {
202 } else if(tag
== Collation::HANGUL_TAG
) {
204 int32_t length
= Hangul::decompose(c
, jamos
);
205 if(tailored
->contains(jamos
[0]) || tailored
->contains(jamos
[1]) ||
206 (length
== 3 && tailored
->contains(jamos
[2]))) {
209 } else if(ce32
!= baseCE32
) {
215 TailoredSet::comparePrefixes(UChar32 c
, const UChar
*p
, const UChar
*q
) {
216 // Parallel iteration over prefixes of both tables.
217 UCharsTrie::Iterator
prefixes(p
, 0, errorCode
);
218 UCharsTrie::Iterator
basePrefixes(q
, 0, errorCode
);
219 const UnicodeString
*tp
= NULL
; // Tailoring prefix.
220 const UnicodeString
*bp
= NULL
; // Base prefix.
221 // Use a string with a U+FFFF as the limit sentinel.
222 // U+FFFF is untailorable and will not occur in prefixes.
223 UnicodeString
none((UChar
)0xffff);
226 if(prefixes
.next(errorCode
)) {
227 tp
= &prefixes
.getString();
233 if(basePrefixes
.next(errorCode
)) {
234 bp
= &basePrefixes
.getString();
239 if(tp
== &none
&& bp
== &none
) { break; }
240 int32_t cmp
= tp
->compare(*bp
);
242 // tp occurs in the tailoring but not in the base.
243 addPrefix(data
, *tp
, c
, (uint32_t)prefixes
.getValue());
246 // bp occurs in the base but not in the tailoring.
247 addPrefix(baseData
, *bp
, c
, (uint32_t)basePrefixes
.getValue());
251 compare(c
, (uint32_t)prefixes
.getValue(), (uint32_t)basePrefixes
.getValue());
260 TailoredSet::compareContractions(UChar32 c
, const UChar
*p
, const UChar
*q
) {
261 // Parallel iteration over suffixes of both tables.
262 UCharsTrie::Iterator
suffixes(p
, 0, errorCode
);
263 UCharsTrie::Iterator
baseSuffixes(q
, 0, errorCode
);
264 const UnicodeString
*ts
= NULL
; // Tailoring suffix.
265 const UnicodeString
*bs
= NULL
; // Base suffix.
266 // Use a string with two U+FFFF as the limit sentinel.
267 // U+FFFF is untailorable and will not occur in contractions except maybe
268 // as a single suffix character for a root-collator boundary contraction.
269 UnicodeString
none((UChar
)0xffff);
270 none
.append((UChar
)0xffff);
273 if(suffixes
.next(errorCode
)) {
274 ts
= &suffixes
.getString();
280 if(baseSuffixes
.next(errorCode
)) {
281 bs
= &baseSuffixes
.getString();
286 if(ts
== &none
&& bs
== &none
) { break; }
287 int32_t cmp
= ts
->compare(*bs
);
289 // ts occurs in the tailoring but not in the base.
293 // bs occurs in the base but not in the tailoring.
298 compare(c
, (uint32_t)suffixes
.getValue(), (uint32_t)baseSuffixes
.getValue());
307 TailoredSet::addPrefixes(const CollationData
*d
, UChar32 c
, const UChar
*p
) {
308 UCharsTrie::Iterator
prefixes(p
, 0, errorCode
);
309 while(prefixes
.next(errorCode
)) {
310 addPrefix(d
, prefixes
.getString(), c
, (uint32_t)prefixes
.getValue());
315 TailoredSet::addPrefix(const CollationData
*d
, const UnicodeString
&pfx
, UChar32 c
, uint32_t ce32
) {
317 ce32
= d
->getFinalCE32(ce32
);
318 if(Collation::isContractionCE32(ce32
)) {
319 const UChar
*p
= d
->contexts
+ Collation::indexFromCE32(ce32
);
320 addContractions(c
, p
+ 2);
322 tailored
->add(UnicodeString(unreversedPrefix
).append(c
));
327 TailoredSet::addContractions(UChar32 c
, const UChar
*p
) {
328 UCharsTrie::Iterator
suffixes(p
, 0, errorCode
);
329 while(suffixes
.next(errorCode
)) {
330 addSuffix(c
, suffixes
.getString());
335 TailoredSet::addSuffix(UChar32 c
, const UnicodeString
&sfx
) {
336 tailored
->add(UnicodeString(unreversedPrefix
).append(c
).append(sfx
));
340 TailoredSet::add(UChar32 c
) {
341 if(unreversedPrefix
.isEmpty() && suffix
== NULL
) {
344 UnicodeString
s(unreversedPrefix
);
353 ContractionsAndExpansions::CESink::~CESink() {}
357 static UBool U_CALLCONV
358 enumCnERange(const void *context
, UChar32 start
, UChar32 end
, uint32_t ce32
) {
359 ContractionsAndExpansions
*cne
= (ContractionsAndExpansions
*)context
;
360 if(cne
->checkTailored
== 0) {
361 // There is no tailoring.
362 // No need to collect nor check the tailored set.
363 } else if(cne
->checkTailored
< 0) {
364 // Collect the set of code points with mappings in the tailoring data.
365 if(ce32
== Collation::FALLBACK_CE32
) {
366 return TRUE
; // fallback to base, not tailored
368 cne
->tailored
.add(start
, end
);
370 // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
371 } else if(start
== end
) {
372 if(cne
->tailored
.contains(start
)) {
375 } else if(cne
->tailored
.containsSome(start
, end
)) {
376 cne
->ranges
.set(start
, end
).removeAll(cne
->tailored
);
377 int32_t count
= cne
->ranges
.getRangeCount();
378 for(int32_t i
= 0; i
< count
; ++i
) {
379 cne
->handleCE32(cne
->ranges
.getRangeStart(i
), cne
->ranges
.getRangeEnd(i
), ce32
);
381 return U_SUCCESS(cne
->errorCode
);
383 cne
->handleCE32(start
, end
, ce32
);
384 return U_SUCCESS(cne
->errorCode
);
390 ContractionsAndExpansions::forData(const CollationData
*d
, UErrorCode
&ec
) {
391 if(U_FAILURE(ec
)) { return; }
392 errorCode
= ec
; // Preserve info & warning codes.
393 // Add all from the data, can be tailoring or base.
394 if(d
->base
!= NULL
) {
398 utrie2_enum(data
->trie
, NULL
, enumCnERange
, this);
399 if(d
->base
== NULL
|| U_FAILURE(errorCode
)) {
403 // Add all from the base data but only for un-tailored code points.
407 utrie2_enum(data
->trie
, NULL
, enumCnERange
, this);
412 ContractionsAndExpansions::forCodePoint(const CollationData
*d
, UChar32 c
, UErrorCode
&ec
) {
413 if(U_FAILURE(ec
)) { return; }
414 errorCode
= ec
; // Preserve info & warning codes.
415 uint32_t ce32
= d
->getCE32(c
);
416 if(ce32
== Collation::FALLBACK_CE32
) {
418 ce32
= d
->getCE32(c
);
421 handleCE32(c
, c
, ce32
);
426 ContractionsAndExpansions::handleCE32(UChar32 start
, UChar32 end
, uint32_t ce32
) {
428 if((ce32
& 0xff) < Collation::SPECIAL_CE32_LOW_BYTE
) {
431 sink
->handleCE(Collation::ceFromSimpleCE32(ce32
));
435 switch(Collation::tagFromCE32(ce32
)) {
436 case Collation::FALLBACK_TAG
:
438 case Collation::RESERVED_TAG_3
:
439 case Collation::BUILDER_DATA_TAG
:
440 case Collation::LEAD_SURROGATE_TAG
:
441 if(U_SUCCESS(errorCode
)) { errorCode
= U_INTERNAL_PROGRAM_ERROR
; }
443 case Collation::LONG_PRIMARY_TAG
:
445 sink
->handleCE(Collation::ceFromLongPrimaryCE32(ce32
));
448 case Collation::LONG_SECONDARY_TAG
:
450 sink
->handleCE(Collation::ceFromLongSecondaryCE32(ce32
));
453 case Collation::LATIN_EXPANSION_TAG
:
455 ces
[0] = Collation::latinCE0FromCE32(ce32
);
456 ces
[1] = Collation::latinCE1FromCE32(ce32
);
457 sink
->handleExpansion(ces
, 2);
459 // Optimization: If we have a prefix,
460 // then the relevant strings have been added already.
461 if(unreversedPrefix
.isEmpty()) {
462 addExpansions(start
, end
);
465 case Collation::EXPANSION32_TAG
:
467 const uint32_t *ce32s
= data
->ce32s
+ Collation::indexFromCE32(ce32
);
468 int32_t length
= Collation::lengthFromCE32(ce32
);
469 for(int32_t i
= 0; i
< length
; ++i
) {
470 ces
[i
] = Collation::ceFromCE32(*ce32s
++);
472 sink
->handleExpansion(ces
, length
);
474 // Optimization: If we have a prefix,
475 // then the relevant strings have been added already.
476 if(unreversedPrefix
.isEmpty()) {
477 addExpansions(start
, end
);
480 case Collation::EXPANSION_TAG
:
482 int32_t length
= Collation::lengthFromCE32(ce32
);
483 sink
->handleExpansion(data
->ces
+ Collation::indexFromCE32(ce32
), length
);
485 // Optimization: If we have a prefix,
486 // then the relevant strings have been added already.
487 if(unreversedPrefix
.isEmpty()) {
488 addExpansions(start
, end
);
491 case Collation::PREFIX_TAG
:
492 handlePrefixes(start
, end
, ce32
);
494 case Collation::CONTRACTION_TAG
:
495 handleContractions(start
, end
, ce32
);
497 case Collation::DIGIT_TAG
:
498 // Fetch the non-numeric-collation CE32 and continue.
499 ce32
= data
->ce32s
[Collation::indexFromCE32(ce32
)];
501 case Collation::U0000_TAG
:
502 U_ASSERT(start
== 0 && end
== 0);
503 // Fetch the normal ce32 for U+0000 and continue.
504 ce32
= data
->ce32s
[0];
506 case Collation::HANGUL_TAG
:
508 // TODO: This should be optimized,
509 // especially if [start..end] is the complete Hangul range. (assert that)
510 UTF16CollationIterator
iter(data
, FALSE
, NULL
, NULL
, NULL
);
511 UChar hangul
[1] = { 0 };
512 for(UChar32 c
= start
; c
<= end
; ++c
) {
513 hangul
[0] = (UChar
)c
;
514 iter
.setText(hangul
, hangul
+ 1);
515 int32_t length
= iter
.fetchCEs(errorCode
);
516 if(U_FAILURE(errorCode
)) { return; }
517 // Ignore the terminating non-CE.
518 U_ASSERT(length
>= 2 && iter
.getCE(length
- 1) == Collation::NO_CE
);
519 sink
->handleExpansion(iter
.getCEs(), length
- 1);
522 // Optimization: If we have a prefix,
523 // then the relevant strings have been added already.
524 if(unreversedPrefix
.isEmpty()) {
525 addExpansions(start
, end
);
528 case Collation::OFFSET_TAG
:
529 // Currently no need to send offset CEs to the sink.
531 case Collation::IMPLICIT_TAG
:
532 // Currently no need to send implicit CEs to the sink.
539 ContractionsAndExpansions::handlePrefixes(
540 UChar32 start
, UChar32 end
, uint32_t ce32
) {
541 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
542 ce32
= CollationData::readCE32(p
); // Default if no prefix match.
543 handleCE32(start
, end
, ce32
);
544 if(!addPrefixes
) { return; }
545 UCharsTrie::Iterator
prefixes(p
+ 2, 0, errorCode
);
546 while(prefixes
.next(errorCode
)) {
547 setPrefix(prefixes
.getString());
548 // Prefix/pre-context mappings are special kinds of contractions
549 // that always yield expansions.
550 addStrings(start
, end
, contractions
);
551 addStrings(start
, end
, expansions
);
552 handleCE32(start
, end
, (uint32_t)prefixes
.getValue());
558 ContractionsAndExpansions::handleContractions(
559 UChar32 start
, UChar32 end
, uint32_t ce32
) {
560 const UChar
*p
= data
->contexts
+ Collation::indexFromCE32(ce32
);
561 if((ce32
& Collation::CONTRACT_SINGLE_CP_NO_MATCH
) != 0) {
562 // No match on the single code point.
563 // We are underneath a prefix, and the default mapping is just
564 // a fallback to the mappings for a shorter prefix.
565 U_ASSERT(!unreversedPrefix
.isEmpty());
567 ce32
= CollationData::readCE32(p
); // Default if no suffix match.
568 U_ASSERT(!Collation::isContractionCE32(ce32
));
569 handleCE32(start
, end
, ce32
);
571 UCharsTrie::Iterator
suffixes(p
+ 2, 0, errorCode
);
572 while(suffixes
.next(errorCode
)) {
573 suffix
= &suffixes
.getString();
574 addStrings(start
, end
, contractions
);
575 if(!unreversedPrefix
.isEmpty()) {
576 addStrings(start
, end
, expansions
);
578 handleCE32(start
, end
, (uint32_t)suffixes
.getValue());
584 ContractionsAndExpansions::addExpansions(UChar32 start
, UChar32 end
) {
585 if(unreversedPrefix
.isEmpty() && suffix
== NULL
) {
586 if(expansions
!= NULL
) {
587 expansions
->add(start
, end
);
590 addStrings(start
, end
, expansions
);
595 ContractionsAndExpansions::addStrings(UChar32 start
, UChar32 end
, UnicodeSet
*set
) {
596 if(set
== NULL
) { return; }
597 UnicodeString
s(unreversedPrefix
);
604 s
.truncate(unreversedPrefix
.length());
605 } while(++start
<= end
);
610 #endif // !UCONFIG_NO_COLLATION