]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationsets.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / collationsets.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationsets.cpp
9 *
10 * created on: 2013feb09
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/ucharstrie.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/ustringtrie.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationsets.h"
25 #include "normalizer2impl.h"
26 #include "uassert.h"
27 #include "utf16collationiterator.h"
28 #include "utrie2.h"
29
30 U_NAMESPACE_BEGIN
31
32 U_CDECL_BEGIN
33
34 static UBool U_CALLCONV
35 enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
36 if(ce32 == Collation::FALLBACK_CE32) {
37 return TRUE; // fallback to base, not tailored
38 }
39 TailoredSet *ts = (TailoredSet *)context;
40 return ts->handleCE32(start, end, ce32);
41 }
42
43 U_CDECL_END
44
45 void
46 TailoredSet::forData(const CollationData *d, UErrorCode &ec) {
47 if(U_FAILURE(ec)) { return; }
48 errorCode = ec; // Preserve info & warning codes.
49 data = d;
50 baseData = d->base;
51 U_ASSERT(baseData != NULL);
52 utrie2_enum(data->trie, NULL, enumTailoredRange, this);
53 ec = errorCode;
54 }
55
56 UBool
57 TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
58 U_ASSERT(ce32 != Collation::FALLBACK_CE32);
59 if(Collation::isSpecialCE32(ce32)) {
60 ce32 = data->getIndirectCE32(ce32);
61 if(ce32 == Collation::FALLBACK_CE32) {
62 return U_SUCCESS(errorCode);
63 }
64 }
65 do {
66 uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start));
67 // Do not just continue if ce32 == baseCE32 because
68 // contractions and expansions in different data objects
69 // normally differ even if they have the same data offsets.
70 if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) {
71 // fastpath
72 if(ce32 != baseCE32) {
73 tailored->add(start);
74 }
75 } else {
76 compare(start, ce32, baseCE32);
77 }
78 } while(++start <= end);
79 return U_SUCCESS(errorCode);
80 }
81
82 void
83 TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) {
84 if(Collation::isPrefixCE32(ce32)) {
85 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
86 ce32 = data->getFinalCE32(CollationData::readCE32(p));
87 if(Collation::isPrefixCE32(baseCE32)) {
88 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
89 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
90 comparePrefixes(c, p + 2, q + 2);
91 } else {
92 addPrefixes(data, c, p + 2);
93 }
94 } else if(Collation::isPrefixCE32(baseCE32)) {
95 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
96 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
97 addPrefixes(baseData, c, q + 2);
98 }
99
100 if(Collation::isContractionCE32(ce32)) {
101 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
102 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
103 ce32 = Collation::NO_CE32;
104 } else {
105 ce32 = data->getFinalCE32(CollationData::readCE32(p));
106 }
107 if(Collation::isContractionCE32(baseCE32)) {
108 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
109 if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
110 baseCE32 = Collation::NO_CE32;
111 } else {
112 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
113 }
114 compareContractions(c, p + 2, q + 2);
115 } else {
116 addContractions(c, p + 2);
117 }
118 } else if(Collation::isContractionCE32(baseCE32)) {
119 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
120 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
121 addContractions(c, q + 2);
122 }
123
124 int32_t tag;
125 if(Collation::isSpecialCE32(ce32)) {
126 tag = Collation::tagFromCE32(ce32);
127 U_ASSERT(tag != Collation::PREFIX_TAG);
128 U_ASSERT(tag != Collation::CONTRACTION_TAG);
129 // Currently, the tailoring data builder does not write offset tags.
130 // They might be useful for saving space,
131 // but they would complicate the builder,
132 // and in tailorings we assume that performance of tailored characters is more important.
133 U_ASSERT(tag != Collation::OFFSET_TAG);
134 } else {
135 tag = -1;
136 }
137 int32_t baseTag;
138 if(Collation::isSpecialCE32(baseCE32)) {
139 baseTag = Collation::tagFromCE32(baseCE32);
140 U_ASSERT(baseTag != Collation::PREFIX_TAG);
141 U_ASSERT(baseTag != Collation::CONTRACTION_TAG);
142 } else {
143 baseTag = -1;
144 }
145
146 // Non-contextual mappings, expansions, etc.
147 if(baseTag == Collation::OFFSET_TAG) {
148 // We might be comparing a tailoring CE which is a copy of
149 // a base offset-tag CE, via the [optimize [set]] syntax
150 // or when a single-character mapping was copied for tailored contractions.
151 // Offset tags always result in long-primary CEs,
152 // with common secondary/tertiary weights.
153 if(!Collation::isLongPrimaryCE32(ce32)) {
154 add(c);
155 return;
156 }
157 int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)];
158 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
159 if(Collation::primaryFromLongPrimaryCE32(ce32) != p) {
160 add(c);
161 return;
162 }
163 }
164
165 if(tag != baseTag) {
166 add(c);
167 return;
168 }
169
170 if(tag == Collation::EXPANSION32_TAG) {
171 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
172 int32_t length = Collation::lengthFromCE32(ce32);
173
174 const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32);
175 int32_t baseLength = Collation::lengthFromCE32(baseCE32);
176
177 if(length != baseLength) {
178 add(c);
179 return;
180 }
181 for(int32_t i = 0; i < length; ++i) {
182 if(ce32s[i] != baseCE32s[i]) {
183 add(c);
184 break;
185 }
186 }
187 } else if(tag == Collation::EXPANSION_TAG) {
188 const int64_t *ces = data->ces + Collation::indexFromCE32(ce32);
189 int32_t length = Collation::lengthFromCE32(ce32);
190
191 const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32);
192 int32_t baseLength = Collation::lengthFromCE32(baseCE32);
193
194 if(length != baseLength) {
195 add(c);
196 return;
197 }
198 for(int32_t i = 0; i < length; ++i) {
199 if(ces[i] != baseCEs[i]) {
200 add(c);
201 break;
202 }
203 }
204 } else if(tag == Collation::HANGUL_TAG) {
205 UChar jamos[3];
206 int32_t length = Hangul::decompose(c, jamos);
207 if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) ||
208 (length == 3 && tailored->contains(jamos[2]))) {
209 add(c);
210 }
211 } else if(ce32 != baseCE32) {
212 add(c);
213 }
214 }
215
216 void
217 TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) {
218 // Parallel iteration over prefixes of both tables.
219 UCharsTrie::Iterator prefixes(p, 0, errorCode);
220 UCharsTrie::Iterator basePrefixes(q, 0, errorCode);
221 const UnicodeString *tp = NULL; // Tailoring prefix.
222 const UnicodeString *bp = NULL; // Base prefix.
223 // Use a string with a U+FFFF as the limit sentinel.
224 // U+FFFF is untailorable and will not occur in prefixes.
225 UnicodeString none((UChar)0xffff);
226 for(;;) {
227 if(tp == NULL) {
228 if(prefixes.next(errorCode)) {
229 tp = &prefixes.getString();
230 } else {
231 tp = &none;
232 }
233 }
234 if(bp == NULL) {
235 if(basePrefixes.next(errorCode)) {
236 bp = &basePrefixes.getString();
237 } else {
238 bp = &none;
239 }
240 }
241 if(tp == &none && bp == &none) { break; }
242 int32_t cmp = tp->compare(*bp);
243 if(cmp < 0) {
244 // tp occurs in the tailoring but not in the base.
245 addPrefix(data, *tp, c, (uint32_t)prefixes.getValue());
246 tp = NULL;
247 } else if(cmp > 0) {
248 // bp occurs in the base but not in the tailoring.
249 addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue());
250 bp = NULL;
251 } else {
252 setPrefix(*tp);
253 compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue());
254 resetPrefix();
255 tp = NULL;
256 bp = NULL;
257 }
258 }
259 }
260
261 void
262 TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) {
263 // Parallel iteration over suffixes of both tables.
264 UCharsTrie::Iterator suffixes(p, 0, errorCode);
265 UCharsTrie::Iterator baseSuffixes(q, 0, errorCode);
266 const UnicodeString *ts = NULL; // Tailoring suffix.
267 const UnicodeString *bs = NULL; // Base suffix.
268 // Use a string with two U+FFFF as the limit sentinel.
269 // U+FFFF is untailorable and will not occur in contractions except maybe
270 // as a single suffix character for a root-collator boundary contraction.
271 UnicodeString none((UChar)0xffff);
272 none.append((UChar)0xffff);
273 for(;;) {
274 if(ts == NULL) {
275 if(suffixes.next(errorCode)) {
276 ts = &suffixes.getString();
277 } else {
278 ts = &none;
279 }
280 }
281 if(bs == NULL) {
282 if(baseSuffixes.next(errorCode)) {
283 bs = &baseSuffixes.getString();
284 } else {
285 bs = &none;
286 }
287 }
288 if(ts == &none && bs == &none) { break; }
289 int32_t cmp = ts->compare(*bs);
290 if(cmp < 0) {
291 // ts occurs in the tailoring but not in the base.
292 addSuffix(c, *ts);
293 ts = NULL;
294 } else if(cmp > 0) {
295 // bs occurs in the base but not in the tailoring.
296 addSuffix(c, *bs);
297 bs = NULL;
298 } else {
299 suffix = ts;
300 compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue());
301 suffix = NULL;
302 ts = NULL;
303 bs = NULL;
304 }
305 }
306 }
307
308 void
309 TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) {
310 UCharsTrie::Iterator prefixes(p, 0, errorCode);
311 while(prefixes.next(errorCode)) {
312 addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue());
313 }
314 }
315
316 void
317 TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) {
318 setPrefix(pfx);
319 ce32 = d->getFinalCE32(ce32);
320 if(Collation::isContractionCE32(ce32)) {
321 const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
322 addContractions(c, p + 2);
323 }
324 tailored->add(UnicodeString(unreversedPrefix).append(c));
325 resetPrefix();
326 }
327
328 void
329 TailoredSet::addContractions(UChar32 c, const UChar *p) {
330 UCharsTrie::Iterator suffixes(p, 0, errorCode);
331 while(suffixes.next(errorCode)) {
332 addSuffix(c, suffixes.getString());
333 }
334 }
335
336 void
337 TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) {
338 tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx));
339 }
340
341 void
342 TailoredSet::add(UChar32 c) {
343 if(unreversedPrefix.isEmpty() && suffix == NULL) {
344 tailored->add(c);
345 } else {
346 UnicodeString s(unreversedPrefix);
347 s.append(c);
348 if(suffix != NULL) {
349 s.append(*suffix);
350 }
351 tailored->add(s);
352 }
353 }
354
355 ContractionsAndExpansions::CESink::~CESink() {}
356
357 U_CDECL_BEGIN
358
359 static UBool U_CALLCONV
360 enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
361 ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context;
362 if(cne->checkTailored == 0) {
363 // There is no tailoring.
364 // No need to collect nor check the tailored set.
365 } else if(cne->checkTailored < 0) {
366 // Collect the set of code points with mappings in the tailoring data.
367 if(ce32 == Collation::FALLBACK_CE32) {
368 return TRUE; // fallback to base, not tailored
369 } else {
370 cne->tailored.add(start, end);
371 }
372 // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
373 } else if(start == end) {
374 if(cne->tailored.contains(start)) {
375 return TRUE;
376 }
377 } else if(cne->tailored.containsSome(start, end)) {
378 cne->ranges.set(start, end).removeAll(cne->tailored);
379 int32_t count = cne->ranges.getRangeCount();
380 for(int32_t i = 0; i < count; ++i) {
381 cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32);
382 }
383 return U_SUCCESS(cne->errorCode);
384 }
385 cne->handleCE32(start, end, ce32);
386 return U_SUCCESS(cne->errorCode);
387 }
388
389 U_CDECL_END
390
391 void
392 ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) {
393 if(U_FAILURE(ec)) { return; }
394 errorCode = ec; // Preserve info & warning codes.
395 // Add all from the data, can be tailoring or base.
396 if(d->base != NULL) {
397 checkTailored = -1;
398 }
399 data = d;
400 utrie2_enum(data->trie, NULL, enumCnERange, this);
401 if(d->base == NULL || U_FAILURE(errorCode)) {
402 ec = errorCode;
403 return;
404 }
405 // Add all from the base data but only for un-tailored code points.
406 tailored.freeze();
407 checkTailored = 1;
408 data = d->base;
409 utrie2_enum(data->trie, NULL, enumCnERange, this);
410 ec = errorCode;
411 }
412
413 void
414 ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) {
415 if(U_FAILURE(ec)) { return; }
416 errorCode = ec; // Preserve info & warning codes.
417 uint32_t ce32 = d->getCE32(c);
418 if(ce32 == Collation::FALLBACK_CE32) {
419 d = d->base;
420 ce32 = d->getCE32(c);
421 }
422 data = d;
423 handleCE32(c, c, ce32);
424 ec = errorCode;
425 }
426
427 void
428 ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
429 for(;;) {
430 if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) {
431 // !isSpecialCE32()
432 if(sink != NULL) {
433 sink->handleCE(Collation::ceFromSimpleCE32(ce32));
434 }
435 return;
436 }
437 switch(Collation::tagFromCE32(ce32)) {
438 case Collation::FALLBACK_TAG:
439 return;
440 case Collation::RESERVED_TAG_3:
441 case Collation::BUILDER_DATA_TAG:
442 case Collation::LEAD_SURROGATE_TAG:
443 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
444 return;
445 case Collation::LONG_PRIMARY_TAG:
446 if(sink != NULL) {
447 sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32));
448 }
449 return;
450 case Collation::LONG_SECONDARY_TAG:
451 if(sink != NULL) {
452 sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32));
453 }
454 return;
455 case Collation::LATIN_EXPANSION_TAG:
456 if(sink != NULL) {
457 ces[0] = Collation::latinCE0FromCE32(ce32);
458 ces[1] = Collation::latinCE1FromCE32(ce32);
459 sink->handleExpansion(ces, 2);
460 }
461 // Optimization: If we have a prefix,
462 // then the relevant strings have been added already.
463 if(unreversedPrefix.isEmpty()) {
464 addExpansions(start, end);
465 }
466 return;
467 case Collation::EXPANSION32_TAG:
468 if(sink != NULL) {
469 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
470 int32_t length = Collation::lengthFromCE32(ce32);
471 for(int32_t i = 0; i < length; ++i) {
472 ces[i] = Collation::ceFromCE32(*ce32s++);
473 }
474 sink->handleExpansion(ces, length);
475 }
476 // Optimization: If we have a prefix,
477 // then the relevant strings have been added already.
478 if(unreversedPrefix.isEmpty()) {
479 addExpansions(start, end);
480 }
481 return;
482 case Collation::EXPANSION_TAG:
483 if(sink != NULL) {
484 int32_t length = Collation::lengthFromCE32(ce32);
485 sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length);
486 }
487 // Optimization: If we have a prefix,
488 // then the relevant strings have been added already.
489 if(unreversedPrefix.isEmpty()) {
490 addExpansions(start, end);
491 }
492 return;
493 case Collation::PREFIX_TAG:
494 handlePrefixes(start, end, ce32);
495 return;
496 case Collation::CONTRACTION_TAG:
497 handleContractions(start, end, ce32);
498 return;
499 case Collation::DIGIT_TAG:
500 // Fetch the non-numeric-collation CE32 and continue.
501 ce32 = data->ce32s[Collation::indexFromCE32(ce32)];
502 break;
503 case Collation::U0000_TAG:
504 U_ASSERT(start == 0 && end == 0);
505 // Fetch the normal ce32 for U+0000 and continue.
506 ce32 = data->ce32s[0];
507 break;
508 case Collation::HANGUL_TAG:
509 if(sink != NULL) {
510 // TODO: This should be optimized,
511 // especially if [start..end] is the complete Hangul range. (assert that)
512 UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL);
513 UChar hangul[1] = { 0 };
514 for(UChar32 c = start; c <= end; ++c) {
515 hangul[0] = (UChar)c;
516 iter.setText(hangul, hangul + 1);
517 int32_t length = iter.fetchCEs(errorCode);
518 if(U_FAILURE(errorCode)) { return; }
519 // Ignore the terminating non-CE.
520 U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE);
521 sink->handleExpansion(iter.getCEs(), length - 1);
522 }
523 }
524 // Optimization: If we have a prefix,
525 // then the relevant strings have been added already.
526 if(unreversedPrefix.isEmpty()) {
527 addExpansions(start, end);
528 }
529 return;
530 case Collation::OFFSET_TAG:
531 // Currently no need to send offset CEs to the sink.
532 return;
533 case Collation::IMPLICIT_TAG:
534 // Currently no need to send implicit CEs to the sink.
535 return;
536 }
537 }
538 }
539
540 void
541 ContractionsAndExpansions::handlePrefixes(
542 UChar32 start, UChar32 end, uint32_t ce32) {
543 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
544 ce32 = CollationData::readCE32(p); // Default if no prefix match.
545 handleCE32(start, end, ce32);
546 if(!addPrefixes) { return; }
547 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
548 while(prefixes.next(errorCode)) {
549 setPrefix(prefixes.getString());
550 // Prefix/pre-context mappings are special kinds of contractions
551 // that always yield expansions.
552 addStrings(start, end, contractions);
553 addStrings(start, end, expansions);
554 handleCE32(start, end, (uint32_t)prefixes.getValue());
555 }
556 resetPrefix();
557 }
558
559 void
560 ContractionsAndExpansions::handleContractions(
561 UChar32 start, UChar32 end, uint32_t ce32) {
562 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
563 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
564 // No match on the single code point.
565 // We are underneath a prefix, and the default mapping is just
566 // a fallback to the mappings for a shorter prefix.
567 U_ASSERT(!unreversedPrefix.isEmpty());
568 } else {
569 ce32 = CollationData::readCE32(p); // Default if no suffix match.
570 U_ASSERT(!Collation::isContractionCE32(ce32));
571 handleCE32(start, end, ce32);
572 }
573 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
574 while(suffixes.next(errorCode)) {
575 suffix = &suffixes.getString();
576 addStrings(start, end, contractions);
577 if(!unreversedPrefix.isEmpty()) {
578 addStrings(start, end, expansions);
579 }
580 handleCE32(start, end, (uint32_t)suffixes.getValue());
581 }
582 suffix = NULL;
583 }
584
585 void
586 ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) {
587 if(unreversedPrefix.isEmpty() && suffix == NULL) {
588 if(expansions != NULL) {
589 expansions->add(start, end);
590 }
591 } else {
592 addStrings(start, end, expansions);
593 }
594 }
595
596 void
597 ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) {
598 if(set == NULL) { return; }
599 UnicodeString s(unreversedPrefix);
600 do {
601 s.append(start);
602 if(suffix != NULL) {
603 s.append(*suffix);
604 }
605 set->add(s);
606 s.truncate(unreversedPrefix.length());
607 } while(++start <= end);
608 }
609
610 U_NAMESPACE_END
611
612 #endif // !UCONFIG_NO_COLLATION