]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationsets.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationsets.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationsets.cpp
7 *
8 * created on: 2013feb09
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucharstrie.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19 #include "unicode/ustringtrie.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationsets.h"
23 #include "normalizer2impl.h"
24 #include "uassert.h"
25 #include "utf16collationiterator.h"
26 #include "utrie2.h"
27
28 U_NAMESPACE_BEGIN
29
30 U_CDECL_BEGIN
31
32 static UBool U_CALLCONV
33 enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
34 if(ce32 == Collation::FALLBACK_CE32) {
35 return TRUE; // fallback to base, not tailored
36 }
37 TailoredSet *ts = (TailoredSet *)context;
38 return ts->handleCE32(start, end, ce32);
39 }
40
41 U_CDECL_END
42
43 void
44 TailoredSet::forData(const CollationData *d, UErrorCode &ec) {
45 if(U_FAILURE(ec)) { return; }
46 errorCode = ec; // Preserve info & warning codes.
47 data = d;
48 baseData = d->base;
49 U_ASSERT(baseData != NULL);
50 utrie2_enum(data->trie, NULL, enumTailoredRange, this);
51 ec = errorCode;
52 }
53
54 UBool
55 TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
56 U_ASSERT(ce32 != Collation::FALLBACK_CE32);
57 if(Collation::isSpecialCE32(ce32)) {
58 ce32 = data->getIndirectCE32(ce32);
59 if(ce32 == Collation::FALLBACK_CE32) {
60 return U_SUCCESS(errorCode);
61 }
62 }
63 do {
64 uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start));
65 // Do not just continue if ce32 == baseCE32 because
66 // contractions and expansions in different data objects
67 // normally differ even if they have the same data offsets.
68 if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) {
69 // fastpath
70 if(ce32 != baseCE32) {
71 tailored->add(start);
72 }
73 } else {
74 compare(start, ce32, baseCE32);
75 }
76 } while(++start <= end);
77 return U_SUCCESS(errorCode);
78 }
79
80 void
81 TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) {
82 if(Collation::isPrefixCE32(ce32)) {
83 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
84 ce32 = data->getFinalCE32(CollationData::readCE32(p));
85 if(Collation::isPrefixCE32(baseCE32)) {
86 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
87 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
88 comparePrefixes(c, p + 2, q + 2);
89 } else {
90 addPrefixes(data, c, p + 2);
91 }
92 } else if(Collation::isPrefixCE32(baseCE32)) {
93 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
94 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
95 addPrefixes(baseData, c, q + 2);
96 }
97
98 if(Collation::isContractionCE32(ce32)) {
99 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
100 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
101 ce32 = Collation::NO_CE32;
102 } else {
103 ce32 = data->getFinalCE32(CollationData::readCE32(p));
104 }
105 if(Collation::isContractionCE32(baseCE32)) {
106 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
107 if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
108 baseCE32 = Collation::NO_CE32;
109 } else {
110 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
111 }
112 compareContractions(c, p + 2, q + 2);
113 } else {
114 addContractions(c, p + 2);
115 }
116 } else if(Collation::isContractionCE32(baseCE32)) {
117 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
118 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
119 addContractions(c, q + 2);
120 }
121
122 int32_t tag;
123 if(Collation::isSpecialCE32(ce32)) {
124 tag = Collation::tagFromCE32(ce32);
125 U_ASSERT(tag != Collation::PREFIX_TAG);
126 U_ASSERT(tag != Collation::CONTRACTION_TAG);
127 // Currently, the tailoring data builder does not write offset tags.
128 // They might be useful for saving space,
129 // but they would complicate the builder,
130 // and in tailorings we assume that performance of tailored characters is more important.
131 U_ASSERT(tag != Collation::OFFSET_TAG);
132 } else {
133 tag = -1;
134 }
135 int32_t baseTag;
136 if(Collation::isSpecialCE32(baseCE32)) {
137 baseTag = Collation::tagFromCE32(baseCE32);
138 U_ASSERT(baseTag != Collation::PREFIX_TAG);
139 U_ASSERT(baseTag != Collation::CONTRACTION_TAG);
140 } else {
141 baseTag = -1;
142 }
143
144 // Non-contextual mappings, expansions, etc.
145 if(baseTag == Collation::OFFSET_TAG) {
146 // We might be comparing a tailoring CE which is a copy of
147 // a base offset-tag CE, via the [optimize [set]] syntax
148 // or when a single-character mapping was copied for tailored contractions.
149 // Offset tags always result in long-primary CEs,
150 // with common secondary/tertiary weights.
151 if(!Collation::isLongPrimaryCE32(ce32)) {
152 add(c);
153 return;
154 }
155 int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)];
156 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
157 if(Collation::primaryFromLongPrimaryCE32(ce32) != p) {
158 add(c);
159 return;
160 }
161 }
162
163 if(tag != baseTag) {
164 add(c);
165 return;
166 }
167
168 if(tag == Collation::EXPANSION32_TAG) {
169 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
170 int32_t length = Collation::lengthFromCE32(ce32);
171
172 const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32);
173 int32_t baseLength = Collation::lengthFromCE32(baseCE32);
174
175 if(length != baseLength) {
176 add(c);
177 return;
178 }
179 for(int32_t i = 0; i < length; ++i) {
180 if(ce32s[i] != baseCE32s[i]) {
181 add(c);
182 break;
183 }
184 }
185 } else if(tag == Collation::EXPANSION_TAG) {
186 const int64_t *ces = data->ces + Collation::indexFromCE32(ce32);
187 int32_t length = Collation::lengthFromCE32(ce32);
188
189 const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32);
190 int32_t baseLength = Collation::lengthFromCE32(baseCE32);
191
192 if(length != baseLength) {
193 add(c);
194 return;
195 }
196 for(int32_t i = 0; i < length; ++i) {
197 if(ces[i] != baseCEs[i]) {
198 add(c);
199 break;
200 }
201 }
202 } else if(tag == Collation::HANGUL_TAG) {
203 UChar jamos[3];
204 int32_t length = Hangul::decompose(c, jamos);
205 if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) ||
206 (length == 3 && tailored->contains(jamos[2]))) {
207 add(c);
208 }
209 } else if(ce32 != baseCE32) {
210 add(c);
211 }
212 }
213
214 void
215 TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) {
216 // Parallel iteration over prefixes of both tables.
217 UCharsTrie::Iterator prefixes(p, 0, errorCode);
218 UCharsTrie::Iterator basePrefixes(q, 0, errorCode);
219 const UnicodeString *tp = NULL; // Tailoring prefix.
220 const UnicodeString *bp = NULL; // Base prefix.
221 // Use a string with a U+FFFF as the limit sentinel.
222 // U+FFFF is untailorable and will not occur in prefixes.
223 UnicodeString none((UChar)0xffff);
224 for(;;) {
225 if(tp == NULL) {
226 if(prefixes.next(errorCode)) {
227 tp = &prefixes.getString();
228 } else {
229 tp = &none;
230 }
231 }
232 if(bp == NULL) {
233 if(basePrefixes.next(errorCode)) {
234 bp = &basePrefixes.getString();
235 } else {
236 bp = &none;
237 }
238 }
239 if(tp == &none && bp == &none) { break; }
240 int32_t cmp = tp->compare(*bp);
241 if(cmp < 0) {
242 // tp occurs in the tailoring but not in the base.
243 addPrefix(data, *tp, c, (uint32_t)prefixes.getValue());
244 tp = NULL;
245 } else if(cmp > 0) {
246 // bp occurs in the base but not in the tailoring.
247 addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue());
248 bp = NULL;
249 } else {
250 setPrefix(*tp);
251 compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue());
252 resetPrefix();
253 tp = NULL;
254 bp = NULL;
255 }
256 }
257 }
258
259 void
260 TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) {
261 // Parallel iteration over suffixes of both tables.
262 UCharsTrie::Iterator suffixes(p, 0, errorCode);
263 UCharsTrie::Iterator baseSuffixes(q, 0, errorCode);
264 const UnicodeString *ts = NULL; // Tailoring suffix.
265 const UnicodeString *bs = NULL; // Base suffix.
266 // Use a string with two U+FFFF as the limit sentinel.
267 // U+FFFF is untailorable and will not occur in contractions except maybe
268 // as a single suffix character for a root-collator boundary contraction.
269 UnicodeString none((UChar)0xffff);
270 none.append((UChar)0xffff);
271 for(;;) {
272 if(ts == NULL) {
273 if(suffixes.next(errorCode)) {
274 ts = &suffixes.getString();
275 } else {
276 ts = &none;
277 }
278 }
279 if(bs == NULL) {
280 if(baseSuffixes.next(errorCode)) {
281 bs = &baseSuffixes.getString();
282 } else {
283 bs = &none;
284 }
285 }
286 if(ts == &none && bs == &none) { break; }
287 int32_t cmp = ts->compare(*bs);
288 if(cmp < 0) {
289 // ts occurs in the tailoring but not in the base.
290 addSuffix(c, *ts);
291 ts = NULL;
292 } else if(cmp > 0) {
293 // bs occurs in the base but not in the tailoring.
294 addSuffix(c, *bs);
295 bs = NULL;
296 } else {
297 suffix = ts;
298 compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue());
299 suffix = NULL;
300 ts = NULL;
301 bs = NULL;
302 }
303 }
304 }
305
306 void
307 TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) {
308 UCharsTrie::Iterator prefixes(p, 0, errorCode);
309 while(prefixes.next(errorCode)) {
310 addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue());
311 }
312 }
313
314 void
315 TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) {
316 setPrefix(pfx);
317 ce32 = d->getFinalCE32(ce32);
318 if(Collation::isContractionCE32(ce32)) {
319 const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
320 addContractions(c, p + 2);
321 }
322 tailored->add(UnicodeString(unreversedPrefix).append(c));
323 resetPrefix();
324 }
325
326 void
327 TailoredSet::addContractions(UChar32 c, const UChar *p) {
328 UCharsTrie::Iterator suffixes(p, 0, errorCode);
329 while(suffixes.next(errorCode)) {
330 addSuffix(c, suffixes.getString());
331 }
332 }
333
334 void
335 TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) {
336 tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx));
337 }
338
339 void
340 TailoredSet::add(UChar32 c) {
341 if(unreversedPrefix.isEmpty() && suffix == NULL) {
342 tailored->add(c);
343 } else {
344 UnicodeString s(unreversedPrefix);
345 s.append(c);
346 if(suffix != NULL) {
347 s.append(*suffix);
348 }
349 tailored->add(s);
350 }
351 }
352
353 ContractionsAndExpansions::CESink::~CESink() {}
354
355 U_CDECL_BEGIN
356
357 static UBool U_CALLCONV
358 enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
359 ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context;
360 if(cne->checkTailored == 0) {
361 // There is no tailoring.
362 // No need to collect nor check the tailored set.
363 } else if(cne->checkTailored < 0) {
364 // Collect the set of code points with mappings in the tailoring data.
365 if(ce32 == Collation::FALLBACK_CE32) {
366 return TRUE; // fallback to base, not tailored
367 } else {
368 cne->tailored.add(start, end);
369 }
370 // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
371 } else if(start == end) {
372 if(cne->tailored.contains(start)) {
373 return TRUE;
374 }
375 } else if(cne->tailored.containsSome(start, end)) {
376 cne->ranges.set(start, end).removeAll(cne->tailored);
377 int32_t count = cne->ranges.getRangeCount();
378 for(int32_t i = 0; i < count; ++i) {
379 cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32);
380 }
381 return U_SUCCESS(cne->errorCode);
382 }
383 cne->handleCE32(start, end, ce32);
384 return U_SUCCESS(cne->errorCode);
385 }
386
387 U_CDECL_END
388
389 void
390 ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) {
391 if(U_FAILURE(ec)) { return; }
392 errorCode = ec; // Preserve info & warning codes.
393 // Add all from the data, can be tailoring or base.
394 if(d->base != NULL) {
395 checkTailored = -1;
396 }
397 data = d;
398 utrie2_enum(data->trie, NULL, enumCnERange, this);
399 if(d->base == NULL || U_FAILURE(errorCode)) {
400 ec = errorCode;
401 return;
402 }
403 // Add all from the base data but only for un-tailored code points.
404 tailored.freeze();
405 checkTailored = 1;
406 data = d->base;
407 utrie2_enum(data->trie, NULL, enumCnERange, this);
408 ec = errorCode;
409 }
410
411 void
412 ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) {
413 if(U_FAILURE(ec)) { return; }
414 errorCode = ec; // Preserve info & warning codes.
415 uint32_t ce32 = d->getCE32(c);
416 if(ce32 == Collation::FALLBACK_CE32) {
417 d = d->base;
418 ce32 = d->getCE32(c);
419 }
420 data = d;
421 handleCE32(c, c, ce32);
422 ec = errorCode;
423 }
424
425 void
426 ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
427 for(;;) {
428 if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) {
429 // !isSpecialCE32()
430 if(sink != NULL) {
431 sink->handleCE(Collation::ceFromSimpleCE32(ce32));
432 }
433 return;
434 }
435 switch(Collation::tagFromCE32(ce32)) {
436 case Collation::FALLBACK_TAG:
437 return;
438 case Collation::RESERVED_TAG_3:
439 case Collation::BUILDER_DATA_TAG:
440 case Collation::LEAD_SURROGATE_TAG:
441 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
442 return;
443 case Collation::LONG_PRIMARY_TAG:
444 if(sink != NULL) {
445 sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32));
446 }
447 return;
448 case Collation::LONG_SECONDARY_TAG:
449 if(sink != NULL) {
450 sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32));
451 }
452 return;
453 case Collation::LATIN_EXPANSION_TAG:
454 if(sink != NULL) {
455 ces[0] = Collation::latinCE0FromCE32(ce32);
456 ces[1] = Collation::latinCE1FromCE32(ce32);
457 sink->handleExpansion(ces, 2);
458 }
459 // Optimization: If we have a prefix,
460 // then the relevant strings have been added already.
461 if(unreversedPrefix.isEmpty()) {
462 addExpansions(start, end);
463 }
464 return;
465 case Collation::EXPANSION32_TAG:
466 if(sink != NULL) {
467 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
468 int32_t length = Collation::lengthFromCE32(ce32);
469 for(int32_t i = 0; i < length; ++i) {
470 ces[i] = Collation::ceFromCE32(*ce32s++);
471 }
472 sink->handleExpansion(ces, length);
473 }
474 // Optimization: If we have a prefix,
475 // then the relevant strings have been added already.
476 if(unreversedPrefix.isEmpty()) {
477 addExpansions(start, end);
478 }
479 return;
480 case Collation::EXPANSION_TAG:
481 if(sink != NULL) {
482 int32_t length = Collation::lengthFromCE32(ce32);
483 sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length);
484 }
485 // Optimization: If we have a prefix,
486 // then the relevant strings have been added already.
487 if(unreversedPrefix.isEmpty()) {
488 addExpansions(start, end);
489 }
490 return;
491 case Collation::PREFIX_TAG:
492 handlePrefixes(start, end, ce32);
493 return;
494 case Collation::CONTRACTION_TAG:
495 handleContractions(start, end, ce32);
496 return;
497 case Collation::DIGIT_TAG:
498 // Fetch the non-numeric-collation CE32 and continue.
499 ce32 = data->ce32s[Collation::indexFromCE32(ce32)];
500 break;
501 case Collation::U0000_TAG:
502 U_ASSERT(start == 0 && end == 0);
503 // Fetch the normal ce32 for U+0000 and continue.
504 ce32 = data->ce32s[0];
505 break;
506 case Collation::HANGUL_TAG:
507 if(sink != NULL) {
508 // TODO: This should be optimized,
509 // especially if [start..end] is the complete Hangul range. (assert that)
510 UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL);
511 UChar hangul[1] = { 0 };
512 for(UChar32 c = start; c <= end; ++c) {
513 hangul[0] = (UChar)c;
514 iter.setText(hangul, hangul + 1);
515 int32_t length = iter.fetchCEs(errorCode);
516 if(U_FAILURE(errorCode)) { return; }
517 // Ignore the terminating non-CE.
518 U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE);
519 sink->handleExpansion(iter.getCEs(), length - 1);
520 }
521 }
522 // Optimization: If we have a prefix,
523 // then the relevant strings have been added already.
524 if(unreversedPrefix.isEmpty()) {
525 addExpansions(start, end);
526 }
527 return;
528 case Collation::OFFSET_TAG:
529 // Currently no need to send offset CEs to the sink.
530 return;
531 case Collation::IMPLICIT_TAG:
532 // Currently no need to send implicit CEs to the sink.
533 return;
534 }
535 }
536 }
537
538 void
539 ContractionsAndExpansions::handlePrefixes(
540 UChar32 start, UChar32 end, uint32_t ce32) {
541 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
542 ce32 = CollationData::readCE32(p); // Default if no prefix match.
543 handleCE32(start, end, ce32);
544 if(!addPrefixes) { return; }
545 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
546 while(prefixes.next(errorCode)) {
547 setPrefix(prefixes.getString());
548 // Prefix/pre-context mappings are special kinds of contractions
549 // that always yield expansions.
550 addStrings(start, end, contractions);
551 addStrings(start, end, expansions);
552 handleCE32(start, end, (uint32_t)prefixes.getValue());
553 }
554 resetPrefix();
555 }
556
557 void
558 ContractionsAndExpansions::handleContractions(
559 UChar32 start, UChar32 end, uint32_t ce32) {
560 const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
561 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
562 // No match on the single code point.
563 // We are underneath a prefix, and the default mapping is just
564 // a fallback to the mappings for a shorter prefix.
565 U_ASSERT(!unreversedPrefix.isEmpty());
566 } else {
567 ce32 = CollationData::readCE32(p); // Default if no suffix match.
568 U_ASSERT(!Collation::isContractionCE32(ce32));
569 handleCE32(start, end, ce32);
570 }
571 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
572 while(suffixes.next(errorCode)) {
573 suffix = &suffixes.getString();
574 addStrings(start, end, contractions);
575 if(!unreversedPrefix.isEmpty()) {
576 addStrings(start, end, expansions);
577 }
578 handleCE32(start, end, (uint32_t)suffixes.getValue());
579 }
580 suffix = NULL;
581 }
582
583 void
584 ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) {
585 if(unreversedPrefix.isEmpty() && suffix == NULL) {
586 if(expansions != NULL) {
587 expansions->add(start, end);
588 }
589 } else {
590 addStrings(start, end, expansions);
591 }
592 }
593
594 void
595 ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) {
596 if(set == NULL) { return; }
597 UnicodeString s(unreversedPrefix);
598 do {
599 s.append(start);
600 if(suffix != NULL) {
601 s.append(*suffix);
602 }
603 set->add(s);
604 s.truncate(unreversedPrefix.length());
605 } while(++start <= end);
606 }
607
608 U_NAMESPACE_END
609
610 #endif // !UCONFIG_NO_COLLATION