]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/collationsets.cpp
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / i18n / collationsets.cpp
diff --git a/icuSources/i18n/collationsets.cpp b/icuSources/i18n/collationsets.cpp
new file mode 100644 (file)
index 0000000..ab282d8
--- /dev/null
@@ -0,0 +1,610 @@
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others.  All Rights Reserved.
+*******************************************************************************
+* collationsets.cpp
+*
+* created on: 2013feb09
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/ucharstrie.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/ustringtrie.h"
+#include "collation.h"
+#include "collationdata.h"
+#include "collationsets.h"
+#include "normalizer2impl.h"
+#include "uassert.h"
+#include "utf16collationiterator.h"
+#include "utrie2.h"
+
+U_NAMESPACE_BEGIN
+
+U_CDECL_BEGIN
+
+static UBool U_CALLCONV
+enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
+    if(ce32 == Collation::FALLBACK_CE32) {
+        return TRUE;  // fallback to base, not tailored
+    }
+    TailoredSet *ts = (TailoredSet *)context;
+    return ts->handleCE32(start, end, ce32);
+}
+
+U_CDECL_END
+
+void
+TailoredSet::forData(const CollationData *d, UErrorCode &ec) {
+    if(U_FAILURE(ec)) { return; }
+    errorCode = ec;  // Preserve info & warning codes.
+    data = d;
+    baseData = d->base;
+    U_ASSERT(baseData != NULL);
+    utrie2_enum(data->trie, NULL, enumTailoredRange, this);
+    ec = errorCode;
+}
+
+UBool
+TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
+    U_ASSERT(ce32 != Collation::FALLBACK_CE32);
+    if(Collation::isSpecialCE32(ce32)) {
+        ce32 = data->getIndirectCE32(ce32);
+        if(ce32 == Collation::FALLBACK_CE32) {
+            return U_SUCCESS(errorCode);
+        }
+    }
+    do {
+        uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start));
+        // Do not just continue if ce32 == baseCE32 because
+        // contractions and expansions in different data objects
+        // normally differ even if they have the same data offsets.
+        if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) {
+            // fastpath
+            if(ce32 != baseCE32) {
+                tailored->add(start);
+            }
+        } else {
+            compare(start, ce32, baseCE32);
+        }
+    } while(++start <= end);
+    return U_SUCCESS(errorCode);
+}
+
+void
+TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) {
+    if(Collation::isPrefixCE32(ce32)) {
+        const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
+        ce32 = data->getFinalCE32(CollationData::readCE32(p));
+        if(Collation::isPrefixCE32(baseCE32)) {
+            const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
+            baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
+            comparePrefixes(c, p + 2, q + 2);
+        } else {
+            addPrefixes(data, c, p + 2);
+        }
+    } else if(Collation::isPrefixCE32(baseCE32)) {
+        const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
+        baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
+        addPrefixes(baseData, c, q + 2);
+    }
+
+    if(Collation::isContractionCE32(ce32)) {
+        const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
+        if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+            ce32 = Collation::NO_CE32;
+        } else {
+            ce32 = data->getFinalCE32(CollationData::readCE32(p));
+        }
+        if(Collation::isContractionCE32(baseCE32)) {
+            const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
+            if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+                baseCE32 = Collation::NO_CE32;
+            } else {
+                baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
+            }
+            compareContractions(c, p + 2, q + 2);
+        } else {
+            addContractions(c, p + 2);
+        }
+    } else if(Collation::isContractionCE32(baseCE32)) {
+        const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
+        baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
+        addContractions(c, q + 2);
+    }
+
+    int32_t tag;
+    if(Collation::isSpecialCE32(ce32)) {
+        tag = Collation::tagFromCE32(ce32);
+        U_ASSERT(tag != Collation::PREFIX_TAG);
+        U_ASSERT(tag != Collation::CONTRACTION_TAG);
+        // Currently, the tailoring data builder does not write offset tags.
+        // They might be useful for saving space,
+        // but they would complicate the builder,
+        // and in tailorings we assume that performance of tailored characters is more important.
+        U_ASSERT(tag != Collation::OFFSET_TAG);
+    } else {
+        tag = -1;
+    }
+    int32_t baseTag;
+    if(Collation::isSpecialCE32(baseCE32)) {
+        baseTag = Collation::tagFromCE32(baseCE32);
+        U_ASSERT(baseTag != Collation::PREFIX_TAG);
+        U_ASSERT(baseTag != Collation::CONTRACTION_TAG);
+    } else {
+        baseTag = -1;
+    }
+
+    // Non-contextual mappings, expansions, etc.
+    if(baseTag == Collation::OFFSET_TAG) {
+        // We might be comparing a tailoring CE which is a copy of
+        // a base offset-tag CE, via the [optimize [set]] syntax
+        // or when a single-character mapping was copied for tailored contractions.
+        // Offset tags always result in long-primary CEs,
+        // with common secondary/tertiary weights.
+        if(!Collation::isLongPrimaryCE32(ce32)) {
+            add(c);
+            return;
+        }
+        int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)];
+        uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
+        if(Collation::primaryFromLongPrimaryCE32(ce32) != p) {
+            add(c);
+            return;
+        }
+    }
+
+    if(tag != baseTag) {
+        add(c);
+        return;
+    }
+
+    if(tag == Collation::EXPANSION32_TAG) {
+        const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
+        int32_t length = Collation::lengthFromCE32(ce32);
+
+        const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32);
+        int32_t baseLength = Collation::lengthFromCE32(baseCE32);
+
+        if(length != baseLength) {
+            add(c);
+            return;
+        }
+        for(int32_t i = 0; i < length; ++i) {
+            if(ce32s[i] != baseCE32s[i]) {
+                add(c);
+                break;
+            }
+        }
+    } else if(tag == Collation::EXPANSION_TAG) {
+        const int64_t *ces = data->ces + Collation::indexFromCE32(ce32);
+        int32_t length = Collation::lengthFromCE32(ce32);
+
+        const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32);
+        int32_t baseLength = Collation::lengthFromCE32(baseCE32);
+
+        if(length != baseLength) {
+            add(c);
+            return;
+        }
+        for(int32_t i = 0; i < length; ++i) {
+            if(ces[i] != baseCEs[i]) {
+                add(c);
+                break;
+            }
+        }
+    } else if(tag == Collation::HANGUL_TAG) {
+        UChar jamos[3];
+        int32_t length = Hangul::decompose(c, jamos);
+        if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) ||
+                (length == 3 && tailored->contains(jamos[2]))) {
+            add(c);
+        }
+    } else if(ce32 != baseCE32) {
+        add(c);
+    }
+}
+
+void
+TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) {
+    // Parallel iteration over prefixes of both tables.
+    UCharsTrie::Iterator prefixes(p, 0, errorCode);
+    UCharsTrie::Iterator basePrefixes(q, 0, errorCode);
+    const UnicodeString *tp = NULL;  // Tailoring prefix.
+    const UnicodeString *bp = NULL;  // Base prefix.
+    // Use a string with a U+FFFF as the limit sentinel.
+    // U+FFFF is untailorable and will not occur in prefixes.
+    UnicodeString none((UChar)0xffff);
+    for(;;) {
+        if(tp == NULL) {
+            if(prefixes.next(errorCode)) {
+                tp = &prefixes.getString();
+            } else {
+                tp = &none;
+            }
+        }
+        if(bp == NULL) {
+            if(basePrefixes.next(errorCode)) {
+                bp = &basePrefixes.getString();
+            } else {
+                bp = &none;
+            }
+        }
+        if(tp == &none && bp == &none) { break; }
+        int32_t cmp = tp->compare(*bp);
+        if(cmp < 0) {
+            // tp occurs in the tailoring but not in the base.
+            addPrefix(data, *tp, c, (uint32_t)prefixes.getValue());
+            tp = NULL;
+        } else if(cmp > 0) {
+            // bp occurs in the base but not in the tailoring.
+            addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue());
+            bp = NULL;
+        } else {
+            setPrefix(*tp);
+            compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue());
+            resetPrefix();
+            tp = NULL;
+            bp = NULL;
+        }
+    }
+}
+
+void
+TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) {
+    // Parallel iteration over suffixes of both tables.
+    UCharsTrie::Iterator suffixes(p, 0, errorCode);
+    UCharsTrie::Iterator baseSuffixes(q, 0, errorCode);
+    const UnicodeString *ts = NULL;  // Tailoring suffix.
+    const UnicodeString *bs = NULL;  // Base suffix.
+    // Use a string with two U+FFFF as the limit sentinel.
+    // U+FFFF is untailorable and will not occur in contractions except maybe
+    // as a single suffix character for a root-collator boundary contraction.
+    UnicodeString none((UChar)0xffff);
+    none.append((UChar)0xffff);
+    for(;;) {
+        if(ts == NULL) {
+            if(suffixes.next(errorCode)) {
+                ts = &suffixes.getString();
+            } else {
+                ts = &none;
+            }
+        }
+        if(bs == NULL) {
+            if(baseSuffixes.next(errorCode)) {
+                bs = &baseSuffixes.getString();
+            } else {
+                bs = &none;
+            }
+        }
+        if(ts == &none && bs == &none) { break; }
+        int32_t cmp = ts->compare(*bs);
+        if(cmp < 0) {
+            // ts occurs in the tailoring but not in the base.
+            addSuffix(c, *ts);
+            ts = NULL;
+        } else if(cmp > 0) {
+            // bs occurs in the base but not in the tailoring.
+            addSuffix(c, *bs);
+            bs = NULL;
+        } else {
+            suffix = ts;
+            compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue());
+            suffix = NULL;
+            ts = NULL;
+            bs = NULL;
+        }
+    }
+}
+
+void
+TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) {
+    UCharsTrie::Iterator prefixes(p, 0, errorCode);
+    while(prefixes.next(errorCode)) {
+        addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue());
+    }
+}
+
+void
+TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) {
+    setPrefix(pfx);
+    ce32 = d->getFinalCE32(ce32);
+    if(Collation::isContractionCE32(ce32)) {
+        const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
+        addContractions(c, p + 2);
+    }
+    tailored->add(UnicodeString(unreversedPrefix).append(c));
+    resetPrefix();
+}
+
+void
+TailoredSet::addContractions(UChar32 c, const UChar *p) {
+    UCharsTrie::Iterator suffixes(p, 0, errorCode);
+    while(suffixes.next(errorCode)) {
+        addSuffix(c, suffixes.getString());
+    }
+}
+
+void
+TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) {
+    tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx));
+}
+
+void
+TailoredSet::add(UChar32 c) {
+    if(unreversedPrefix.isEmpty() && suffix == NULL) {
+        tailored->add(c);
+    } else {
+        UnicodeString s(unreversedPrefix);
+        s.append(c);
+        if(suffix != NULL) {
+            s.append(*suffix);
+        }
+        tailored->add(s);
+    }
+}
+
+ContractionsAndExpansions::CESink::~CESink() {}
+
+U_CDECL_BEGIN
+
+static UBool U_CALLCONV
+enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
+    ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context;
+    if(cne->checkTailored == 0) {
+        // There is no tailoring.
+        // No need to collect nor check the tailored set.
+    } else if(cne->checkTailored < 0) {
+        // Collect the set of code points with mappings in the tailoring data.
+        if(ce32 == Collation::FALLBACK_CE32) {
+            return TRUE;  // fallback to base, not tailored
+        } else {
+            cne->tailored.add(start, end);
+        }
+        // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
+    } else if(start == end) {
+        if(cne->tailored.contains(start)) {
+            return TRUE;
+        }
+    } else if(cne->tailored.containsSome(start, end)) {
+        cne->ranges.set(start, end).removeAll(cne->tailored);
+        int32_t count = cne->ranges.getRangeCount();
+        for(int32_t i = 0; i < count; ++i) {
+            cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32);
+        }
+        return U_SUCCESS(cne->errorCode);
+    }
+    cne->handleCE32(start, end, ce32);
+    return U_SUCCESS(cne->errorCode);
+}
+
+U_CDECL_END
+
+void
+ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) {
+    if(U_FAILURE(ec)) { return; }
+    errorCode = ec;  // Preserve info & warning codes.
+    // Add all from the data, can be tailoring or base.
+    if(d->base != NULL) {
+        checkTailored = -1;
+    }
+    data = d;
+    utrie2_enum(data->trie, NULL, enumCnERange, this);
+    if(d->base == NULL || U_FAILURE(errorCode)) {
+        ec = errorCode;
+        return;
+    }
+    // Add all from the base data but only for un-tailored code points.
+    tailored.freeze();
+    checkTailored = 1;
+    data = d->base;
+    utrie2_enum(data->trie, NULL, enumCnERange, this);
+    ec = errorCode;
+}
+
+void
+ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) {
+    if(U_FAILURE(ec)) { return; }
+    errorCode = ec;  // Preserve info & warning codes.
+    uint32_t ce32 = d->getCE32(c);
+    if(ce32 == Collation::FALLBACK_CE32) {
+        d = d->base;
+        ce32 = d->getCE32(c);
+    }
+    data = d;
+    handleCE32(c, c, ce32);
+    ec = errorCode;
+}
+
+void
+ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
+    for(;;) {
+        if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) {
+            // !isSpecialCE32()
+            if(sink != NULL) {
+                sink->handleCE(Collation::ceFromSimpleCE32(ce32));
+            }
+            return;
+        }
+        switch(Collation::tagFromCE32(ce32)) {
+        case Collation::FALLBACK_TAG:
+            return;
+        case Collation::RESERVED_TAG_3:
+        case Collation::BUILDER_DATA_TAG:
+        case Collation::LEAD_SURROGATE_TAG:
+            if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
+            return;
+        case Collation::LONG_PRIMARY_TAG:
+            if(sink != NULL) {
+                sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32));
+            }
+            return;
+        case Collation::LONG_SECONDARY_TAG:
+            if(sink != NULL) {
+                sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32));
+            }
+            return;
+        case Collation::LATIN_EXPANSION_TAG:
+            if(sink != NULL) {
+                ces[0] = Collation::latinCE0FromCE32(ce32);
+                ces[1] = Collation::latinCE1FromCE32(ce32);
+                sink->handleExpansion(ces, 2);
+            }
+            // Optimization: If we have a prefix,
+            // then the relevant strings have been added already.
+            if(unreversedPrefix.isEmpty()) {
+                addExpansions(start, end);
+            }
+            return;
+        case Collation::EXPANSION32_TAG:
+            if(sink != NULL) {
+                const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
+                int32_t length = Collation::lengthFromCE32(ce32);
+                for(int32_t i = 0; i < length; ++i) {
+                    ces[i] = Collation::ceFromCE32(*ce32s++);
+                }
+                sink->handleExpansion(ces, length);
+            }
+            // Optimization: If we have a prefix,
+            // then the relevant strings have been added already.
+            if(unreversedPrefix.isEmpty()) {
+                addExpansions(start, end);
+            }
+            return;
+        case Collation::EXPANSION_TAG:
+            if(sink != NULL) {
+                int32_t length = Collation::lengthFromCE32(ce32);
+                sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length);
+            }
+            // Optimization: If we have a prefix,
+            // then the relevant strings have been added already.
+            if(unreversedPrefix.isEmpty()) {
+                addExpansions(start, end);
+            }
+            return;
+        case Collation::PREFIX_TAG:
+            handlePrefixes(start, end, ce32);
+            return;
+        case Collation::CONTRACTION_TAG:
+            handleContractions(start, end, ce32);
+            return;
+        case Collation::DIGIT_TAG:
+            // Fetch the non-numeric-collation CE32 and continue.
+            ce32 = data->ce32s[Collation::indexFromCE32(ce32)];
+            break;
+        case Collation::U0000_TAG:
+            U_ASSERT(start == 0 && end == 0);
+            // Fetch the normal ce32 for U+0000 and continue.
+            ce32 = data->ce32s[0];
+            break;
+        case Collation::HANGUL_TAG:
+            if(sink != NULL) {
+                // TODO: This should be optimized,
+                // especially if [start..end] is the complete Hangul range. (assert that)
+                UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL);
+                UChar hangul[1] = { 0 };
+                for(UChar32 c = start; c <= end; ++c) {
+                    hangul[0] = (UChar)c;
+                    iter.setText(hangul, hangul + 1);
+                    int32_t length = iter.fetchCEs(errorCode);
+                    if(U_FAILURE(errorCode)) { return; }
+                    // Ignore the terminating non-CE.
+                    U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE);
+                    sink->handleExpansion(iter.getCEs(), length - 1);
+                }
+            }
+            // Optimization: If we have a prefix,
+            // then the relevant strings have been added already.
+            if(unreversedPrefix.isEmpty()) {
+                addExpansions(start, end);
+            }
+            return;
+        case Collation::OFFSET_TAG:
+            // Currently no need to send offset CEs to the sink.
+            return;
+        case Collation::IMPLICIT_TAG:
+            // Currently no need to send implicit CEs to the sink.
+            return;
+        }
+    }
+}
+
+void
+ContractionsAndExpansions::handlePrefixes(
+        UChar32 start, UChar32 end, uint32_t ce32) {
+    const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
+    ce32 = CollationData::readCE32(p);  // Default if no prefix match.
+    handleCE32(start, end, ce32);
+    if(!addPrefixes) { return; }
+    UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
+    while(prefixes.next(errorCode)) {
+        setPrefix(prefixes.getString());
+        // Prefix/pre-context mappings are special kinds of contractions
+        // that always yield expansions.
+        addStrings(start, end, contractions);
+        addStrings(start, end, expansions);
+        handleCE32(start, end, (uint32_t)prefixes.getValue());
+    }
+    resetPrefix();
+}
+
+void
+ContractionsAndExpansions::handleContractions(
+        UChar32 start, UChar32 end, uint32_t ce32) {
+    const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
+    if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
+        // No match on the single code point.
+        // We are underneath a prefix, and the default mapping is just
+        // a fallback to the mappings for a shorter prefix.
+        U_ASSERT(!unreversedPrefix.isEmpty());
+    } else {
+        ce32 = CollationData::readCE32(p);  // Default if no suffix match.
+        U_ASSERT(!Collation::isContractionCE32(ce32));
+        handleCE32(start, end, ce32);
+    }
+    UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
+    while(suffixes.next(errorCode)) {
+        suffix = &suffixes.getString();
+        addStrings(start, end, contractions);
+        if(!unreversedPrefix.isEmpty()) {
+            addStrings(start, end, expansions);
+        }
+        handleCE32(start, end, (uint32_t)suffixes.getValue());
+    }
+    suffix = NULL;
+}
+
+void
+ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) {
+    if(unreversedPrefix.isEmpty() && suffix == NULL) {
+        if(expansions != NULL) {
+            expansions->add(start, end);
+        }
+    } else {
+        addStrings(start, end, expansions);
+    }
+}
+
+void
+ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) {
+    if(set == NULL) { return; }
+    UnicodeString s(unreversedPrefix);
+    do {
+        s.append(start);
+        if(suffix != NULL) {
+            s.append(*suffix);
+        }
+        set->add(s);
+        s.truncate(unreversedPrefix.length());
+    } while(++start <= end);
+}
+
+U_NAMESPACE_END
+
+#endif  // !UCONFIG_NO_COLLATION