[apple/icu.git] / icuSources / i18n / bmsearch.cpp

/*
 ******************************************************************************
 *   Copyright (C) 1996-2011, International Business Machines                 *
 *   Corporation and others.  All Rights Reserved.                            *
 ******************************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/usearch.h"

#include "cmemory.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/ucoleitr.h"

#include "unicode/regex.h"        // TODO: make conditional on regexp being built.

#include "unicode/uniset.h"
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "uhash.h"
#include "ucol_imp.h"
#include "normalizer2impl.h"

#include "unicode/colldata.h"
#include "unicode/bmsearch.h"

U_NAMESPACE_BEGIN

#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))


struct CEI
{
    uint32_t order;
    int32_t  lowOffset;
    int32_t  highOffset;
};

class Target : public UMemory
{
public:
    Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
    ~Target();

    void setTargetString(const UnicodeString *target);

    const CEI *nextCE(int32_t offset);
    const CEI *prevCE(int32_t offset);

    int32_t stringLength();
    UChar charAt(int32_t offset);

    UBool isBreakBoundary(int32_t offset);
    int32_t nextBreakBoundary(int32_t offset);
    int32_t nextSafeBoundary(int32_t offset);

    UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);

    void setOffset(int32_t offset);
    void setLast(int32_t last);
    int32_t getOffset();

private:
    CEI *ceb;
    int32_t bufferSize;
    int32_t bufferMin;
    int32_t bufferMax;

    uint32_t strengthMask;
    UCollationStrength strength;
    uint32_t variableTop;
    UBool toShift;
    UCollator *coll;
    const Normalizer2 &nfd;

    const UnicodeString *targetString;
    const UChar *targetBuffer;
    int32_t targetLength;

    UCollationElements *elements;
    UBreakIterator *charBreakIterator;
};

Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
    : bufferSize(0), bufferMin(0), bufferMax(0),
      strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
      nfd(*Normalizer2Factory::getNFDInstance(status)),
      targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
    strength = ucol_getStrength(coll);
    toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
    variableTop = ucol_getVariableTop(coll, &status);

    // find the largest expansion
    uint8_t maxExpansion = 0;
    for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
        if (*expansion > maxExpansion) {
            maxExpansion = *expansion;
        }
    }

    // room for an extra character on each end, plus 4 for safety
    bufferSize = patternLength + (2 * maxExpansion) + 4;

    ceb = NEW_ARRAY(CEI, bufferSize);

    if (ceb == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    if (target != NULL) {
        setTargetString(target);
    }

    switch (strength)
    {
    default:
        strengthMask |= UCOL_TERTIARYORDERMASK;
        /* fall through */

    case UCOL_SECONDARY:
        strengthMask |= UCOL_SECONDARYORDERMASK;
        /* fall through */

    case UCOL_PRIMARY:
        strengthMask |= UCOL_PRIMARYORDERMASK;
    }
}

Target::~Target()
{
    ubrk_close(charBreakIterator);
    ucol_closeElements(elements);

    DELETE_ARRAY(ceb);
}

void Target::setTargetString(const UnicodeString *target)
{
    if (charBreakIterator != NULL) {
        ubrk_close(charBreakIterator);
        ucol_closeElements(elements);
    }

    targetString = target;

    if (targetString != NULL) {
        UErrorCode status = U_ZERO_ERROR;

        targetBuffer = targetString->getBuffer();
        targetLength = targetString->length();

        elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
        ucol_forceHanImplicit(elements, &status);

        charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
                                      targetBuffer, targetLength, &status);
    } else {
        targetBuffer = NULL;
        targetLength = 0;
    }
}

const CEI *Target::nextCE(int32_t offset)
{
    UErrorCode status = U_ZERO_ERROR;
    int32_t low = -1, high = -1;
    uint32_t order;
    UBool cont = FALSE;

    if (offset >= bufferMin && offset < bufferMax) {
        return &ceb[offset];
    }

    if (bufferMax >= bufferSize || offset != bufferMax) {
        return NULL;
    }

    do {
        low   = ucol_getOffset(elements);
        order = ucol_next(elements, &status);
        high  = ucol_getOffset(elements);

        if (order == (uint32_t)UCOL_NULLORDER) {
          //high = low = -1;
            break;
        }

        cont = isContinuation(order);
        order &= strengthMask;

        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
            if (strength >= UCOL_QUATERNARY) {
                order &= UCOL_PRIMARYORDERMASK;
            } else {
                order = UCOL_IGNORABLE;
            }
        }
    } while (order == UCOL_IGNORABLE);

    if (cont) {
        order |= UCOL_CONTINUATION_MARKER;
    }

    ceb[offset].order = order;
    ceb[offset].lowOffset = low;
    ceb[offset].highOffset = high;

    bufferMax += 1;

    return &ceb[offset];
}

const CEI *Target::prevCE(int32_t offset)
{
    UErrorCode status = U_ZERO_ERROR;
    int32_t low = -1, high = -1;
    uint32_t order;
    UBool cont = FALSE;

    if (offset >= bufferMin && offset < bufferMax) {
        return &ceb[offset];
    }

    if (bufferMax >= bufferSize || offset != bufferMax) {
        return NULL;
    }

    do {
        high  = ucol_getOffset(elements);
        order = ucol_previous(elements, &status);
        low   = ucol_getOffset(elements);

        if (order == (uint32_t)UCOL_NULLORDER) {
            break;
        }

        cont = isContinuation(order);
        order &= strengthMask;

        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
            if (strength >= UCOL_QUATERNARY) {
                order &= UCOL_PRIMARYORDERMASK;
            } else {
                order = UCOL_IGNORABLE;
            }
        }
    } while (order == UCOL_IGNORABLE);

    bufferMax += 1;

    if (cont) {
        order |= UCOL_CONTINUATION_MARKER;
    }

    ceb[offset].order       = order;
    ceb[offset].lowOffset   = low;
    ceb[offset].highOffset = high;

    return &ceb[offset];
}

int32_t Target::stringLength()
{
    if (targetString != NULL) {
        return targetLength;
    }

    return 0;
}

UChar Target::charAt(int32_t offset)
{
    if (targetString != NULL) {
        return targetBuffer[offset];
    }

    return 0x0000;
}

void Target::setOffset(int32_t offset)
{
    UErrorCode status = U_ZERO_ERROR;

    bufferMin = 0;
    bufferMax = 0;

    ucol_setOffset(elements, offset, &status);
}

void Target::setLast(int32_t last)
{
    UErrorCode status = U_ZERO_ERROR;

    bufferMin = 0;
    bufferMax = 1;

    ceb[0].order      = UCOL_NULLORDER;
    ceb[0].lowOffset  = last;
    ceb[0].highOffset = last;

    ucol_setOffset(elements, last, &status);
}

int32_t Target::getOffset()
{
    return ucol_getOffset(elements);
}

UBool Target::isBreakBoundary(int32_t offset)
{
    return ubrk_isBoundary(charBreakIterator, offset);
}

int32_t Target::nextBreakBoundary(int32_t offset)
{
    return ubrk_following(charBreakIterator, offset);
}

int32_t Target::nextSafeBoundary(int32_t offset)
{
    while (offset < targetLength) {
      //UChar ch = charAt(offset);
        UChar ch = targetBuffer[offset];

        if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
            return offset;
        }

        offset += 1;
    }

    return targetLength;
}

UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
{
    if (strength < UCOL_IDENTICAL) {
        return TRUE;
    }

    // Note: We could use Normalizer::compare() or similar, but for short strings
    // which may not be in FCD it might be faster to just NFD them.
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString t2, p2;
    nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
    nfd.normalize(pattern, p2, status);
    // return FALSE if NFD failed
    return U_SUCCESS(status) && t2 == p2;
}

#define HASH_TABLE_SIZE 257

class BadCharacterTable : public UMemory
{
public:
    BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
    ~BadCharacterTable();

    int32_t operator[](uint32_t ce) const;
    int32_t getMaxSkip() const;
    int32_t minLengthInChars(int32_t index);

private:
    static int32_t hash(uint32_t ce);

    int32_t maxSkip;
    int32_t badCharacterTable[HASH_TABLE_SIZE];

    int32_t *minLengthCache;
};

BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
    : minLengthCache(NULL)
{
    int32_t plen = patternCEs.size();

    // **** need a better way to deal with this ****
    if (U_FAILURE(status) || plen == 0) {
        return;
    }

    int32_t *history = NEW_ARRAY(int32_t, plen);

    if (history == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    for (int32_t i = 0; i < plen; i += 1) {
        history[i] = -1;
    }

    minLengthCache = NEW_ARRAY(int32_t, plen + 1);

    if (minLengthCache == NULL) {
        DELETE_ARRAY(history);
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);

    for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
        badCharacterTable[j] = maxSkip;
    }

    for(int32_t p = 1; p < plen; p += 1) {
        minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);

        // Make sure this entry is not bigger than the previous one.
        // Otherwise, we might skip too far in some cases.
        if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
            minLengthCache[p] = minLengthCache[p - 1];
        }
    }

    minLengthCache[plen] = 0;

    for(int32_t p = 0; p < plen - 1; p += 1) {
        badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
    }

    DELETE_ARRAY(history);
}

BadCharacterTable::~BadCharacterTable()
{
    DELETE_ARRAY(minLengthCache);
}

int32_t BadCharacterTable::operator[](uint32_t ce) const
{
    return badCharacterTable[hash(ce)];
}

int32_t BadCharacterTable::getMaxSkip() const
{
    return maxSkip;
}

int32_t BadCharacterTable::minLengthInChars(int32_t index)
{
    return minLengthCache[index];
}

int32_t BadCharacterTable::hash(uint32_t ce)
{
    return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
}

class GoodSuffixTable : public UMemory
{
public:
    GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
    ~GoodSuffixTable();

    int32_t operator[](int32_t offset) const;

private:
    int32_t *goodSuffixTable;
};

GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
    : goodSuffixTable(NULL)
{
    int32_t patlen = patternCEs.size();

    // **** need a better way to deal with this ****
    if (U_FAILURE(status) || patlen <= 0) {
        return;
    }

    int32_t *suff  = NEW_ARRAY(int32_t, patlen);
    int32_t start = patlen - 1, end = - 1;
    int32_t maxSkip = badCharacterTable.getMaxSkip();

    if (suff == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    // initialze suff
    suff[patlen - 1] = patlen;

    for (int32_t i = patlen - 2; i >= 0; i -= 1) {
        // (i > start) means we're inside the last suffix match we found
        // ((patlen - 1) - end) is how far the end of that match is from end of pattern
        // (i - start) is how far we are from start of that match
        // (i + (patlen - 1) - end) is index of same character at end of pattern
        // so if any suffix match at that character doesn't extend beyond the last match,
        // it's the suffix for this character as well
        if (i > start && suff[i + patlen - 1 - end] < i - start) {
            suff[i] = suff[i + patlen - 1 - end];
        } else {
            start = end = i;

            int32_t s = patlen;

            while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
                start -= 1;
            }

            suff[i] = end - start;
        }
    }

    // now build goodSuffixTable
    goodSuffixTable  = NEW_ARRAY(int32_t, patlen);

    if (goodSuffixTable == NULL) {
        DELETE_ARRAY(suff);
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }


    // initialize entries to minLengthInChars of the pattern
    for (int32_t i = 0; i < patlen; i += 1) {
        goodSuffixTable[i] = maxSkip;
    }

    int32_t prefix = 0;

    for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
        if (suff[i] == i + 1) {
            // this matching suffix is a prefix of the pattern
            int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);

            // for any mis-match before this suffix, we should skip
            // so that the front of the pattern (i.e. the prefix)
            // lines up with the front of the suffix.
            // (patlen - 1 - i) is the start of the suffix
            while (prefix < patlen - 1 - i) {
                // value of maxSkip means never set...
                if (goodSuffixTable[prefix] == maxSkip) {
                    goodSuffixTable[prefix] = prefixSkip;
                }

                prefix += 1;
            }
        }
    }

    for (int32_t i = 0; i < patlen - 1; i += 1) {
        goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
    }

    DELETE_ARRAY(suff);
}

GoodSuffixTable::~GoodSuffixTable()
{
    DELETE_ARRAY(goodSuffixTable);
}

int32_t GoodSuffixTable::operator[](int32_t offset) const
{
    return goodSuffixTable[offset];
}

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)


UBool BoyerMooreSearch::empty()
{
    return patCEs->size() <= 0;
}

CollData *BoyerMooreSearch::getData()
{
    return data;
}

CEList *BoyerMooreSearch::getPatternCEs()
{
    return patCEs;
}

BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
{
    return badCharacterTable;
}

GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
{
    return goodSuffixTable;
}

BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
                                   UErrorCode &status)
    : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
{

    if (U_FAILURE(status)) {
        return;
    }

    UCollator *collator = data->getCollator();

    patCEs = new CEList(collator, patternString, status);

    if (patCEs == NULL || U_FAILURE(status)) {
        return;
    }

    badCharacterTable = new BadCharacterTable(*patCEs, data, status);

    if (badCharacterTable == NULL || U_FAILURE(status)) {
        return;
    }

    goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);

    if (targetString != NULL) {
        target = new Target(collator, targetString, patCEs->size(), status);
    }
}

BoyerMooreSearch::~BoyerMooreSearch()
{
    delete target;
    delete goodSuffixTable;
    delete badCharacterTable;
    delete patCEs;
}

void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
{
    if (U_FAILURE(status)) {
        return;
    }

    if (target == NULL) {
        target = new Target(data->getCollator(), targetString, patCEs->size(), status);
    } else {
        target->setTargetString(targetString);
    }
}

// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
/*
 * TODO:
 *  * deal with trailing (and leading?) ignorables.
 *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
 */
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
{
    /*UCollator *coll =*/ data->getCollator();
    int32_t plen = patCEs->size();
    int32_t tlen = target->stringLength();
    int32_t maxSkip = badCharacterTable->getMaxSkip();
    int32_t tOffset = offset + maxSkip;

    if (plen <= 0) {
        // Searching for a zero length pattern always fails.
        start = end = -1;
        return FALSE;
    }

    while (tOffset <= tlen) {
        int32_t pIndex = plen - 1;
        int32_t tIndex = 0;
        int32_t lIndex = 0;

        if (tOffset < tlen) {
            // **** we really want to skip ahead enough to  ****
            // **** be sure we get at least 1 non-ignorable ****
            // **** CE after the end of the pattern.        ****
            int32_t next = target->nextSafeBoundary(tOffset + 1);

            target->setOffset(next);

            for (lIndex = 0; ; lIndex += 1) {
                const CEI *cei = target->prevCE(lIndex);
                int32_t low = cei->lowOffset;
                int32_t high = cei->highOffset;

                if (high == 0 || (low < high && low <= tOffset)) {
                    if (low < tOffset) {
                        while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
                            lIndex -= 1;
                        }

                        if (high > tOffset) {
                            tOffset = high;
                        }
                    }

                    break;
                }
            }
        } else {
            target->setLast(tOffset);
            lIndex = 0;
        }

        tIndex = ++lIndex;

        // Iterate backward until we hit the beginning of the pattern
        while (pIndex >= 0) {
            uint32_t pce = (*patCEs)[pIndex];
            const CEI *tcei = target->prevCE(tIndex++);


            if (tcei->order != pce) {
                // There is a mismatch at this position.  Decide how far
                // over to shift the pattern, then try again.

                int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
#ifdef EXTRA_CAUTIOUS
                int32_t old = tOffset;
#endif

                tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);

                if (gsOffset > tOffset) {
                    tOffset = gsOffset;
                }

#ifdef EXTRA_CAUTIOUS
                // Make sure we don't skip backwards...
                if (tOffset <= old) {
                    tOffset = old + 1;
                }
#endif

                break;
            }

            pIndex -= 1;
        }

        if (pIndex < 0) {
            // We made it back to the beginning of the pattern,
            // which means we matched it all.  Return the location.
            const CEI firstCEI = *target->prevCE(tIndex - 1);
            const CEI lastCEI  = *target->prevCE(lIndex);
            int32_t mStart   = firstCEI.lowOffset;
            int32_t minLimit = lastCEI.lowOffset;
            int32_t maxLimit = lastCEI.highOffset;
            int32_t mLimit;
            UBool found = TRUE;

            target->setOffset(/*tOffset*/maxLimit);

            const CEI nextCEI = *target->nextCE(0);

            if (nextCEI.lowOffset > maxLimit) {
                maxLimit = nextCEI.lowOffset;
            }

            if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
                found = FALSE;
            }

            if (! target->isBreakBoundary(mStart)) {
                found = FALSE;
            }

            if (firstCEI.lowOffset == firstCEI.highOffset) {
                found = FALSE;
            }

            mLimit = maxLimit;
            if (minLimit < maxLimit) {
                // When the last CE's low index is same with its high index, the CE is likely
                // a part of expansion. In this case, the index is located just after the
                // character corresponding to the CEs compared above. If the index is right
                // at the break boundary, move the position to the next boundary will result
                // incorrect match length when there are ignorable characters exist between
                // the position and the next character produces CE(s). See ticket#8482.
                if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
                    mLimit = minLimit;
                } else {
                    int32_t nbb = target->nextBreakBoundary(minLimit);

                    if (nbb >= lastCEI.highOffset) {
                        mLimit = nbb;
                    }
                }
            }

            if (mLimit > maxLimit) {
                found = FALSE;
            }

            if (! target->isBreakBoundary(mLimit)) {
                found = FALSE;
            }

            if (! target->isIdentical(pattern, mStart, mLimit)) {
                found = FALSE;
            }

            if (found) {
                start = mStart;
                end   = mLimit;

                return TRUE;
            }

            tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
        }
        // Otherwise, we're here because of a mismatch, so keep going....
    }

    // no match
   start = -1;
   end = -1;
   return FALSE;
}

U_NAMESPACE_END

#endif // #if !UCONFIG_NO_COLLATION
Commit	Line	Data
729e4ab9 A	1	/*
729e4ab9 A	2	******************************************************************************
4388f060	3	* Copyright (C) 1996-2011, International Business Machines *
729e4ab9 A	4	* Corporation and others. All Rights Reserved. *
	5	******************************************************************************
	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
	11
	12	#include "unicode/unistr.h"
	13	#include "unicode/putil.h"
	14	#include "unicode/usearch.h"
	15
	16	#include "cmemory.h"
	17	#include "unicode/coll.h"
	18	#include "unicode/tblcoll.h"
	19	#include "unicode/coleitr.h"
	20	#include "unicode/ucoleitr.h"
	21
	22	#include "unicode/regex.h" // TODO: make conditional on regexp being built.
	23
	24	#include "unicode/uniset.h"
	25	#include "unicode/uset.h"
	26	#include "unicode/ustring.h"
	27	#include "hash.h"
	28	#include "uhash.h"
	29	#include "ucol_imp.h"
	30	#include "normalizer2impl.h"
	31
	32	#include "unicode/colldata.h"
	33	#include "unicode/bmsearch.h"
	34
	35	U_NAMESPACE_BEGIN
	36
	37	#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
	38	#define NEW_ARRAY(type, count) (type ) uprv_malloc((count) sizeof(type))
	39	#define DELETE_ARRAY(array) uprv_free((void *) (array))
	40
	41
	42	struct CEI
	43	{
	44	uint32_t order;
	45	int32_t lowOffset;
	46	int32_t highOffset;
	47	};
	48
	49	class Target : public UMemory
	50	{
	51	public:
	52	Target(UCollator theCollator, const UnicodeString target, int32_t patternLength, UErrorCode &status);
	53	~Target();
	54
	55	void setTargetString(const UnicodeString *target);
	56
	57	const CEI *nextCE(int32_t offset);
	58	const CEI *prevCE(int32_t offset);
	59
	60	int32_t stringLength();
	61	UChar charAt(int32_t offset);
	62
	63	UBool isBreakBoundary(int32_t offset);
	64	int32_t nextBreakBoundary(int32_t offset);
	65	int32_t nextSafeBoundary(int32_t offset);
	66
	67	UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
68
69	void setOffset(int32_t offset);
70	void setLast(int32_t last);
71	int32_t getOffset();
72
73	private:
74	CEI *ceb;
75	int32_t bufferSize;
76	int32_t bufferMin;
77	int32_t bufferMax;
78
79	uint32_t strengthMask;
80	UCollationStrength strength;
81	uint32_t variableTop;
82	UBool toShift;
83	UCollator *coll;
84	const Normalizer2 &nfd;
85
86	const UnicodeString *targetString;
87	const UChar *targetBuffer;
88	int32_t targetLength;
89
90	UCollationElements *elements;
91	UBreakIterator *charBreakIterator;
92	};
93
94	Target::Target(UCollator theCollator, const UnicodeString target, int32_t patternLength, UErrorCode &status)
95	: bufferSize(0), bufferMin(0), bufferMax(0),
96	strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
97	nfd(*Normalizer2Factory::getNFDInstance(status)),
98	targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
99	{
100	strength = ucol_getStrength(coll);
101	toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
102	variableTop = ucol_getVariableTop(coll, &status);
103
104	// find the largest expansion
105	uint8_t maxExpansion = 0;
106	for (const uint8_t expansion = coll->expansionCESize; expansion != 0; expansion += 1) {
107	if (*expansion > maxExpansion) {
108	maxExpansion = *expansion;
109	}
110	}
111
112	// room for an extra character on each end, plus 4 for safety
113	bufferSize = patternLength + (2 * maxExpansion) + 4;
114
115	ceb = NEW_ARRAY(CEI, bufferSize);
116
117	if (ceb == NULL) {
118	status = U_MEMORY_ALLOCATION_ERROR;
119	return;
120	}
121
122	if (target != NULL) {
123	setTargetString(target);
124	}
125
126	switch (strength)
127	{
128	default:
129	strengthMask \|= UCOL_TERTIARYORDERMASK;
130	/* fall through */
131
132	case UCOL_SECONDARY:
133	strengthMask \|= UCOL_SECONDARYORDERMASK;
134	/* fall through */
135
136	case UCOL_PRIMARY:
137	strengthMask \|= UCOL_PRIMARYORDERMASK;
138	}
139	}
140
141	Target::~Target()
142	{
143	ubrk_close(charBreakIterator);
144	ucol_closeElements(elements);
145
146	DELETE_ARRAY(ceb);
147	}
148
149	void Target::setTargetString(const UnicodeString *target)
150	{
151	if (charBreakIterator != NULL) {
152	ubrk_close(charBreakIterator);
153	ucol_closeElements(elements);
154	}
155
156	targetString = target;
157
158	if (targetString != NULL) {
159	UErrorCode status = U_ZERO_ERROR;
160
161	targetBuffer = targetString->getBuffer();
162	targetLength = targetString->length();
163
164	elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
165	ucol_forceHanImplicit(elements, &status);
166
167	charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
168	targetBuffer, targetLength, &status);
169	} else {
170	targetBuffer = NULL;
171	targetLength = 0;
172	}
173	}
174
175	const CEI *Target::nextCE(int32_t offset)
176	{
177	UErrorCode status = U_ZERO_ERROR;
178	int32_t low = -1, high = -1;
179	uint32_t order;
180	UBool cont = FALSE;
181
182	if (offset >= bufferMin && offset < bufferMax) {
183	return &ceb[offset];
184	}
185
186	if (bufferMax >= bufferSize \|\| offset != bufferMax) {
187	return NULL;
188	}
189
190	do {
191	low = ucol_getOffset(elements);
192	order = ucol_next(elements, &status);
193	high = ucol_getOffset(elements);
194
195	if (order == (uint32_t)UCOL_NULLORDER) {
196	//high = low = -1;
197	break;
198	}
199
200	cont = isContinuation(order);
201	order &= strengthMask;
202
203	if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
204	if (strength >= UCOL_QUATERNARY) {
205	order &= UCOL_PRIMARYORDERMASK;
206	} else {
207	order = UCOL_IGNORABLE;
208	}
209	}
210	} while (order == UCOL_IGNORABLE);
211
212	if (cont) {
213	order \|= UCOL_CONTINUATION_MARKER;
214	}
215
216	ceb[offset].order = order;
217	ceb[offset].lowOffset = low;
218	ceb[offset].highOffset = high;
219
220	bufferMax += 1;
221
222	return &ceb[offset];
223	}
224
225	const CEI *Target::prevCE(int32_t offset)
226	{
227	UErrorCode status = U_ZERO_ERROR;
228	int32_t low = -1, high = -1;
229	uint32_t order;
230	UBool cont = FALSE;
231
232	if (offset >= bufferMin && offset < bufferMax) {
233	return &ceb[offset];
234	}
235
236	if (bufferMax >= bufferSize \|\| offset != bufferMax) {
237	return NULL;
238	}
239
240	do {
241	high = ucol_getOffset(elements);
242	order = ucol_previous(elements, &status);
243	low = ucol_getOffset(elements);
244
245	if (order == (uint32_t)UCOL_NULLORDER) {
246	break;
247	}
248
249	cont = isContinuation(order);
250	order &= strengthMask;
251
252	if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
253	if (strength >= UCOL_QUATERNARY) {
254	order &= UCOL_PRIMARYORDERMASK;
255	} else {
256	order = UCOL_IGNORABLE;
257	}
258	}
259	} while (order == UCOL_IGNORABLE);
260
261	bufferMax += 1;
262
263	if (cont) {
264	order \|= UCOL_CONTINUATION_MARKER;
265	}
266
267	ceb[offset].order = order;
268	ceb[offset].lowOffset = low;
269	ceb[offset].highOffset = high;
270
271	return &ceb[offset];
272	}
273
274	int32_t Target::stringLength()
275	{
276	if (targetString != NULL) {
277	return targetLength;
278	}
279
280	return 0;
281	}
282
283	UChar Target::charAt(int32_t offset)
284	{
285	if (targetString != NULL) {
286	return targetBuffer[offset];
287	}
288
289	return 0x0000;
290	}
291
292	void Target::setOffset(int32_t offset)
293	{
294	UErrorCode status = U_ZERO_ERROR;
295
296	bufferMin = 0;
297	bufferMax = 0;
298
299	ucol_setOffset(elements, offset, &status);
300	}
301
302	void Target::setLast(int32_t last)
303	{
304	UErrorCode status = U_ZERO_ERROR;
305
306	bufferMin = 0;
307	bufferMax = 1;
308
309	ceb[0].order = UCOL_NULLORDER;
310	ceb[0].lowOffset = last;
311	ceb[0].highOffset = last;
312
313	ucol_setOffset(elements, last, &status);
314	}
315
316	int32_t Target::getOffset()
317	{
318	return ucol_getOffset(elements);
319	}
320
321	UBool Target::isBreakBoundary(int32_t offset)
322	{
323	return ubrk_isBoundary(charBreakIterator, offset);
324	}
325
326	int32_t Target::nextBreakBoundary(int32_t offset)
327	{
328	return ubrk_following(charBreakIterator, offset);
329	}
330
331	int32_t Target::nextSafeBoundary(int32_t offset)
332	{
333	while (offset < targetLength) {
334	//UChar ch = charAt(offset);
335	UChar ch = targetBuffer[offset];
336
337	if (U_IS_LEAD(ch) \|\| ! ucol_unsafeCP(ch, coll)) {
338	return offset;
339	}
340
341	offset += 1;
342	}
343
344	return targetLength;
345	}
346
347	UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
348	{
349	if (strength < UCOL_IDENTICAL) {
350	return TRUE;
351	}
352
353	// Note: We could use Normalizer::compare() or similar, but for short strings
354	// which may not be in FCD it might be faster to just NFD them.
355	UErrorCode status = U_ZERO_ERROR;
356	UnicodeString t2, p2;
357	nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
358	nfd.normalize(pattern, p2, status);
359	// return FALSE if NFD failed
360	return U_SUCCESS(status) && t2 == p2;
361	}
362
363	#define HASH_TABLE_SIZE 257
364
365	class BadCharacterTable : public UMemory
366	{
367	public:
368	BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
369	~BadCharacterTable();
370
371	int32_t operator[](uint32_t ce) const;
372	int32_t getMaxSkip() const;
373	int32_t minLengthInChars(int32_t index);
374
375	private:
376	static int32_t hash(uint32_t ce);
377
378	int32_t maxSkip;
379	int32_t badCharacterTable[HASH_TABLE_SIZE];
380
381	int32_t *minLengthCache;
382	};
383
384	BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
385	: minLengthCache(NULL)
386	{
387	int32_t plen = patternCEs.size();
388
389	// ** need a better way to deal with this **
390	if (U_FAILURE(status) \|\| plen == 0) {
391	return;
392	}
393
394	int32_t *history = NEW_ARRAY(int32_t, plen);
395
396	if (history == NULL) {
397	status = U_MEMORY_ALLOCATION_ERROR;
398	return;
399	}
400
401	for (int32_t i = 0; i < plen; i += 1) {
402	history[i] = -1;
403	}
404
405	minLengthCache = NEW_ARRAY(int32_t, plen + 1);
406
407	if (minLengthCache == NULL) {
408	DELETE_ARRAY(history);
409	status = U_MEMORY_ALLOCATION_ERROR;
410	return;
411	}
412
413	maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
414
415	for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
416	badCharacterTable[j] = maxSkip;
417	}
418
419	for(int32_t p = 1; p < plen; p += 1) {
420	minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
421
422	// Make sure this entry is not bigger than the previous one.
423	// Otherwise, we might skip too far in some cases.
424	if (minLengthCache[p] < 0 \|\| minLengthCache[p] > minLengthCache[p - 1]) {
425	minLengthCache[p] = minLengthCache[p - 1];
426	}
427	}
428
429	minLengthCache[plen] = 0;
430
431	for(int32_t p = 0; p < plen - 1; p += 1) {
432	badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
433	}
434
435	DELETE_ARRAY(history);
436	}
437
438	BadCharacterTable::~BadCharacterTable()
439	{
440	DELETE_ARRAY(minLengthCache);
441	}
442
443	int32_t BadCharacterTable::operator[](uint32_t ce) const
444	{
445	return badCharacterTable[hash(ce)];
446	}
447
448	int32_t BadCharacterTable::getMaxSkip() const
449	{
450	return maxSkip;
451	}
452
453	int32_t BadCharacterTable::minLengthInChars(int32_t index)
454	{
455	return minLengthCache[index];
456	}
457
458	int32_t BadCharacterTable::hash(uint32_t ce)
459	{
460	return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
461	}
462
463	class GoodSuffixTable : public UMemory
464	{
465	public:
466	GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
467	~GoodSuffixTable();
468
469	int32_t operator[](int32_t offset) const;
470
471	private:
472	int32_t *goodSuffixTable;
473	};
474
475	GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
476	: goodSuffixTable(NULL)
477	{
478	int32_t patlen = patternCEs.size();
479
480	// ** need a better way to deal with this **
481	if (U_FAILURE(status) \|\| patlen <= 0) {
482	return;
483	}
484
485	int32_t *suff = NEW_ARRAY(int32_t, patlen);
486	int32_t start = patlen - 1, end = - 1;
487	int32_t maxSkip = badCharacterTable.getMaxSkip();
488
489	if (suff == NULL) {
490	status = U_MEMORY_ALLOCATION_ERROR;
491	return;
492	}
493
494	// initialze suff
495	suff[patlen - 1] = patlen;
496
497	for (int32_t i = patlen - 2; i >= 0; i -= 1) {
498	// (i > start) means we're inside the last suffix match we found
499	// ((patlen - 1) - end) is how far the end of that match is from end of pattern
500	// (i - start) is how far we are from start of that match
501	// (i + (patlen - 1) - end) is index of same character at end of pattern
502	// so if any suffix match at that character doesn't extend beyond the last match,
503	// it's the suffix for this character as well
504	if (i > start && suff[i + patlen - 1 - end] < i - start) {
505	suff[i] = suff[i + patlen - 1 - end];
506	} else {
507	start = end = i;
508
509	int32_t s = patlen;
510
511	while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
512	start -= 1;
513	}
514
515	suff[i] = end - start;
516	}
517	}
518
519	// now build goodSuffixTable
520	goodSuffixTable = NEW_ARRAY(int32_t, patlen);
521
522	if (goodSuffixTable == NULL) {
523	DELETE_ARRAY(suff);
524	status = U_MEMORY_ALLOCATION_ERROR;
525	return;
526	}
527
528
529	// initialize entries to minLengthInChars of the pattern
530	for (int32_t i = 0; i < patlen; i += 1) {
531	goodSuffixTable[i] = maxSkip;
532	}
533
534	int32_t prefix = 0;
535
536	for (int32_t i = patlen - /1/ 2; i >= 0; i -= 1) {
537	if (suff[i] == i + 1) {
538	// this matching suffix is a prefix of the pattern
539	int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
540
541	// for any mis-match before this suffix, we should skip
542	// so that the front of the pattern (i.e. the prefix)
543	// lines up with the front of the suffix.
544	// (patlen - 1 - i) is the start of the suffix
545	while (prefix < patlen - 1 - i) {
546	// value of maxSkip means never set...
547	if (goodSuffixTable[prefix] == maxSkip) {
548	goodSuffixTable[prefix] = prefixSkip;
549	}
550
551	prefix += 1;
552	}
553	}
554	}
555
556	for (int32_t i = 0; i < patlen - 1; i += 1) {
557	goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
558	}
559
560	DELETE_ARRAY(suff);
561	}
562
563	GoodSuffixTable::~GoodSuffixTable()
564	{
565	DELETE_ARRAY(goodSuffixTable);
566	}
567
568	int32_t GoodSuffixTable::operator[](int32_t offset) const
569	{
570	return goodSuffixTable[offset];
571	}
572
573	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
574
575
576	UBool BoyerMooreSearch::empty()
577	{
578	return patCEs->size() <= 0;
579	}
580
581	CollData *BoyerMooreSearch::getData()
582	{
583	return data;
584	}
585
586	CEList *BoyerMooreSearch::getPatternCEs()
587	{
588	return patCEs;
589	}
590
591	BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
592	{
593	return badCharacterTable;
594	}
595
596	GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
597	{
598	return goodSuffixTable;
599	}
600
601	BoyerMooreSearch::BoyerMooreSearch(CollData theData, const UnicodeString &patternString, const UnicodeString targetString,
602	UErrorCode &status)
603	: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
604	{
605
606	if (U_FAILURE(status)) {
607	return;
608	}
609
610	UCollator *collator = data->getCollator();
611
612	patCEs = new CEList(collator, patternString, status);
613
614	if (patCEs == NULL \|\| U_FAILURE(status)) {
615	return;
616	}
617
618	badCharacterTable = new BadCharacterTable(*patCEs, data, status);
619
620	if (badCharacterTable == NULL \|\| U_FAILURE(status)) {
621	return;
622	}
623
624	goodSuffixTable = new GoodSuffixTable(patCEs, badCharacterTable, status);
625
626	if (targetString != NULL) {
627	target = new Target(collator, targetString, patCEs->size(), status);
628	}
629	}
630
631	BoyerMooreSearch::~BoyerMooreSearch()
632	{
633	delete target;
634	delete goodSuffixTable;
635	delete badCharacterTable;
636	delete patCEs;
637	}
638
639	void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
640	{
641	if (U_FAILURE(status)) {
642	return;
643	}
644
645	if (target == NULL) {
646	target = new Target(data->getCollator(), targetString, patCEs->size(), status);
647	} else {
648	target->setTargetString(targetString);
649	}
650	}
651
652	// ** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. **
653	/*
654	* TODO:
655	* * deal with trailing (and leading?) ignorables.
656	* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
657	*/
658	UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
659	{
660	/UCollator coll =*/ data->getCollator();
661	int32_t plen = patCEs->size();
662	int32_t tlen = target->stringLength();
663	int32_t maxSkip = badCharacterTable->getMaxSkip();
664	int32_t tOffset = offset + maxSkip;
665
666	if (plen <= 0) {
667	// Searching for a zero length pattern always fails.
668	start = end = -1;
669	return FALSE;
670	}
671
672	while (tOffset <= tlen) {
673	int32_t pIndex = plen - 1;
674	int32_t tIndex = 0;
675	int32_t lIndex = 0;
676
677	if (tOffset < tlen) {
678	// ** we really want to skip ahead enough to **
679	// ** be sure we get at least 1 non-ignorable **
680	// ** CE after the end of the pattern. **
681	int32_t next = target->nextSafeBoundary(tOffset + 1);
682
683	target->setOffset(next);
684
685	for (lIndex = 0; ; lIndex += 1) {
686	const CEI *cei = target->prevCE(lIndex);
687	int32_t low = cei->lowOffset;
688	int32_t high = cei->highOffset;
689
690	if (high == 0 \|\| (low < high && low <= tOffset)) {
691	if (low < tOffset) {
692	while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
693	lIndex -= 1;
694	}
695
696	if (high > tOffset) {
697	tOffset = high;
698	}
699	}
700
701	break;
702	}
703	}
704	} else {
705	target->setLast(tOffset);
706	lIndex = 0;
707	}
708
709	tIndex = ++lIndex;
710
711	// Iterate backward until we hit the beginning of the pattern
712	while (pIndex >= 0) {
713	uint32_t pce = (*patCEs)[pIndex];
714	const CEI *tcei = target->prevCE(tIndex++);
715
716
717	if (tcei->order != pce) {
718	// There is a mismatch at this position. Decide how far
719	// over to shift the pattern, then try again.
720
721	int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
722	#ifdef EXTRA_CAUTIOUS
723	int32_t old = tOffset;
724	#endif
725
726	tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
727
728	if (gsOffset > tOffset) {
729	tOffset = gsOffset;
730	}
731
732	#ifdef EXTRA_CAUTIOUS
733	// Make sure we don't skip backwards...
734	if (tOffset <= old) {
735	tOffset = old + 1;
736	}
737	#endif
738
739	break;
740	}
741
742	pIndex -= 1;
743	}
744
745	if (pIndex < 0) {
746	// We made it back to the beginning of the pattern,
747	// which means we matched it all. Return the location.
748	const CEI firstCEI = *target->prevCE(tIndex - 1);
749	const CEI lastCEI = *target->prevCE(lIndex);
750	int32_t mStart = firstCEI.lowOffset;
751	int32_t minLimit = lastCEI.lowOffset;
752	int32_t maxLimit = lastCEI.highOffset;
753	int32_t mLimit;
754	UBool found = TRUE;
755
756	target->setOffset(/tOffset/maxLimit);
757
758	const CEI nextCEI = *target->nextCE(0);
759
760	if (nextCEI.lowOffset > maxLimit) {
761	maxLimit = nextCEI.lowOffset;
762	}
763
764	if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
765	found = FALSE;
766	}
767
768	if (! target->isBreakBoundary(mStart)) {
769	found = FALSE;
770	}
771
772	if (firstCEI.lowOffset == firstCEI.highOffset) {
773	found = FALSE;
774	}
775
776	mLimit = maxLimit;
777	if (minLimit < maxLimit) {
4388f060 A	778	// When the last CE's low index is same with its high index, the CE is likely
	779	// a part of expansion. In this case, the index is located just after the
	780	// character corresponding to the CEs compared above. If the index is right
	781	// at the break boundary, move the position to the next boundary will result
	782	// incorrect match length when there are ignorable characters exist between
	783	// the position and the next character produces CE(s). See ticket#8482.
	784	if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
	785	mLimit = minLimit;
	786	} else {
	787	int32_t nbb = target->nextBreakBoundary(minLimit);
	788
	789	if (nbb >= lastCEI.highOffset) {
	790	mLimit = nbb;
	791	}
729e4ab9 A	792	}
	793	}
	794
	795	if (mLimit > maxLimit) {
	796	found = FALSE;
	797	}
	798
	799	if (! target->isBreakBoundary(mLimit)) {
	800	found = FALSE;
	801	}
	802
	803	if (! target->isIdentical(pattern, mStart, mLimit)) {
	804	found = FALSE;
	805	}
	806
	807	if (found) {
	808	start = mStart;
	809	end = mLimit;
	810
	811	return TRUE;
	812	}
	813
	814	tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
	815	}
	816	// Otherwise, we're here because of a mismatch, so keep going....
	817	}
	818
	819	// no match
	820	start = -1;
	821	end = -1;
	822	return FALSE;
	823	}
	824
	825	U_NAMESPACE_END
	826
	827	#endif // #if !UCONFIG_NO_COLLATION