X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..249c4c5ea9376c24572daf9c2effa7484a282f14:/icuSources/i18n/usearch.cpp?ds=sidebyside diff --git a/icuSources/i18n/usearch.cpp b/icuSources/i18n/usearch.cpp index ba463e60..645db01e 100644 --- a/icuSources/i18n/usearch.cpp +++ b/icuSources/i18n/usearch.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 2001-2011 IBM and others. All rights reserved. +* Copyright (C) 2001-2015 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 07/02/2001 synwee Creation. @@ -16,7 +18,6 @@ #include "unicode/uchar.h" #include "unicode/utf16.h" #include "normalizer2impl.h" -#include "ucol_imp.h" #include "usrchimp.h" #include "cmemory.h" #include "ucln_in.h" @@ -29,8 +30,6 @@ U_NAMESPACE_USE // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) #define BOYER_MOORE 0 -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - // internal definition --------------------------------------------------- #define LAST_BYTE_MASK_ 0xFF @@ -51,17 +50,10 @@ static inline void setColEIterOffset(UCollationElements *elems, int32_t offset) { - collIterate *ci = &(elems->iteratordata_); - ci->pos = ci->string + offset; - ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; - if (ci->flags & UCOL_ITER_INNORMBUF) { - ci->flags = ci->origFlags; - } - ci->fcdPosition = NULL; - - ci->offsetReturn = NULL; - ci->offsetStore = ci->offsetBuffer; - ci->offsetRepeatCount = ci->offsetRepeatValue = 0; + // Note: Not "fast" any more after the 2013 collation rewrite. + // We do not want to expose more internals than necessary. + UErrorCode status = U_ZERO_ERROR; + ucol_setOffset(elems, offset, &status); } /** @@ -85,18 +77,22 @@ inline uint32_t getMask(UCollationStrength strength) } /** -* This is to squeeze the 21bit ces into a 256 table -* @param ce collation element -* @return collapsed version of the collation element +* @param ce 32-bit collation element +* @return hash code */ static -inline int hash(uint32_t ce) +inline int hashFromCE32(uint32_t ce) { - // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work - // well with the new collation where most of the latin 1 characters - // are of the value xx000xxx. their hashes will most of the time be 0 - // to be discussed on the hash algo. - return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; + int hc = (int)( + ((((((ce >> 24) * 37) + + (ce >> 16)) * 37) + + (ce >> 8)) * 37) + + ce); + hc %= MAX_TABLE_SIZE_; + if (hc < 0) { + hc += MAX_TABLE_SIZE_; + } + return hc; } U_CDECL_BEGIN @@ -228,7 +224,7 @@ inline int32_t * addTouint32_tArray(int32_t *destination, if (U_FAILURE(*status)) { return NULL; } - uprv_memcpy(temp, destination, sizeof(int32_t) * offset); + uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset); *destinationlength = newlength; destination = temp; } @@ -270,7 +266,7 @@ inline int64_t * addTouint64_tArray(int64_t *destination, return NULL; } - uprv_memcpy(temp, destination, sizeof(int64_t) * offset); + uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset); *destinationlength = newlength; destination = temp; } @@ -298,7 +294,7 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch, { UPattern *pattern = &(strsrch->pattern); uint32_t cetablesize = INITIAL_ARRAY_SIZE_; - int32_t *cetable = pattern->CEBuffer; + int32_t *cetable = pattern->cesBuffer; uint32_t patternlength = pattern->textLength; UCollationElements *coleiter = strsrch->utilIter; @@ -311,17 +307,14 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch, strsrch->utilIter = coleiter; } else { - uprv_init_collIterate(strsrch->collator, pattern->text, - pattern->textLength, - &coleiter->iteratordata_, - status); + ucol_setText(coleiter, pattern->text, pattern->textLength, status); } if(U_FAILURE(*status)) { return 0; } - if (pattern->CE != cetable && pattern->CE) { - uprv_free(pattern->CE); + if (pattern->ces != cetable && pattern->ces) { + uprv_free(pattern->ces); } uint16_t offset = 0; @@ -340,7 +333,7 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch, return 0; } offset ++; - if (cetable != temp && cetable != pattern->CEBuffer) { + if (cetable != temp && cetable != pattern->cesBuffer) { uprv_free(cetable); } cetable = temp; @@ -349,8 +342,8 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch, } cetable[offset] = 0; - pattern->CE = cetable; - pattern->CELength = offset; + pattern->ces = cetable; + pattern->cesLength = offset; return result; } @@ -373,7 +366,7 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch, { UPattern *pattern = &(strsrch->pattern); uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; - int64_t *pcetable = pattern->PCEBuffer; + int64_t *pcetable = pattern->pcesBuffer; uint32_t patternlength = pattern->textLength; UCollationElements *coleiter = strsrch->utilIter; @@ -385,29 +378,26 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch, // returned. strsrch->utilIter = coleiter; } else { - uprv_init_collIterate(strsrch->collator, pattern->text, - pattern->textLength, - &coleiter->iteratordata_, - status); + ucol_setText(coleiter, pattern->text, pattern->textLength, status); } if(U_FAILURE(*status)) { return 0; } - if (pattern->PCE != pcetable && pattern->PCE != NULL) { - uprv_free(pattern->PCE); + if (pattern->pces != pcetable && pattern->pces != NULL) { + uprv_free(pattern->pces); } uint16_t offset = 0; uint16_t result = 0; int64_t pce; - uprv_init_pce(coleiter); + icu::UCollationPCE iter(coleiter); // ** Should processed CEs be signed or unsigned? // ** (the rest of the code in this file seems to play fast-and-loose with // ** whether a CE is signed or unsigned. For example, look at routine above this one.) - while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && + while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && U_SUCCESS(*status)) { int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, pce, @@ -420,7 +410,7 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch, offset += 1; - if (pcetable != temp && pcetable != pattern->PCEBuffer) { + if (pcetable != temp && pcetable != pattern->pcesBuffer) { uprv_free(pcetable); } @@ -429,8 +419,8 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch, } pcetable[offset] = 0; - pattern->PCE = pcetable; - pattern->PCELength = offset; + pattern->pces = pcetable; + pattern->pcesLength = offset; return result; } @@ -446,6 +436,7 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch, static inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) { + if (U_FAILURE(*status)) { return 0; } UPattern *pattern = &(strsrch->pattern); const UChar *patterntext = pattern->text; int32_t length = pattern->textLength; @@ -465,12 +456,12 @@ inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) } // ** HACK ** - if (strsrch->pattern.PCE != NULL) { - if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { - uprv_free(strsrch->pattern.PCE); + if (strsrch->pattern.pces != NULL) { + if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { + uprv_free(strsrch->pattern.pces); } - strsrch->pattern.PCE = NULL; + strsrch->pattern.pces = NULL; } // since intializePattern is an internal method status is a success. @@ -507,22 +498,22 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[], for (count = 0; count < cesize; count ++) { // number of ces from right of array to the count int temp = defaultforward - count - 1; - shift[hash(cetable[count])] = temp > 1 ? temp : 1; + shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1; } - shift[hash(cetable[cesize])] = 1; + shift[hashFromCE32(cetable[cesize])] = 1; // for ignorables we just shift by one. see test examples. - shift[hash(0)] = 1; + shift[hashFromCE32(0)] = 1; for (count = 0; count < MAX_TABLE_SIZE_; count ++) { backshift[count] = defaultbackward; } for (count = cesize; count > 0; count --) { // the original value count does not seem to work - backshift[hash(cetable[count])] = count > expansionsize ? + backshift[hashFromCE32(cetable[count])] = count > expansionsize ? (int16_t)(count - expansionsize) : 1; } - backshift[hash(cetable[0])] = 1; - backshift[hash(0)] = 1; + backshift[hashFromCE32(cetable[0])] = 1; + backshift[hashFromCE32(0)] = 1; } /** @@ -557,14 +548,14 @@ static inline void initialize(UStringSearch *strsrch, UErrorCode *status) { int16_t expandlength = initializePattern(strsrch, status); - if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { + if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { UPattern *pattern = &strsrch->pattern; - int32_t cesize = pattern->CELength; + int32_t cesize = pattern->cesLength; int16_t minlength = cesize > expandlength ? (int16_t)cesize - expandlength : 1; pattern->defaultShiftSize = minlength; - setShiftTable(pattern->shift, pattern->backShift, pattern->CE, + setShiftTable(pattern->shift, pattern->backShift, pattern->ces, cesize, expandlength, minlength, minlength); return; } @@ -640,14 +631,14 @@ UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, start; UErrorCode status = U_ZERO_ERROR; ucol_setText(coleiter, text, end - start, &status); - for (int32_t count = 0; count < strsrch->pattern.CELength; + for (int32_t count = 0; count < strsrch->pattern.cesLength; count ++) { int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); if (ce == UCOL_IGNORABLE) { count --; continue; } - if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { + if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { return FALSE; } } @@ -745,10 +736,10 @@ inline int32_t shiftForward(UStringSearch *strsrch, { UPattern *pattern = &(strsrch->pattern); if (ce != UCOL_NULLORDER) { - int32_t shift = pattern->shift[hash(ce)]; + int32_t shift = pattern->shift[hashFromCE32(ce)]; // this is to adjust for characters in the middle of the // substring for matching that failed. - int32_t adjust = pattern->CELength - patternceindex; + int32_t adjust = pattern->cesLength - patternceindex; if (adjust > 1 && shift >= adjust) { shift -= adjust - 1; } @@ -882,7 +873,7 @@ UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, UCollationElements *coleiter = strsrch->utilIter; ucol_setText(coleiter, norm, size, status); - uint32_t firstce = strsrch->pattern.CE[0]; + uint32_t firstce = strsrch->pattern.ces[0]; UBool ignorable = TRUE; uint32_t ce = UCOL_IGNORABLE; while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { @@ -935,7 +926,7 @@ UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, UErrorCode status = U_ZERO_ERROR; // we have been iterating forwards previously uint32_t ignorable = TRUE; - int32_t firstce = strsrch->pattern.CE[0]; + int32_t firstce = strsrch->pattern.ces[0]; setColEIterOffset(coleiter, start); int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); @@ -1017,7 +1008,7 @@ UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, int32_t textlength = strsrch->search->textLength; U16_BACK_1(text, 0, temp); if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { - int32_t firstce = strsrch->pattern.CE[0]; + int32_t firstce = strsrch->pattern.ces[0]; UCollationElements *coleiter = strsrch->textIter; UErrorCode status = U_ZERO_ERROR; int32_t ce; @@ -1028,7 +1019,7 @@ UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, } } int32_t count = 1; - while (count < strsrch->pattern.CELength) { + while (count < strsrch->pattern.cesLength) { if (getCE(strsrch, ucol_next(coleiter, &status)) == UCOL_IGNORABLE) { // Thai can give an ignorable here. @@ -1212,8 +1203,8 @@ UBool checkNextExactContractionMatch(UStringSearch *strsrch, expansion --; } - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t count = 0; while (count < patterncelength) { int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); @@ -1390,7 +1381,7 @@ inline UChar * addToUCharArray( UChar *destination, } } if (source1length != 0) { - uprv_memcpy(destination, source1, sizeof(UChar) * source1length); + u_memcpy(destination, source1, source1length); } if (source2length != 0) { uprv_memcpy(destination + source1length, source2, @@ -1415,8 +1406,8 @@ static inline UBool checkCollationMatch(const UStringSearch *strsrch, UCollationElements *coleiter) { - int patternceindex = strsrch->pattern.CELength; - int32_t *patternce = strsrch->pattern.CE; + int patternceindex = strsrch->pattern.cesLength; + int32_t *patternce = strsrch->pattern.ces; UErrorCode status = U_ZERO_ERROR; while (patternceindex > 0) { int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); @@ -1615,8 +1606,8 @@ int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, ucol_setText(coleiter, safetext, safetextlength, status); // status checked in loop below - int32_t *ce = strsrch->pattern.CE; - int32_t celength = strsrch->pattern.CELength; + int32_t *ce = strsrch->pattern.ces; + int32_t celength = strsrch->pattern.cesLength; int ceindex = celength - 1; UBool isSafe = TRUE; // indication flag for position in safe zone @@ -1855,8 +1846,8 @@ UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, expansion --; } - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t count = 0; int32_t textlength = strsrch->search->textLength; while (count < patterncelength) { @@ -1986,7 +1977,7 @@ inline int32_t reverseShift(UStringSearch *strsrch, } else { if (ce != UCOL_NULLORDER) { - int32_t shift = strsrch->pattern.backShift[hash(ce)]; + int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)]; // this is to adjust for characters in the middle of the substring // for matching that failed. @@ -2053,8 +2044,8 @@ UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, expansion --; } - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t count = patterncelength; while (count > 0) { int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); @@ -2278,8 +2269,8 @@ int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, ucol_setText(coleiter, safetext, safetextlength, status); // status checked in loop below - int32_t *ce = strsrch->pattern.CE; - int32_t celength = strsrch->pattern.CELength; + int32_t *ce = strsrch->pattern.ces; + int32_t celength = strsrch->pattern.cesLength; int ceindex = 0; UBool isSafe = TRUE; // safe zone indication flag for position int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); @@ -2493,8 +2484,8 @@ UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, expansion --; } - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t count = patterncelength; while (count > 0) { int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); @@ -2700,7 +2691,7 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( UCOL_SHIFTED; result->variableTop = ucol_getVariableTop(collator, status); - result->nfd = Normalizer2Factory::getNFDInstance(*status); + result->nfd = Normalizer2::getNFDInstance(*status); if (U_FAILURE(*status)) { uprv_free(result); @@ -2719,8 +2710,8 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( result->pattern.text = pattern; result->pattern.textLength = patternlength; - result->pattern.CE = NULL; - result->pattern.PCE = NULL; + result->pattern.ces = NULL; + result->pattern.pces = NULL; result->search->breakIter = breakiter; #if !UCONFIG_NO_BREAK_ITERATION @@ -2736,6 +2727,7 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( result->utilIter = NULL; result->textIter = ucol_openElements(collator, text, textlength, status); + result->textProcessedIter = NULL; if (U_FAILURE(*status)) { usearch_close(result); return NULL; @@ -2762,16 +2754,17 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) { if (strsrch) { - if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && - strsrch->pattern.CE) { - uprv_free(strsrch->pattern.CE); + if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && + strsrch->pattern.ces) { + uprv_free(strsrch->pattern.ces); } - if (strsrch->pattern.PCE != NULL && - strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { - uprv_free(strsrch->pattern.PCE); + if (strsrch->pattern.pces != NULL && + strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { + uprv_free(strsrch->pattern.pces); } + delete strsrch->textProcessedIter; ucol_closeElements(strsrch->textIter); ucol_closeElements(strsrch->utilIter); @@ -2790,6 +2783,24 @@ U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) } } +namespace { + +UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { + if (U_FAILURE(*status)) { return FALSE; } + if (strsrch->textProcessedIter == NULL) { + strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); + if (strsrch->textProcessedIter == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + } else { + strsrch->textProcessedIter->init(strsrch->textIter); + } + return TRUE; +} + +} + // set and get methods -------------------------------------------------- U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, @@ -3010,6 +3021,11 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, } if (strsrch) { + delete strsrch->textProcessedIter; + strsrch->textProcessedIter = NULL; + ucol_closeElements(strsrch->textIter); + ucol_closeElements(strsrch->utilIter); + strsrch->textIter = strsrch->utilIter = NULL; if (strsrch->ownCollator && (strsrch->collator != collator)) { ucol_close((UCollator *)strsrch->collator); strsrch->ownCollator = FALSE; @@ -3028,23 +3044,19 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, UCOL_SHIFTED; // if status is a failure, ucol_getVariableTop returns 0 strsrch->variableTop = ucol_getVariableTop(collator, status); - if (U_SUCCESS(*status)) { - initialize(strsrch, status); - if (U_SUCCESS(*status)) { - /* free offset buffer to avoid memory leak before initializing. */ - ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); - uprv_init_collIterate(collator, strsrch->search->text, - strsrch->search->textLength, - &(strsrch->textIter->iteratordata_), - status); - strsrch->utilIter->iteratordata_.coll = collator; - } - } + strsrch->textIter = ucol_openElements(collator, + strsrch->search->text, + strsrch->search->textLength, + status); + strsrch->utilIter = ucol_openElements( + collator, strsrch->pattern.text, strsrch->pattern.textLength, status); + // initialize() _after_ setting the iterators for the new collator. + initialize(strsrch, status); } // **** are these calls needed? // **** we call uprv_init_pce in initializePatternPCETable - // **** and the CEBuffer constructor... + // **** and the CEIBuffer constructor... #if 0 uprv_init_pce(strsrch->textIter); uprv_init_pce(strsrch->utilIter); @@ -3222,7 +3234,7 @@ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, } if (U_SUCCESS(*status)) { - if (strsrch->pattern.CELength == 0) { + if (strsrch->pattern.cesLength == 0) { if (search->matchedIndex == USEARCH_DONE) { search->matchedIndex = offset; } @@ -3333,7 +3345,7 @@ U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, } if (U_SUCCESS(*status)) { - if (strsrch->pattern.CELength == 0) { + if (strsrch->pattern.cesLength == 0) { search->matchedIndex = (matchedindex == USEARCH_DONE ? offset : matchedindex); if (search->matchedIndex == 0) { @@ -3416,11 +3428,8 @@ U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) if (!sameCollAttribute) { initialize(strsrch, &status); } - /* free offset buffer to avoid memory leak before initializing. */ - ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); - uprv_init_collIterate(strsrch->collator, strsrch->search->text, + ucol_setText(strsrch->textIter, strsrch->search->text, strsrch->search->textLength, - &(strsrch->textIter->iteratordata_), &status); strsrch->search->matchedLength = 0; strsrch->search->matchedIndex = USEARCH_DONE; @@ -3444,9 +3453,9 @@ struct CEI { U_NAMESPACE_BEGIN - +namespace { // -// CEBuffer A circular buffer of CEs from the text being searched. +// CEIBuffer A circular buffer of CEs-with-index from the text being searched. // #define DEFAULT_CEBUFFER_SIZE 96 #define CEBUFFER_EXTRA 32 @@ -3455,7 +3464,7 @@ U_NAMESPACE_BEGIN #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) -struct CEBuffer { +struct CEIBuffer { CEI defBuf[DEFAULT_CEBUFFER_SIZE]; CEI *buf; int32_t bufSize; @@ -3466,17 +3475,17 @@ struct CEBuffer { - CEBuffer(UStringSearch *ss, UErrorCode *status); - ~CEBuffer(); + CEIBuffer(UStringSearch *ss, UErrorCode *status); + ~CEIBuffer(); const CEI *get(int32_t index); const CEI *getPrevious(int32_t index); }; -CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { +CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { buf = defBuf; strSearch = ss; - bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA; + bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; if (ss->search->elementComparisonType != 0) { const UChar * patText = ss->pattern.text; if (patText) { @@ -3496,7 +3505,7 @@ CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { firstIx = 0; limitIx = 0; - uprv_init_pce(ceIter); + if (!initTextProcessedIter(ss, status)) { return; } if (bufSize>DEFAULT_CEBUFFER_SIZE) { buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); @@ -3509,7 +3518,7 @@ CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { // TODO: add a reset or init function so that allocated // buffers can be retained & reused. -CEBuffer::~CEBuffer() { +CEIBuffer::~CEIBuffer() { if (buf != defBuf) { uprv_free(buf); } @@ -3522,7 +3531,7 @@ CEBuffer::~CEBuffer() { // where n is the largest index to have been fetched by some previous call to this function. // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. // -const CEI *CEBuffer::get(int32_t index) { +const CEI *CEIBuffer::get(int32_t index) { int i = index % bufSize; if (index>=firstIx && indextextProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); return &buf[i]; } @@ -3561,7 +3570,7 @@ const CEI *CEBuffer::get(int32_t index) { // where n is the largest index to have been fetched by some previous call to this function. // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. // -const CEI *CEBuffer::getPrevious(int32_t index) { +const CEI *CEIBuffer::getPrevious(int32_t index) { int i = index % bufSize; if (index>=firstIx && indextextProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); return &buf[i]; } +} + U_NAMESPACE_END @@ -3800,6 +3811,28 @@ static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s #endif +namespace { + +UChar32 codePointAt(const USearch &search, int32_t index) { + if (index < search.textLength) { + UChar32 c; + U16_NEXT(search.text, index, search.textLength, c); + return c; + } + return U_SENTINEL; +} + +UChar32 codePointBefore(const USearch &search, int32_t index) { + if (0 < index) { + UChar32 c; + U16_PREV(search.text, 0, index, c); + return c; + } + return U_SENTINEL; +} + +} // namespace + U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, int32_t startIdx, int32_t *matchStart, @@ -3815,8 +3848,8 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, #ifdef USEARCH_DEBUG if (getenv("USEARCH_DEBUG") != NULL) { printf("Pattern CEs\n"); - for (int ii=0; iipattern.CELength; ii++) { - printf(" %8x", strsrch->pattern.CE[ii]); + for (int ii=0; iipattern.cesLength; ii++) { + printf(" %8x", strsrch->pattern.ces[ii]); } printf("\n"); } @@ -3825,20 +3858,20 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, // Input parameter sanity check. // TODO: should input indicies clip to the text length // in the same way that UText does. - if(strsrch->pattern.CELength == 0 || + if(strsrch->pattern.cesLength == 0 || startIdx < 0 || startIdx > strsrch->search->textLength || - strsrch->pattern.CE == NULL) { + strsrch->pattern.ces == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } - if (strsrch->pattern.PCE == NULL) { + if (strsrch->pattern.pces == NULL) { initializePatternPCETable(strsrch, status); } ucol_setOffset(strsrch->textIter, startIdx, status); - CEBuffer ceb(strsrch, status); + CEIBuffer ceb(strsrch, status); int32_t targetIx = 0; @@ -3884,8 +3917,8 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, break; } - for (patIx=0; patIxpattern.PCELength; patIx++) { - patCE = strsrch->pattern.PCE[patIx]; + for (patIx=0; patIxpattern.pcesLength; patIx++) { + patCE = strsrch->pattern.pces[patIx]; targetCEI = ceb.get(targetIx+patIx+targetIxOffset); // Compare CE from target string with CE from the pattern. // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, @@ -3905,7 +3938,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, } } } - targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far + targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { // No match at this targetIx. Try again at the next. @@ -3993,6 +4026,34 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, found = FALSE; } + // Allow matches to end in the middle of a grapheme cluster if the following + // conditions are met; this is needed to make prefix search work properly in + // Indic, see #11750 + // * the default breakIter is being used + // * the next collation element after this combining sequence + // - has non-zero primary weight + // - corresponds to a separate character following the one at end of the current match + // (the second of these conditions, and perhaps both, may be redundant given the + // subsequent check for normalization boundary; however they are likely much faster + // tests in any case) + // * the match limit is a normalization boundary + UBool allowMidclusterMatch = FALSE; + if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { + allowMidclusterMatch = + strsrch->search->breakIter == NULL && + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && + (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || + strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); + } + // If those conditions are met, then: + // * do NOT advance the candidate match limit (mLimit) to a break boundary; however + // the match limit may be backed off to a previous break boundary. This handles + // cases in which mLimit includes target characters that are ignorable with current + // settings (such as space) and which extend beyond the pattern match. + // * do NOT require that end of the combining sequence not extend beyond the match in CE space + // * do NOT require that match limit be on a breakIter boundary + // Advance the match end position to the first acceptable match boundary. // This advances the index over any combining charcters. mLimit = maxLimit; @@ -4007,7 +4068,10 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, mLimit = minLimit; } else { int32_t nba = nextBoundaryAfter(strsrch, minLimit); - if (nba >= lastCEI->highIndex) { + // Note that we can have nba < maxLimit && nba >= minLImit, in which + // case we want to set mLimit to nba regardless of allowMidclusterMatch + // (i.e. we back off mLimit to the previous breakIterator boundary). + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { mLimit = nba; } } @@ -4019,14 +4083,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, } #endif - // If advancing to the end of a combining sequence in character indexing space - // advanced us beyond the end of the match in CE space, reject this match. - if (mLimit > maxLimit) { - found = FALSE; - } + if (!allowMidclusterMatch) { + // If advancing to the end of a combining sequence in character indexing space + // advanced us beyond the end of the match in CE space, reject this match. + if (mLimit > maxLimit) { + found = FALSE; + } - if (!isBreakBoundary(strsrch, mLimit)) { - found = FALSE; + if (!isBreakBoundary(strsrch, mLimit)) { + found = FALSE; + } } if (! checkIdentical(strsrch, mStart, mLimit)) { @@ -4082,8 +4148,8 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, #ifdef USEARCH_DEBUG if (getenv("USEARCH_DEBUG") != NULL) { printf("Pattern CEs\n"); - for (int ii=0; iipattern.CELength; ii++) { - printf(" %8x", strsrch->pattern.CE[ii]); + for (int ii=0; iipattern.cesLength; ii++) { + printf(" %8x", strsrch->pattern.ces[ii]); } printf("\n"); } @@ -4092,19 +4158,19 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, // Input parameter sanity check. // TODO: should input indicies clip to the text length // in the same way that UText does. - if(strsrch->pattern.CELength == 0 || + if(strsrch->pattern.cesLength == 0 || startIdx < 0 || startIdx > strsrch->search->textLength || - strsrch->pattern.CE == NULL) { + strsrch->pattern.ces == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } - if (strsrch->pattern.PCE == NULL) { + if (strsrch->pattern.pces == NULL) { initializePatternPCETable(strsrch, status); } - CEBuffer ceb(strsrch, status); + CEIBuffer ceb(strsrch, status); int32_t targetIx = 0; /* @@ -4165,10 +4231,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, // Inner loop checks for a match beginning at each // position from the outer loop. int32_t targetIxOffset = 0; - for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { - int64_t patCE = strsrch->pattern.PCE[patIx]; + for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { + int64_t patCE = strsrch->pattern.pces[patIx]; - targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset); + targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); // Compare CE from target string with CE from the pattern. // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, // which will fail the compare, below. @@ -4204,7 +4270,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, // There still is a chance of match failure if the CE range not correspond to // an acceptable character range. // - const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset); + const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); mStart = firstCEI->lowIndex; // Check for the start of the match being within a combining sequence. @@ -4243,25 +4309,57 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, mLimit = maxLimit = nextCEI->lowIndex; + // Allow matches to end in the middle of a grapheme cluster if the following + // conditions are met; this is needed to make prefix search work properly in + // Indic, see #11750 + // * the default breakIter is being used + // * the next collation element after this combining sequence + // - has non-zero primary weight + // - corresponds to a separate character following the one at end of the current match + // (the second of these conditions, and perhaps both, may be redundant given the + // subsequent check for normalization boundary; however they are likely much faster + // tests in any case) + // * the match limit is a normalization boundary + UBool allowMidclusterMatch = FALSE; + if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { + allowMidclusterMatch = + strsrch->search->breakIter == NULL && + nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && + maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && + (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || + strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); + } + // If those conditions are met, then: + // * do NOT advance the candidate match limit (mLimit) to a break boundary; however + // the match limit may be backed off to a previous break boundary. This handles + // cases in which mLimit includes target characters that are ignorable with current + // settings (such as space) and which extend beyond the pattern match. + // * do NOT require that end of the combining sequence not extend beyond the match in CE space + // * do NOT require that match limit be on a breakIter boundary + // Advance the match end position to the first acceptable match boundary. - // This advances the index over any combining charcters. + // This advances the index over any combining characters. if (minLimit < maxLimit) { int32_t nba = nextBoundaryAfter(strsrch, minLimit); - - if (nba >= lastCEI->highIndex) { + // Note that we can have nba < maxLimit && nba >= minLImit, in which + // case we want to set mLimit to nba regardless of allowMidclusterMatch + // (i.e. we back off mLimit to the previous breakIterator boundary). + if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { mLimit = nba; } } - // If advancing to the end of a combining sequence in character indexing space - // advanced us beyond the end of the match in CE space, reject this match. - if (mLimit > maxLimit) { - found = FALSE; - } + if (!allowMidclusterMatch) { + // If advancing to the end of a combining sequence in character indexing space + // advanced us beyond the end of the match in CE space, reject this match. + if (mLimit > maxLimit) { + found = FALSE; + } - // Make sure the end of the match is on a break boundary - if (!isBreakBoundary(strsrch, mLimit)) { - found = FALSE; + // Make sure the end of the match is on a break boundary + if (!isBreakBoundary(strsrch, mLimit)) { + found = FALSE; + } } } else { @@ -4330,8 +4428,8 @@ UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) #if BOYER_MOORE UCollationElements *coleiter = strsrch->textIter; int32_t textlength = strsrch->search->textLength; - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t textoffset = ucol_getOffset(coleiter); // status used in setting coleiter offset, since offset is checked in @@ -4444,8 +4542,8 @@ UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) #if BOYER_MOORE UCollationElements *coleiter = strsrch->textIter; int32_t textlength = strsrch->search->textLength; - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t textoffset = ucol_getOffset(coleiter); UBool hasPatternAccents = strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; @@ -4558,8 +4656,8 @@ UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) #if BOYER_MOORE UCollationElements *coleiter = strsrch->textIter; - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t textoffset = ucol_getOffset(coleiter); // shifting it check for setting offset @@ -4659,8 +4757,12 @@ UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) } else { // move the start position at the end of possible match initializePatternPCETable(strsrch, status); - for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { - int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); + if (!initTextProcessedIter(strsrch, status)) { + setMatchNotFound(strsrch); + return FALSE; + } + for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { + int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); if (pce == UCOL_PROCESSED_NULLORDER) { // at the end of the text break; @@ -4700,8 +4802,8 @@ UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, #if BOYER_MOORE UCollationElements *coleiter = strsrch->textIter; - int32_t *patternce = strsrch->pattern.CE; - int32_t patterncelength = strsrch->pattern.CELength; + int32_t *patternce = strsrch->pattern.ces; + int32_t patterncelength = strsrch->pattern.cesLength; int32_t textoffset = ucol_getOffset(coleiter); UBool hasPatternAccents = strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; @@ -4808,8 +4910,12 @@ UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, } else { // move the start position at the end of possible match initializePatternPCETable(strsrch, status); - for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { - int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); + if (!initTextProcessedIter(strsrch, status)) { + setMatchNotFound(strsrch); + return FALSE; + } + for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { + int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); if (pce == UCOL_PROCESSED_NULLORDER) { // at the end of the text break;