2 ******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
32 #include "unicode/colldata.h"
33 #include "unicode/bmsearch.h"
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
49 class Target
: public UMemory
52 Target(UCollator
*theCollator
, const UnicodeString
*target
, int32_t patternLength
, UErrorCode
&status
);
55 void setTargetString(const UnicodeString
*target
);
57 const CEI
*nextCE(int32_t offset
);
58 const CEI
*prevCE(int32_t offset
);
60 int32_t stringLength();
61 UChar
charAt(int32_t offset
);
63 UBool
isBreakBoundary(int32_t offset
);
64 int32_t nextBreakBoundary(int32_t offset
);
65 int32_t nextSafeBoundary(int32_t offset
);
67 UBool
isIdentical(UnicodeString
&pattern
, int32_t start
, int32_t end
);
69 void setOffset(int32_t offset
);
70 void setLast(int32_t last
);
79 uint32_t strengthMask
;
80 UCollationStrength strength
;
84 const Normalizer2
&nfd
;
86 const UnicodeString
*targetString
;
87 const UChar
*targetBuffer
;
90 UCollationElements
*elements
;
91 UBreakIterator
*charBreakIterator
;
94 Target::Target(UCollator
*theCollator
, const UnicodeString
*target
, int32_t patternLength
, UErrorCode
&status
)
95 : bufferSize(0), bufferMin(0), bufferMax(0),
96 strengthMask(0), strength(UCOL_PRIMARY
), variableTop(0), toShift(FALSE
), coll(theCollator
),
97 nfd(*Normalizer2Factory::getNFDInstance(status
)),
98 targetString(NULL
), targetBuffer(NULL
), targetLength(0), elements(NULL
), charBreakIterator(NULL
)
100 strength
= ucol_getStrength(coll
);
101 toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
102 variableTop
= ucol_getVariableTop(coll
, &status
);
104 // find the largest expansion
105 uint8_t maxExpansion
= 0;
106 for (const uint8_t *expansion
= coll
->expansionCESize
; *expansion
!= 0; expansion
+= 1) {
107 if (*expansion
> maxExpansion
) {
108 maxExpansion
= *expansion
;
112 // room for an extra character on each end, plus 4 for safety
113 bufferSize
= patternLength
+ (2 * maxExpansion
) + 4;
115 ceb
= NEW_ARRAY(CEI
, bufferSize
);
118 status
= U_MEMORY_ALLOCATION_ERROR
;
122 if (target
!= NULL
) {
123 setTargetString(target
);
129 strengthMask
|= UCOL_TERTIARYORDERMASK
;
133 strengthMask
|= UCOL_SECONDARYORDERMASK
;
137 strengthMask
|= UCOL_PRIMARYORDERMASK
;
143 ubrk_close(charBreakIterator
);
144 ucol_closeElements(elements
);
149 void Target::setTargetString(const UnicodeString
*target
)
151 if (charBreakIterator
!= NULL
) {
152 ubrk_close(charBreakIterator
);
153 ucol_closeElements(elements
);
156 targetString
= target
;
158 if (targetString
!= NULL
) {
159 UErrorCode status
= U_ZERO_ERROR
;
161 targetBuffer
= targetString
->getBuffer();
162 targetLength
= targetString
->length();
164 elements
= ucol_openElements(coll
, target
->getBuffer(), target
->length(), &status
);
165 ucol_forceHanImplicit(elements
, &status
);
167 charBreakIterator
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(coll
, ULOC_VALID_LOCALE
, &status
),
168 targetBuffer
, targetLength
, &status
);
175 const CEI
*Target::nextCE(int32_t offset
)
177 UErrorCode status
= U_ZERO_ERROR
;
178 int32_t low
= -1, high
= -1;
182 if (offset
>= bufferMin
&& offset
< bufferMax
) {
186 if (bufferMax
>= bufferSize
|| offset
!= bufferMax
) {
191 low
= ucol_getOffset(elements
);
192 order
= ucol_next(elements
, &status
);
193 high
= ucol_getOffset(elements
);
195 if (order
== (uint32_t)UCOL_NULLORDER
) {
200 cont
= isContinuation(order
);
201 order
&= strengthMask
;
203 if (toShift
&& variableTop
> order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
204 if (strength
>= UCOL_QUATERNARY
) {
205 order
&= UCOL_PRIMARYORDERMASK
;
207 order
= UCOL_IGNORABLE
;
210 } while (order
== UCOL_IGNORABLE
);
213 order
|= UCOL_CONTINUATION_MARKER
;
216 ceb
[offset
].order
= order
;
217 ceb
[offset
].lowOffset
= low
;
218 ceb
[offset
].highOffset
= high
;
225 const CEI
*Target::prevCE(int32_t offset
)
227 UErrorCode status
= U_ZERO_ERROR
;
228 int32_t low
= -1, high
= -1;
232 if (offset
>= bufferMin
&& offset
< bufferMax
) {
236 if (bufferMax
>= bufferSize
|| offset
!= bufferMax
) {
241 high
= ucol_getOffset(elements
);
242 order
= ucol_previous(elements
, &status
);
243 low
= ucol_getOffset(elements
);
245 if (order
== (uint32_t)UCOL_NULLORDER
) {
249 cont
= isContinuation(order
);
250 order
&= strengthMask
;
252 if (toShift
&& variableTop
> order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
253 if (strength
>= UCOL_QUATERNARY
) {
254 order
&= UCOL_PRIMARYORDERMASK
;
256 order
= UCOL_IGNORABLE
;
259 } while (order
== UCOL_IGNORABLE
);
264 order
|= UCOL_CONTINUATION_MARKER
;
267 ceb
[offset
].order
= order
;
268 ceb
[offset
].lowOffset
= low
;
269 ceb
[offset
].highOffset
= high
;
274 int32_t Target::stringLength()
276 if (targetString
!= NULL
) {
283 UChar
Target::charAt(int32_t offset
)
285 if (targetString
!= NULL
) {
286 return targetBuffer
[offset
];
292 void Target::setOffset(int32_t offset
)
294 UErrorCode status
= U_ZERO_ERROR
;
299 ucol_setOffset(elements
, offset
, &status
);
302 void Target::setLast(int32_t last
)
304 UErrorCode status
= U_ZERO_ERROR
;
309 ceb
[0].order
= UCOL_NULLORDER
;
310 ceb
[0].lowOffset
= last
;
311 ceb
[0].highOffset
= last
;
313 ucol_setOffset(elements
, last
, &status
);
316 int32_t Target::getOffset()
318 return ucol_getOffset(elements
);
321 UBool
Target::isBreakBoundary(int32_t offset
)
323 return ubrk_isBoundary(charBreakIterator
, offset
);
326 int32_t Target::nextBreakBoundary(int32_t offset
)
328 return ubrk_following(charBreakIterator
, offset
);
331 int32_t Target::nextSafeBoundary(int32_t offset
)
333 while (offset
< targetLength
) {
334 //UChar ch = charAt(offset);
335 UChar ch
= targetBuffer
[offset
];
337 if (U_IS_LEAD(ch
) || ! ucol_unsafeCP(ch
, coll
)) {
347 UBool
Target::isIdentical(UnicodeString
&pattern
, int32_t start
, int32_t end
)
349 if (strength
< UCOL_IDENTICAL
) {
353 // Note: We could use Normalizer::compare() or similar, but for short strings
354 // which may not be in FCD it might be faster to just NFD them.
355 UErrorCode status
= U_ZERO_ERROR
;
356 UnicodeString t2
, p2
;
357 nfd
.normalize(UnicodeString(FALSE
, targetBuffer
+ start
, end
- start
), t2
, status
);
358 nfd
.normalize(pattern
, p2
, status
);
359 // return FALSE if NFD failed
360 return U_SUCCESS(status
) && t2
== p2
;
363 #define HASH_TABLE_SIZE 257
365 class BadCharacterTable
: public UMemory
368 BadCharacterTable(CEList
&patternCEs
, CollData
*data
, UErrorCode
&status
);
369 ~BadCharacterTable();
371 int32_t operator[](uint32_t ce
) const;
372 int32_t getMaxSkip() const;
373 int32_t minLengthInChars(int32_t index
);
376 static int32_t hash(uint32_t ce
);
379 int32_t badCharacterTable
[HASH_TABLE_SIZE
];
381 int32_t *minLengthCache
;
384 BadCharacterTable::BadCharacterTable(CEList
&patternCEs
, CollData
*data
, UErrorCode
&status
)
385 : minLengthCache(NULL
)
387 int32_t plen
= patternCEs
.size();
389 // **** need a better way to deal with this ****
390 if (U_FAILURE(status
) || plen
== 0) {
394 int32_t *history
= NEW_ARRAY(int32_t, plen
);
396 if (history
== NULL
) {
397 status
= U_MEMORY_ALLOCATION_ERROR
;
401 for (int32_t i
= 0; i
< plen
; i
+= 1) {
405 minLengthCache
= NEW_ARRAY(int32_t, plen
+ 1);
407 if (minLengthCache
== NULL
) {
408 DELETE_ARRAY(history
);
409 status
= U_MEMORY_ALLOCATION_ERROR
;
413 maxSkip
= minLengthCache
[0] = data
->minLengthInChars(&patternCEs
, 0, history
);
415 for(int32_t j
= 0; j
< HASH_TABLE_SIZE
; j
+= 1) {
416 badCharacterTable
[j
] = maxSkip
;
419 for(int32_t p
= 1; p
< plen
; p
+= 1) {
420 minLengthCache
[p
] = data
->minLengthInChars(&patternCEs
, p
, history
);
422 // Make sure this entry is not bigger than the previous one.
423 // Otherwise, we might skip too far in some cases.
424 if (minLengthCache
[p
] < 0 || minLengthCache
[p
] > minLengthCache
[p
- 1]) {
425 minLengthCache
[p
] = minLengthCache
[p
- 1];
429 minLengthCache
[plen
] = 0;
431 for(int32_t p
= 0; p
< plen
- 1; p
+= 1) {
432 badCharacterTable
[hash(patternCEs
[p
])] = minLengthCache
[p
+ 1];
435 DELETE_ARRAY(history
);
438 BadCharacterTable::~BadCharacterTable()
440 DELETE_ARRAY(minLengthCache
);
443 int32_t BadCharacterTable::operator[](uint32_t ce
) const
445 return badCharacterTable
[hash(ce
)];
448 int32_t BadCharacterTable::getMaxSkip() const
453 int32_t BadCharacterTable::minLengthInChars(int32_t index
)
455 return minLengthCache
[index
];
458 int32_t BadCharacterTable::hash(uint32_t ce
)
460 return UCOL_PRIMARYORDER(ce
) % HASH_TABLE_SIZE
;
463 class GoodSuffixTable
: public UMemory
466 GoodSuffixTable(CEList
&patternCEs
, BadCharacterTable
&badCharacterTable
, UErrorCode
&status
);
469 int32_t operator[](int32_t offset
) const;
472 int32_t *goodSuffixTable
;
475 GoodSuffixTable::GoodSuffixTable(CEList
&patternCEs
, BadCharacterTable
&badCharacterTable
, UErrorCode
&status
)
476 : goodSuffixTable(NULL
)
478 int32_t patlen
= patternCEs
.size();
480 // **** need a better way to deal with this ****
481 if (U_FAILURE(status
) || patlen
<= 0) {
485 int32_t *suff
= NEW_ARRAY(int32_t, patlen
);
486 int32_t start
= patlen
- 1, end
= - 1;
487 int32_t maxSkip
= badCharacterTable
.getMaxSkip();
490 status
= U_MEMORY_ALLOCATION_ERROR
;
495 suff
[patlen
- 1] = patlen
;
497 for (int32_t i
= patlen
- 2; i
>= 0; i
-= 1) {
498 // (i > start) means we're inside the last suffix match we found
499 // ((patlen - 1) - end) is how far the end of that match is from end of pattern
500 // (i - start) is how far we are from start of that match
501 // (i + (patlen - 1) - end) is index of same character at end of pattern
502 // so if any suffix match at that character doesn't extend beyond the last match,
503 // it's the suffix for this character as well
504 if (i
> start
&& suff
[i
+ patlen
- 1 - end
] < i
- start
) {
505 suff
[i
] = suff
[i
+ patlen
- 1 - end
];
511 while (start
>= 0 && patternCEs
[start
] == patternCEs
[--s
]) {
515 suff
[i
] = end
- start
;
519 // now build goodSuffixTable
520 goodSuffixTable
= NEW_ARRAY(int32_t, patlen
);
522 if (goodSuffixTable
== NULL
) {
524 status
= U_MEMORY_ALLOCATION_ERROR
;
529 // initialize entries to minLengthInChars of the pattern
530 for (int32_t i
= 0; i
< patlen
; i
+= 1) {
531 goodSuffixTable
[i
] = maxSkip
;
536 for (int32_t i
= patlen
- /*1*/ 2; i
>= 0; i
-= 1) {
537 if (suff
[i
] == i
+ 1) {
538 // this matching suffix is a prefix of the pattern
539 int32_t prefixSkip
= badCharacterTable
.minLengthInChars(i
+ 1);
541 // for any mis-match before this suffix, we should skip
542 // so that the front of the pattern (i.e. the prefix)
543 // lines up with the front of the suffix.
544 // (patlen - 1 - i) is the start of the suffix
545 while (prefix
< patlen
- 1 - i
) {
546 // value of maxSkip means never set...
547 if (goodSuffixTable
[prefix
] == maxSkip
) {
548 goodSuffixTable
[prefix
] = prefixSkip
;
556 for (int32_t i
= 0; i
< patlen
- 1; i
+= 1) {
557 goodSuffixTable
[patlen
- 1 - suff
[i
]] = badCharacterTable
.minLengthInChars(i
+ 1);
563 GoodSuffixTable::~GoodSuffixTable()
565 DELETE_ARRAY(goodSuffixTable
);
568 int32_t GoodSuffixTable::operator[](int32_t offset
) const
570 return goodSuffixTable
[offset
];
573 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch
)
576 UBool
BoyerMooreSearch::empty()
578 return patCEs
->size() <= 0;
581 CollData
*BoyerMooreSearch::getData()
586 CEList
*BoyerMooreSearch::getPatternCEs()
591 BadCharacterTable
*BoyerMooreSearch::getBadCharacterTable()
593 return badCharacterTable
;
596 GoodSuffixTable
*BoyerMooreSearch::getGoodSuffixTable()
598 return goodSuffixTable
;
601 BoyerMooreSearch::BoyerMooreSearch(CollData
*theData
, const UnicodeString
&patternString
, const UnicodeString
*targetString
,
603 : data(theData
), patCEs(NULL
), badCharacterTable(NULL
), goodSuffixTable(NULL
), pattern(patternString
), target(NULL
)
606 if (U_FAILURE(status
)) {
610 UCollator
*collator
= data
->getCollator();
612 patCEs
= new CEList(collator
, patternString
, status
);
614 if (patCEs
== NULL
|| U_FAILURE(status
)) {
618 badCharacterTable
= new BadCharacterTable(*patCEs
, data
, status
);
620 if (badCharacterTable
== NULL
|| U_FAILURE(status
)) {
624 goodSuffixTable
= new GoodSuffixTable(*patCEs
, *badCharacterTable
, status
);
626 if (targetString
!= NULL
) {
627 target
= new Target(collator
, targetString
, patCEs
->size(), status
);
631 BoyerMooreSearch::~BoyerMooreSearch()
634 delete goodSuffixTable
;
635 delete badCharacterTable
;
639 void BoyerMooreSearch::setTargetString(const UnicodeString
*targetString
, UErrorCode
&status
)
641 if (U_FAILURE(status
)) {
645 if (target
== NULL
) {
646 target
= new Target(data
->getCollator(), targetString
, patCEs
->size(), status
);
648 target
->setTargetString(targetString
);
652 // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
655 * * deal with trailing (and leading?) ignorables.
656 * * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
658 UBool
BoyerMooreSearch::search(int32_t offset
, int32_t &start
, int32_t &end
)
660 /*UCollator *coll =*/ data
->getCollator();
661 int32_t plen
= patCEs
->size();
662 int32_t tlen
= target
->stringLength();
663 int32_t maxSkip
= badCharacterTable
->getMaxSkip();
664 int32_t tOffset
= offset
+ maxSkip
;
667 // Searching for a zero length pattern always fails.
672 while (tOffset
<= tlen
) {
673 int32_t pIndex
= plen
- 1;
677 if (tOffset
< tlen
) {
678 // **** we really want to skip ahead enough to ****
679 // **** be sure we get at least 1 non-ignorable ****
680 // **** CE after the end of the pattern. ****
681 int32_t next
= target
->nextSafeBoundary(tOffset
+ 1);
683 target
->setOffset(next
);
685 for (lIndex
= 0; ; lIndex
+= 1) {
686 const CEI
*cei
= target
->prevCE(lIndex
);
687 int32_t low
= cei
->lowOffset
;
688 int32_t high
= cei
->highOffset
;
690 if (high
== 0 || (low
< high
&& low
<= tOffset
)) {
692 while (lIndex
>= 0 && target
->prevCE(lIndex
)->highOffset
== high
) {
696 if (high
> tOffset
) {
705 target
->setLast(tOffset
);
711 // Iterate backward until we hit the beginning of the pattern
712 while (pIndex
>= 0) {
713 uint32_t pce
= (*patCEs
)[pIndex
];
714 const CEI
*tcei
= target
->prevCE(tIndex
++);
717 if (tcei
->order
!= pce
) {
718 // There is a mismatch at this position. Decide how far
719 // over to shift the pattern, then try again.
721 int32_t gsOffset
= tOffset
+ (*goodSuffixTable
)[pIndex
];
722 #ifdef EXTRA_CAUTIOUS
723 int32_t old
= tOffset
;
726 tOffset
+= (*badCharacterTable
)[tcei
->order
] - badCharacterTable
->minLengthInChars(pIndex
+ 1);
728 if (gsOffset
> tOffset
) {
732 #ifdef EXTRA_CAUTIOUS
733 // Make sure we don't skip backwards...
734 if (tOffset
<= old
) {
746 // We made it back to the beginning of the pattern,
747 // which means we matched it all. Return the location.
748 const CEI firstCEI
= *target
->prevCE(tIndex
- 1);
749 const CEI lastCEI
= *target
->prevCE(lIndex
);
750 int32_t mStart
= firstCEI
.lowOffset
;
751 int32_t minLimit
= lastCEI
.lowOffset
;
752 int32_t maxLimit
= lastCEI
.highOffset
;
756 target
->setOffset(/*tOffset*/maxLimit
);
758 const CEI nextCEI
= *target
->nextCE(0);
760 if (nextCEI
.lowOffset
> maxLimit
) {
761 maxLimit
= nextCEI
.lowOffset
;
764 if (nextCEI
.lowOffset
== nextCEI
.highOffset
&& nextCEI
.order
!= (uint32_t)UCOL_NULLORDER
) {
768 if (! target
->isBreakBoundary(mStart
)) {
772 if (firstCEI
.lowOffset
== firstCEI
.highOffset
) {
777 if (minLimit
< maxLimit
) {
778 // When the last CE's low index is same with its high index, the CE is likely
779 // a part of expansion. In this case, the index is located just after the
780 // character corresponding to the CEs compared above. If the index is right
781 // at the break boundary, move the position to the next boundary will result
782 // incorrect match length when there are ignorable characters exist between
783 // the position and the next character produces CE(s). See ticket#8482.
784 if (minLimit
== lastCEI
.highOffset
&& target
->isBreakBoundary(minLimit
)) {
787 int32_t nbb
= target
->nextBreakBoundary(minLimit
);
789 if (nbb
>= lastCEI
.highOffset
) {
795 if (mLimit
> maxLimit
) {
799 if (! target
->isBreakBoundary(mLimit
)) {
803 if (! target
->isIdentical(pattern
, mStart
, mLimit
)) {
814 tOffset
+= (*goodSuffixTable
)[0]; // really? Maybe += 1 or += maxSkip?
816 // Otherwise, we're here because of a mismatch, so keep going....
827 #endif // #if !UCONFIG_NO_COLLATION