2 ******************************************************************************
4 * Copyright (C) 2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: unisetspan.cpp
10 * tab size: 8 (not used)
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
22 #include "unisetspan.h"
27 * List of offsets from the current position from where to try matching
28 * a code point or a string.
29 * Store offsets rather than indexes to simplify the code and use the same list
30 * for both increments (in span()) and decrements (in spanBack()).
32 * Assumption: The maximum offset is limited, and the offsets that are stored
33 * at any one time are relatively dense, that is, there are normally no gaps of
34 * hundreds or thousands of offset values.
36 * The implementation uses a circular buffer of byte flags,
37 * each indicating whether the corresponding offset is in the list.
38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39 * physically moving part of the list.
41 * Note: In principle, the caller should setMaxLength() to the maximum of the
42 * max string length and U16_LENGTH/U8_LENGTH to account for
43 * "long" single code points.
44 * However, this implementation uses at least a staticList with more than
45 * U8_LENGTH entries anyway.
47 * Note: If maxLength were guaranteed to be no more than 32 or 64,
48 * the list could be stored as bit flags in a single integer.
49 * Rather than handling a circular buffer with a start list index,
50 * the integer would simply be shifted when lower offsets are removed.
51 * UnicodeSet does not have a limit on the lengths of strings.
53 class OffsetList
{ // Only ever stack-allocated, does not need to inherit UMemory.
55 OffsetList() : list(staticList
), capacity(0), length(0), start(0) {}
58 if(list
!=staticList
) {
63 // Call exactly once if the list is to be used.
64 void setMaxLength(int32_t maxLength
) {
65 if(maxLength
<=(int32_t)sizeof(staticList
)) {
66 capacity
=(int32_t)sizeof(staticList
);
68 UBool
*l
=(UBool
*)uprv_malloc(maxLength
);
74 uprv_memset(list
, 0, capacity
);
78 uprv_memset(list
, 0, capacity
);
82 UBool
isEmpty() const {
83 return (UBool
)(length
==0);
86 // Reduce all stored offsets by delta, used when the current position
88 // There must not be any offsets lower than delta.
89 // If there is an offset equal to delta, it is removed.
90 // delta=[1..maxLength]
91 void shift(int32_t delta
) {
92 int32_t i
=start
+delta
;
103 // Add an offset. The list must not contain it yet.
104 // offset=[1..maxLength]
105 void addOffset(int32_t offset
) {
106 int32_t i
=start
+offset
;
114 // offset=[1..maxLength]
115 UBool
containsOffset(int32_t offset
) const {
116 int32_t i
=start
+offset
;
123 // Find the lowest stored offset from a non-empty list, remove it,
124 // and reduce all other offsets by this minimum.
125 // Returns [1..maxLength].
126 int32_t popMinimum() {
127 // Look for the next offset in list[start+1..capacity-1].
128 int32_t i
=start
, result
;
129 while(++i
<capacity
) {
140 // Wrap around and look for the next offset in list[0..start].
141 // Since the list is not empty, there will be one.
142 result
=capacity
-start
;
159 UBool staticList
[16];
162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
164 getUTF8Length(const UChar
*s
, int32_t length
) {
165 UErrorCode errorCode
=U_ZERO_ERROR
;
167 u_strToUTF8(NULL
, 0, &length8
, s
, length
, &errorCode
);
168 if(U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
171 // The string contains an unpaired surrogate.
172 // Ignore this string.
177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
179 appendUTF8(const UChar
*s
, int32_t length
, uint8_t *t
, int32_t capacity
) {
180 UErrorCode errorCode
=U_ZERO_ERROR
;
182 u_strToUTF8((char *)t
, capacity
, &length8
, s
, length
, &errorCode
);
183 if(U_SUCCESS(errorCode
)) {
186 // The string contains an unpaired surrogate.
187 // Ignore this string.
192 static inline uint8_t
193 makeSpanLengthByte(int32_t spanLength
) {
194 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195 return spanLength
<0xfe ? (uint8_t)spanLength
: (uint8_t)0xfe;
198 // Construct for all variants of span(), or only for any one variant.
199 // Initialize as little as possible, for single use.
200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet
&set
,
201 const UVector
&setStrings
,
203 : spanSet(0, 0x10ffff), pSpanNotSet(NULL
), strings(setStrings
),
204 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
206 maxLength16(0), maxLength8(0),
207 all((UBool
)(which
==ALL
)) {
208 spanSet
.retainAll(set
);
209 if(which
&NOT_CONTAINED
) {
210 // Default to the same sets.
211 // addToSpanNotSet() will create a separate set if necessary.
212 pSpanNotSet
=&spanSet
;
215 // Determine if the strings even need to be taken into account at all for span() etc.
216 // If any string is relevant, then all strings need to be used for
217 // span(longest match) but only the relevant ones for span(while contained).
218 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222 int32_t stringsLength
=strings
.size();
224 int32_t i
, spanLength
;
225 UBool someRelevant
=FALSE
;
226 for(i
=0; i
<stringsLength
; ++i
) {
227 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
228 const UChar
*s16
=string
.getBuffer();
229 int32_t length16
=string
.length();
231 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
232 if(spanLength
<length16
) { // Relevant string.
233 someRelevant
=thisRelevant
=TRUE
;
237 if((which
&UTF16
) && length16
>maxLength16
) {
238 maxLength16
=length16
;
240 if((which
&UTF8
) && (thisRelevant
|| (which
&CONTAINED
))) {
241 int32_t length8
=getUTF8Length(s16
, length16
);
243 if(length8
>maxLength8
) {
249 maxLength16
=maxLength8
=0;
253 // Freeze after checking for the need to use strings at all because freezing
254 // a set takes some time and memory which are wasted if there are no relevant strings.
259 uint8_t *spanBackLengths
;
260 uint8_t *spanUTF8Lengths
;
261 uint8_t *spanBackUTF8Lengths
;
263 // Allocate a block of meta data.
266 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267 allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
269 allocSize
=stringsLength
; // One set of span lengths.
271 // UTF-8 lengths and UTF-8 strings.
272 allocSize
+=stringsLength
*4+utf8Length
;
275 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
276 utf8Lengths
=staticLengths
;
278 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
279 if(utf8Lengths
==NULL
) {
280 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281 return; // Out of memory.
286 // Store span lengths for all span() variants.
287 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
288 spanBackLengths
=spanLengths
+stringsLength
;
289 spanUTF8Lengths
=spanBackLengths
+stringsLength
;
290 spanBackUTF8Lengths
=spanUTF8Lengths
+stringsLength
;
291 utf8
=spanBackUTF8Lengths
+stringsLength
;
293 // Store span lengths for only one span() variant.
295 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
296 utf8
=spanLengths
+stringsLength
;
298 spanLengths
=(uint8_t *)utf8Lengths
;
300 spanBackLengths
=spanUTF8Lengths
=spanBackUTF8Lengths
=spanLengths
;
303 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304 int32_t utf8Count
=0; // Count UTF-8 bytes written so far.
306 for(i
=0; i
<stringsLength
; ++i
) {
307 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
308 const UChar
*s16
=string
.getBuffer();
309 int32_t length16
=string
.length();
310 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
311 if(spanLength
<length16
) { // Relevant string.
313 if(which
&CONTAINED
) {
315 spanLengths
[i
]=makeSpanLengthByte(spanLength
);
318 spanLength
=length16
-spanSet
.spanBack(s16
, length16
, USET_SPAN_CONTAINED
);
319 spanBackLengths
[i
]=makeSpanLengthByte(spanLength
);
321 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322 spanLengths
[i
]=spanBackLengths
[i
]=0; // Only store a relevant/irrelevant flag.
326 uint8_t *s8
=utf8
+utf8Count
;
327 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
328 utf8Count
+=utf8Lengths
[i
]=length8
;
329 if(length8
==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
330 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
331 } else { // Relevant for UTF-8.
332 if(which
&CONTAINED
) {
334 spanLength
=spanSet
.spanUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
335 spanUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
338 spanLength
=length8
-spanSet
.spanBackUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
339 spanBackUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
341 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=0; // Only store a relevant/irrelevant flag.
346 if(which
&NOT_CONTAINED
) {
347 // Add string start and end code points to the spanNotSet so that
348 // a span(while not contained) stops before any string.
352 U16_NEXT(s16
, len
, length16
, c
);
356 int32_t len
=length16
;
357 U16_PREV(s16
, 0, len
, c
);
361 } else { // Irrelevant string.
363 if(which
&CONTAINED
) { // Only necessary for LONGEST_MATCH.
364 uint8_t *s8
=utf8
+utf8Count
;
365 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
366 utf8Count
+=utf8Lengths
[i
]=length8
;
372 spanLengths
[i
]=spanBackLengths
[i
]=
373 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=
374 (uint8_t)ALL_CP_CONTAINED
;
376 // All spanXYZLengths pointers contain the same address.
377 spanLengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
384 pSpanNotSet
->freeze();
388 // Copy constructor. Assumes which==ALL for a frozen set.
389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan
&otherStringSpan
,
390 const UVector
&newParentSetStrings
)
391 : spanSet(otherStringSpan
.spanSet
), pSpanNotSet(NULL
), strings(newParentSetStrings
),
392 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
393 utf8Length(otherStringSpan
.utf8Length
),
394 maxLength16(otherStringSpan
.maxLength16
), maxLength8(otherStringSpan
.maxLength8
),
396 if(otherStringSpan
.pSpanNotSet
==&otherStringSpan
.spanSet
) {
397 pSpanNotSet
=&spanSet
;
399 pSpanNotSet
=(UnicodeSet
*)otherStringSpan
.pSpanNotSet
->clone();
402 // Allocate a block of meta data.
403 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404 int32_t stringsLength
=strings
.size();
405 int32_t allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
406 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
407 utf8Lengths
=staticLengths
;
409 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
410 if(utf8Lengths
==NULL
) {
411 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412 return; // Out of memory.
416 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
417 utf8
=spanLengths
+stringsLength
*4;
418 uprv_memcpy(utf8Lengths
, otherStringSpan
.utf8Lengths
, allocSize
);
421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
422 if(pSpanNotSet
!=NULL
&& pSpanNotSet
!=&spanSet
) {
425 if(utf8Lengths
!=NULL
&& utf8Lengths
!=staticLengths
) {
426 uprv_free(utf8Lengths
);
430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c
) {
431 if(pSpanNotSet
==NULL
|| pSpanNotSet
==&spanSet
) {
432 if(spanSet
.contains(c
)) {
433 return; // Nothing to do.
435 UnicodeSet
*newSet
=(UnicodeSet
*)spanSet
.cloneAsThawed();
437 return; // Out of memory.
445 // Compare strings without any argument checks. Requires length>0.
447 matches16(const UChar
*s
, const UChar
*t
, int32_t length
) {
457 matches8(const uint8_t *s
, const uint8_t *t
, int32_t length
) {
466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
467 // at code point boundaries.
468 // That is, each edge of a match must not be in the middle of a surrogate pair.
470 matches16CPB(const UChar
*s
, int32_t start
, int32_t limit
, const UChar
*t
, int32_t length
) {
473 return matches16(s
, t
, length
) &&
474 !(0<start
&& U16_IS_LEAD(s
[-1]) && U16_IS_TRAIL(s
[0])) &&
475 !(length
<limit
&& U16_IS_LEAD(s
[length
-1]) && U16_IS_TRAIL(s
[length
]));
478 // Does the set contain the next code point?
479 // If so, return its length; otherwise return its negative length.
480 static inline int32_t
481 spanOne(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
483 if(c
>=0xd800 && c
<=0xdbff && length
>=2 && U16_IS_TRAIL(c2
=s
[1])) {
484 return set
.contains(U16_GET_SUPPLEMENTARY(c
, c2
)) ? 2 : -2;
486 return set
.contains(c
) ? 1 : -1;
489 static inline int32_t
490 spanOneBack(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
491 UChar c
=s
[length
-1], c2
;
492 if(c
>=0xdc00 && c
<=0xdfff && length
>=2 && U16_IS_LEAD(c2
=s
[length
-2])) {
493 return set
.contains(U16_GET_SUPPLEMENTARY(c2
, c
)) ? 2 : -2;
495 return set
.contains(c
) ? 1 : -1;
498 static inline int32_t
499 spanOneUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
502 return set
.contains(c
) ? 1 : -1;
504 // Take advantage of non-ASCII fastpaths in U8_NEXT().
506 U8_NEXT(s
, i
, length
, c
);
507 return set
.contains(c
) ? i
: -i
;
510 static inline int32_t
511 spanOneBackUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
512 UChar32 c
=s
[length
-1];
514 return set
.contains(c
) ? 1 : -1;
517 c
=utf8_prevCharSafeBody(s
, 0, &i
, c
, -1);
519 return set
.contains(c
) ? length
: -length
;
523 * Note: In span() when spanLength==0 (after a string match, or at the beginning
524 * after an empty code point span) and in spanNot() and spanNotUTF8(),
525 * string matching could use a binary search
526 * because all string matches are done from the same start index.
528 * For UTF-8, this would require a comparison function that returns UTF-16 order.
530 * This optimization should not be necessary for normal UnicodeSets because
531 * most sets have no strings, and most sets with strings have
532 * very few very short strings.
533 * For cases with many strings, it might be better to use a different API
534 * and implementation with a DFA (state machine).
538 * Algorithm for span(USET_SPAN_CONTAINED)
540 * Theoretical algorithm:
541 * - Iterate through the string, and at each code point boundary:
542 * + If the code point there is in the set, then remember to continue after it.
543 * + If a set string matches at the current position, then remember to continue after it.
544 * + Either recursively span for each code point or string match,
545 * or recursively span for all but the shortest one and
546 * iteratively continue the span with the shortest local match.
547 * + Remember the longest recursive span (the farthest end point).
548 * + If there is no match at the current position, neither for the code point there
549 * nor for any set string, then stop and return the longest recursive span length.
551 * Optimized implementation:
553 * (We assume that most sets will have very few very short strings.
554 * A span using a string-less set is extremely fast.)
556 * Create and cache a spanSet which contains all of the single code points
557 * of the original set but none of its strings.
559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
561 * + Try to match each set string at the end of the spanLength.
562 * ~ Set strings that start with set-contained code points must be matched
563 * with a partial overlap because the recursive algorithm would have tried
564 * to match them at every position.
565 * ~ Set strings that entirely consist of set-contained code points
566 * are irrelevant for span(USET_SPAN_CONTAINED) because the
567 * recursive algorithm would continue after them anyway
568 * and find the longest recursive match from their end.
569 * ~ Rather than recursing, note each end point of a set string match.
570 * + If no set string matched after spanSet.span(), then return
571 * with where the spanSet.span() ended.
572 * + If at least one set string matched after spanSet.span(), then
573 * pop the shortest string match end point and continue
574 * the loop, trying to match all set strings from there.
575 * + If at least one more set string matched after a previous string match,
576 * then test if the code point after the previous string match is also
577 * contained in the set.
578 * Continue the loop with the shortest end point of either this code point
579 * or a matching set string.
580 * + If no more set string matched after a previous string match,
581 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582 * Stop if spanLength==0, otherwise continue the loop.
584 * By noting each end point of a set string match,
585 * the function visits each string position at most once and finishes
588 * The recursive algorithm may visit the same string position many times
589 * if multiple paths lead to it and finishes in exponential time.
593 * Algorithm for span(USET_SPAN_SIMPLE)
595 * Theoretical algorithm:
596 * - Iterate through the string, and at each code point boundary:
597 * + If the code point there is in the set, then remember to continue after it.
598 * + If a set string matches at the current position, then remember to continue after it.
599 * + Continue from the farthest match position and ignore all others.
600 * + If there is no match at the current position,
601 * then stop and return the current position.
603 * Optimized implementation:
605 * (Same assumption and spanSet as above.)
607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
609 * + Try to match each set string at the end of the spanLength.
610 * ~ Set strings that start with set-contained code points must be matched
611 * with a partial overlap because the standard algorithm would have tried
612 * to match them earlier.
613 * ~ Set strings that entirely consist of set-contained code points
614 * must be matched with a full overlap because the longest-match algorithm
615 * would hide set string matches that end earlier.
616 * Such set strings need not be matched earlier inside the code point span
617 * because the standard algorithm would then have continued after
618 * the set string match anyway.
619 * ~ Remember the longest set string match (farthest end point) from the earliest
621 * + If no set string matched after spanSet.span(), then return
622 * with where the spanSet.span() ended.
623 * + If at least one set string matched, then continue the loop after the
624 * longest match from the earliest position.
625 * + If no more set string matched after a previous string match,
626 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627 * Stop if spanLength==0, otherwise continue the loop.
630 int32_t UnicodeSetStringSpan::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
631 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
632 return spanNot(s
, length
);
634 int32_t spanLength
=spanSet
.span(s
, length
, USET_SPAN_CONTAINED
);
635 if(spanLength
==length
) {
639 // Consider strings; they may overlap with the span.
641 if(spanCondition
==USET_SPAN_CONTAINED
) {
642 // Use offset list to try all possibilities.
643 offsets
.setMaxLength(maxLength16
);
645 int32_t pos
=spanLength
, rest
=length
-pos
;
646 int32_t i
, stringsLength
=strings
.size();
648 if(spanCondition
==USET_SPAN_CONTAINED
) {
649 for(i
=0; i
<stringsLength
; ++i
) {
650 int32_t overlap
=spanLengths
[i
];
651 if(overlap
==ALL_CP_CONTAINED
) {
652 continue; // Irrelevant string.
654 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
655 const UChar
*s16
=string
.getBuffer();
656 int32_t length16
=string
.length();
658 // Try to match this string at pos-overlap..pos.
659 if(overlap
>=LONG_SPAN
) {
661 // While contained: No point matching fully inside the code point span.
662 U16_BACK_1(s16
, 0, overlap
); // Length of the string minus the last code point.
664 if(overlap
>spanLength
) {
667 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
672 // Try to match if the increment is not listed already.
673 if(!offsets
.containsOffset(inc
) && matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)) {
675 return length
; // Reached the end of the string.
677 offsets
.addOffset(inc
);
686 } else /* USET_SPAN_SIMPLE */ {
687 int32_t maxInc
=0, maxOverlap
=0;
688 for(i
=0; i
<stringsLength
; ++i
) {
689 int32_t overlap
=spanLengths
[i
];
690 // For longest match, we do need to try to match even an all-contained string
691 // to find the match from the earliest start.
693 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
694 const UChar
*s16
=string
.getBuffer();
695 int32_t length16
=string
.length();
697 // Try to match this string at pos-overlap..pos.
698 if(overlap
>=LONG_SPAN
) {
700 // Longest match: Need to match fully inside the code point span
701 // to find the match from the earliest start.
703 if(overlap
>spanLength
) {
706 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
708 if(inc
>rest
|| overlap
<maxOverlap
) {
711 // Try to match if the string is longer or starts earlier.
712 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
713 matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)
715 maxInc
=inc
; // Longest match from earliest start.
724 if(maxInc
!=0 || maxOverlap
!=0) {
725 // Longest-match algorithm, and there was a string match.
726 // Simply continue after it.
730 return length
; // Reached the end of the string.
732 spanLength
=0; // Match strings from after a string match.
736 // Finished trying to match all strings at pos.
738 if(spanLength
!=0 || pos
==0) {
739 // The position is after an unlimited code point span (spanLength!=0),
740 // not after a string match.
741 // The only position where spanLength==0 after a span is pos==0.
742 // Otherwise, an unlimited code point span is only tried again when no
743 // strings match, and if such a non-initial span fails we stop.
744 if(offsets
.isEmpty()) {
745 return pos
; // No strings matched after a span.
747 // Match strings from after the next string match.
749 // The position is after a string match (or a single code point).
750 if(offsets
.isEmpty()) {
751 // No more strings matched after a previous string match.
752 // Try another code point span from after the last string match.
753 spanLength
=spanSet
.span(s
+pos
, rest
, USET_SPAN_CONTAINED
);
754 if( spanLength
==rest
|| // Reached the end of the string, or
755 spanLength
==0 // neither strings nor span progressed.
757 return pos
+spanLength
;
761 continue; // spanLength>0: Match strings from after a span.
763 // Try to match only one code point from after a string match if some
764 // string matched beyond it, so that we try all possible positions
765 // and don't overshoot.
766 spanLength
=spanOne(spanSet
, s
+pos
, rest
);
768 if(spanLength
==rest
) {
769 return length
; // Reached the end of the string.
771 // Match strings after this code point.
772 // There cannot be any increments below it because UnicodeSet strings
773 // contain multiple code points.
776 offsets
.shift(spanLength
);
778 continue; // Match strings from after a single code point.
780 // Match strings from after the next string match.
783 int32_t minOffset
=offsets
.popMinimum();
786 spanLength
=0; // Match strings from after a string match.
790 int32_t UnicodeSetStringSpan::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
791 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
792 return spanNotBack(s
, length
);
794 int32_t pos
=spanSet
.spanBack(s
, length
, USET_SPAN_CONTAINED
);
798 int32_t spanLength
=length
-pos
;
800 // Consider strings; they may overlap with the span.
802 if(spanCondition
==USET_SPAN_CONTAINED
) {
803 // Use offset list to try all possibilities.
804 offsets
.setMaxLength(maxLength16
);
806 int32_t i
, stringsLength
=strings
.size();
807 uint8_t *spanBackLengths
=spanLengths
;
809 spanBackLengths
+=stringsLength
;
812 if(spanCondition
==USET_SPAN_CONTAINED
) {
813 for(i
=0; i
<stringsLength
; ++i
) {
814 int32_t overlap
=spanBackLengths
[i
];
815 if(overlap
==ALL_CP_CONTAINED
) {
816 continue; // Irrelevant string.
818 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
819 const UChar
*s16
=string
.getBuffer();
820 int32_t length16
=string
.length();
822 // Try to match this string at pos-(length16-overlap)..pos-length16.
823 if(overlap
>=LONG_SPAN
) {
825 // While contained: No point matching fully inside the code point span.
827 U16_FWD_1(s16
, len1
, overlap
);
828 overlap
-=len1
; // Length of the string minus the first code point.
830 if(overlap
>spanLength
) {
833 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
838 // Try to match if the decrement is not listed already.
839 if(!offsets
.containsOffset(dec
) && matches16CPB(s
, pos
-dec
, length
, s16
, length16
)) {
841 return 0; // Reached the start of the string.
843 offsets
.addOffset(dec
);
852 } else /* USET_SPAN_SIMPLE */ {
853 int32_t maxDec
=0, maxOverlap
=0;
854 for(i
=0; i
<stringsLength
; ++i
) {
855 int32_t overlap
=spanBackLengths
[i
];
856 // For longest match, we do need to try to match even an all-contained string
857 // to find the match from the latest end.
859 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
860 const UChar
*s16
=string
.getBuffer();
861 int32_t length16
=string
.length();
863 // Try to match this string at pos-(length16-overlap)..pos-length16.
864 if(overlap
>=LONG_SPAN
) {
866 // Longest match: Need to match fully inside the code point span
867 // to find the match from the latest end.
869 if(overlap
>spanLength
) {
872 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
874 if(dec
>pos
|| overlap
<maxOverlap
) {
877 // Try to match if the string is longer or ends later.
878 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
879 matches16CPB(s
, pos
-dec
, length
, s16
, length16
)
881 maxDec
=dec
; // Longest match from latest end.
890 if(maxDec
!=0 || maxOverlap
!=0) {
891 // Longest-match algorithm, and there was a string match.
892 // Simply continue before it.
895 return 0; // Reached the start of the string.
897 spanLength
=0; // Match strings from before a string match.
901 // Finished trying to match all strings at pos.
903 if(spanLength
!=0 || pos
==length
) {
904 // The position is before an unlimited code point span (spanLength!=0),
905 // not before a string match.
906 // The only position where spanLength==0 before a span is pos==length.
907 // Otherwise, an unlimited code point span is only tried again when no
908 // strings match, and if such a non-initial span fails we stop.
909 if(offsets
.isEmpty()) {
910 return pos
; // No strings matched before a span.
912 // Match strings from before the next string match.
914 // The position is before a string match (or a single code point).
915 if(offsets
.isEmpty()) {
916 // No more strings matched before a previous string match.
917 // Try another code point span from before the last string match.
919 pos
=spanSet
.spanBack(s
, oldPos
, USET_SPAN_CONTAINED
);
920 spanLength
=oldPos
-pos
;
921 if( pos
==0 || // Reached the start of the string, or
922 spanLength
==0 // neither strings nor span progressed.
926 continue; // spanLength>0: Match strings from before a span.
928 // Try to match only one code point from before a string match if some
929 // string matched beyond it, so that we try all possible positions
930 // and don't overshoot.
931 spanLength
=spanOneBack(spanSet
, s
, pos
);
933 if(spanLength
==pos
) {
934 return 0; // Reached the start of the string.
936 // Match strings before this code point.
937 // There cannot be any decrements below it because UnicodeSet strings
938 // contain multiple code points.
940 offsets
.shift(spanLength
);
942 continue; // Match strings from before a single code point.
944 // Match strings from before the next string match.
947 pos
-=offsets
.popMinimum();
948 spanLength
=0; // Match strings from before a string match.
952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
953 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
954 return spanNotUTF8(s
, length
);
956 int32_t spanLength
=spanSet
.spanUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
957 if(spanLength
==length
) {
961 // Consider strings; they may overlap with the span.
963 if(spanCondition
==USET_SPAN_CONTAINED
) {
964 // Use offset list to try all possibilities.
965 offsets
.setMaxLength(maxLength8
);
967 int32_t pos
=spanLength
, rest
=length
-pos
;
968 int32_t i
, stringsLength
=strings
.size();
969 uint8_t *spanUTF8Lengths
=spanLengths
;
971 spanUTF8Lengths
+=2*stringsLength
;
974 const uint8_t *s8
=utf8
;
976 if(spanCondition
==USET_SPAN_CONTAINED
) {
977 for(i
=0; i
<stringsLength
; ++i
) {
978 length8
=utf8Lengths
[i
];
980 continue; // String not representable in UTF-8.
982 int32_t overlap
=spanUTF8Lengths
[i
];
983 if(overlap
==ALL_CP_CONTAINED
) {
985 continue; // Irrelevant string.
988 // Try to match this string at pos-overlap..pos.
989 if(overlap
>=LONG_SPAN
) {
991 // While contained: No point matching fully inside the code point span.
992 U8_BACK_1(s8
, 0, overlap
); // Length of the string minus the last code point.
994 if(overlap
>spanLength
) {
997 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1002 // Try to match if the increment is not listed already.
1003 // Match at code point boundaries. (The UTF-8 strings were converted
1004 // from UTF-16 and are guaranteed to be well-formed.)
1005 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1006 !offsets
.containsOffset(inc
) &&
1007 matches8(s
+pos
-overlap
, s8
, length8
)
1011 return length
; // Reached the end of the string.
1013 offsets
.addOffset(inc
);
1023 } else /* USET_SPAN_SIMPLE */ {
1024 int32_t maxInc
=0, maxOverlap
=0;
1025 for(i
=0; i
<stringsLength
; ++i
) {
1026 length8
=utf8Lengths
[i
];
1028 continue; // String not representable in UTF-8.
1030 int32_t overlap
=spanUTF8Lengths
[i
];
1031 // For longest match, we do need to try to match even an all-contained string
1032 // to find the match from the earliest start.
1034 // Try to match this string at pos-overlap..pos.
1035 if(overlap
>=LONG_SPAN
) {
1037 // Longest match: Need to match fully inside the code point span
1038 // to find the match from the earliest start.
1040 if(overlap
>spanLength
) {
1043 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1045 if(inc
>rest
|| overlap
<maxOverlap
) {
1048 // Try to match if the string is longer or starts earlier.
1049 // Match at code point boundaries. (The UTF-8 strings were converted
1050 // from UTF-16 and are guaranteed to be well-formed.)
1051 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1052 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
1053 matches8(s
+pos
-overlap
, s8
, length8
)
1056 maxInc
=inc
; // Longest match from earliest start.
1066 if(maxInc
!=0 || maxOverlap
!=0) {
1067 // Longest-match algorithm, and there was a string match.
1068 // Simply continue after it.
1072 return length
; // Reached the end of the string.
1074 spanLength
=0; // Match strings from after a string match.
1078 // Finished trying to match all strings at pos.
1080 if(spanLength
!=0 || pos
==0) {
1081 // The position is after an unlimited code point span (spanLength!=0),
1082 // not after a string match.
1083 // The only position where spanLength==0 after a span is pos==0.
1084 // Otherwise, an unlimited code point span is only tried again when no
1085 // strings match, and if such a non-initial span fails we stop.
1086 if(offsets
.isEmpty()) {
1087 return pos
; // No strings matched after a span.
1089 // Match strings from after the next string match.
1091 // The position is after a string match (or a single code point).
1092 if(offsets
.isEmpty()) {
1093 // No more strings matched after a previous string match.
1094 // Try another code point span from after the last string match.
1095 spanLength
=spanSet
.spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_CONTAINED
);
1096 if( spanLength
==rest
|| // Reached the end of the string, or
1097 spanLength
==0 // neither strings nor span progressed.
1099 return pos
+spanLength
;
1103 continue; // spanLength>0: Match strings from after a span.
1105 // Try to match only one code point from after a string match if some
1106 // string matched beyond it, so that we try all possible positions
1107 // and don't overshoot.
1108 spanLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1110 if(spanLength
==rest
) {
1111 return length
; // Reached the end of the string.
1113 // Match strings after this code point.
1114 // There cannot be any increments below it because UnicodeSet strings
1115 // contain multiple code points.
1118 offsets
.shift(spanLength
);
1120 continue; // Match strings from after a single code point.
1122 // Match strings from after the next string match.
1125 int32_t minOffset
=offsets
.popMinimum();
1128 spanLength
=0; // Match strings from after a string match.
1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
1133 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
1134 return spanNotBackUTF8(s
, length
);
1136 int32_t pos
=spanSet
.spanBackUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
1140 int32_t spanLength
=length
-pos
;
1142 // Consider strings; they may overlap with the span.
1144 if(spanCondition
==USET_SPAN_CONTAINED
) {
1145 // Use offset list to try all possibilities.
1146 offsets
.setMaxLength(maxLength8
);
1148 int32_t i
, stringsLength
=strings
.size();
1149 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1151 spanBackUTF8Lengths
+=3*stringsLength
;
1154 const uint8_t *s8
=utf8
;
1156 if(spanCondition
==USET_SPAN_CONTAINED
) {
1157 for(i
=0; i
<stringsLength
; ++i
) {
1158 length8
=utf8Lengths
[i
];
1160 continue; // String not representable in UTF-8.
1162 int32_t overlap
=spanBackUTF8Lengths
[i
];
1163 if(overlap
==ALL_CP_CONTAINED
) {
1165 continue; // Irrelevant string.
1168 // Try to match this string at pos-(length8-overlap)..pos-length8.
1169 if(overlap
>=LONG_SPAN
) {
1171 // While contained: No point matching fully inside the code point span.
1173 U8_FWD_1(s8
, len1
, overlap
);
1174 overlap
-=len1
; // Length of the string minus the first code point.
1176 if(overlap
>spanLength
) {
1179 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1184 // Try to match if the decrement is not listed already.
1185 // Match at code point boundaries. (The UTF-8 strings were converted
1186 // from UTF-16 and are guaranteed to be well-formed.)
1187 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1188 !offsets
.containsOffset(dec
) &&
1189 matches8(s
+pos
-dec
, s8
, length8
)
1192 return 0; // Reached the start of the string.
1194 offsets
.addOffset(dec
);
1204 } else /* USET_SPAN_SIMPLE */ {
1205 int32_t maxDec
=0, maxOverlap
=0;
1206 for(i
=0; i
<stringsLength
; ++i
) {
1207 length8
=utf8Lengths
[i
];
1209 continue; // String not representable in UTF-8.
1211 int32_t overlap
=spanBackUTF8Lengths
[i
];
1212 // For longest match, we do need to try to match even an all-contained string
1213 // to find the match from the latest end.
1215 // Try to match this string at pos-(length8-overlap)..pos-length8.
1216 if(overlap
>=LONG_SPAN
) {
1218 // Longest match: Need to match fully inside the code point span
1219 // to find the match from the latest end.
1221 if(overlap
>spanLength
) {
1224 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1226 if(dec
>pos
|| overlap
<maxOverlap
) {
1229 // Try to match if the string is longer or ends later.
1230 // Match at code point boundaries. (The UTF-8 strings were converted
1231 // from UTF-16 and are guaranteed to be well-formed.)
1232 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1233 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
1234 matches8(s
+pos
-dec
, s8
, length8
)
1236 maxDec
=dec
; // Longest match from latest end.
1246 if(maxDec
!=0 || maxOverlap
!=0) {
1247 // Longest-match algorithm, and there was a string match.
1248 // Simply continue before it.
1251 return 0; // Reached the start of the string.
1253 spanLength
=0; // Match strings from before a string match.
1257 // Finished trying to match all strings at pos.
1259 if(spanLength
!=0 || pos
==length
) {
1260 // The position is before an unlimited code point span (spanLength!=0),
1261 // not before a string match.
1262 // The only position where spanLength==0 before a span is pos==length.
1263 // Otherwise, an unlimited code point span is only tried again when no
1264 // strings match, and if such a non-initial span fails we stop.
1265 if(offsets
.isEmpty()) {
1266 return pos
; // No strings matched before a span.
1268 // Match strings from before the next string match.
1270 // The position is before a string match (or a single code point).
1271 if(offsets
.isEmpty()) {
1272 // No more strings matched before a previous string match.
1273 // Try another code point span from before the last string match.
1275 pos
=spanSet
.spanBackUTF8((const char *)s
, oldPos
, USET_SPAN_CONTAINED
);
1276 spanLength
=oldPos
-pos
;
1277 if( pos
==0 || // Reached the start of the string, or
1278 spanLength
==0 // neither strings nor span progressed.
1282 continue; // spanLength>0: Match strings from before a span.
1284 // Try to match only one code point from before a string match if some
1285 // string matched beyond it, so that we try all possible positions
1286 // and don't overshoot.
1287 spanLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1289 if(spanLength
==pos
) {
1290 return 0; // Reached the start of the string.
1292 // Match strings before this code point.
1293 // There cannot be any decrements below it because UnicodeSet strings
1294 // contain multiple code points.
1296 offsets
.shift(spanLength
);
1298 continue; // Match strings from before a single code point.
1300 // Match strings from before the next string match.
1303 pos
-=offsets
.popMinimum();
1304 spanLength
=0; // Match strings from before a string match.
1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1311 * Theoretical algorithm:
1312 * - Iterate through the string, and at each code point boundary:
1313 * + If the code point there is in the set, then return with the current position.
1314 * + If a set string matches at the current position, then return with the current position.
1316 * Optimized implementation:
1318 * (Same assumption as for span() above.)
1320 * Create and cache a spanNotSet which contains all of the single code points
1321 * of the original set but none of its strings.
1322 * For each set string add its initial code point to the spanNotSet.
1323 * (Also add its final code point for spanNotBack().)
1326 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327 * + If the current code point is in the original set, then
1328 * return the current position.
1329 * + If any set string matches at the current position, then
1330 * return the current position.
1331 * + If there is no match at the current position, neither for the code point there
1332 * nor for any set string, then skip this code point and continue the loop.
1333 * This happens for set-string-initial code points that were added to spanNotSet
1334 * when there is not actually a match for such a set string.
1337 int32_t UnicodeSetStringSpan::spanNot(const UChar
*s
, int32_t length
) const {
1338 int32_t pos
=0, rest
=length
;
1339 int32_t i
, stringsLength
=strings
.size();
1341 // Span until we find a code point from the set,
1342 // or a code point that starts or ends some string.
1343 i
=pSpanNotSet
->span(s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1345 return length
; // Reached the end of the string.
1350 // Check whether the current code point is in the original set,
1351 // without the string starts and ends.
1352 int32_t cpLength
=spanOne(spanSet
, s
+pos
, rest
);
1354 return pos
; // There is a set element at pos.
1357 // Try to match the strings at pos.
1358 for(i
=0; i
<stringsLength
; ++i
) {
1359 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1360 continue; // Irrelevant string.
1362 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1363 const UChar
*s16
=string
.getBuffer();
1364 int32_t length16
=string
.length();
1365 if(length16
<=rest
&& matches16CPB(s
, pos
, length
, s16
, length16
)) {
1366 return pos
; // There is a set element at pos.
1370 // The span(while not contained) ended on a string start/end which is
1371 // not in the original set. Skip this code point and continue.
1376 return length
; // Reached the end of the string.
1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar
*s
, int32_t length
) const {
1381 int32_t i
, stringsLength
=strings
.size();
1383 // Span until we find a code point from the set,
1384 // or a code point that starts or ends some string.
1385 pos
=pSpanNotSet
->spanBack(s
, pos
, USET_SPAN_NOT_CONTAINED
);
1387 return 0; // Reached the start of the string.
1390 // Check whether the current code point is in the original set,
1391 // without the string starts and ends.
1392 int32_t cpLength
=spanOneBack(spanSet
, s
, pos
);
1394 return pos
; // There is a set element at pos.
1397 // Try to match the strings at pos.
1398 for(i
=0; i
<stringsLength
; ++i
) {
1399 // Use spanLengths rather than a spanBackLengths pointer because
1400 // it is easier and we only need to know whether the string is irrelevant
1401 // which is the same in either array.
1402 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1403 continue; // Irrelevant string.
1405 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1406 const UChar
*s16
=string
.getBuffer();
1407 int32_t length16
=string
.length();
1408 if(length16
<=pos
&& matches16CPB(s
, pos
-length16
, length
, s16
, length16
)) {
1409 return pos
; // There is a set element at pos.
1413 // The span(while not contained) ended on a string start/end which is
1414 // not in the original set. Skip this code point and continue.
1418 return 0; // Reached the start of the string.
1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s
, int32_t length
) const {
1422 int32_t pos
=0, rest
=length
;
1423 int32_t i
, stringsLength
=strings
.size();
1424 uint8_t *spanUTF8Lengths
=spanLengths
;
1426 spanUTF8Lengths
+=2*stringsLength
;
1429 // Span until we find a code point from the set,
1430 // or a code point that starts or ends some string.
1431 i
=pSpanNotSet
->spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1433 return length
; // Reached the end of the string.
1438 // Check whether the current code point is in the original set,
1439 // without the string starts and ends.
1440 int32_t cpLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1442 return pos
; // There is a set element at pos.
1445 // Try to match the strings at pos.
1446 const uint8_t *s8
=utf8
;
1448 for(i
=0; i
<stringsLength
; ++i
) {
1449 length8
=utf8Lengths
[i
];
1450 // ALL_CP_CONTAINED: Irrelevant string.
1451 if(length8
!=0 && spanUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=rest
&& matches8(s
+pos
, s8
, length8
)) {
1452 return pos
; // There is a set element at pos.
1457 // The span(while not contained) ended on a string start/end which is
1458 // not in the original set. Skip this code point and continue.
1463 return length
; // Reached the end of the string.
1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s
, int32_t length
) const {
1468 int32_t i
, stringsLength
=strings
.size();
1469 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1471 spanBackUTF8Lengths
+=3*stringsLength
;
1474 // Span until we find a code point from the set,
1475 // or a code point that starts or ends some string.
1476 pos
=pSpanNotSet
->spanBackUTF8((const char *)s
, pos
, USET_SPAN_NOT_CONTAINED
);
1478 return 0; // Reached the start of the string.
1481 // Check whether the current code point is in the original set,
1482 // without the string starts and ends.
1483 int32_t cpLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1485 return pos
; // There is a set element at pos.
1488 // Try to match the strings at pos.
1489 const uint8_t *s8
=utf8
;
1491 for(i
=0; i
<stringsLength
; ++i
) {
1492 length8
=utf8Lengths
[i
];
1493 // ALL_CP_CONTAINED: Irrelevant string.
1494 if(length8
!=0 && spanBackUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=pos
&& matches8(s
+pos
-length8
, s8
, length8
)) {
1495 return pos
; // There is a set element at pos.
1500 // The span(while not contained) ended on a string start/end which is
1501 // not in the original set. Skip this code point and continue.
1505 return 0; // Reached the start of the string.