1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2007-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: unisetspan.cpp
12 * tab size: 8 (not used)
15 * created on: 2007mar01
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf8.h"
23 #include "unicode/utf16.h"
26 #include "unisetspan.h"
31 * List of offsets from the current position from where to try matching
32 * a code point or a string.
33 * Store offsets rather than indexes to simplify the code and use the same list
34 * for both increments (in span()) and decrements (in spanBack()).
36 * Assumption: The maximum offset is limited, and the offsets that are stored
37 * at any one time are relatively dense, that is, there are normally no gaps of
38 * hundreds or thousands of offset values.
40 * The implementation uses a circular buffer of byte flags,
41 * each indicating whether the corresponding offset is in the list.
42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43 * physically moving part of the list.
45 * Note: In principle, the caller should setMaxLength() to the maximum of the
46 * max string length and U16_LENGTH/U8_LENGTH to account for
47 * "long" single code points.
48 * However, this implementation uses at least a staticList with more than
49 * U8_LENGTH entries anyway.
51 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52 * the list could be stored as bit flags in a single integer.
53 * Rather than handling a circular buffer with a start list index,
54 * the integer would simply be shifted when lower offsets are removed.
55 * UnicodeSet does not have a limit on the lengths of strings.
57 class OffsetList
{ // Only ever stack-allocated, does not need to inherit UMemory.
59 OffsetList() : list(staticList
), capacity(0), length(0), start(0) {}
62 if(list
!=staticList
) {
67 // Call exactly once if the list is to be used.
68 void setMaxLength(int32_t maxLength
) {
69 if(maxLength
<=(int32_t)sizeof(staticList
)) {
70 capacity
=(int32_t)sizeof(staticList
);
72 UBool
*l
=(UBool
*)uprv_malloc(maxLength
);
78 uprv_memset(list
, 0, capacity
);
82 uprv_memset(list
, 0, capacity
);
86 UBool
isEmpty() const {
87 return (UBool
)(length
==0);
90 // Reduce all stored offsets by delta, used when the current position
92 // There must not be any offsets lower than delta.
93 // If there is an offset equal to delta, it is removed.
94 // delta=[1..maxLength]
95 void shift(int32_t delta
) {
96 int32_t i
=start
+delta
;
107 // Add an offset. The list must not contain it yet.
108 // offset=[1..maxLength]
109 void addOffset(int32_t offset
) {
110 int32_t i
=start
+offset
;
118 // offset=[1..maxLength]
119 UBool
containsOffset(int32_t offset
) const {
120 int32_t i
=start
+offset
;
127 // Find the lowest stored offset from a non-empty list, remove it,
128 // and reduce all other offsets by this minimum.
129 // Returns [1..maxLength].
130 int32_t popMinimum() {
131 // Look for the next offset in list[start+1..capacity-1].
132 int32_t i
=start
, result
;
133 while(++i
<capacity
) {
144 // Wrap around and look for the next offset in list[0..start].
145 // Since the list is not empty, there will be one.
146 result
=capacity
-start
;
163 UBool staticList
[16];
166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
168 getUTF8Length(const UChar
*s
, int32_t length
) {
169 UErrorCode errorCode
=U_ZERO_ERROR
;
171 u_strToUTF8(NULL
, 0, &length8
, s
, length
, &errorCode
);
172 if(U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
175 // The string contains an unpaired surrogate.
176 // Ignore this string.
181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
183 appendUTF8(const UChar
*s
, int32_t length
, uint8_t *t
, int32_t capacity
) {
184 UErrorCode errorCode
=U_ZERO_ERROR
;
186 u_strToUTF8((char *)t
, capacity
, &length8
, s
, length
, &errorCode
);
187 if(U_SUCCESS(errorCode
)) {
190 // The string contains an unpaired surrogate.
191 // Ignore this string.
196 static inline uint8_t
197 makeSpanLengthByte(int32_t spanLength
) {
198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199 return spanLength
<0xfe ? (uint8_t)spanLength
: (uint8_t)0xfe;
202 // Construct for all variants of span(), or only for any one variant.
203 // Initialize as little as possible, for single use.
204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet
&set
,
205 const UVector
&setStrings
,
207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL
), strings(setStrings
),
208 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
210 maxLength16(0), maxLength8(0),
211 all((UBool
)(which
==ALL
)) {
212 spanSet
.retainAll(set
);
213 if(which
&NOT_CONTAINED
) {
214 // Default to the same sets.
215 // addToSpanNotSet() will create a separate set if necessary.
216 pSpanNotSet
=&spanSet
;
219 // Determine if the strings even need to be taken into account at all for span() etc.
220 // If any string is relevant, then all strings need to be used for
221 // span(longest match) but only the relevant ones for span(while contained).
222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226 int32_t stringsLength
=strings
.size();
228 int32_t i
, spanLength
;
229 UBool someRelevant
=FALSE
;
230 for(i
=0; i
<stringsLength
; ++i
) {
231 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
232 const UChar
*s16
=string
.getBuffer();
233 int32_t length16
=string
.length();
235 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
236 if(spanLength
<length16
) { // Relevant string.
237 someRelevant
=thisRelevant
=TRUE
;
241 if((which
&UTF16
) && length16
>maxLength16
) {
242 maxLength16
=length16
;
244 if((which
&UTF8
) && (thisRelevant
|| (which
&CONTAINED
))) {
245 int32_t length8
=getUTF8Length(s16
, length16
);
247 if(length8
>maxLength8
) {
253 maxLength16
=maxLength8
=0;
257 // Freeze after checking for the need to use strings at all because freezing
258 // a set takes some time and memory which are wasted if there are no relevant strings.
263 uint8_t *spanBackLengths
;
264 uint8_t *spanUTF8Lengths
;
265 uint8_t *spanBackUTF8Lengths
;
267 // Allocate a block of meta data.
270 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
271 allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
273 allocSize
=stringsLength
; // One set of span lengths.
275 // UTF-8 lengths and UTF-8 strings.
276 allocSize
+=stringsLength
*4+utf8Length
;
279 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
280 utf8Lengths
=staticLengths
;
282 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
283 if(utf8Lengths
==NULL
) {
284 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
285 return; // Out of memory.
290 // Store span lengths for all span() variants.
291 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
292 spanBackLengths
=spanLengths
+stringsLength
;
293 spanUTF8Lengths
=spanBackLengths
+stringsLength
;
294 spanBackUTF8Lengths
=spanUTF8Lengths
+stringsLength
;
295 utf8
=spanBackUTF8Lengths
+stringsLength
;
297 // Store span lengths for only one span() variant.
299 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
300 utf8
=spanLengths
+stringsLength
;
302 spanLengths
=(uint8_t *)utf8Lengths
;
304 spanBackLengths
=spanUTF8Lengths
=spanBackUTF8Lengths
=spanLengths
;
307 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
308 int32_t utf8Count
=0; // Count UTF-8 bytes written so far.
310 for(i
=0; i
<stringsLength
; ++i
) {
311 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
312 const UChar
*s16
=string
.getBuffer();
313 int32_t length16
=string
.length();
314 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
315 if(spanLength
<length16
) { // Relevant string.
317 if(which
&CONTAINED
) {
319 spanLengths
[i
]=makeSpanLengthByte(spanLength
);
322 spanLength
=length16
-spanSet
.spanBack(s16
, length16
, USET_SPAN_CONTAINED
);
323 spanBackLengths
[i
]=makeSpanLengthByte(spanLength
);
325 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
326 spanLengths
[i
]=spanBackLengths
[i
]=0; // Only store a relevant/irrelevant flag.
330 uint8_t *s8
=utf8
+utf8Count
;
331 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
332 utf8Count
+=utf8Lengths
[i
]=length8
;
333 if(length8
==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
334 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
335 } else { // Relevant for UTF-8.
336 if(which
&CONTAINED
) {
338 spanLength
=spanSet
.spanUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
339 spanUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
342 spanLength
=length8
-spanSet
.spanBackUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
343 spanBackUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
345 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
346 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=0; // Only store a relevant/irrelevant flag.
350 if(which
&NOT_CONTAINED
) {
351 // Add string start and end code points to the spanNotSet so that
352 // a span(while not contained) stops before any string.
356 U16_NEXT(s16
, len
, length16
, c
);
360 int32_t len
=length16
;
361 U16_PREV(s16
, 0, len
, c
);
365 } else { // Irrelevant string.
367 if(which
&CONTAINED
) { // Only necessary for LONGEST_MATCH.
368 uint8_t *s8
=utf8
+utf8Count
;
369 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
370 utf8Count
+=utf8Lengths
[i
]=length8
;
376 spanLengths
[i
]=spanBackLengths
[i
]=
377 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=
378 (uint8_t)ALL_CP_CONTAINED
;
380 // All spanXYZLengths pointers contain the same address.
381 spanLengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
388 pSpanNotSet
->freeze();
392 // Copy constructor. Assumes which==ALL for a frozen set.
393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan
&otherStringSpan
,
394 const UVector
&newParentSetStrings
)
395 : spanSet(otherStringSpan
.spanSet
), pSpanNotSet(NULL
), strings(newParentSetStrings
),
396 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
397 utf8Length(otherStringSpan
.utf8Length
),
398 maxLength16(otherStringSpan
.maxLength16
), maxLength8(otherStringSpan
.maxLength8
),
400 if(otherStringSpan
.pSpanNotSet
==&otherStringSpan
.spanSet
) {
401 pSpanNotSet
=&spanSet
;
403 pSpanNotSet
=(UnicodeSet
*)otherStringSpan
.pSpanNotSet
->clone();
406 // Allocate a block of meta data.
407 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
408 int32_t stringsLength
=strings
.size();
409 int32_t allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
410 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
411 utf8Lengths
=staticLengths
;
413 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
414 if(utf8Lengths
==NULL
) {
415 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
416 return; // Out of memory.
420 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
421 utf8
=spanLengths
+stringsLength
*4;
422 uprv_memcpy(utf8Lengths
, otherStringSpan
.utf8Lengths
, allocSize
);
425 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
426 if(pSpanNotSet
!=NULL
&& pSpanNotSet
!=&spanSet
) {
429 if(utf8Lengths
!=NULL
&& utf8Lengths
!=staticLengths
) {
430 uprv_free(utf8Lengths
);
434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c
) {
435 if(pSpanNotSet
==NULL
|| pSpanNotSet
==&spanSet
) {
436 if(spanSet
.contains(c
)) {
437 return; // Nothing to do.
439 UnicodeSet
*newSet
=(UnicodeSet
*)spanSet
.cloneAsThawed();
441 return; // Out of memory.
449 // Compare strings without any argument checks. Requires length>0.
451 matches16(const UChar
*s
, const UChar
*t
, int32_t length
) {
461 matches8(const uint8_t *s
, const uint8_t *t
, int32_t length
) {
470 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
471 // at code point boundaries.
472 // That is, each edge of a match must not be in the middle of a surrogate pair.
474 matches16CPB(const UChar
*s
, int32_t start
, int32_t limit
, const UChar
*t
, int32_t length
) {
477 return matches16(s
, t
, length
) &&
478 !(0<start
&& U16_IS_LEAD(s
[-1]) && U16_IS_TRAIL(s
[0])) &&
479 !(length
<limit
&& U16_IS_LEAD(s
[length
-1]) && U16_IS_TRAIL(s
[length
]));
482 // Does the set contain the next code point?
483 // If so, return its length; otherwise return its negative length.
484 static inline int32_t
485 spanOne(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
487 if(c
>=0xd800 && c
<=0xdbff && length
>=2 && U16_IS_TRAIL(c2
=s
[1])) {
488 return set
.contains(U16_GET_SUPPLEMENTARY(c
, c2
)) ? 2 : -2;
490 return set
.contains(c
) ? 1 : -1;
493 static inline int32_t
494 spanOneBack(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
495 UChar c
=s
[length
-1], c2
;
496 if(c
>=0xdc00 && c
<=0xdfff && length
>=2 && U16_IS_LEAD(c2
=s
[length
-2])) {
497 return set
.contains(U16_GET_SUPPLEMENTARY(c2
, c
)) ? 2 : -2;
499 return set
.contains(c
) ? 1 : -1;
502 static inline int32_t
503 spanOneUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
506 return set
.contains(c
) ? 1 : -1;
508 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
510 U8_NEXT_OR_FFFD(s
, i
, length
, c
);
511 return set
.contains(c
) ? i
: -i
;
514 static inline int32_t
515 spanOneBackUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
516 UChar32 c
=s
[length
-1];
518 return set
.contains(c
) ? 1 : -1;
521 c
=utf8_prevCharSafeBody(s
, 0, &i
, c
, -3);
523 return set
.contains(c
) ? length
: -length
;
527 * Note: In span() when spanLength==0 (after a string match, or at the beginning
528 * after an empty code point span) and in spanNot() and spanNotUTF8(),
529 * string matching could use a binary search
530 * because all string matches are done from the same start index.
532 * For UTF-8, this would require a comparison function that returns UTF-16 order.
534 * This optimization should not be necessary for normal UnicodeSets because
535 * most sets have no strings, and most sets with strings have
536 * very few very short strings.
537 * For cases with many strings, it might be better to use a different API
538 * and implementation with a DFA (state machine).
542 * Algorithm for span(USET_SPAN_CONTAINED)
544 * Theoretical algorithm:
545 * - Iterate through the string, and at each code point boundary:
546 * + If the code point there is in the set, then remember to continue after it.
547 * + If a set string matches at the current position, then remember to continue after it.
548 * + Either recursively span for each code point or string match,
549 * or recursively span for all but the shortest one and
550 * iteratively continue the span with the shortest local match.
551 * + Remember the longest recursive span (the farthest end point).
552 * + If there is no match at the current position, neither for the code point there
553 * nor for any set string, then stop and return the longest recursive span length.
555 * Optimized implementation:
557 * (We assume that most sets will have very few very short strings.
558 * A span using a string-less set is extremely fast.)
560 * Create and cache a spanSet which contains all of the single code points
561 * of the original set but none of its strings.
563 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
565 * + Try to match each set string at the end of the spanLength.
566 * ~ Set strings that start with set-contained code points must be matched
567 * with a partial overlap because the recursive algorithm would have tried
568 * to match them at every position.
569 * ~ Set strings that entirely consist of set-contained code points
570 * are irrelevant for span(USET_SPAN_CONTAINED) because the
571 * recursive algorithm would continue after them anyway
572 * and find the longest recursive match from their end.
573 * ~ Rather than recursing, note each end point of a set string match.
574 * + If no set string matched after spanSet.span(), then return
575 * with where the spanSet.span() ended.
576 * + If at least one set string matched after spanSet.span(), then
577 * pop the shortest string match end point and continue
578 * the loop, trying to match all set strings from there.
579 * + If at least one more set string matched after a previous string match,
580 * then test if the code point after the previous string match is also
581 * contained in the set.
582 * Continue the loop with the shortest end point of either this code point
583 * or a matching set string.
584 * + If no more set string matched after a previous string match,
585 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
586 * Stop if spanLength==0, otherwise continue the loop.
588 * By noting each end point of a set string match,
589 * the function visits each string position at most once and finishes
592 * The recursive algorithm may visit the same string position many times
593 * if multiple paths lead to it and finishes in exponential time.
597 * Algorithm for span(USET_SPAN_SIMPLE)
599 * Theoretical algorithm:
600 * - Iterate through the string, and at each code point boundary:
601 * + If the code point there is in the set, then remember to continue after it.
602 * + If a set string matches at the current position, then remember to continue after it.
603 * + Continue from the farthest match position and ignore all others.
604 * + If there is no match at the current position,
605 * then stop and return the current position.
607 * Optimized implementation:
609 * (Same assumption and spanSet as above.)
611 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
613 * + Try to match each set string at the end of the spanLength.
614 * ~ Set strings that start with set-contained code points must be matched
615 * with a partial overlap because the standard algorithm would have tried
616 * to match them earlier.
617 * ~ Set strings that entirely consist of set-contained code points
618 * must be matched with a full overlap because the longest-match algorithm
619 * would hide set string matches that end earlier.
620 * Such set strings need not be matched earlier inside the code point span
621 * because the standard algorithm would then have continued after
622 * the set string match anyway.
623 * ~ Remember the longest set string match (farthest end point) from the earliest
625 * + If no set string matched after spanSet.span(), then return
626 * with where the spanSet.span() ended.
627 * + If at least one set string matched, then continue the loop after the
628 * longest match from the earliest position.
629 * + If no more set string matched after a previous string match,
630 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
631 * Stop if spanLength==0, otherwise continue the loop.
634 int32_t UnicodeSetStringSpan::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
635 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
636 return spanNot(s
, length
);
638 int32_t spanLength
=spanSet
.span(s
, length
, USET_SPAN_CONTAINED
);
639 if(spanLength
==length
) {
643 // Consider strings; they may overlap with the span.
645 if(spanCondition
==USET_SPAN_CONTAINED
) {
646 // Use offset list to try all possibilities.
647 offsets
.setMaxLength(maxLength16
);
649 int32_t pos
=spanLength
, rest
=length
-pos
;
650 int32_t i
, stringsLength
=strings
.size();
652 if(spanCondition
==USET_SPAN_CONTAINED
) {
653 for(i
=0; i
<stringsLength
; ++i
) {
654 int32_t overlap
=spanLengths
[i
];
655 if(overlap
==ALL_CP_CONTAINED
) {
656 continue; // Irrelevant string.
658 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
659 const UChar
*s16
=string
.getBuffer();
660 int32_t length16
=string
.length();
662 // Try to match this string at pos-overlap..pos.
663 if(overlap
>=LONG_SPAN
) {
665 // While contained: No point matching fully inside the code point span.
666 U16_BACK_1(s16
, 0, overlap
); // Length of the string minus the last code point.
668 if(overlap
>spanLength
) {
671 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
676 // Try to match if the increment is not listed already.
677 if(!offsets
.containsOffset(inc
) && matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)) {
679 return length
; // Reached the end of the string.
681 offsets
.addOffset(inc
);
690 } else /* USET_SPAN_SIMPLE */ {
691 int32_t maxInc
=0, maxOverlap
=0;
692 for(i
=0; i
<stringsLength
; ++i
) {
693 int32_t overlap
=spanLengths
[i
];
694 // For longest match, we do need to try to match even an all-contained string
695 // to find the match from the earliest start.
697 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
698 const UChar
*s16
=string
.getBuffer();
699 int32_t length16
=string
.length();
701 // Try to match this string at pos-overlap..pos.
702 if(overlap
>=LONG_SPAN
) {
704 // Longest match: Need to match fully inside the code point span
705 // to find the match from the earliest start.
707 if(overlap
>spanLength
) {
710 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
712 if(inc
>rest
|| overlap
<maxOverlap
) {
715 // Try to match if the string is longer or starts earlier.
716 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
717 matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)
719 maxInc
=inc
; // Longest match from earliest start.
728 if(maxInc
!=0 || maxOverlap
!=0) {
729 // Longest-match algorithm, and there was a string match.
730 // Simply continue after it.
734 return length
; // Reached the end of the string.
736 spanLength
=0; // Match strings from after a string match.
740 // Finished trying to match all strings at pos.
742 if(spanLength
!=0 || pos
==0) {
743 // The position is after an unlimited code point span (spanLength!=0),
744 // not after a string match.
745 // The only position where spanLength==0 after a span is pos==0.
746 // Otherwise, an unlimited code point span is only tried again when no
747 // strings match, and if such a non-initial span fails we stop.
748 if(offsets
.isEmpty()) {
749 return pos
; // No strings matched after a span.
751 // Match strings from after the next string match.
753 // The position is after a string match (or a single code point).
754 if(offsets
.isEmpty()) {
755 // No more strings matched after a previous string match.
756 // Try another code point span from after the last string match.
757 spanLength
=spanSet
.span(s
+pos
, rest
, USET_SPAN_CONTAINED
);
758 if( spanLength
==rest
|| // Reached the end of the string, or
759 spanLength
==0 // neither strings nor span progressed.
761 return pos
+spanLength
;
765 continue; // spanLength>0: Match strings from after a span.
767 // Try to match only one code point from after a string match if some
768 // string matched beyond it, so that we try all possible positions
769 // and don't overshoot.
770 spanLength
=spanOne(spanSet
, s
+pos
, rest
);
772 if(spanLength
==rest
) {
773 return length
; // Reached the end of the string.
775 // Match strings after this code point.
776 // There cannot be any increments below it because UnicodeSet strings
777 // contain multiple code points.
780 offsets
.shift(spanLength
);
782 continue; // Match strings from after a single code point.
784 // Match strings from after the next string match.
787 int32_t minOffset
=offsets
.popMinimum();
790 spanLength
=0; // Match strings from after a string match.
794 int32_t UnicodeSetStringSpan::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
795 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
796 return spanNotBack(s
, length
);
798 int32_t pos
=spanSet
.spanBack(s
, length
, USET_SPAN_CONTAINED
);
802 int32_t spanLength
=length
-pos
;
804 // Consider strings; they may overlap with the span.
806 if(spanCondition
==USET_SPAN_CONTAINED
) {
807 // Use offset list to try all possibilities.
808 offsets
.setMaxLength(maxLength16
);
810 int32_t i
, stringsLength
=strings
.size();
811 uint8_t *spanBackLengths
=spanLengths
;
813 spanBackLengths
+=stringsLength
;
816 if(spanCondition
==USET_SPAN_CONTAINED
) {
817 for(i
=0; i
<stringsLength
; ++i
) {
818 int32_t overlap
=spanBackLengths
[i
];
819 if(overlap
==ALL_CP_CONTAINED
) {
820 continue; // Irrelevant string.
822 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
823 const UChar
*s16
=string
.getBuffer();
824 int32_t length16
=string
.length();
826 // Try to match this string at pos-(length16-overlap)..pos-length16.
827 if(overlap
>=LONG_SPAN
) {
829 // While contained: No point matching fully inside the code point span.
831 U16_FWD_1(s16
, len1
, overlap
);
832 overlap
-=len1
; // Length of the string minus the first code point.
834 if(overlap
>spanLength
) {
837 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
842 // Try to match if the decrement is not listed already.
843 if(!offsets
.containsOffset(dec
) && matches16CPB(s
, pos
-dec
, length
, s16
, length16
)) {
845 return 0; // Reached the start of the string.
847 offsets
.addOffset(dec
);
856 } else /* USET_SPAN_SIMPLE */ {
857 int32_t maxDec
=0, maxOverlap
=0;
858 for(i
=0; i
<stringsLength
; ++i
) {
859 int32_t overlap
=spanBackLengths
[i
];
860 // For longest match, we do need to try to match even an all-contained string
861 // to find the match from the latest end.
863 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
864 const UChar
*s16
=string
.getBuffer();
865 int32_t length16
=string
.length();
867 // Try to match this string at pos-(length16-overlap)..pos-length16.
868 if(overlap
>=LONG_SPAN
) {
870 // Longest match: Need to match fully inside the code point span
871 // to find the match from the latest end.
873 if(overlap
>spanLength
) {
876 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
878 if(dec
>pos
|| overlap
<maxOverlap
) {
881 // Try to match if the string is longer or ends later.
882 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
883 matches16CPB(s
, pos
-dec
, length
, s16
, length16
)
885 maxDec
=dec
; // Longest match from latest end.
894 if(maxDec
!=0 || maxOverlap
!=0) {
895 // Longest-match algorithm, and there was a string match.
896 // Simply continue before it.
899 return 0; // Reached the start of the string.
901 spanLength
=0; // Match strings from before a string match.
905 // Finished trying to match all strings at pos.
907 if(spanLength
!=0 || pos
==length
) {
908 // The position is before an unlimited code point span (spanLength!=0),
909 // not before a string match.
910 // The only position where spanLength==0 before a span is pos==length.
911 // Otherwise, an unlimited code point span is only tried again when no
912 // strings match, and if such a non-initial span fails we stop.
913 if(offsets
.isEmpty()) {
914 return pos
; // No strings matched before a span.
916 // Match strings from before the next string match.
918 // The position is before a string match (or a single code point).
919 if(offsets
.isEmpty()) {
920 // No more strings matched before a previous string match.
921 // Try another code point span from before the last string match.
923 pos
=spanSet
.spanBack(s
, oldPos
, USET_SPAN_CONTAINED
);
924 spanLength
=oldPos
-pos
;
925 if( pos
==0 || // Reached the start of the string, or
926 spanLength
==0 // neither strings nor span progressed.
930 continue; // spanLength>0: Match strings from before a span.
932 // Try to match only one code point from before a string match if some
933 // string matched beyond it, so that we try all possible positions
934 // and don't overshoot.
935 spanLength
=spanOneBack(spanSet
, s
, pos
);
937 if(spanLength
==pos
) {
938 return 0; // Reached the start of the string.
940 // Match strings before this code point.
941 // There cannot be any decrements below it because UnicodeSet strings
942 // contain multiple code points.
944 offsets
.shift(spanLength
);
946 continue; // Match strings from before a single code point.
948 // Match strings from before the next string match.
951 pos
-=offsets
.popMinimum();
952 spanLength
=0; // Match strings from before a string match.
956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
957 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
958 return spanNotUTF8(s
, length
);
960 int32_t spanLength
=spanSet
.spanUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
961 if(spanLength
==length
) {
965 // Consider strings; they may overlap with the span.
967 if(spanCondition
==USET_SPAN_CONTAINED
) {
968 // Use offset list to try all possibilities.
969 offsets
.setMaxLength(maxLength8
);
971 int32_t pos
=spanLength
, rest
=length
-pos
;
972 int32_t i
, stringsLength
=strings
.size();
973 uint8_t *spanUTF8Lengths
=spanLengths
;
975 spanUTF8Lengths
+=2*stringsLength
;
978 const uint8_t *s8
=utf8
;
980 if(spanCondition
==USET_SPAN_CONTAINED
) {
981 for(i
=0; i
<stringsLength
; ++i
) {
982 length8
=utf8Lengths
[i
];
984 continue; // String not representable in UTF-8.
986 int32_t overlap
=spanUTF8Lengths
[i
];
987 if(overlap
==ALL_CP_CONTAINED
) {
989 continue; // Irrelevant string.
992 // Try to match this string at pos-overlap..pos.
993 if(overlap
>=LONG_SPAN
) {
995 // While contained: No point matching fully inside the code point span.
996 U8_BACK_1(s8
, 0, overlap
); // Length of the string minus the last code point.
998 if(overlap
>spanLength
) {
1001 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1006 // Try to match if the increment is not listed already.
1007 // Match at code point boundaries. (The UTF-8 strings were converted
1008 // from UTF-16 and are guaranteed to be well-formed.)
1009 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1010 !offsets
.containsOffset(inc
) &&
1011 matches8(s
+pos
-overlap
, s8
, length8
)
1015 return length
; // Reached the end of the string.
1017 offsets
.addOffset(inc
);
1027 } else /* USET_SPAN_SIMPLE */ {
1028 int32_t maxInc
=0, maxOverlap
=0;
1029 for(i
=0; i
<stringsLength
; ++i
) {
1030 length8
=utf8Lengths
[i
];
1032 continue; // String not representable in UTF-8.
1034 int32_t overlap
=spanUTF8Lengths
[i
];
1035 // For longest match, we do need to try to match even an all-contained string
1036 // to find the match from the earliest start.
1038 // Try to match this string at pos-overlap..pos.
1039 if(overlap
>=LONG_SPAN
) {
1041 // Longest match: Need to match fully inside the code point span
1042 // to find the match from the earliest start.
1044 if(overlap
>spanLength
) {
1047 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1049 if(inc
>rest
|| overlap
<maxOverlap
) {
1052 // Try to match if the string is longer or starts earlier.
1053 // Match at code point boundaries. (The UTF-8 strings were converted
1054 // from UTF-16 and are guaranteed to be well-formed.)
1055 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1056 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
1057 matches8(s
+pos
-overlap
, s8
, length8
)
1060 maxInc
=inc
; // Longest match from earliest start.
1070 if(maxInc
!=0 || maxOverlap
!=0) {
1071 // Longest-match algorithm, and there was a string match.
1072 // Simply continue after it.
1076 return length
; // Reached the end of the string.
1078 spanLength
=0; // Match strings from after a string match.
1082 // Finished trying to match all strings at pos.
1084 if(spanLength
!=0 || pos
==0) {
1085 // The position is after an unlimited code point span (spanLength!=0),
1086 // not after a string match.
1087 // The only position where spanLength==0 after a span is pos==0.
1088 // Otherwise, an unlimited code point span is only tried again when no
1089 // strings match, and if such a non-initial span fails we stop.
1090 if(offsets
.isEmpty()) {
1091 return pos
; // No strings matched after a span.
1093 // Match strings from after the next string match.
1095 // The position is after a string match (or a single code point).
1096 if(offsets
.isEmpty()) {
1097 // No more strings matched after a previous string match.
1098 // Try another code point span from after the last string match.
1099 spanLength
=spanSet
.spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_CONTAINED
);
1100 if( spanLength
==rest
|| // Reached the end of the string, or
1101 spanLength
==0 // neither strings nor span progressed.
1103 return pos
+spanLength
;
1107 continue; // spanLength>0: Match strings from after a span.
1109 // Try to match only one code point from after a string match if some
1110 // string matched beyond it, so that we try all possible positions
1111 // and don't overshoot.
1112 spanLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1114 if(spanLength
==rest
) {
1115 return length
; // Reached the end of the string.
1117 // Match strings after this code point.
1118 // There cannot be any increments below it because UnicodeSet strings
1119 // contain multiple code points.
1122 offsets
.shift(spanLength
);
1124 continue; // Match strings from after a single code point.
1126 // Match strings from after the next string match.
1129 int32_t minOffset
=offsets
.popMinimum();
1132 spanLength
=0; // Match strings from after a string match.
1136 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
1137 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
1138 return spanNotBackUTF8(s
, length
);
1140 int32_t pos
=spanSet
.spanBackUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
1144 int32_t spanLength
=length
-pos
;
1146 // Consider strings; they may overlap with the span.
1148 if(spanCondition
==USET_SPAN_CONTAINED
) {
1149 // Use offset list to try all possibilities.
1150 offsets
.setMaxLength(maxLength8
);
1152 int32_t i
, stringsLength
=strings
.size();
1153 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1155 spanBackUTF8Lengths
+=3*stringsLength
;
1158 const uint8_t *s8
=utf8
;
1160 if(spanCondition
==USET_SPAN_CONTAINED
) {
1161 for(i
=0; i
<stringsLength
; ++i
) {
1162 length8
=utf8Lengths
[i
];
1164 continue; // String not representable in UTF-8.
1166 int32_t overlap
=spanBackUTF8Lengths
[i
];
1167 if(overlap
==ALL_CP_CONTAINED
) {
1169 continue; // Irrelevant string.
1172 // Try to match this string at pos-(length8-overlap)..pos-length8.
1173 if(overlap
>=LONG_SPAN
) {
1175 // While contained: No point matching fully inside the code point span.
1177 U8_FWD_1(s8
, len1
, overlap
);
1178 overlap
-=len1
; // Length of the string minus the first code point.
1180 if(overlap
>spanLength
) {
1183 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1188 // Try to match if the decrement is not listed already.
1189 // Match at code point boundaries. (The UTF-8 strings were converted
1190 // from UTF-16 and are guaranteed to be well-formed.)
1191 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1192 !offsets
.containsOffset(dec
) &&
1193 matches8(s
+pos
-dec
, s8
, length8
)
1196 return 0; // Reached the start of the string.
1198 offsets
.addOffset(dec
);
1208 } else /* USET_SPAN_SIMPLE */ {
1209 int32_t maxDec
=0, maxOverlap
=0;
1210 for(i
=0; i
<stringsLength
; ++i
) {
1211 length8
=utf8Lengths
[i
];
1213 continue; // String not representable in UTF-8.
1215 int32_t overlap
=spanBackUTF8Lengths
[i
];
1216 // For longest match, we do need to try to match even an all-contained string
1217 // to find the match from the latest end.
1219 // Try to match this string at pos-(length8-overlap)..pos-length8.
1220 if(overlap
>=LONG_SPAN
) {
1222 // Longest match: Need to match fully inside the code point span
1223 // to find the match from the latest end.
1225 if(overlap
>spanLength
) {
1228 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1230 if(dec
>pos
|| overlap
<maxOverlap
) {
1233 // Try to match if the string is longer or ends later.
1234 // Match at code point boundaries. (The UTF-8 strings were converted
1235 // from UTF-16 and are guaranteed to be well-formed.)
1236 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1237 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
1238 matches8(s
+pos
-dec
, s8
, length8
)
1240 maxDec
=dec
; // Longest match from latest end.
1250 if(maxDec
!=0 || maxOverlap
!=0) {
1251 // Longest-match algorithm, and there was a string match.
1252 // Simply continue before it.
1255 return 0; // Reached the start of the string.
1257 spanLength
=0; // Match strings from before a string match.
1261 // Finished trying to match all strings at pos.
1263 if(spanLength
!=0 || pos
==length
) {
1264 // The position is before an unlimited code point span (spanLength!=0),
1265 // not before a string match.
1266 // The only position where spanLength==0 before a span is pos==length.
1267 // Otherwise, an unlimited code point span is only tried again when no
1268 // strings match, and if such a non-initial span fails we stop.
1269 if(offsets
.isEmpty()) {
1270 return pos
; // No strings matched before a span.
1272 // Match strings from before the next string match.
1274 // The position is before a string match (or a single code point).
1275 if(offsets
.isEmpty()) {
1276 // No more strings matched before a previous string match.
1277 // Try another code point span from before the last string match.
1279 pos
=spanSet
.spanBackUTF8((const char *)s
, oldPos
, USET_SPAN_CONTAINED
);
1280 spanLength
=oldPos
-pos
;
1281 if( pos
==0 || // Reached the start of the string, or
1282 spanLength
==0 // neither strings nor span progressed.
1286 continue; // spanLength>0: Match strings from before a span.
1288 // Try to match only one code point from before a string match if some
1289 // string matched beyond it, so that we try all possible positions
1290 // and don't overshoot.
1291 spanLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1293 if(spanLength
==pos
) {
1294 return 0; // Reached the start of the string.
1296 // Match strings before this code point.
1297 // There cannot be any decrements below it because UnicodeSet strings
1298 // contain multiple code points.
1300 offsets
.shift(spanLength
);
1302 continue; // Match strings from before a single code point.
1304 // Match strings from before the next string match.
1307 pos
-=offsets
.popMinimum();
1308 spanLength
=0; // Match strings from before a string match.
1313 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1315 * Theoretical algorithm:
1316 * - Iterate through the string, and at each code point boundary:
1317 * + If the code point there is in the set, then return with the current position.
1318 * + If a set string matches at the current position, then return with the current position.
1320 * Optimized implementation:
1322 * (Same assumption as for span() above.)
1324 * Create and cache a spanNotSet which contains all of the single code points
1325 * of the original set but none of its strings.
1326 * For each set string add its initial code point to the spanNotSet.
1327 * (Also add its final code point for spanNotBack().)
1330 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1331 * + If the current code point is in the original set, then
1332 * return the current position.
1333 * + If any set string matches at the current position, then
1334 * return the current position.
1335 * + If there is no match at the current position, neither for the code point there
1336 * nor for any set string, then skip this code point and continue the loop.
1337 * This happens for set-string-initial code points that were added to spanNotSet
1338 * when there is not actually a match for such a set string.
1341 int32_t UnicodeSetStringSpan::spanNot(const UChar
*s
, int32_t length
) const {
1342 int32_t pos
=0, rest
=length
;
1343 int32_t i
, stringsLength
=strings
.size();
1345 // Span until we find a code point from the set,
1346 // or a code point that starts or ends some string.
1347 i
=pSpanNotSet
->span(s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1349 return length
; // Reached the end of the string.
1354 // Check whether the current code point is in the original set,
1355 // without the string starts and ends.
1356 int32_t cpLength
=spanOne(spanSet
, s
+pos
, rest
);
1358 return pos
; // There is a set element at pos.
1361 // Try to match the strings at pos.
1362 for(i
=0; i
<stringsLength
; ++i
) {
1363 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1364 continue; // Irrelevant string.
1366 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1367 const UChar
*s16
=string
.getBuffer();
1368 int32_t length16
=string
.length();
1369 if(length16
<=rest
&& matches16CPB(s
, pos
, length
, s16
, length16
)) {
1370 return pos
; // There is a set element at pos.
1374 // The span(while not contained) ended on a string start/end which is
1375 // not in the original set. Skip this code point and continue.
1380 return length
; // Reached the end of the string.
1383 int32_t UnicodeSetStringSpan::spanNotBack(const UChar
*s
, int32_t length
) const {
1385 int32_t i
, stringsLength
=strings
.size();
1387 // Span until we find a code point from the set,
1388 // or a code point that starts or ends some string.
1389 pos
=pSpanNotSet
->spanBack(s
, pos
, USET_SPAN_NOT_CONTAINED
);
1391 return 0; // Reached the start of the string.
1394 // Check whether the current code point is in the original set,
1395 // without the string starts and ends.
1396 int32_t cpLength
=spanOneBack(spanSet
, s
, pos
);
1398 return pos
; // There is a set element at pos.
1401 // Try to match the strings at pos.
1402 for(i
=0; i
<stringsLength
; ++i
) {
1403 // Use spanLengths rather than a spanBackLengths pointer because
1404 // it is easier and we only need to know whether the string is irrelevant
1405 // which is the same in either array.
1406 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1407 continue; // Irrelevant string.
1409 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1410 const UChar
*s16
=string
.getBuffer();
1411 int32_t length16
=string
.length();
1412 if(length16
<=pos
&& matches16CPB(s
, pos
-length16
, length
, s16
, length16
)) {
1413 return pos
; // There is a set element at pos.
1417 // The span(while not contained) ended on a string start/end which is
1418 // not in the original set. Skip this code point and continue.
1422 return 0; // Reached the start of the string.
1425 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s
, int32_t length
) const {
1426 int32_t pos
=0, rest
=length
;
1427 int32_t i
, stringsLength
=strings
.size();
1428 uint8_t *spanUTF8Lengths
=spanLengths
;
1430 spanUTF8Lengths
+=2*stringsLength
;
1433 // Span until we find a code point from the set,
1434 // or a code point that starts or ends some string.
1435 i
=pSpanNotSet
->spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1437 return length
; // Reached the end of the string.
1442 // Check whether the current code point is in the original set,
1443 // without the string starts and ends.
1444 int32_t cpLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1446 return pos
; // There is a set element at pos.
1449 // Try to match the strings at pos.
1450 const uint8_t *s8
=utf8
;
1452 for(i
=0; i
<stringsLength
; ++i
) {
1453 length8
=utf8Lengths
[i
];
1454 // ALL_CP_CONTAINED: Irrelevant string.
1455 if(length8
!=0 && spanUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=rest
&& matches8(s
+pos
, s8
, length8
)) {
1456 return pos
; // There is a set element at pos.
1461 // The span(while not contained) ended on a string start/end which is
1462 // not in the original set. Skip this code point and continue.
1467 return length
; // Reached the end of the string.
1470 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s
, int32_t length
) const {
1472 int32_t i
, stringsLength
=strings
.size();
1473 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1475 spanBackUTF8Lengths
+=3*stringsLength
;
1478 // Span until we find a code point from the set,
1479 // or a code point that starts or ends some string.
1480 pos
=pSpanNotSet
->spanBackUTF8((const char *)s
, pos
, USET_SPAN_NOT_CONTAINED
);
1482 return 0; // Reached the start of the string.
1485 // Check whether the current code point is in the original set,
1486 // without the string starts and ends.
1487 int32_t cpLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1489 return pos
; // There is a set element at pos.
1492 // Try to match the strings at pos.
1493 const uint8_t *s8
=utf8
;
1495 for(i
=0; i
<stringsLength
; ++i
) {
1496 length8
=utf8Lengths
[i
];
1497 // ALL_CP_CONTAINED: Irrelevant string.
1498 if(length8
!=0 && spanBackUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=pos
&& matches8(s
+pos
-length8
, s8
, length8
)) {
1499 return pos
; // There is a set element at pos.
1504 // The span(while not contained) ended on a string start/end which is
1505 // not in the original set. Skip this code point and continue.
1509 return 0; // Reached the start of the string.