2 ******************************************************************************
4 * Copyright (C) 2007-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: unisetspan.cpp
10 * tab size: 8 (not used)
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 #include "unicode/utf8.h"
21 #include "unicode/utf16.h"
24 #include "unisetspan.h"
29 * List of offsets from the current position from where to try matching
30 * a code point or a string.
31 * Store offsets rather than indexes to simplify the code and use the same list
32 * for both increments (in span()) and decrements (in spanBack()).
34 * Assumption: The maximum offset is limited, and the offsets that are stored
35 * at any one time are relatively dense, that is, there are normally no gaps of
36 * hundreds or thousands of offset values.
38 * The implementation uses a circular buffer of byte flags,
39 * each indicating whether the corresponding offset is in the list.
40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
41 * physically moving part of the list.
43 * Note: In principle, the caller should setMaxLength() to the maximum of the
44 * max string length and U16_LENGTH/U8_LENGTH to account for
45 * "long" single code points.
46 * However, this implementation uses at least a staticList with more than
47 * U8_LENGTH entries anyway.
49 * Note: If maxLength were guaranteed to be no more than 32 or 64,
50 * the list could be stored as bit flags in a single integer.
51 * Rather than handling a circular buffer with a start list index,
52 * the integer would simply be shifted when lower offsets are removed.
53 * UnicodeSet does not have a limit on the lengths of strings.
55 class OffsetList
{ // Only ever stack-allocated, does not need to inherit UMemory.
57 OffsetList() : list(staticList
), capacity(0), length(0), start(0) {}
60 if(list
!=staticList
) {
65 // Call exactly once if the list is to be used.
66 void setMaxLength(int32_t maxLength
) {
67 if(maxLength
<=(int32_t)sizeof(staticList
)) {
68 capacity
=(int32_t)sizeof(staticList
);
70 UBool
*l
=(UBool
*)uprv_malloc(maxLength
);
76 uprv_memset(list
, 0, capacity
);
80 uprv_memset(list
, 0, capacity
);
84 UBool
isEmpty() const {
85 return (UBool
)(length
==0);
88 // Reduce all stored offsets by delta, used when the current position
90 // There must not be any offsets lower than delta.
91 // If there is an offset equal to delta, it is removed.
92 // delta=[1..maxLength]
93 void shift(int32_t delta
) {
94 int32_t i
=start
+delta
;
105 // Add an offset. The list must not contain it yet.
106 // offset=[1..maxLength]
107 void addOffset(int32_t offset
) {
108 int32_t i
=start
+offset
;
116 // offset=[1..maxLength]
117 UBool
containsOffset(int32_t offset
) const {
118 int32_t i
=start
+offset
;
125 // Find the lowest stored offset from a non-empty list, remove it,
126 // and reduce all other offsets by this minimum.
127 // Returns [1..maxLength].
128 int32_t popMinimum() {
129 // Look for the next offset in list[start+1..capacity-1].
130 int32_t i
=start
, result
;
131 while(++i
<capacity
) {
142 // Wrap around and look for the next offset in list[0..start].
143 // Since the list is not empty, there will be one.
144 result
=capacity
-start
;
161 UBool staticList
[16];
164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
166 getUTF8Length(const UChar
*s
, int32_t length
) {
167 UErrorCode errorCode
=U_ZERO_ERROR
;
169 u_strToUTF8(NULL
, 0, &length8
, s
, length
, &errorCode
);
170 if(U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
173 // The string contains an unpaired surrogate.
174 // Ignore this string.
179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
181 appendUTF8(const UChar
*s
, int32_t length
, uint8_t *t
, int32_t capacity
) {
182 UErrorCode errorCode
=U_ZERO_ERROR
;
184 u_strToUTF8((char *)t
, capacity
, &length8
, s
, length
, &errorCode
);
185 if(U_SUCCESS(errorCode
)) {
188 // The string contains an unpaired surrogate.
189 // Ignore this string.
194 static inline uint8_t
195 makeSpanLengthByte(int32_t spanLength
) {
196 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
197 return spanLength
<0xfe ? (uint8_t)spanLength
: (uint8_t)0xfe;
200 // Construct for all variants of span(), or only for any one variant.
201 // Initialize as little as possible, for single use.
202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet
&set
,
203 const UVector
&setStrings
,
205 : spanSet(0, 0x10ffff), pSpanNotSet(NULL
), strings(setStrings
),
206 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
208 maxLength16(0), maxLength8(0),
209 all((UBool
)(which
==ALL
)) {
210 spanSet
.retainAll(set
);
211 if(which
&NOT_CONTAINED
) {
212 // Default to the same sets.
213 // addToSpanNotSet() will create a separate set if necessary.
214 pSpanNotSet
=&spanSet
;
217 // Determine if the strings even need to be taken into account at all for span() etc.
218 // If any string is relevant, then all strings need to be used for
219 // span(longest match) but only the relevant ones for span(while contained).
220 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
221 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
222 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
223 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
224 int32_t stringsLength
=strings
.size();
226 int32_t i
, spanLength
;
227 UBool someRelevant
=FALSE
;
228 for(i
=0; i
<stringsLength
; ++i
) {
229 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
230 const UChar
*s16
=string
.getBuffer();
231 int32_t length16
=string
.length();
233 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
234 if(spanLength
<length16
) { // Relevant string.
235 someRelevant
=thisRelevant
=TRUE
;
239 if((which
&UTF16
) && length16
>maxLength16
) {
240 maxLength16
=length16
;
242 if((which
&UTF8
) && (thisRelevant
|| (which
&CONTAINED
))) {
243 int32_t length8
=getUTF8Length(s16
, length16
);
245 if(length8
>maxLength8
) {
251 maxLength16
=maxLength8
=0;
255 // Freeze after checking for the need to use strings at all because freezing
256 // a set takes some time and memory which are wasted if there are no relevant strings.
261 uint8_t *spanBackLengths
;
262 uint8_t *spanUTF8Lengths
;
263 uint8_t *spanBackUTF8Lengths
;
265 // Allocate a block of meta data.
268 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
269 allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
271 allocSize
=stringsLength
; // One set of span lengths.
273 // UTF-8 lengths and UTF-8 strings.
274 allocSize
+=stringsLength
*4+utf8Length
;
277 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
278 utf8Lengths
=staticLengths
;
280 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
281 if(utf8Lengths
==NULL
) {
282 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
283 return; // Out of memory.
288 // Store span lengths for all span() variants.
289 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
290 spanBackLengths
=spanLengths
+stringsLength
;
291 spanUTF8Lengths
=spanBackLengths
+stringsLength
;
292 spanBackUTF8Lengths
=spanUTF8Lengths
+stringsLength
;
293 utf8
=spanBackUTF8Lengths
+stringsLength
;
295 // Store span lengths for only one span() variant.
297 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
298 utf8
=spanLengths
+stringsLength
;
300 spanLengths
=(uint8_t *)utf8Lengths
;
302 spanBackLengths
=spanUTF8Lengths
=spanBackUTF8Lengths
=spanLengths
;
305 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
306 int32_t utf8Count
=0; // Count UTF-8 bytes written so far.
308 for(i
=0; i
<stringsLength
; ++i
) {
309 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
310 const UChar
*s16
=string
.getBuffer();
311 int32_t length16
=string
.length();
312 spanLength
=spanSet
.span(s16
, length16
, USET_SPAN_CONTAINED
);
313 if(spanLength
<length16
) { // Relevant string.
315 if(which
&CONTAINED
) {
317 spanLengths
[i
]=makeSpanLengthByte(spanLength
);
320 spanLength
=length16
-spanSet
.spanBack(s16
, length16
, USET_SPAN_CONTAINED
);
321 spanBackLengths
[i
]=makeSpanLengthByte(spanLength
);
323 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
324 spanLengths
[i
]=spanBackLengths
[i
]=0; // Only store a relevant/irrelevant flag.
328 uint8_t *s8
=utf8
+utf8Count
;
329 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
330 utf8Count
+=utf8Lengths
[i
]=length8
;
331 if(length8
==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
332 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
333 } else { // Relevant for UTF-8.
334 if(which
&CONTAINED
) {
336 spanLength
=spanSet
.spanUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
337 spanUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
340 spanLength
=length8
-spanSet
.spanBackUTF8((const char *)s8
, length8
, USET_SPAN_CONTAINED
);
341 spanBackUTF8Lengths
[i
]=makeSpanLengthByte(spanLength
);
343 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
344 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=0; // Only store a relevant/irrelevant flag.
348 if(which
&NOT_CONTAINED
) {
349 // Add string start and end code points to the spanNotSet so that
350 // a span(while not contained) stops before any string.
354 U16_NEXT(s16
, len
, length16
, c
);
358 int32_t len
=length16
;
359 U16_PREV(s16
, 0, len
, c
);
363 } else { // Irrelevant string.
365 if(which
&CONTAINED
) { // Only necessary for LONGEST_MATCH.
366 uint8_t *s8
=utf8
+utf8Count
;
367 int32_t length8
=appendUTF8(s16
, length16
, s8
, utf8Length
-utf8Count
);
368 utf8Count
+=utf8Lengths
[i
]=length8
;
374 spanLengths
[i
]=spanBackLengths
[i
]=
375 spanUTF8Lengths
[i
]=spanBackUTF8Lengths
[i
]=
376 (uint8_t)ALL_CP_CONTAINED
;
378 // All spanXYZLengths pointers contain the same address.
379 spanLengths
[i
]=(uint8_t)ALL_CP_CONTAINED
;
386 pSpanNotSet
->freeze();
390 // Copy constructor. Assumes which==ALL for a frozen set.
391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan
&otherStringSpan
,
392 const UVector
&newParentSetStrings
)
393 : spanSet(otherStringSpan
.spanSet
), pSpanNotSet(NULL
), strings(newParentSetStrings
),
394 utf8Lengths(NULL
), spanLengths(NULL
), utf8(NULL
),
395 utf8Length(otherStringSpan
.utf8Length
),
396 maxLength16(otherStringSpan
.maxLength16
), maxLength8(otherStringSpan
.maxLength8
),
398 if(otherStringSpan
.pSpanNotSet
==&otherStringSpan
.spanSet
) {
399 pSpanNotSet
=&spanSet
;
401 pSpanNotSet
=(UnicodeSet
*)otherStringSpan
.pSpanNotSet
->clone();
404 // Allocate a block of meta data.
405 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
406 int32_t stringsLength
=strings
.size();
407 int32_t allocSize
=stringsLength
*(4+1+1+1+1)+utf8Length
;
408 if(allocSize
<=(int32_t)sizeof(staticLengths
)) {
409 utf8Lengths
=staticLengths
;
411 utf8Lengths
=(int32_t *)uprv_malloc(allocSize
);
412 if(utf8Lengths
==NULL
) {
413 maxLength16
=maxLength8
=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
414 return; // Out of memory.
418 spanLengths
=(uint8_t *)(utf8Lengths
+stringsLength
);
419 utf8
=spanLengths
+stringsLength
*4;
420 uprv_memcpy(utf8Lengths
, otherStringSpan
.utf8Lengths
, allocSize
);
423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
424 if(pSpanNotSet
!=NULL
&& pSpanNotSet
!=&spanSet
) {
427 if(utf8Lengths
!=NULL
&& utf8Lengths
!=staticLengths
) {
428 uprv_free(utf8Lengths
);
432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c
) {
433 if(pSpanNotSet
==NULL
|| pSpanNotSet
==&spanSet
) {
434 if(spanSet
.contains(c
)) {
435 return; // Nothing to do.
437 UnicodeSet
*newSet
=(UnicodeSet
*)spanSet
.cloneAsThawed();
439 return; // Out of memory.
447 // Compare strings without any argument checks. Requires length>0.
449 matches16(const UChar
*s
, const UChar
*t
, int32_t length
) {
459 matches8(const uint8_t *s
, const uint8_t *t
, int32_t length
) {
468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
469 // at code point boundaries.
470 // That is, each edge of a match must not be in the middle of a surrogate pair.
472 matches16CPB(const UChar
*s
, int32_t start
, int32_t limit
, const UChar
*t
, int32_t length
) {
475 return matches16(s
, t
, length
) &&
476 !(0<start
&& U16_IS_LEAD(s
[-1]) && U16_IS_TRAIL(s
[0])) &&
477 !(length
<limit
&& U16_IS_LEAD(s
[length
-1]) && U16_IS_TRAIL(s
[length
]));
480 // Does the set contain the next code point?
481 // If so, return its length; otherwise return its negative length.
482 static inline int32_t
483 spanOne(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
485 if(c
>=0xd800 && c
<=0xdbff && length
>=2 && U16_IS_TRAIL(c2
=s
[1])) {
486 return set
.contains(U16_GET_SUPPLEMENTARY(c
, c2
)) ? 2 : -2;
488 return set
.contains(c
) ? 1 : -1;
491 static inline int32_t
492 spanOneBack(const UnicodeSet
&set
, const UChar
*s
, int32_t length
) {
493 UChar c
=s
[length
-1], c2
;
494 if(c
>=0xdc00 && c
<=0xdfff && length
>=2 && U16_IS_LEAD(c2
=s
[length
-2])) {
495 return set
.contains(U16_GET_SUPPLEMENTARY(c2
, c
)) ? 2 : -2;
497 return set
.contains(c
) ? 1 : -1;
500 static inline int32_t
501 spanOneUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
504 return set
.contains(c
) ? 1 : -1;
506 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
508 U8_NEXT_OR_FFFD(s
, i
, length
, c
);
509 return set
.contains(c
) ? i
: -i
;
512 static inline int32_t
513 spanOneBackUTF8(const UnicodeSet
&set
, const uint8_t *s
, int32_t length
) {
514 UChar32 c
=s
[length
-1];
516 return set
.contains(c
) ? 1 : -1;
519 c
=utf8_prevCharSafeBody(s
, 0, &i
, c
, -3);
521 return set
.contains(c
) ? length
: -length
;
525 * Note: In span() when spanLength==0 (after a string match, or at the beginning
526 * after an empty code point span) and in spanNot() and spanNotUTF8(),
527 * string matching could use a binary search
528 * because all string matches are done from the same start index.
530 * For UTF-8, this would require a comparison function that returns UTF-16 order.
532 * This optimization should not be necessary for normal UnicodeSets because
533 * most sets have no strings, and most sets with strings have
534 * very few very short strings.
535 * For cases with many strings, it might be better to use a different API
536 * and implementation with a DFA (state machine).
540 * Algorithm for span(USET_SPAN_CONTAINED)
542 * Theoretical algorithm:
543 * - Iterate through the string, and at each code point boundary:
544 * + If the code point there is in the set, then remember to continue after it.
545 * + If a set string matches at the current position, then remember to continue after it.
546 * + Either recursively span for each code point or string match,
547 * or recursively span for all but the shortest one and
548 * iteratively continue the span with the shortest local match.
549 * + Remember the longest recursive span (the farthest end point).
550 * + If there is no match at the current position, neither for the code point there
551 * nor for any set string, then stop and return the longest recursive span length.
553 * Optimized implementation:
555 * (We assume that most sets will have very few very short strings.
556 * A span using a string-less set is extremely fast.)
558 * Create and cache a spanSet which contains all of the single code points
559 * of the original set but none of its strings.
561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
563 * + Try to match each set string at the end of the spanLength.
564 * ~ Set strings that start with set-contained code points must be matched
565 * with a partial overlap because the recursive algorithm would have tried
566 * to match them at every position.
567 * ~ Set strings that entirely consist of set-contained code points
568 * are irrelevant for span(USET_SPAN_CONTAINED) because the
569 * recursive algorithm would continue after them anyway
570 * and find the longest recursive match from their end.
571 * ~ Rather than recursing, note each end point of a set string match.
572 * + If no set string matched after spanSet.span(), then return
573 * with where the spanSet.span() ended.
574 * + If at least one set string matched after spanSet.span(), then
575 * pop the shortest string match end point and continue
576 * the loop, trying to match all set strings from there.
577 * + If at least one more set string matched after a previous string match,
578 * then test if the code point after the previous string match is also
579 * contained in the set.
580 * Continue the loop with the shortest end point of either this code point
581 * or a matching set string.
582 * + If no more set string matched after a previous string match,
583 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
584 * Stop if spanLength==0, otherwise continue the loop.
586 * By noting each end point of a set string match,
587 * the function visits each string position at most once and finishes
590 * The recursive algorithm may visit the same string position many times
591 * if multiple paths lead to it and finishes in exponential time.
595 * Algorithm for span(USET_SPAN_SIMPLE)
597 * Theoretical algorithm:
598 * - Iterate through the string, and at each code point boundary:
599 * + If the code point there is in the set, then remember to continue after it.
600 * + If a set string matches at the current position, then remember to continue after it.
601 * + Continue from the farthest match position and ignore all others.
602 * + If there is no match at the current position,
603 * then stop and return the current position.
605 * Optimized implementation:
607 * (Same assumption and spanSet as above.)
609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
611 * + Try to match each set string at the end of the spanLength.
612 * ~ Set strings that start with set-contained code points must be matched
613 * with a partial overlap because the standard algorithm would have tried
614 * to match them earlier.
615 * ~ Set strings that entirely consist of set-contained code points
616 * must be matched with a full overlap because the longest-match algorithm
617 * would hide set string matches that end earlier.
618 * Such set strings need not be matched earlier inside the code point span
619 * because the standard algorithm would then have continued after
620 * the set string match anyway.
621 * ~ Remember the longest set string match (farthest end point) from the earliest
623 * + If no set string matched after spanSet.span(), then return
624 * with where the spanSet.span() ended.
625 * + If at least one set string matched, then continue the loop after the
626 * longest match from the earliest position.
627 * + If no more set string matched after a previous string match,
628 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
629 * Stop if spanLength==0, otherwise continue the loop.
632 int32_t UnicodeSetStringSpan::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
633 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
634 return spanNot(s
, length
);
636 int32_t spanLength
=spanSet
.span(s
, length
, USET_SPAN_CONTAINED
);
637 if(spanLength
==length
) {
641 // Consider strings; they may overlap with the span.
643 if(spanCondition
==USET_SPAN_CONTAINED
) {
644 // Use offset list to try all possibilities.
645 offsets
.setMaxLength(maxLength16
);
647 int32_t pos
=spanLength
, rest
=length
-pos
;
648 int32_t i
, stringsLength
=strings
.size();
650 if(spanCondition
==USET_SPAN_CONTAINED
) {
651 for(i
=0; i
<stringsLength
; ++i
) {
652 int32_t overlap
=spanLengths
[i
];
653 if(overlap
==ALL_CP_CONTAINED
) {
654 continue; // Irrelevant string.
656 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
657 const UChar
*s16
=string
.getBuffer();
658 int32_t length16
=string
.length();
660 // Try to match this string at pos-overlap..pos.
661 if(overlap
>=LONG_SPAN
) {
663 // While contained: No point matching fully inside the code point span.
664 U16_BACK_1(s16
, 0, overlap
); // Length of the string minus the last code point.
666 if(overlap
>spanLength
) {
669 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
674 // Try to match if the increment is not listed already.
675 if(!offsets
.containsOffset(inc
) && matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)) {
677 return length
; // Reached the end of the string.
679 offsets
.addOffset(inc
);
688 } else /* USET_SPAN_SIMPLE */ {
689 int32_t maxInc
=0, maxOverlap
=0;
690 for(i
=0; i
<stringsLength
; ++i
) {
691 int32_t overlap
=spanLengths
[i
];
692 // For longest match, we do need to try to match even an all-contained string
693 // to find the match from the earliest start.
695 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
696 const UChar
*s16
=string
.getBuffer();
697 int32_t length16
=string
.length();
699 // Try to match this string at pos-overlap..pos.
700 if(overlap
>=LONG_SPAN
) {
702 // Longest match: Need to match fully inside the code point span
703 // to find the match from the earliest start.
705 if(overlap
>spanLength
) {
708 int32_t inc
=length16
-overlap
; // Keep overlap+inc==length16.
710 if(inc
>rest
|| overlap
<maxOverlap
) {
713 // Try to match if the string is longer or starts earlier.
714 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
715 matches16CPB(s
, pos
-overlap
, length
, s16
, length16
)
717 maxInc
=inc
; // Longest match from earliest start.
726 if(maxInc
!=0 || maxOverlap
!=0) {
727 // Longest-match algorithm, and there was a string match.
728 // Simply continue after it.
732 return length
; // Reached the end of the string.
734 spanLength
=0; // Match strings from after a string match.
738 // Finished trying to match all strings at pos.
740 if(spanLength
!=0 || pos
==0) {
741 // The position is after an unlimited code point span (spanLength!=0),
742 // not after a string match.
743 // The only position where spanLength==0 after a span is pos==0.
744 // Otherwise, an unlimited code point span is only tried again when no
745 // strings match, and if such a non-initial span fails we stop.
746 if(offsets
.isEmpty()) {
747 return pos
; // No strings matched after a span.
749 // Match strings from after the next string match.
751 // The position is after a string match (or a single code point).
752 if(offsets
.isEmpty()) {
753 // No more strings matched after a previous string match.
754 // Try another code point span from after the last string match.
755 spanLength
=spanSet
.span(s
+pos
, rest
, USET_SPAN_CONTAINED
);
756 if( spanLength
==rest
|| // Reached the end of the string, or
757 spanLength
==0 // neither strings nor span progressed.
759 return pos
+spanLength
;
763 continue; // spanLength>0: Match strings from after a span.
765 // Try to match only one code point from after a string match if some
766 // string matched beyond it, so that we try all possible positions
767 // and don't overshoot.
768 spanLength
=spanOne(spanSet
, s
+pos
, rest
);
770 if(spanLength
==rest
) {
771 return length
; // Reached the end of the string.
773 // Match strings after this code point.
774 // There cannot be any increments below it because UnicodeSet strings
775 // contain multiple code points.
778 offsets
.shift(spanLength
);
780 continue; // Match strings from after a single code point.
782 // Match strings from after the next string match.
785 int32_t minOffset
=offsets
.popMinimum();
788 spanLength
=0; // Match strings from after a string match.
792 int32_t UnicodeSetStringSpan::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
793 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
794 return spanNotBack(s
, length
);
796 int32_t pos
=spanSet
.spanBack(s
, length
, USET_SPAN_CONTAINED
);
800 int32_t spanLength
=length
-pos
;
802 // Consider strings; they may overlap with the span.
804 if(spanCondition
==USET_SPAN_CONTAINED
) {
805 // Use offset list to try all possibilities.
806 offsets
.setMaxLength(maxLength16
);
808 int32_t i
, stringsLength
=strings
.size();
809 uint8_t *spanBackLengths
=spanLengths
;
811 spanBackLengths
+=stringsLength
;
814 if(spanCondition
==USET_SPAN_CONTAINED
) {
815 for(i
=0; i
<stringsLength
; ++i
) {
816 int32_t overlap
=spanBackLengths
[i
];
817 if(overlap
==ALL_CP_CONTAINED
) {
818 continue; // Irrelevant string.
820 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
821 const UChar
*s16
=string
.getBuffer();
822 int32_t length16
=string
.length();
824 // Try to match this string at pos-(length16-overlap)..pos-length16.
825 if(overlap
>=LONG_SPAN
) {
827 // While contained: No point matching fully inside the code point span.
829 U16_FWD_1(s16
, len1
, overlap
);
830 overlap
-=len1
; // Length of the string minus the first code point.
832 if(overlap
>spanLength
) {
835 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
840 // Try to match if the decrement is not listed already.
841 if(!offsets
.containsOffset(dec
) && matches16CPB(s
, pos
-dec
, length
, s16
, length16
)) {
843 return 0; // Reached the start of the string.
845 offsets
.addOffset(dec
);
854 } else /* USET_SPAN_SIMPLE */ {
855 int32_t maxDec
=0, maxOverlap
=0;
856 for(i
=0; i
<stringsLength
; ++i
) {
857 int32_t overlap
=spanBackLengths
[i
];
858 // For longest match, we do need to try to match even an all-contained string
859 // to find the match from the latest end.
861 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
862 const UChar
*s16
=string
.getBuffer();
863 int32_t length16
=string
.length();
865 // Try to match this string at pos-(length16-overlap)..pos-length16.
866 if(overlap
>=LONG_SPAN
) {
868 // Longest match: Need to match fully inside the code point span
869 // to find the match from the latest end.
871 if(overlap
>spanLength
) {
874 int32_t dec
=length16
-overlap
; // Keep dec+overlap==length16.
876 if(dec
>pos
|| overlap
<maxOverlap
) {
879 // Try to match if the string is longer or ends later.
880 if( (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
881 matches16CPB(s
, pos
-dec
, length
, s16
, length16
)
883 maxDec
=dec
; // Longest match from latest end.
892 if(maxDec
!=0 || maxOverlap
!=0) {
893 // Longest-match algorithm, and there was a string match.
894 // Simply continue before it.
897 return 0; // Reached the start of the string.
899 spanLength
=0; // Match strings from before a string match.
903 // Finished trying to match all strings at pos.
905 if(spanLength
!=0 || pos
==length
) {
906 // The position is before an unlimited code point span (spanLength!=0),
907 // not before a string match.
908 // The only position where spanLength==0 before a span is pos==length.
909 // Otherwise, an unlimited code point span is only tried again when no
910 // strings match, and if such a non-initial span fails we stop.
911 if(offsets
.isEmpty()) {
912 return pos
; // No strings matched before a span.
914 // Match strings from before the next string match.
916 // The position is before a string match (or a single code point).
917 if(offsets
.isEmpty()) {
918 // No more strings matched before a previous string match.
919 // Try another code point span from before the last string match.
921 pos
=spanSet
.spanBack(s
, oldPos
, USET_SPAN_CONTAINED
);
922 spanLength
=oldPos
-pos
;
923 if( pos
==0 || // Reached the start of the string, or
924 spanLength
==0 // neither strings nor span progressed.
928 continue; // spanLength>0: Match strings from before a span.
930 // Try to match only one code point from before a string match if some
931 // string matched beyond it, so that we try all possible positions
932 // and don't overshoot.
933 spanLength
=spanOneBack(spanSet
, s
, pos
);
935 if(spanLength
==pos
) {
936 return 0; // Reached the start of the string.
938 // Match strings before this code point.
939 // There cannot be any decrements below it because UnicodeSet strings
940 // contain multiple code points.
942 offsets
.shift(spanLength
);
944 continue; // Match strings from before a single code point.
946 // Match strings from before the next string match.
949 pos
-=offsets
.popMinimum();
950 spanLength
=0; // Match strings from before a string match.
954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
955 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
956 return spanNotUTF8(s
, length
);
958 int32_t spanLength
=spanSet
.spanUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
959 if(spanLength
==length
) {
963 // Consider strings; they may overlap with the span.
965 if(spanCondition
==USET_SPAN_CONTAINED
) {
966 // Use offset list to try all possibilities.
967 offsets
.setMaxLength(maxLength8
);
969 int32_t pos
=spanLength
, rest
=length
-pos
;
970 int32_t i
, stringsLength
=strings
.size();
971 uint8_t *spanUTF8Lengths
=spanLengths
;
973 spanUTF8Lengths
+=2*stringsLength
;
976 const uint8_t *s8
=utf8
;
978 if(spanCondition
==USET_SPAN_CONTAINED
) {
979 for(i
=0; i
<stringsLength
; ++i
) {
980 length8
=utf8Lengths
[i
];
982 continue; // String not representable in UTF-8.
984 int32_t overlap
=spanUTF8Lengths
[i
];
985 if(overlap
==ALL_CP_CONTAINED
) {
987 continue; // Irrelevant string.
990 // Try to match this string at pos-overlap..pos.
991 if(overlap
>=LONG_SPAN
) {
993 // While contained: No point matching fully inside the code point span.
994 U8_BACK_1(s8
, 0, overlap
); // Length of the string minus the last code point.
996 if(overlap
>spanLength
) {
999 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1004 // Try to match if the increment is not listed already.
1005 // Match at code point boundaries. (The UTF-8 strings were converted
1006 // from UTF-16 and are guaranteed to be well-formed.)
1007 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1008 !offsets
.containsOffset(inc
) &&
1009 matches8(s
+pos
-overlap
, s8
, length8
)
1013 return length
; // Reached the end of the string.
1015 offsets
.addOffset(inc
);
1025 } else /* USET_SPAN_SIMPLE */ {
1026 int32_t maxInc
=0, maxOverlap
=0;
1027 for(i
=0; i
<stringsLength
; ++i
) {
1028 length8
=utf8Lengths
[i
];
1030 continue; // String not representable in UTF-8.
1032 int32_t overlap
=spanUTF8Lengths
[i
];
1033 // For longest match, we do need to try to match even an all-contained string
1034 // to find the match from the earliest start.
1036 // Try to match this string at pos-overlap..pos.
1037 if(overlap
>=LONG_SPAN
) {
1039 // Longest match: Need to match fully inside the code point span
1040 // to find the match from the earliest start.
1042 if(overlap
>spanLength
) {
1045 int32_t inc
=length8
-overlap
; // Keep overlap+inc==length8.
1047 if(inc
>rest
|| overlap
<maxOverlap
) {
1050 // Try to match if the string is longer or starts earlier.
1051 // Match at code point boundaries. (The UTF-8 strings were converted
1052 // from UTF-16 and are guaranteed to be well-formed.)
1053 if( !U8_IS_TRAIL(s
[pos
-overlap
]) &&
1054 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ inc
>maxInc
) &&
1055 matches8(s
+pos
-overlap
, s8
, length8
)
1058 maxInc
=inc
; // Longest match from earliest start.
1068 if(maxInc
!=0 || maxOverlap
!=0) {
1069 // Longest-match algorithm, and there was a string match.
1070 // Simply continue after it.
1074 return length
; // Reached the end of the string.
1076 spanLength
=0; // Match strings from after a string match.
1080 // Finished trying to match all strings at pos.
1082 if(spanLength
!=0 || pos
==0) {
1083 // The position is after an unlimited code point span (spanLength!=0),
1084 // not after a string match.
1085 // The only position where spanLength==0 after a span is pos==0.
1086 // Otherwise, an unlimited code point span is only tried again when no
1087 // strings match, and if such a non-initial span fails we stop.
1088 if(offsets
.isEmpty()) {
1089 return pos
; // No strings matched after a span.
1091 // Match strings from after the next string match.
1093 // The position is after a string match (or a single code point).
1094 if(offsets
.isEmpty()) {
1095 // No more strings matched after a previous string match.
1096 // Try another code point span from after the last string match.
1097 spanLength
=spanSet
.spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_CONTAINED
);
1098 if( spanLength
==rest
|| // Reached the end of the string, or
1099 spanLength
==0 // neither strings nor span progressed.
1101 return pos
+spanLength
;
1105 continue; // spanLength>0: Match strings from after a span.
1107 // Try to match only one code point from after a string match if some
1108 // string matched beyond it, so that we try all possible positions
1109 // and don't overshoot.
1110 spanLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1112 if(spanLength
==rest
) {
1113 return length
; // Reached the end of the string.
1115 // Match strings after this code point.
1116 // There cannot be any increments below it because UnicodeSet strings
1117 // contain multiple code points.
1120 offsets
.shift(spanLength
);
1122 continue; // Match strings from after a single code point.
1124 // Match strings from after the next string match.
1127 int32_t minOffset
=offsets
.popMinimum();
1130 spanLength
=0; // Match strings from after a string match.
1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
1135 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
1136 return spanNotBackUTF8(s
, length
);
1138 int32_t pos
=spanSet
.spanBackUTF8((const char *)s
, length
, USET_SPAN_CONTAINED
);
1142 int32_t spanLength
=length
-pos
;
1144 // Consider strings; they may overlap with the span.
1146 if(spanCondition
==USET_SPAN_CONTAINED
) {
1147 // Use offset list to try all possibilities.
1148 offsets
.setMaxLength(maxLength8
);
1150 int32_t i
, stringsLength
=strings
.size();
1151 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1153 spanBackUTF8Lengths
+=3*stringsLength
;
1156 const uint8_t *s8
=utf8
;
1158 if(spanCondition
==USET_SPAN_CONTAINED
) {
1159 for(i
=0; i
<stringsLength
; ++i
) {
1160 length8
=utf8Lengths
[i
];
1162 continue; // String not representable in UTF-8.
1164 int32_t overlap
=spanBackUTF8Lengths
[i
];
1165 if(overlap
==ALL_CP_CONTAINED
) {
1167 continue; // Irrelevant string.
1170 // Try to match this string at pos-(length8-overlap)..pos-length8.
1171 if(overlap
>=LONG_SPAN
) {
1173 // While contained: No point matching fully inside the code point span.
1175 U8_FWD_1(s8
, len1
, overlap
);
1176 overlap
-=len1
; // Length of the string minus the first code point.
1178 if(overlap
>spanLength
) {
1181 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1186 // Try to match if the decrement is not listed already.
1187 // Match at code point boundaries. (The UTF-8 strings were converted
1188 // from UTF-16 and are guaranteed to be well-formed.)
1189 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1190 !offsets
.containsOffset(dec
) &&
1191 matches8(s
+pos
-dec
, s8
, length8
)
1194 return 0; // Reached the start of the string.
1196 offsets
.addOffset(dec
);
1206 } else /* USET_SPAN_SIMPLE */ {
1207 int32_t maxDec
=0, maxOverlap
=0;
1208 for(i
=0; i
<stringsLength
; ++i
) {
1209 length8
=utf8Lengths
[i
];
1211 continue; // String not representable in UTF-8.
1213 int32_t overlap
=spanBackUTF8Lengths
[i
];
1214 // For longest match, we do need to try to match even an all-contained string
1215 // to find the match from the latest end.
1217 // Try to match this string at pos-(length8-overlap)..pos-length8.
1218 if(overlap
>=LONG_SPAN
) {
1220 // Longest match: Need to match fully inside the code point span
1221 // to find the match from the latest end.
1223 if(overlap
>spanLength
) {
1226 int32_t dec
=length8
-overlap
; // Keep dec+overlap==length8.
1228 if(dec
>pos
|| overlap
<maxOverlap
) {
1231 // Try to match if the string is longer or ends later.
1232 // Match at code point boundaries. (The UTF-8 strings were converted
1233 // from UTF-16 and are guaranteed to be well-formed.)
1234 if( !U8_IS_TRAIL(s
[pos
-dec
]) &&
1235 (overlap
>maxOverlap
|| /* redundant overlap==maxOverlap && */ dec
>maxDec
) &&
1236 matches8(s
+pos
-dec
, s8
, length8
)
1238 maxDec
=dec
; // Longest match from latest end.
1248 if(maxDec
!=0 || maxOverlap
!=0) {
1249 // Longest-match algorithm, and there was a string match.
1250 // Simply continue before it.
1253 return 0; // Reached the start of the string.
1255 spanLength
=0; // Match strings from before a string match.
1259 // Finished trying to match all strings at pos.
1261 if(spanLength
!=0 || pos
==length
) {
1262 // The position is before an unlimited code point span (spanLength!=0),
1263 // not before a string match.
1264 // The only position where spanLength==0 before a span is pos==length.
1265 // Otherwise, an unlimited code point span is only tried again when no
1266 // strings match, and if such a non-initial span fails we stop.
1267 if(offsets
.isEmpty()) {
1268 return pos
; // No strings matched before a span.
1270 // Match strings from before the next string match.
1272 // The position is before a string match (or a single code point).
1273 if(offsets
.isEmpty()) {
1274 // No more strings matched before a previous string match.
1275 // Try another code point span from before the last string match.
1277 pos
=spanSet
.spanBackUTF8((const char *)s
, oldPos
, USET_SPAN_CONTAINED
);
1278 spanLength
=oldPos
-pos
;
1279 if( pos
==0 || // Reached the start of the string, or
1280 spanLength
==0 // neither strings nor span progressed.
1284 continue; // spanLength>0: Match strings from before a span.
1286 // Try to match only one code point from before a string match if some
1287 // string matched beyond it, so that we try all possible positions
1288 // and don't overshoot.
1289 spanLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1291 if(spanLength
==pos
) {
1292 return 0; // Reached the start of the string.
1294 // Match strings before this code point.
1295 // There cannot be any decrements below it because UnicodeSet strings
1296 // contain multiple code points.
1298 offsets
.shift(spanLength
);
1300 continue; // Match strings from before a single code point.
1302 // Match strings from before the next string match.
1305 pos
-=offsets
.popMinimum();
1306 spanLength
=0; // Match strings from before a string match.
1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1313 * Theoretical algorithm:
1314 * - Iterate through the string, and at each code point boundary:
1315 * + If the code point there is in the set, then return with the current position.
1316 * + If a set string matches at the current position, then return with the current position.
1318 * Optimized implementation:
1320 * (Same assumption as for span() above.)
1322 * Create and cache a spanNotSet which contains all of the single code points
1323 * of the original set but none of its strings.
1324 * For each set string add its initial code point to the spanNotSet.
1325 * (Also add its final code point for spanNotBack().)
1328 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329 * + If the current code point is in the original set, then
1330 * return the current position.
1331 * + If any set string matches at the current position, then
1332 * return the current position.
1333 * + If there is no match at the current position, neither for the code point there
1334 * nor for any set string, then skip this code point and continue the loop.
1335 * This happens for set-string-initial code points that were added to spanNotSet
1336 * when there is not actually a match for such a set string.
1339 int32_t UnicodeSetStringSpan::spanNot(const UChar
*s
, int32_t length
) const {
1340 int32_t pos
=0, rest
=length
;
1341 int32_t i
, stringsLength
=strings
.size();
1343 // Span until we find a code point from the set,
1344 // or a code point that starts or ends some string.
1345 i
=pSpanNotSet
->span(s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1347 return length
; // Reached the end of the string.
1352 // Check whether the current code point is in the original set,
1353 // without the string starts and ends.
1354 int32_t cpLength
=spanOne(spanSet
, s
+pos
, rest
);
1356 return pos
; // There is a set element at pos.
1359 // Try to match the strings at pos.
1360 for(i
=0; i
<stringsLength
; ++i
) {
1361 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1362 continue; // Irrelevant string.
1364 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1365 const UChar
*s16
=string
.getBuffer();
1366 int32_t length16
=string
.length();
1367 if(length16
<=rest
&& matches16CPB(s
, pos
, length
, s16
, length16
)) {
1368 return pos
; // There is a set element at pos.
1372 // The span(while not contained) ended on a string start/end which is
1373 // not in the original set. Skip this code point and continue.
1378 return length
; // Reached the end of the string.
1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar
*s
, int32_t length
) const {
1383 int32_t i
, stringsLength
=strings
.size();
1385 // Span until we find a code point from the set,
1386 // or a code point that starts or ends some string.
1387 pos
=pSpanNotSet
->spanBack(s
, pos
, USET_SPAN_NOT_CONTAINED
);
1389 return 0; // Reached the start of the string.
1392 // Check whether the current code point is in the original set,
1393 // without the string starts and ends.
1394 int32_t cpLength
=spanOneBack(spanSet
, s
, pos
);
1396 return pos
; // There is a set element at pos.
1399 // Try to match the strings at pos.
1400 for(i
=0; i
<stringsLength
; ++i
) {
1401 // Use spanLengths rather than a spanBackLengths pointer because
1402 // it is easier and we only need to know whether the string is irrelevant
1403 // which is the same in either array.
1404 if(spanLengths
[i
]==ALL_CP_CONTAINED
) {
1405 continue; // Irrelevant string.
1407 const UnicodeString
&string
=*(const UnicodeString
*)strings
.elementAt(i
);
1408 const UChar
*s16
=string
.getBuffer();
1409 int32_t length16
=string
.length();
1410 if(length16
<=pos
&& matches16CPB(s
, pos
-length16
, length
, s16
, length16
)) {
1411 return pos
; // There is a set element at pos.
1415 // The span(while not contained) ended on a string start/end which is
1416 // not in the original set. Skip this code point and continue.
1420 return 0; // Reached the start of the string.
1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s
, int32_t length
) const {
1424 int32_t pos
=0, rest
=length
;
1425 int32_t i
, stringsLength
=strings
.size();
1426 uint8_t *spanUTF8Lengths
=spanLengths
;
1428 spanUTF8Lengths
+=2*stringsLength
;
1431 // Span until we find a code point from the set,
1432 // or a code point that starts or ends some string.
1433 i
=pSpanNotSet
->spanUTF8((const char *)s
+pos
, rest
, USET_SPAN_NOT_CONTAINED
);
1435 return length
; // Reached the end of the string.
1440 // Check whether the current code point is in the original set,
1441 // without the string starts and ends.
1442 int32_t cpLength
=spanOneUTF8(spanSet
, s
+pos
, rest
);
1444 return pos
; // There is a set element at pos.
1447 // Try to match the strings at pos.
1448 const uint8_t *s8
=utf8
;
1450 for(i
=0; i
<stringsLength
; ++i
) {
1451 length8
=utf8Lengths
[i
];
1452 // ALL_CP_CONTAINED: Irrelevant string.
1453 if(length8
!=0 && spanUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=rest
&& matches8(s
+pos
, s8
, length8
)) {
1454 return pos
; // There is a set element at pos.
1459 // The span(while not contained) ended on a string start/end which is
1460 // not in the original set. Skip this code point and continue.
1465 return length
; // Reached the end of the string.
1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s
, int32_t length
) const {
1470 int32_t i
, stringsLength
=strings
.size();
1471 uint8_t *spanBackUTF8Lengths
=spanLengths
;
1473 spanBackUTF8Lengths
+=3*stringsLength
;
1476 // Span until we find a code point from the set,
1477 // or a code point that starts or ends some string.
1478 pos
=pSpanNotSet
->spanBackUTF8((const char *)s
, pos
, USET_SPAN_NOT_CONTAINED
);
1480 return 0; // Reached the start of the string.
1483 // Check whether the current code point is in the original set,
1484 // without the string starts and ends.
1485 int32_t cpLength
=spanOneBackUTF8(spanSet
, s
, pos
);
1487 return pos
; // There is a set element at pos.
1490 // Try to match the strings at pos.
1491 const uint8_t *s8
=utf8
;
1493 for(i
=0; i
<stringsLength
; ++i
) {
1494 length8
=utf8Lengths
[i
];
1495 // ALL_CP_CONTAINED: Irrelevant string.
1496 if(length8
!=0 && spanBackUTF8Lengths
[i
]!=ALL_CP_CONTAINED
&& length8
<=pos
&& matches8(s
+pos
-length8
, s8
, length8
)) {
1497 return pos
; // There is a set element at pos.
1502 // The span(while not contained) ended on a string start/end which is
1503 // not in the original set. Skip this code point and continue.
1507 return 0; // Reached the start of the string.