icuSources/common/unisetspan.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2007, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  unisetspan.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2007mar01
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/ustring.h"
  20 #include "cmemory.h"
  21 #include "uvector.h"
  22 #include "unisetspan.h"
  23
  24 U_NAMESPACE_BEGIN
  25
  26 /*
  27  * List of offsets from the current position from where to try matching
  28  * a code point or a string.
  29  * Store offsets rather than indexes to simplify the code and use the same list
  30  * for both increments (in span()) and decrements (in spanBack()).
  31  *
  32  * Assumption: The maximum offset is limited, and the offsets that are stored
  33  * at any one time are relatively dense, that is, there are normally no gaps of
  34  * hundreds or thousands of offset values.
  35  *
  36  * The implementation uses a circular buffer of byte flags,
  37  * each indicating whether the corresponding offset is in the list.
  38  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
  39  * physically moving part of the list.
  40  *
  41  * Note: In principle, the caller should setMaxLength() to the maximum of the
  42  * max string length and U16_LENGTH/U8_LENGTH to account for
  43  * "long" single code points.
  44  * However, this implementation uses at least a staticList with more than
  45  * U8_LENGTH entries anyway.
  46  *
  47  * Note: If maxLength were guaranteed to be no more than 32 or 64,
  48  * the list could be stored as bit flags in a single integer.
  49  * Rather than handling a circular buffer with a start list index,
  50  * the integer would simply be shifted when lower offsets are removed.
  51  * UnicodeSet does not have a limit on the lengths of strings.
  52  */
  53 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
  54 public:
  55     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
  56
  57     ~OffsetList() {
  58         if(list!=staticList) {
  59             uprv_free(list);
  60         }
  61     }
  62
  63     // Call exactly once if the list is to be used.
  64     void setMaxLength(int32_t maxLength) {
  65         if(maxLength<=(int32_t)sizeof(staticList)) {
  66             capacity=(int32_t)sizeof(staticList);
  67         } else {
  68             UBool *l=(UBool *)uprv_malloc(maxLength);
  69             if(l!=NULL) {
  70                 list=l;
  71                 capacity=maxLength;
  72             }
  73         }
  74         uprv_memset(list, 0, capacity);
  75     }
  76
  77     void clear() {
  78         uprv_memset(list, 0, capacity);
  79         start=length=0;
  80     }
  81
  82     UBool isEmpty() const {
  83         return (UBool)(length==0);
  84     }
  85
  86     // Reduce all stored offsets by delta, used when the current position
  87     // moves by delta.
  88     // There must not be any offsets lower than delta.
  89     // If there is an offset equal to delta, it is removed.
  90     // delta=[1..maxLength]
  91     void shift(int32_t delta) {
  92         int32_t i=start+delta;
  93         if(i>=capacity) {
  94             i-=capacity;
  95         }
  96         if(list[i]) {
  97             list[i]=FALSE;
  98             --length;
  99         }
 100         start=i;
 101     }
 102
 103     // Add an offset. The list must not contain it yet.
 104     // offset=[1..maxLength]
 105     void addOffset(int32_t offset) {
 106         int32_t i=start+offset;
 107         if(i>=capacity) {
 108             i-=capacity;
 109         }
 110         list[i]=TRUE;
 111         ++length;
 112     }
 113
 114     // offset=[1..maxLength]
 115     UBool containsOffset(int32_t offset) const {
 116         int32_t i=start+offset;
 117         if(i>=capacity) {
 118             i-=capacity;
 119         }
 120         return list[i];
 121     }
 122
 123     // Find the lowest stored offset from a non-empty list, remove it,
 124     // and reduce all other offsets by this minimum.
 125     // Returns [1..maxLength].
 126     int32_t popMinimum() {
 127         // Look for the next offset in list[start+1..capacity-1].
 128         int32_t i=start, result;
 129         while(++i<capacity) {
 130             if(list[i]) {
 131                 list[i]=FALSE;
 132                 --length;
 133                 result=i-start;
 134                 start=i;
 135                 return result;
 136             }
 137         }
 138         // i==capacity
 139
 140         // Wrap around and look for the next offset in list[0..start].
 141         // Since the list is not empty, there will be one.
 142         result=capacity-start;
 143         i=0;
 144         while(!list[i]) {
 145             ++i;
 146         }
 147         list[i]=FALSE;
 148         --length;
 149         start=i;
 150         return result+=i;
 151     }
 152
 153 private:
 154     UBool *list;
 155     int32_t capacity;
 156     int32_t length;
 157     int32_t start;
 158
 159     UBool staticList[16];
 160 };
 161
 162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
 163 static int32_t
 164 getUTF8Length(const UChar *s, int32_t length) {
 165     UErrorCode errorCode=U_ZERO_ERROR;
 166     int32_t length8=0;
 167     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
 168     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
 169         return length8;
 170     } else {
 171         // The string contains an unpaired surrogate.
 172         // Ignore this string.
 173         return 0;
 174     }
 175 }
 176
 177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
 178 static int32_t
 179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
 180     UErrorCode errorCode=U_ZERO_ERROR;
 181     int32_t length8=0;
 182     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
 183     if(U_SUCCESS(errorCode)) {
 184         return length8;
 185     } else {
 186         // The string contains an unpaired surrogate.
 187         // Ignore this string.
 188         return 0;
 189     }
 190 }
 191
 192 static inline uint8_t
 193 makeSpanLengthByte(int32_t spanLength) {
 194     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
 195     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
 196 }
 197
 198 // Construct for all variants of span(), or only for any one variant.
 199 // Initialize as little as possible, for single use.
 200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
 201                                            const UVector &setStrings,
 202                                            uint32_t which)
 203         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
 204           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 205           utf8Length(0),
 206           maxLength16(0), maxLength8(0),
 207           all((UBool)(which==ALL)) {
 208     spanSet.retainAll(set);
 209     if(which&NOT_CONTAINED) {
 210         // Default to the same sets.
 211         // addToSpanNotSet() will create a separate set if necessary.
 212         pSpanNotSet=&spanSet;
 213     }
 214
 215     // Determine if the strings even need to be taken into account at all for span() etc.
 216     // If any string is relevant, then all strings need to be used for
 217     // span(longest match) but only the relevant ones for span(while contained).
 218     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
 219     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
 220     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
 221     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
 222     int32_t stringsLength=strings.size();
 223
 224     int32_t i, spanLength;
 225     UBool someRelevant=FALSE;
 226     for(i=0; i<stringsLength; ++i) {
 227         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 228         const UChar *s16=string.getBuffer();
 229         int32_t length16=string.length();
 230         UBool thisRelevant;
 231         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 232         if(spanLength<length16) {  // Relevant string.
 233             someRelevant=thisRelevant=TRUE;
 234         } else {
 235             thisRelevant=FALSE;
 236         }
 237         if((which&UTF16) && length16>maxLength16) {
 238             maxLength16=length16;
 239         }
 240         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
 241             int32_t length8=getUTF8Length(s16, length16);
 242             utf8Length+=length8;
 243             if(length8>maxLength8) {
 244                 maxLength8=length8;
 245             }
 246         }
 247     }
 248     if(!someRelevant) {
 249         maxLength16=maxLength8=0;
 250         return;
 251     }
 252
 253     // Freeze after checking for the need to use strings at all because freezing
 254     // a set takes some time and memory which are wasted if there are no relevant strings.
 255     if(all) {
 256         spanSet.freeze();
 257     }
 258
 259     uint8_t *spanBackLengths;
 260     uint8_t *spanUTF8Lengths;
 261     uint8_t *spanBackUTF8Lengths;
 262
 263     // Allocate a block of meta data.
 264     int32_t allocSize;
 265     if(all) {
 266         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 267         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 268     } else {
 269         allocSize=stringsLength;  // One set of span lengths.
 270         if(which&UTF8) {
 271             // UTF-8 lengths and UTF-8 strings.
 272             allocSize+=stringsLength*4+utf8Length;
 273         }
 274     }
 275     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 276         utf8Lengths=staticLengths;
 277     } else {
 278         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 279         if(utf8Lengths==NULL) {
 280             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 281             return;  // Out of memory.
 282         }
 283     }
 284
 285     if(all) {
 286         // Store span lengths for all span() variants.
 287         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 288         spanBackLengths=spanLengths+stringsLength;
 289         spanUTF8Lengths=spanBackLengths+stringsLength;
 290         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
 291         utf8=spanBackUTF8Lengths+stringsLength;
 292     } else {
 293         // Store span lengths for only one span() variant.
 294         if(which&UTF8) {
 295             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 296             utf8=spanLengths+stringsLength;
 297         } else {
 298             spanLengths=(uint8_t *)utf8Lengths;
 299         }
 300         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
 301     }
 302
 303     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
 304     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
 305
 306     for(i=0; i<stringsLength; ++i) {
 307         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 308         const UChar *s16=string.getBuffer();
 309         int32_t length16=string.length();
 310         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 311         if(spanLength<length16) {  // Relevant string.
 312             if(which&UTF16) {
 313                 if(which&CONTAINED) {
 314                     if(which&FWD) {
 315                         spanLengths[i]=makeSpanLengthByte(spanLength);
 316                     }
 317                     if(which&BACK) {
 318                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
 319                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
 320                     }
 321                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 322                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
 323                 }
 324             }
 325             if(which&UTF8) {
 326                 uint8_t *s8=utf8+utf8Count;
 327                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 328                 utf8Count+=utf8Lengths[i]=length8;
 329                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
 330                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
 331                 } else {  // Relevant for UTF-8.
 332                     if(which&CONTAINED) {
 333                         if(which&FWD) {
 334                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 335                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 336                         }
 337                         if(which&BACK) {
 338                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 339                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 340                         }
 341                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 342                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
 343                     }
 344                 }
 345             }
 346             if(which&NOT_CONTAINED) {
 347                 // Add string start and end code points to the spanNotSet so that
 348                 // a span(while not contained) stops before any string.
 349                 UChar32 c;
 350                 if(which&FWD) {
 351                     int32_t len=0;
 352                     U16_NEXT(s16, len, length16, c);
 353                     addToSpanNotSet(c);
 354                 }
 355                 if(which&BACK) {
 356                     int32_t len=length16;
 357                     U16_PREV(s16, 0, len, c);
 358                     addToSpanNotSet(c);
 359                 }
 360             }
 361         } else {  // Irrelevant string.
 362             if(which&UTF8) {
 363                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
 364                     uint8_t *s8=utf8+utf8Count;
 365                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 366                     utf8Count+=utf8Lengths[i]=length8;
 367                 } else {
 368                     utf8Lengths[i]=0;
 369                 }
 370             }
 371             if(all) {
 372                 spanLengths[i]=spanBackLengths[i]=
 373                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
 374                         (uint8_t)ALL_CP_CONTAINED;
 375             } else {
 376                 // All spanXYZLengths pointers contain the same address.
 377                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
 378             }
 379         }
 380     }
 381
 382     // Finish.
 383     if(all) {
 384         pSpanNotSet->freeze();
 385     }
 386 }
 387
 388 // Copy constructor. Assumes which==ALL for a frozen set.
 389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
 390                                            const UVector &newParentSetStrings)
 391         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
 392           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 393           utf8Length(otherStringSpan.utf8Length),
 394           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
 395           all(TRUE) {
 396     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
 397         pSpanNotSet=&spanSet;
 398     } else {
 399         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
 400     }
 401
 402     // Allocate a block of meta data.
 403     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 404     int32_t stringsLength=strings.size();
 405     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 406     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 407         utf8Lengths=staticLengths;
 408     } else {
 409         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 410         if(utf8Lengths==NULL) {
 411             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 412             return;  // Out of memory.
 413         }
 414     }
 415
 416     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 417     utf8=spanLengths+stringsLength*4;
 418     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
 419 }
 420
 421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
 422     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
 423         delete pSpanNotSet;
 424     }
 425     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
 426         uprv_free(utf8Lengths);
 427     }
 428 }
 429
 430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
 431     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
 432         if(spanSet.contains(c)) {
 433             return;  // Nothing to do.
 434         }
 435         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
 436         if(newSet==NULL) {
 437             return;  // Out of memory.
 438         } else {
 439             pSpanNotSet=newSet;
 440         }
 441     }
 442     pSpanNotSet->add(c);
 443 }
 444
 445 // Compare strings without any argument checks. Requires length>0.
 446 static inline UBool
 447 matches16(const UChar *s, const UChar *t, int32_t length) {
 448     do {
 449         if(*s++!=*t++) {
 450             return FALSE;
 451         }
 452     } while(--length>0);
 453     return TRUE;
 454 }
 455
 456 static inline UBool
 457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
 458     do {
 459         if(*s++!=*t++) {
 460             return FALSE;
 461         }
 462     } while(--length>0);
 463     return TRUE;
 464 }
 465
 466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
 467 // at code point boundaries.
 468 // That is, each edge of a match must not be in the middle of a surrogate pair.
 469 static inline UBool
 470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
 471     s+=start;
 472     limit-=start;
 473     return matches16(s, t, length) &&
 474            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
 475            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
 476 }
 477
 478 // Does the set contain the next code point?
 479 // If so, return its length; otherwise return its negative length.
 480 static inline int32_t
 481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
 482     UChar c=*s, c2;
 483     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
 484         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
 485     }
 486     return set.contains(c) ? 1 : -1;
 487 }
 488
 489 static inline int32_t
 490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
 491     UChar c=s[length-1], c2;
 492     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
 493         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
 494     }
 495     return set.contains(c) ? 1 : -1;
 496 }
 497
 498 static inline int32_t
 499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 500     UChar32 c=*s;
 501     if((int8_t)c>=0) {
 502         return set.contains(c) ? 1 : -1;
 503     }
 504     // Take advantage of non-ASCII fastpaths in U8_NEXT().
 505     int32_t i=0;
 506     U8_NEXT(s, i, length, c);
 507     return set.contains(c) ? i : -i;
 508 }
 509
 510 static inline int32_t
 511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 512     UChar32 c=s[length-1];
 513     if((int8_t)c>=0) {
 514         return set.contains(c) ? 1 : -1;
 515     }
 516     int32_t i=length-1;
 517     c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
 518     length-=i;
 519     return set.contains(c) ? length : -length;
 520 }
 521
 522 /*
 523  * Note: In span() when spanLength==0 (after a string match, or at the beginning
 524  * after an empty code point span) and in spanNot() and spanNotUTF8(),
 525  * string matching could use a binary search
 526  * because all string matches are done from the same start index.
 527  *
 528  * For UTF-8, this would require a comparison function that returns UTF-16 order.
 529  *
 530  * This optimization should not be necessary for normal UnicodeSets because
 531  * most sets have no strings, and most sets with strings have
 532  * very few very short strings.
 533  * For cases with many strings, it might be better to use a different API
 534  * and implementation with a DFA (state machine).
 535  */
 536
 537 /*
 538  * Algorithm for span(USET_SPAN_CONTAINED)
 539  *
 540  * Theoretical algorithm:
 541  * - Iterate through the string, and at each code point boundary:
 542  *   + If the code point there is in the set, then remember to continue after it.
 543  *   + If a set string matches at the current position, then remember to continue after it.
 544  *   + Either recursively span for each code point or string match,
 545  *     or recursively span for all but the shortest one and
 546  *     iteratively continue the span with the shortest local match.
 547  *   + Remember the longest recursive span (the farthest end point).
 548  *   + If there is no match at the current position, neither for the code point there
 549  *     nor for any set string, then stop and return the longest recursive span length.
 550  *
 551  * Optimized implementation:
 552  *
 553  * (We assume that most sets will have very few very short strings.
 554  * A span using a string-less set is extremely fast.)
 555  *
 556  * Create and cache a spanSet which contains all of the single code points
 557  * of the original set but none of its strings.
 558  *
 559  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 560  * - Loop:
 561  *   + Try to match each set string at the end of the spanLength.
 562  *     ~ Set strings that start with set-contained code points must be matched
 563  *       with a partial overlap because the recursive algorithm would have tried
 564  *       to match them at every position.
 565  *     ~ Set strings that entirely consist of set-contained code points
 566  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
 567  *       recursive algorithm would continue after them anyway
 568  *       and find the longest recursive match from their end.
 569  *     ~ Rather than recursing, note each end point of a set string match.
 570  *   + If no set string matched after spanSet.span(), then return
 571  *     with where the spanSet.span() ended.
 572  *   + If at least one set string matched after spanSet.span(), then
 573  *     pop the shortest string match end point and continue
 574  *     the loop, trying to match all set strings from there.
 575  *   + If at least one more set string matched after a previous string match,
 576  *     then test if the code point after the previous string match is also
 577  *     contained in the set.
 578  *     Continue the loop with the shortest end point of either this code point
 579  *     or a matching set string.
 580  *   + If no more set string matched after a previous string match,
 581  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 582  *     Stop if spanLength==0, otherwise continue the loop.
 583  *
 584  * By noting each end point of a set string match,
 585  * the function visits each string position at most once and finishes
 586  * in linear time.
 587  *
 588  * The recursive algorithm may visit the same string position many times
 589  * if multiple paths lead to it and finishes in exponential time.
 590  */
 591
 592 /*
 593  * Algorithm for span(USET_SPAN_SIMPLE)
 594  *
 595  * Theoretical algorithm:
 596  * - Iterate through the string, and at each code point boundary:
 597  *   + If the code point there is in the set, then remember to continue after it.
 598  *   + If a set string matches at the current position, then remember to continue after it.
 599  *   + Continue from the farthest match position and ignore all others.
 600  *   + If there is no match at the current position,
 601  *     then stop and return the current position.
 602  *
 603  * Optimized implementation:
 604  *
 605  * (Same assumption and spanSet as above.)
 606  *
 607  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 608  * - Loop:
 609  *   + Try to match each set string at the end of the spanLength.
 610  *     ~ Set strings that start with set-contained code points must be matched
 611  *       with a partial overlap because the standard algorithm would have tried
 612  *       to match them earlier.
 613  *     ~ Set strings that entirely consist of set-contained code points
 614  *       must be matched with a full overlap because the longest-match algorithm
 615  *       would hide set string matches that end earlier.
 616  *       Such set strings need not be matched earlier inside the code point span
 617  *       because the standard algorithm would then have continued after
 618  *       the set string match anyway.
 619  *     ~ Remember the longest set string match (farthest end point) from the earliest
 620  *       starting point.
 621  *   + If no set string matched after spanSet.span(), then return
 622  *     with where the spanSet.span() ended.
 623  *   + If at least one set string matched, then continue the loop after the
 624  *     longest match from the earliest position.
 625  *   + If no more set string matched after a previous string match,
 626  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 627  *     Stop if spanLength==0, otherwise continue the loop.
 628  */
 629
 630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 631     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 632         return spanNot(s, length);
 633     }
 634     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
 635     if(spanLength==length) {
 636         return length;
 637     }
 638
 639     // Consider strings; they may overlap with the span.
 640     OffsetList offsets;
 641     if(spanCondition==USET_SPAN_CONTAINED) {
 642         // Use offset list to try all possibilities.
 643         offsets.setMaxLength(maxLength16);
 644     }
 645     int32_t pos=spanLength, rest=length-pos;
 646     int32_t i, stringsLength=strings.size();
 647     for(;;) {
 648         if(spanCondition==USET_SPAN_CONTAINED) {
 649             for(i=0; i<stringsLength; ++i) {
 650                 int32_t overlap=spanLengths[i];
 651                 if(overlap==ALL_CP_CONTAINED) {
 652                     continue;  // Irrelevant string.
 653                 }
 654                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 655                 const UChar *s16=string.getBuffer();
 656                 int32_t length16=string.length();
 657
 658                 // Try to match this string at pos-overlap..pos.
 659                 if(overlap>=LONG_SPAN) {
 660                     overlap=length16;
 661                     // While contained: No point matching fully inside the code point span.
 662                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
 663                 }
 664                 if(overlap>spanLength) {
 665                     overlap=spanLength;
 666                 }
 667                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 668                 for(;;) {
 669                     if(inc>rest) {
 670                         break;
 671                     }
 672                     // Try to match if the increment is not listed already.
 673                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
 674                         if(inc==rest) {
 675                             return length;  // Reached the end of the string.
 676                         }
 677                         offsets.addOffset(inc);
 678                     }
 679                     if(overlap==0) {
 680                         break;
 681                     }
 682                     --overlap;
 683                     ++inc;
 684                 }
 685             }
 686         } else /* USET_SPAN_SIMPLE */ {
 687             int32_t maxInc=0, maxOverlap=0;
 688             for(i=0; i<stringsLength; ++i) {
 689                 int32_t overlap=spanLengths[i];
 690                 // For longest match, we do need to try to match even an all-contained string
 691                 // to find the match from the earliest start.
 692
 693                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 694                 const UChar *s16=string.getBuffer();
 695                 int32_t length16=string.length();
 696
 697                 // Try to match this string at pos-overlap..pos.
 698                 if(overlap>=LONG_SPAN) {
 699                     overlap=length16;
 700                     // Longest match: Need to match fully inside the code point span
 701                     // to find the match from the earliest start.
 702                 }
 703                 if(overlap>spanLength) {
 704                     overlap=spanLength;
 705                 }
 706                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 707                 for(;;) {
 708                     if(inc>rest || overlap<maxOverlap) {
 709                         break;
 710                     }
 711                     // Try to match if the string is longer or starts earlier.
 712                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
 713                         matches16CPB(s, pos-overlap, length, s16, length16)
 714                     ) {
 715                         maxInc=inc;  // Longest match from earliest start.
 716                         maxOverlap=overlap;
 717                         break;
 718                     }
 719                     --overlap;
 720                     ++inc;
 721                 }
 722             }
 723
 724             if(maxInc!=0 || maxOverlap!=0) {
 725                 // Longest-match algorithm, and there was a string match.
 726                 // Simply continue after it.
 727                 pos+=maxInc;
 728                 rest-=maxInc;
 729                 if(rest==0) {
 730                     return length;  // Reached the end of the string.
 731                 }
 732                 spanLength=0;  // Match strings from after a string match.
 733                 continue;
 734             }
 735         }
 736         // Finished trying to match all strings at pos.
 737
 738         if(spanLength!=0 || pos==0) {
 739             // The position is after an unlimited code point span (spanLength!=0),
 740             // not after a string match.
 741             // The only position where spanLength==0 after a span is pos==0.
 742             // Otherwise, an unlimited code point span is only tried again when no
 743             // strings match, and if such a non-initial span fails we stop.
 744             if(offsets.isEmpty()) {
 745                 return pos;  // No strings matched after a span.
 746             }
 747             // Match strings from after the next string match.
 748         } else {
 749             // The position is after a string match (or a single code point).
 750             if(offsets.isEmpty()) {
 751                 // No more strings matched after a previous string match.
 752                 // Try another code point span from after the last string match.
 753                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
 754                 if( spanLength==rest || // Reached the end of the string, or
 755                     spanLength==0       // neither strings nor span progressed.
 756                 ) {
 757                     return pos+spanLength;
 758                 }
 759                 pos+=spanLength;
 760                 rest-=spanLength;
 761                 continue;  // spanLength>0: Match strings from after a span.
 762             } else {
 763                 // Try to match only one code point from after a string match if some
 764                 // string matched beyond it, so that we try all possible positions
 765                 // and don't overshoot.
 766                 spanLength=spanOne(spanSet, s+pos, rest);
 767                 if(spanLength>0) {
 768                     if(spanLength==rest) {
 769                         return length;  // Reached the end of the string.
 770                     }
 771                     // Match strings after this code point.
 772                     // There cannot be any increments below it because UnicodeSet strings
 773                     // contain multiple code points.
 774                     pos+=spanLength;
 775                     rest-=spanLength;
 776                     offsets.shift(spanLength);
 777                     spanLength=0;
 778                     continue;  // Match strings from after a single code point.
 779                 }
 780                 // Match strings from after the next string match.
 781             }
 782         }
 783         int32_t minOffset=offsets.popMinimum();
 784         pos+=minOffset;
 785         rest-=minOffset;
 786         spanLength=0;  // Match strings from after a string match.
 787     }
 788 }
 789
 790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 791     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 792         return spanNotBack(s, length);
 793     }
 794     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
 795     if(pos==0) {
 796         return 0;
 797     }
 798     int32_t spanLength=length-pos;
 799
 800     // Consider strings; they may overlap with the span.
 801     OffsetList offsets;
 802     if(spanCondition==USET_SPAN_CONTAINED) {
 803         // Use offset list to try all possibilities.
 804         offsets.setMaxLength(maxLength16);
 805     }
 806     int32_t i, stringsLength=strings.size();
 807     uint8_t *spanBackLengths=spanLengths;
 808     if(all) {
 809         spanBackLengths+=stringsLength;
 810     }
 811     for(;;) {
 812         if(spanCondition==USET_SPAN_CONTAINED) {
 813             for(i=0; i<stringsLength; ++i) {
 814                 int32_t overlap=spanBackLengths[i];
 815                 if(overlap==ALL_CP_CONTAINED) {
 816                     continue;  // Irrelevant string.
 817                 }
 818                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 819                 const UChar *s16=string.getBuffer();
 820                 int32_t length16=string.length();
 821
 822                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 823                 if(overlap>=LONG_SPAN) {
 824                     overlap=length16;
 825                     // While contained: No point matching fully inside the code point span.
 826                     int32_t len1=0;
 827                     U16_FWD_1(s16, len1, overlap);
 828                     overlap-=len1;  // Length of the string minus the first code point.
 829                 }
 830                 if(overlap>spanLength) {
 831                     overlap=spanLength;
 832                 }
 833                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 834                 for(;;) {
 835                     if(dec>pos) {
 836                         break;
 837                     }
 838                     // Try to match if the decrement is not listed already.
 839                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
 840                         if(dec==pos) {
 841                             return 0;  // Reached the start of the string.
 842                         }
 843                         offsets.addOffset(dec);
 844                     }
 845                     if(overlap==0) {
 846                         break;
 847                     }
 848                     --overlap;
 849                     ++dec;
 850                 }
 851             }
 852         } else /* USET_SPAN_SIMPLE */ {
 853             int32_t maxDec=0, maxOverlap=0;
 854             for(i=0; i<stringsLength; ++i) {
 855                 int32_t overlap=spanBackLengths[i];
 856                 // For longest match, we do need to try to match even an all-contained string
 857                 // to find the match from the latest end.
 858
 859                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 860                 const UChar *s16=string.getBuffer();
 861                 int32_t length16=string.length();
 862
 863                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 864                 if(overlap>=LONG_SPAN) {
 865                     overlap=length16;
 866                     // Longest match: Need to match fully inside the code point span
 867                     // to find the match from the latest end.
 868                 }
 869                 if(overlap>spanLength) {
 870                     overlap=spanLength;
 871                 }
 872                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 873                 for(;;) {
 874                     if(dec>pos || overlap<maxOverlap) {
 875                         break;
 876                     }
 877                     // Try to match if the string is longer or ends later.
 878                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
 879                         matches16CPB(s, pos-dec, length, s16, length16)
 880                     ) {
 881                         maxDec=dec;  // Longest match from latest end.
 882                         maxOverlap=overlap;
 883                         break;
 884                     }
 885                     --overlap;
 886                     ++dec;
 887                 }
 888             }
 889
 890             if(maxDec!=0 || maxOverlap!=0) {
 891                 // Longest-match algorithm, and there was a string match.
 892                 // Simply continue before it.
 893                 pos-=maxDec;
 894                 if(pos==0) {
 895                     return 0;  // Reached the start of the string.
 896                 }
 897                 spanLength=0;  // Match strings from before a string match.
 898                 continue;
 899             }
 900         }
 901         // Finished trying to match all strings at pos.
 902
 903         if(spanLength!=0 || pos==length) {
 904             // The position is before an unlimited code point span (spanLength!=0),
 905             // not before a string match.
 906             // The only position where spanLength==0 before a span is pos==length.
 907             // Otherwise, an unlimited code point span is only tried again when no
 908             // strings match, and if such a non-initial span fails we stop.
 909             if(offsets.isEmpty()) {
 910                 return pos;  // No strings matched before a span.
 911             }
 912             // Match strings from before the next string match.
 913         } else {
 914             // The position is before a string match (or a single code point).
 915             if(offsets.isEmpty()) {
 916                 // No more strings matched before a previous string match.
 917                 // Try another code point span from before the last string match.
 918                 int32_t oldPos=pos;
 919                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
 920                 spanLength=oldPos-pos;
 921                 if( pos==0 ||           // Reached the start of the string, or
 922                     spanLength==0       // neither strings nor span progressed.
 923                 ) {
 924                     return pos;
 925                 }
 926                 continue;  // spanLength>0: Match strings from before a span.
 927             } else {
 928                 // Try to match only one code point from before a string match if some
 929                 // string matched beyond it, so that we try all possible positions
 930                 // and don't overshoot.
 931                 spanLength=spanOneBack(spanSet, s, pos);
 932                 if(spanLength>0) {
 933                     if(spanLength==pos) {
 934                         return 0;  // Reached the start of the string.
 935                     }
 936                     // Match strings before this code point.
 937                     // There cannot be any decrements below it because UnicodeSet strings
 938                     // contain multiple code points.
 939                     pos-=spanLength;
 940                     offsets.shift(spanLength);
 941                     spanLength=0;
 942                     continue;  // Match strings from before a single code point.
 943                 }
 944                 // Match strings from before the next string match.
 945             }
 946         }
 947         pos-=offsets.popMinimum();
 948         spanLength=0;  // Match strings from before a string match.
 949     }
 950 }
 951
 952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
 953     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 954         return spanNotUTF8(s, length);
 955     }
 956     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
 957     if(spanLength==length) {
 958         return length;
 959     }
 960
 961     // Consider strings; they may overlap with the span.
 962     OffsetList offsets;
 963     if(spanCondition==USET_SPAN_CONTAINED) {
 964         // Use offset list to try all possibilities.
 965         offsets.setMaxLength(maxLength8);
 966     }
 967     int32_t pos=spanLength, rest=length-pos;
 968     int32_t i, stringsLength=strings.size();
 969     uint8_t *spanUTF8Lengths=spanLengths;
 970     if(all) {
 971         spanUTF8Lengths+=2*stringsLength;
 972     }
 973     for(;;) {
 974         const uint8_t *s8=utf8;
 975         int32_t length8;
 976         if(spanCondition==USET_SPAN_CONTAINED) {
 977             for(i=0; i<stringsLength; ++i) {
 978                 length8=utf8Lengths[i];
 979                 if(length8==0) {
 980                     continue;  // String not representable in UTF-8.
 981                 }
 982                 int32_t overlap=spanUTF8Lengths[i];
 983                 if(overlap==ALL_CP_CONTAINED) {
 984                     s8+=length8;
 985                     continue;  // Irrelevant string.
 986                 }
 987
 988                 // Try to match this string at pos-overlap..pos.
 989                 if(overlap>=LONG_SPAN) {
 990                     overlap=length8;
 991                     // While contained: No point matching fully inside the code point span.
 992                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
 993                 }
 994                 if(overlap>spanLength) {
 995                     overlap=spanLength;
 996                 }
 997                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
 998                 for(;;) {
 999                     if(inc>rest) {
1000                         break;
1001                     }
1002                     // Try to match if the increment is not listed already.
1003                     // Match at code point boundaries. (The UTF-8 strings were converted
1004                     // from UTF-16 and are guaranteed to be well-formed.)
1005                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006                         !offsets.containsOffset(inc) &&
1007                         matches8(s+pos-overlap, s8, length8)
1008
1009                     ) {
1010                         if(inc==rest) {
1011                             return length;  // Reached the end of the string.
1012                         }
1013                         offsets.addOffset(inc);
1014                     }
1015                     if(overlap==0) {
1016                         break;
1017                     }
1018                     --overlap;
1019                     ++inc;
1020                 }
1021                 s8+=length8;
1022             }
1023         } else /* USET_SPAN_SIMPLE */ {
1024             int32_t maxInc=0, maxOverlap=0;
1025             for(i=0; i<stringsLength; ++i) {
1026                 length8=utf8Lengths[i];
1027                 if(length8==0) {
1028                     continue;  // String not representable in UTF-8.
1029                 }
1030                 int32_t overlap=spanUTF8Lengths[i];
1031                 // For longest match, we do need to try to match even an all-contained string
1032                 // to find the match from the earliest start.
1033
1034                 // Try to match this string at pos-overlap..pos.
1035                 if(overlap>=LONG_SPAN) {
1036                     overlap=length8;
1037                     // Longest match: Need to match fully inside the code point span
1038                     // to find the match from the earliest start.
1039                 }
1040                 if(overlap>spanLength) {
1041                     overlap=spanLength;
1042                 }
1043                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1044                 for(;;) {
1045                     if(inc>rest || overlap<maxOverlap) {
1046                         break;
1047                     }
1048                     // Try to match if the string is longer or starts earlier.
1049                     // Match at code point boundaries. (The UTF-8 strings were converted
1050                     // from UTF-16 and are guaranteed to be well-formed.)
1051                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053                         matches8(s+pos-overlap, s8, length8)
1054
1055                     ) {
1056                         maxInc=inc;  // Longest match from earliest start.
1057                         maxOverlap=overlap;
1058                         break;
1059                     }
1060                     --overlap;
1061                     ++inc;
1062                 }
1063                 s8+=length8;
1064             }
1065
1066             if(maxInc!=0 || maxOverlap!=0) {
1067                 // Longest-match algorithm, and there was a string match.
1068                 // Simply continue after it.
1069                 pos+=maxInc;
1070                 rest-=maxInc;
1071                 if(rest==0) {
1072                     return length;  // Reached the end of the string.
1073                 }
1074                 spanLength=0;  // Match strings from after a string match.
1075                 continue;
1076             }
1077         }
1078         // Finished trying to match all strings at pos.
1079
1080         if(spanLength!=0 || pos==0) {
1081             // The position is after an unlimited code point span (spanLength!=0),
1082             // not after a string match.
1083             // The only position where spanLength==0 after a span is pos==0.
1084             // Otherwise, an unlimited code point span is only tried again when no
1085             // strings match, and if such a non-initial span fails we stop.
1086             if(offsets.isEmpty()) {
1087                 return pos;  // No strings matched after a span.
1088             }
1089             // Match strings from after the next string match.
1090         } else {
1091             // The position is after a string match (or a single code point).
1092             if(offsets.isEmpty()) {
1093                 // No more strings matched after a previous string match.
1094                 // Try another code point span from after the last string match.
1095                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096                 if( spanLength==rest || // Reached the end of the string, or
1097                     spanLength==0       // neither strings nor span progressed.
1098                 ) {
1099                     return pos+spanLength;
1100                 }
1101                 pos+=spanLength;
1102                 rest-=spanLength;
1103                 continue;  // spanLength>0: Match strings from after a span.
1104             } else {
1105                 // Try to match only one code point from after a string match if some
1106                 // string matched beyond it, so that we try all possible positions
1107                 // and don't overshoot.
1108                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109                 if(spanLength>0) {
1110                     if(spanLength==rest) {
1111                         return length;  // Reached the end of the string.
1112                     }
1113                     // Match strings after this code point.
1114                     // There cannot be any increments below it because UnicodeSet strings
1115                     // contain multiple code points.
1116                     pos+=spanLength;
1117                     rest-=spanLength;
1118                     offsets.shift(spanLength);
1119                     spanLength=0;
1120                     continue;  // Match strings from after a single code point.
1121                 }
1122                 // Match strings from after the next string match.
1123             }
1124         }
1125         int32_t minOffset=offsets.popMinimum();
1126         pos+=minOffset;
1127         rest-=minOffset;
1128         spanLength=0;  // Match strings from after a string match.
1129     }
1130 }
1131
1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134         return spanNotBackUTF8(s, length);
1135     }
1136     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137     if(pos==0) {
1138         return 0;
1139     }
1140     int32_t spanLength=length-pos;
1141
1142     // Consider strings; they may overlap with the span.
1143     OffsetList offsets;
1144     if(spanCondition==USET_SPAN_CONTAINED) {
1145         // Use offset list to try all possibilities.
1146         offsets.setMaxLength(maxLength8);
1147     }
1148     int32_t i, stringsLength=strings.size();
1149     uint8_t *spanBackUTF8Lengths=spanLengths;
1150     if(all) {
1151         spanBackUTF8Lengths+=3*stringsLength;
1152     }
1153     for(;;) {
1154         const uint8_t *s8=utf8;
1155         int32_t length8;
1156         if(spanCondition==USET_SPAN_CONTAINED) {
1157             for(i=0; i<stringsLength; ++i) {
1158                 length8=utf8Lengths[i];
1159                 if(length8==0) {
1160                     continue;  // String not representable in UTF-8.
1161                 }
1162                 int32_t overlap=spanBackUTF8Lengths[i];
1163                 if(overlap==ALL_CP_CONTAINED) {
1164                     s8+=length8;
1165                     continue;  // Irrelevant string.
1166                 }
1167
1168                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1169                 if(overlap>=LONG_SPAN) {
1170                     overlap=length8;
1171                     // While contained: No point matching fully inside the code point span.
1172                     int32_t len1=0;
1173                     U8_FWD_1(s8, len1, overlap);
1174                     overlap-=len1;  // Length of the string minus the first code point.
1175                 }
1176                 if(overlap>spanLength) {
1177                     overlap=spanLength;
1178                 }
1179                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1180                 for(;;) {
1181                     if(dec>pos) {
1182                         break;
1183                     }
1184                     // Try to match if the decrement is not listed already.
1185                     // Match at code point boundaries. (The UTF-8 strings were converted
1186                     // from UTF-16 and are guaranteed to be well-formed.)
1187                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1188                         !offsets.containsOffset(dec) &&
1189                         matches8(s+pos-dec, s8, length8)
1190                     ) {
1191                         if(dec==pos) {
1192                             return 0;  // Reached the start of the string.
1193                         }
1194                         offsets.addOffset(dec);
1195                     }
1196                     if(overlap==0) {
1197                         break;
1198                     }
1199                     --overlap;
1200                     ++dec;
1201                 }
1202                 s8+=length8;
1203             }
1204         } else /* USET_SPAN_SIMPLE */ {
1205             int32_t maxDec=0, maxOverlap=0;
1206             for(i=0; i<stringsLength; ++i) {
1207                 length8=utf8Lengths[i];
1208                 if(length8==0) {
1209                     continue;  // String not representable in UTF-8.
1210                 }
1211                 int32_t overlap=spanBackUTF8Lengths[i];
1212                 // For longest match, we do need to try to match even an all-contained string
1213                 // to find the match from the latest end.
1214
1215                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1216                 if(overlap>=LONG_SPAN) {
1217                     overlap=length8;
1218                     // Longest match: Need to match fully inside the code point span
1219                     // to find the match from the latest end.
1220                 }
1221                 if(overlap>spanLength) {
1222                     overlap=spanLength;
1223                 }
1224                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1225                 for(;;) {
1226                     if(dec>pos || overlap<maxOverlap) {
1227                         break;
1228                     }
1229                     // Try to match if the string is longer or ends later.
1230                     // Match at code point boundaries. (The UTF-8 strings were converted
1231                     // from UTF-16 and are guaranteed to be well-formed.)
1232                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1233                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234                         matches8(s+pos-dec, s8, length8)
1235                     ) {
1236                         maxDec=dec;  // Longest match from latest end.
1237                         maxOverlap=overlap;
1238                         break;
1239                     }
1240                     --overlap;
1241                     ++dec;
1242                 }
1243                 s8+=length8;
1244             }
1245
1246             if(maxDec!=0 || maxOverlap!=0) {
1247                 // Longest-match algorithm, and there was a string match.
1248                 // Simply continue before it.
1249                 pos-=maxDec;
1250                 if(pos==0) {
1251                     return 0;  // Reached the start of the string.
1252                 }
1253                 spanLength=0;  // Match strings from before a string match.
1254                 continue;
1255             }
1256         }
1257         // Finished trying to match all strings at pos.
1258
1259         if(spanLength!=0 || pos==length) {
1260             // The position is before an unlimited code point span (spanLength!=0),
1261             // not before a string match.
1262             // The only position where spanLength==0 before a span is pos==length.
1263             // Otherwise, an unlimited code point span is only tried again when no
1264             // strings match, and if such a non-initial span fails we stop.
1265             if(offsets.isEmpty()) {
1266                 return pos;  // No strings matched before a span.
1267             }
1268             // Match strings from before the next string match.
1269         } else {
1270             // The position is before a string match (or a single code point).
1271             if(offsets.isEmpty()) {
1272                 // No more strings matched before a previous string match.
1273                 // Try another code point span from before the last string match.
1274                 int32_t oldPos=pos;
1275                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276                 spanLength=oldPos-pos;
1277                 if( pos==0 ||           // Reached the start of the string, or
1278                     spanLength==0       // neither strings nor span progressed.
1279                 ) {
1280                     return pos;
1281                 }
1282                 continue;  // spanLength>0: Match strings from before a span.
1283             } else {
1284                 // Try to match only one code point from before a string match if some
1285                 // string matched beyond it, so that we try all possible positions
1286                 // and don't overshoot.
1287                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1288                 if(spanLength>0) {
1289                     if(spanLength==pos) {
1290                         return 0;  // Reached the start of the string.
1291                     }
1292                     // Match strings before this code point.
1293                     // There cannot be any decrements below it because UnicodeSet strings
1294                     // contain multiple code points.
1295                     pos-=spanLength;
1296                     offsets.shift(spanLength);
1297                     spanLength=0;
1298                     continue;  // Match strings from before a single code point.
1299                 }
1300                 // Match strings from before the next string match.
1301             }
1302         }
1303         pos-=offsets.popMinimum();
1304         spanLength=0;  // Match strings from before a string match.
1305     }
1306 }
1307
1308 /*
1309  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310  *
1311  * Theoretical algorithm:
1312  * - Iterate through the string, and at each code point boundary:
1313  *   + If the code point there is in the set, then return with the current position.
1314  *   + If a set string matches at the current position, then return with the current position.
1315  *
1316  * Optimized implementation:
1317  *
1318  * (Same assumption as for span() above.)
1319  *
1320  * Create and cache a spanNotSet which contains all of the single code points
1321  * of the original set but none of its strings.
1322  * For each set string add its initial code point to the spanNotSet.
1323  * (Also add its final code point for spanNotBack().)
1324  *
1325  * - Loop:
1326  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327  *   + If the current code point is in the original set, then
1328  *     return the current position.
1329  *   + If any set string matches at the current position, then
1330  *     return the current position.
1331  *   + If there is no match at the current position, neither for the code point there
1332  *     nor for any set string, then skip this code point and continue the loop.
1333  *     This happens for set-string-initial code points that were added to spanNotSet
1334  *     when there is not actually a match for such a set string.
1335  */
1336
1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338     int32_t pos=0, rest=length;
1339     int32_t i, stringsLength=strings.size();
1340     do {
1341         // Span until we find a code point from the set,
1342         // or a code point that starts or ends some string.
1343         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344         if(i==rest) {
1345             return length;  // Reached the end of the string.
1346         }
1347         pos+=i;
1348         rest-=i;
1349
1350         // Check whether the current code point is in the original set,
1351         // without the string starts and ends.
1352         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353         if(cpLength>0) {
1354             return pos;  // There is a set element at pos.
1355         }
1356
1357         // Try to match the strings at pos.
1358         for(i=0; i<stringsLength; ++i) {
1359             if(spanLengths[i]==ALL_CP_CONTAINED) {
1360                 continue;  // Irrelevant string.
1361             }
1362             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363             const UChar *s16=string.getBuffer();
1364             int32_t length16=string.length();
1365             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366                 return pos;  // There is a set element at pos.
1367             }
1368         }
1369
1370         // The span(while not contained) ended on a string start/end which is
1371         // not in the original set. Skip this code point and continue.
1372         // cpLength<0
1373         pos-=cpLength;
1374         rest+=cpLength;
1375     } while(rest!=0);
1376     return length;  // Reached the end of the string.
1377 }
1378
1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380     int32_t pos=length;
1381     int32_t i, stringsLength=strings.size();
1382     do {
1383         // Span until we find a code point from the set,
1384         // or a code point that starts or ends some string.
1385         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386         if(pos==0) {
1387             return 0;  // Reached the start of the string.
1388         }
1389
1390         // Check whether the current code point is in the original set,
1391         // without the string starts and ends.
1392         int32_t cpLength=spanOneBack(spanSet, s, pos);
1393         if(cpLength>0) {
1394             return pos;  // There is a set element at pos.
1395         }
1396
1397         // Try to match the strings at pos.
1398         for(i=0; i<stringsLength; ++i) {
1399             // Use spanLengths rather than a spanBackLengths pointer because
1400             // it is easier and we only need to know whether the string is irrelevant
1401             // which is the same in either array.
1402             if(spanLengths[i]==ALL_CP_CONTAINED) {
1403                 continue;  // Irrelevant string.
1404             }
1405             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406             const UChar *s16=string.getBuffer();
1407             int32_t length16=string.length();
1408             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409                 return pos;  // There is a set element at pos.
1410             }
1411         }
1412
1413         // The span(while not contained) ended on a string start/end which is
1414         // not in the original set. Skip this code point and continue.
1415         // cpLength<0
1416         pos+=cpLength;
1417     } while(pos!=0);
1418     return 0;  // Reached the start of the string.
1419 }
1420
1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422     int32_t pos=0, rest=length;
1423     int32_t i, stringsLength=strings.size();
1424     uint8_t *spanUTF8Lengths=spanLengths;
1425     if(all) {
1426         spanUTF8Lengths+=2*stringsLength;
1427     }
1428     do {
1429         // Span until we find a code point from the set,
1430         // or a code point that starts or ends some string.
1431         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432         if(i==rest) {
1433             return length;  // Reached the end of the string.
1434         }
1435         pos+=i;
1436         rest-=i;
1437
1438         // Check whether the current code point is in the original set,
1439         // without the string starts and ends.
1440         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441         if(cpLength>0) {
1442             return pos;  // There is a set element at pos.
1443         }
1444
1445         // Try to match the strings at pos.
1446         const uint8_t *s8=utf8;
1447         int32_t length8;
1448         for(i=0; i<stringsLength; ++i) {
1449             length8=utf8Lengths[i];
1450             // ALL_CP_CONTAINED: Irrelevant string.
1451             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452                 return pos;  // There is a set element at pos.
1453             }
1454             s8+=length8;
1455         }
1456
1457         // The span(while not contained) ended on a string start/end which is
1458         // not in the original set. Skip this code point and continue.
1459         // cpLength<0
1460         pos-=cpLength;
1461         rest+=cpLength;
1462     } while(rest!=0);
1463     return length;  // Reached the end of the string.
1464 }
1465
1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467     int32_t pos=length;
1468     int32_t i, stringsLength=strings.size();
1469     uint8_t *spanBackUTF8Lengths=spanLengths;
1470     if(all) {
1471         spanBackUTF8Lengths+=3*stringsLength;
1472     }
1473     do {
1474         // Span until we find a code point from the set,
1475         // or a code point that starts or ends some string.
1476         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477         if(pos==0) {
1478             return 0;  // Reached the start of the string.
1479         }
1480
1481         // Check whether the current code point is in the original set,
1482         // without the string starts and ends.
1483         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484         if(cpLength>0) {
1485             return pos;  // There is a set element at pos.
1486         }
1487
1488         // Try to match the strings at pos.
1489         const uint8_t *s8=utf8;
1490         int32_t length8;
1491         for(i=0; i<stringsLength; ++i) {
1492             length8=utf8Lengths[i];
1493             // ALL_CP_CONTAINED: Irrelevant string.
1494             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495                 return pos;  // There is a set element at pos.
1496             }
1497             s8+=length8;
1498         }
1499
1500         // The span(while not contained) ended on a string start/end which is
1501         // not in the original set. Skip this code point and continue.
1502         // cpLength<0
1503         pos+=cpLength;
1504     } while(pos!=0);
1505     return 0;  // Reached the start of the string.
1506 }
1507
1508 U_NAMESPACE_END