icuSources/common/unisetspan.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2007-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  unisetspan.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2007mar01
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/ustring.h"
  20 #include "unicode/utf8.h"
  21 #include "unicode/utf16.h"
  22 #include "cmemory.h"
  23 #include "uvector.h"
  24 #include "unisetspan.h"
  25
  26 U_NAMESPACE_BEGIN
  27
  28 /*
  29  * List of offsets from the current position from where to try matching
  30  * a code point or a string.
  31  * Store offsets rather than indexes to simplify the code and use the same list
  32  * for both increments (in span()) and decrements (in spanBack()).
  33  *
  34  * Assumption: The maximum offset is limited, and the offsets that are stored
  35  * at any one time are relatively dense, that is, there are normally no gaps of
  36  * hundreds or thousands of offset values.
  37  *
  38  * The implementation uses a circular buffer of byte flags,
  39  * each indicating whether the corresponding offset is in the list.
  40  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
  41  * physically moving part of the list.
  42  *
  43  * Note: In principle, the caller should setMaxLength() to the maximum of the
  44  * max string length and U16_LENGTH/U8_LENGTH to account for
  45  * "long" single code points.
  46  * However, this implementation uses at least a staticList with more than
  47  * U8_LENGTH entries anyway.
  48  *
  49  * Note: If maxLength were guaranteed to be no more than 32 or 64,
  50  * the list could be stored as bit flags in a single integer.
  51  * Rather than handling a circular buffer with a start list index,
  52  * the integer would simply be shifted when lower offsets are removed.
  53  * UnicodeSet does not have a limit on the lengths of strings.
  54  */
  55 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
  56 public:
  57     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
  58
  59     ~OffsetList() {
  60         if(list!=staticList) {
  61             uprv_free(list);
  62         }
  63     }
  64
  65     // Call exactly once if the list is to be used.
  66     void setMaxLength(int32_t maxLength) {
  67         if(maxLength<=(int32_t)sizeof(staticList)) {
  68             capacity=(int32_t)sizeof(staticList);
  69         } else {
  70             UBool *l=(UBool *)uprv_malloc(maxLength);
  71             if(l!=NULL) {
  72                 list=l;
  73                 capacity=maxLength;
  74             }
  75         }
  76         uprv_memset(list, 0, capacity);
  77     }
  78
  79     void clear() {
  80         uprv_memset(list, 0, capacity);
  81         start=length=0;
  82     }
  83
  84     UBool isEmpty() const {
  85         return (UBool)(length==0);
  86     }
  87
  88     // Reduce all stored offsets by delta, used when the current position
  89     // moves by delta.
  90     // There must not be any offsets lower than delta.
  91     // If there is an offset equal to delta, it is removed.
  92     // delta=[1..maxLength]
  93     void shift(int32_t delta) {
  94         int32_t i=start+delta;
  95         if(i>=capacity) {
  96             i-=capacity;
  97         }
  98         if(list[i]) {
  99             list[i]=FALSE;
 100             --length;
 101         }
 102         start=i;
 103     }
 104
 105     // Add an offset. The list must not contain it yet.
 106     // offset=[1..maxLength]
 107     void addOffset(int32_t offset) {
 108         int32_t i=start+offset;
 109         if(i>=capacity) {
 110             i-=capacity;
 111         }
 112         list[i]=TRUE;
 113         ++length;
 114     }
 115
 116     // offset=[1..maxLength]
 117     UBool containsOffset(int32_t offset) const {
 118         int32_t i=start+offset;
 119         if(i>=capacity) {
 120             i-=capacity;
 121         }
 122         return list[i];
 123     }
 124
 125     // Find the lowest stored offset from a non-empty list, remove it,
 126     // and reduce all other offsets by this minimum.
 127     // Returns [1..maxLength].
 128     int32_t popMinimum() {
 129         // Look for the next offset in list[start+1..capacity-1].
 130         int32_t i=start, result;
 131         while(++i<capacity) {
 132             if(list[i]) {
 133                 list[i]=FALSE;
 134                 --length;
 135                 result=i-start;
 136                 start=i;
 137                 return result;
 138             }
 139         }
 140         // i==capacity
 141
 142         // Wrap around and look for the next offset in list[0..start].
 143         // Since the list is not empty, there will be one.
 144         result=capacity-start;
 145         i=0;
 146         while(!list[i]) {
 147             ++i;
 148         }
 149         list[i]=FALSE;
 150         --length;
 151         start=i;
 152         return result+=i;
 153     }
 154
 155 private:
 156     UBool *list;
 157     int32_t capacity;
 158     int32_t length;
 159     int32_t start;
 160
 161     UBool staticList[16];
 162 };
 163
 164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
 165 static int32_t
 166 getUTF8Length(const UChar *s, int32_t length) {
 167     UErrorCode errorCode=U_ZERO_ERROR;
 168     int32_t length8=0;
 169     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
 170     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
 171         return length8;
 172     } else {
 173         // The string contains an unpaired surrogate.
 174         // Ignore this string.
 175         return 0;
 176     }
 177 }
 178
 179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
 180 static int32_t
 181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
 182     UErrorCode errorCode=U_ZERO_ERROR;
 183     int32_t length8=0;
 184     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
 185     if(U_SUCCESS(errorCode)) {
 186         return length8;
 187     } else {
 188         // The string contains an unpaired surrogate.
 189         // Ignore this string.
 190         return 0;
 191     }
 192 }
 193
 194 static inline uint8_t
 195 makeSpanLengthByte(int32_t spanLength) {
 196     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
 197     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
 198 }
 199
 200 // Construct for all variants of span(), or only for any one variant.
 201 // Initialize as little as possible, for single use.
 202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
 203                                            const UVector &setStrings,
 204                                            uint32_t which)
 205         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
 206           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 207           utf8Length(0),
 208           maxLength16(0), maxLength8(0),
 209           all((UBool)(which==ALL)) {
 210     spanSet.retainAll(set);
 211     if(which&NOT_CONTAINED) {
 212         // Default to the same sets.
 213         // addToSpanNotSet() will create a separate set if necessary.
 214         pSpanNotSet=&spanSet;
 215     }
 216
 217     // Determine if the strings even need to be taken into account at all for span() etc.
 218     // If any string is relevant, then all strings need to be used for
 219     // span(longest match) but only the relevant ones for span(while contained).
 220     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
 221     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
 222     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
 223     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
 224     int32_t stringsLength=strings.size();
 225
 226     int32_t i, spanLength;
 227     UBool someRelevant=FALSE;
 228     for(i=0; i<stringsLength; ++i) {
 229         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 230         const UChar *s16=string.getBuffer();
 231         int32_t length16=string.length();
 232         UBool thisRelevant;
 233         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 234         if(spanLength<length16) {  // Relevant string.
 235             someRelevant=thisRelevant=TRUE;
 236         } else {
 237             thisRelevant=FALSE;
 238         }
 239         if((which&UTF16) && length16>maxLength16) {
 240             maxLength16=length16;
 241         }
 242         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
 243             int32_t length8=getUTF8Length(s16, length16);
 244             utf8Length+=length8;
 245             if(length8>maxLength8) {
 246                 maxLength8=length8;
 247             }
 248         }
 249     }
 250     if(!someRelevant) {
 251         maxLength16=maxLength8=0;
 252         return;
 253     }
 254
 255     // Freeze after checking for the need to use strings at all because freezing
 256     // a set takes some time and memory which are wasted if there are no relevant strings.
 257     if(all) {
 258         spanSet.freeze();
 259     }
 260
 261     uint8_t *spanBackLengths;
 262     uint8_t *spanUTF8Lengths;
 263     uint8_t *spanBackUTF8Lengths;
 264
 265     // Allocate a block of meta data.
 266     int32_t allocSize;
 267     if(all) {
 268         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 269         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 270     } else {
 271         allocSize=stringsLength;  // One set of span lengths.
 272         if(which&UTF8) {
 273             // UTF-8 lengths and UTF-8 strings.
 274             allocSize+=stringsLength*4+utf8Length;
 275         }
 276     }
 277     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 278         utf8Lengths=staticLengths;
 279     } else {
 280         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 281         if(utf8Lengths==NULL) {
 282             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 283             return;  // Out of memory.
 284         }
 285     }
 286
 287     if(all) {
 288         // Store span lengths for all span() variants.
 289         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 290         spanBackLengths=spanLengths+stringsLength;
 291         spanUTF8Lengths=spanBackLengths+stringsLength;
 292         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
 293         utf8=spanBackUTF8Lengths+stringsLength;
 294     } else {
 295         // Store span lengths for only one span() variant.
 296         if(which&UTF8) {
 297             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 298             utf8=spanLengths+stringsLength;
 299         } else {
 300             spanLengths=(uint8_t *)utf8Lengths;
 301         }
 302         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
 303     }
 304
 305     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
 306     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
 307
 308     for(i=0; i<stringsLength; ++i) {
 309         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 310         const UChar *s16=string.getBuffer();
 311         int32_t length16=string.length();
 312         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 313         if(spanLength<length16) {  // Relevant string.
 314             if(which&UTF16) {
 315                 if(which&CONTAINED) {
 316                     if(which&FWD) {
 317                         spanLengths[i]=makeSpanLengthByte(spanLength);
 318                     }
 319                     if(which&BACK) {
 320                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
 321                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
 322                     }
 323                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 324                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
 325                 }
 326             }
 327             if(which&UTF8) {
 328                 uint8_t *s8=utf8+utf8Count;
 329                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 330                 utf8Count+=utf8Lengths[i]=length8;
 331                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
 332                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
 333                 } else {  // Relevant for UTF-8.
 334                     if(which&CONTAINED) {
 335                         if(which&FWD) {
 336                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 337                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 338                         }
 339                         if(which&BACK) {
 340                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 341                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 342                         }
 343                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 344                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
 345                     }
 346                 }
 347             }
 348             if(which&NOT_CONTAINED) {
 349                 // Add string start and end code points to the spanNotSet so that
 350                 // a span(while not contained) stops before any string.
 351                 UChar32 c;
 352                 if(which&FWD) {
 353                     int32_t len=0;
 354                     U16_NEXT(s16, len, length16, c);
 355                     addToSpanNotSet(c);
 356                 }
 357                 if(which&BACK) {
 358                     int32_t len=length16;
 359                     U16_PREV(s16, 0, len, c);
 360                     addToSpanNotSet(c);
 361                 }
 362             }
 363         } else {  // Irrelevant string.
 364             if(which&UTF8) {
 365                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
 366                     uint8_t *s8=utf8+utf8Count;
 367                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 368                     utf8Count+=utf8Lengths[i]=length8;
 369                 } else {
 370                     utf8Lengths[i]=0;
 371                 }
 372             }
 373             if(all) {
 374                 spanLengths[i]=spanBackLengths[i]=
 375                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
 376                         (uint8_t)ALL_CP_CONTAINED;
 377             } else {
 378                 // All spanXYZLengths pointers contain the same address.
 379                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
 380             }
 381         }
 382     }
 383
 384     // Finish.
 385     if(all) {
 386         pSpanNotSet->freeze();
 387     }
 388 }
 389
 390 // Copy constructor. Assumes which==ALL for a frozen set.
 391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
 392                                            const UVector &newParentSetStrings)
 393         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
 394           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 395           utf8Length(otherStringSpan.utf8Length),
 396           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
 397           all(TRUE) {
 398     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
 399         pSpanNotSet=&spanSet;
 400     } else {
 401         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
 402     }
 403
 404     // Allocate a block of meta data.
 405     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 406     int32_t stringsLength=strings.size();
 407     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 408     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 409         utf8Lengths=staticLengths;
 410     } else {
 411         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 412         if(utf8Lengths==NULL) {
 413             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 414             return;  // Out of memory.
 415         }
 416     }
 417
 418     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 419     utf8=spanLengths+stringsLength*4;
 420     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
 421 }
 422
 423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
 424     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
 425         delete pSpanNotSet;
 426     }
 427     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
 428         uprv_free(utf8Lengths);
 429     }
 430 }
 431
 432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
 433     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
 434         if(spanSet.contains(c)) {
 435             return;  // Nothing to do.
 436         }
 437         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
 438         if(newSet==NULL) {
 439             return;  // Out of memory.
 440         } else {
 441             pSpanNotSet=newSet;
 442         }
 443     }
 444     pSpanNotSet->add(c);
 445 }
 446
 447 // Compare strings without any argument checks. Requires length>0.
 448 static inline UBool
 449 matches16(const UChar *s, const UChar *t, int32_t length) {
 450     do {
 451         if(*s++!=*t++) {
 452             return FALSE;
 453         }
 454     } while(--length>0);
 455     return TRUE;
 456 }
 457
 458 static inline UBool
 459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
 460     do {
 461         if(*s++!=*t++) {
 462             return FALSE;
 463         }
 464     } while(--length>0);
 465     return TRUE;
 466 }
 467
 468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
 469 // at code point boundaries.
 470 // That is, each edge of a match must not be in the middle of a surrogate pair.
 471 static inline UBool
 472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
 473     s+=start;
 474     limit-=start;
 475     return matches16(s, t, length) &&
 476            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
 477            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
 478 }
 479
 480 // Does the set contain the next code point?
 481 // If so, return its length; otherwise return its negative length.
 482 static inline int32_t
 483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
 484     UChar c=*s, c2;
 485     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
 486         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
 487     }
 488     return set.contains(c) ? 1 : -1;
 489 }
 490
 491 static inline int32_t
 492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
 493     UChar c=s[length-1], c2;
 494     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
 495         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
 496     }
 497     return set.contains(c) ? 1 : -1;
 498 }
 499
 500 static inline int32_t
 501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 502     UChar32 c=*s;
 503     if((int8_t)c>=0) {
 504         return set.contains(c) ? 1 : -1;
 505     }
 506     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
 507     int32_t i=0;
 508     U8_NEXT_OR_FFFD(s, i, length, c);
 509     return set.contains(c) ? i : -i;
 510 }
 511
 512 static inline int32_t
 513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 514     UChar32 c=s[length-1];
 515     if((int8_t)c>=0) {
 516         return set.contains(c) ? 1 : -1;
 517     }
 518     int32_t i=length-1;
 519     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
 520     length-=i;
 521     return set.contains(c) ? length : -length;
 522 }
 523
 524 /*
 525  * Note: In span() when spanLength==0 (after a string match, or at the beginning
 526  * after an empty code point span) and in spanNot() and spanNotUTF8(),
 527  * string matching could use a binary search
 528  * because all string matches are done from the same start index.
 529  *
 530  * For UTF-8, this would require a comparison function that returns UTF-16 order.
 531  *
 532  * This optimization should not be necessary for normal UnicodeSets because
 533  * most sets have no strings, and most sets with strings have
 534  * very few very short strings.
 535  * For cases with many strings, it might be better to use a different API
 536  * and implementation with a DFA (state machine).
 537  */
 538
 539 /*
 540  * Algorithm for span(USET_SPAN_CONTAINED)
 541  *
 542  * Theoretical algorithm:
 543  * - Iterate through the string, and at each code point boundary:
 544  *   + If the code point there is in the set, then remember to continue after it.
 545  *   + If a set string matches at the current position, then remember to continue after it.
 546  *   + Either recursively span for each code point or string match,
 547  *     or recursively span for all but the shortest one and
 548  *     iteratively continue the span with the shortest local match.
 549  *   + Remember the longest recursive span (the farthest end point).
 550  *   + If there is no match at the current position, neither for the code point there
 551  *     nor for any set string, then stop and return the longest recursive span length.
 552  *
 553  * Optimized implementation:
 554  *
 555  * (We assume that most sets will have very few very short strings.
 556  * A span using a string-less set is extremely fast.)
 557  *
 558  * Create and cache a spanSet which contains all of the single code points
 559  * of the original set but none of its strings.
 560  *
 561  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 562  * - Loop:
 563  *   + Try to match each set string at the end of the spanLength.
 564  *     ~ Set strings that start with set-contained code points must be matched
 565  *       with a partial overlap because the recursive algorithm would have tried
 566  *       to match them at every position.
 567  *     ~ Set strings that entirely consist of set-contained code points
 568  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
 569  *       recursive algorithm would continue after them anyway
 570  *       and find the longest recursive match from their end.
 571  *     ~ Rather than recursing, note each end point of a set string match.
 572  *   + If no set string matched after spanSet.span(), then return
 573  *     with where the spanSet.span() ended.
 574  *   + If at least one set string matched after spanSet.span(), then
 575  *     pop the shortest string match end point and continue
 576  *     the loop, trying to match all set strings from there.
 577  *   + If at least one more set string matched after a previous string match,
 578  *     then test if the code point after the previous string match is also
 579  *     contained in the set.
 580  *     Continue the loop with the shortest end point of either this code point
 581  *     or a matching set string.
 582  *   + If no more set string matched after a previous string match,
 583  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 584  *     Stop if spanLength==0, otherwise continue the loop.
 585  *
 586  * By noting each end point of a set string match,
 587  * the function visits each string position at most once and finishes
 588  * in linear time.
 589  *
 590  * The recursive algorithm may visit the same string position many times
 591  * if multiple paths lead to it and finishes in exponential time.
 592  */
 593
 594 /*
 595  * Algorithm for span(USET_SPAN_SIMPLE)
 596  *
 597  * Theoretical algorithm:
 598  * - Iterate through the string, and at each code point boundary:
 599  *   + If the code point there is in the set, then remember to continue after it.
 600  *   + If a set string matches at the current position, then remember to continue after it.
 601  *   + Continue from the farthest match position and ignore all others.
 602  *   + If there is no match at the current position,
 603  *     then stop and return the current position.
 604  *
 605  * Optimized implementation:
 606  *
 607  * (Same assumption and spanSet as above.)
 608  *
 609  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 610  * - Loop:
 611  *   + Try to match each set string at the end of the spanLength.
 612  *     ~ Set strings that start with set-contained code points must be matched
 613  *       with a partial overlap because the standard algorithm would have tried
 614  *       to match them earlier.
 615  *     ~ Set strings that entirely consist of set-contained code points
 616  *       must be matched with a full overlap because the longest-match algorithm
 617  *       would hide set string matches that end earlier.
 618  *       Such set strings need not be matched earlier inside the code point span
 619  *       because the standard algorithm would then have continued after
 620  *       the set string match anyway.
 621  *     ~ Remember the longest set string match (farthest end point) from the earliest
 622  *       starting point.
 623  *   + If no set string matched after spanSet.span(), then return
 624  *     with where the spanSet.span() ended.
 625  *   + If at least one set string matched, then continue the loop after the
 626  *     longest match from the earliest position.
 627  *   + If no more set string matched after a previous string match,
 628  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 629  *     Stop if spanLength==0, otherwise continue the loop.
 630  */
 631
 632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 633     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 634         return spanNot(s, length);
 635     }
 636     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
 637     if(spanLength==length) {
 638         return length;
 639     }
 640
 641     // Consider strings; they may overlap with the span.
 642     OffsetList offsets;
 643     if(spanCondition==USET_SPAN_CONTAINED) {
 644         // Use offset list to try all possibilities.
 645         offsets.setMaxLength(maxLength16);
 646     }
 647     int32_t pos=spanLength, rest=length-pos;
 648     int32_t i, stringsLength=strings.size();
 649     for(;;) {
 650         if(spanCondition==USET_SPAN_CONTAINED) {
 651             for(i=0; i<stringsLength; ++i) {
 652                 int32_t overlap=spanLengths[i];
 653                 if(overlap==ALL_CP_CONTAINED) {
 654                     continue;  // Irrelevant string.
 655                 }
 656                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 657                 const UChar *s16=string.getBuffer();
 658                 int32_t length16=string.length();
 659
 660                 // Try to match this string at pos-overlap..pos.
 661                 if(overlap>=LONG_SPAN) {
 662                     overlap=length16;
 663                     // While contained: No point matching fully inside the code point span.
 664                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
 665                 }
 666                 if(overlap>spanLength) {
 667                     overlap=spanLength;
 668                 }
 669                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 670                 for(;;) {
 671                     if(inc>rest) {
 672                         break;
 673                     }
 674                     // Try to match if the increment is not listed already.
 675                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
 676                         if(inc==rest) {
 677                             return length;  // Reached the end of the string.
 678                         }
 679                         offsets.addOffset(inc);
 680                     }
 681                     if(overlap==0) {
 682                         break;
 683                     }
 684                     --overlap;
 685                     ++inc;
 686                 }
 687             }
 688         } else /* USET_SPAN_SIMPLE */ {
 689             int32_t maxInc=0, maxOverlap=0;
 690             for(i=0; i<stringsLength; ++i) {
 691                 int32_t overlap=spanLengths[i];
 692                 // For longest match, we do need to try to match even an all-contained string
 693                 // to find the match from the earliest start.
 694
 695                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 696                 const UChar *s16=string.getBuffer();
 697                 int32_t length16=string.length();
 698
 699                 // Try to match this string at pos-overlap..pos.
 700                 if(overlap>=LONG_SPAN) {
 701                     overlap=length16;
 702                     // Longest match: Need to match fully inside the code point span
 703                     // to find the match from the earliest start.
 704                 }
 705                 if(overlap>spanLength) {
 706                     overlap=spanLength;
 707                 }
 708                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 709                 for(;;) {
 710                     if(inc>rest || overlap<maxOverlap) {
 711                         break;
 712                     }
 713                     // Try to match if the string is longer or starts earlier.
 714                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
 715                         matches16CPB(s, pos-overlap, length, s16, length16)
 716                     ) {
 717                         maxInc=inc;  // Longest match from earliest start.
 718                         maxOverlap=overlap;
 719                         break;
 720                     }
 721                     --overlap;
 722                     ++inc;
 723                 }
 724             }
 725
 726             if(maxInc!=0 || maxOverlap!=0) {
 727                 // Longest-match algorithm, and there was a string match.
 728                 // Simply continue after it.
 729                 pos+=maxInc;
 730                 rest-=maxInc;
 731                 if(rest==0) {
 732                     return length;  // Reached the end of the string.
 733                 }
 734                 spanLength=0;  // Match strings from after a string match.
 735                 continue;
 736             }
 737         }
 738         // Finished trying to match all strings at pos.
 739
 740         if(spanLength!=0 || pos==0) {
 741             // The position is after an unlimited code point span (spanLength!=0),
 742             // not after a string match.
 743             // The only position where spanLength==0 after a span is pos==0.
 744             // Otherwise, an unlimited code point span is only tried again when no
 745             // strings match, and if such a non-initial span fails we stop.
 746             if(offsets.isEmpty()) {
 747                 return pos;  // No strings matched after a span.
 748             }
 749             // Match strings from after the next string match.
 750         } else {
 751             // The position is after a string match (or a single code point).
 752             if(offsets.isEmpty()) {
 753                 // No more strings matched after a previous string match.
 754                 // Try another code point span from after the last string match.
 755                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
 756                 if( spanLength==rest || // Reached the end of the string, or
 757                     spanLength==0       // neither strings nor span progressed.
 758                 ) {
 759                     return pos+spanLength;
 760                 }
 761                 pos+=spanLength;
 762                 rest-=spanLength;
 763                 continue;  // spanLength>0: Match strings from after a span.
 764             } else {
 765                 // Try to match only one code point from after a string match if some
 766                 // string matched beyond it, so that we try all possible positions
 767                 // and don't overshoot.
 768                 spanLength=spanOne(spanSet, s+pos, rest);
 769                 if(spanLength>0) {
 770                     if(spanLength==rest) {
 771                         return length;  // Reached the end of the string.
 772                     }
 773                     // Match strings after this code point.
 774                     // There cannot be any increments below it because UnicodeSet strings
 775                     // contain multiple code points.
 776                     pos+=spanLength;
 777                     rest-=spanLength;
 778                     offsets.shift(spanLength);
 779                     spanLength=0;
 780                     continue;  // Match strings from after a single code point.
 781                 }
 782                 // Match strings from after the next string match.
 783             }
 784         }
 785         int32_t minOffset=offsets.popMinimum();
 786         pos+=minOffset;
 787         rest-=minOffset;
 788         spanLength=0;  // Match strings from after a string match.
 789     }
 790 }
 791
 792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 793     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 794         return spanNotBack(s, length);
 795     }
 796     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
 797     if(pos==0) {
 798         return 0;
 799     }
 800     int32_t spanLength=length-pos;
 801
 802     // Consider strings; they may overlap with the span.
 803     OffsetList offsets;
 804     if(spanCondition==USET_SPAN_CONTAINED) {
 805         // Use offset list to try all possibilities.
 806         offsets.setMaxLength(maxLength16);
 807     }
 808     int32_t i, stringsLength=strings.size();
 809     uint8_t *spanBackLengths=spanLengths;
 810     if(all) {
 811         spanBackLengths+=stringsLength;
 812     }
 813     for(;;) {
 814         if(spanCondition==USET_SPAN_CONTAINED) {
 815             for(i=0; i<stringsLength; ++i) {
 816                 int32_t overlap=spanBackLengths[i];
 817                 if(overlap==ALL_CP_CONTAINED) {
 818                     continue;  // Irrelevant string.
 819                 }
 820                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 821                 const UChar *s16=string.getBuffer();
 822                 int32_t length16=string.length();
 823
 824                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 825                 if(overlap>=LONG_SPAN) {
 826                     overlap=length16;
 827                     // While contained: No point matching fully inside the code point span.
 828                     int32_t len1=0;
 829                     U16_FWD_1(s16, len1, overlap);
 830                     overlap-=len1;  // Length of the string minus the first code point.
 831                 }
 832                 if(overlap>spanLength) {
 833                     overlap=spanLength;
 834                 }
 835                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 836                 for(;;) {
 837                     if(dec>pos) {
 838                         break;
 839                     }
 840                     // Try to match if the decrement is not listed already.
 841                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
 842                         if(dec==pos) {
 843                             return 0;  // Reached the start of the string.
 844                         }
 845                         offsets.addOffset(dec);
 846                     }
 847                     if(overlap==0) {
 848                         break;
 849                     }
 850                     --overlap;
 851                     ++dec;
 852                 }
 853             }
 854         } else /* USET_SPAN_SIMPLE */ {
 855             int32_t maxDec=0, maxOverlap=0;
 856             for(i=0; i<stringsLength; ++i) {
 857                 int32_t overlap=spanBackLengths[i];
 858                 // For longest match, we do need to try to match even an all-contained string
 859                 // to find the match from the latest end.
 860
 861                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 862                 const UChar *s16=string.getBuffer();
 863                 int32_t length16=string.length();
 864
 865                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 866                 if(overlap>=LONG_SPAN) {
 867                     overlap=length16;
 868                     // Longest match: Need to match fully inside the code point span
 869                     // to find the match from the latest end.
 870                 }
 871                 if(overlap>spanLength) {
 872                     overlap=spanLength;
 873                 }
 874                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 875                 for(;;) {
 876                     if(dec>pos || overlap<maxOverlap) {
 877                         break;
 878                     }
 879                     // Try to match if the string is longer or ends later.
 880                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
 881                         matches16CPB(s, pos-dec, length, s16, length16)
 882                     ) {
 883                         maxDec=dec;  // Longest match from latest end.
 884                         maxOverlap=overlap;
 885                         break;
 886                     }
 887                     --overlap;
 888                     ++dec;
 889                 }
 890             }
 891
 892             if(maxDec!=0 || maxOverlap!=0) {
 893                 // Longest-match algorithm, and there was a string match.
 894                 // Simply continue before it.
 895                 pos-=maxDec;
 896                 if(pos==0) {
 897                     return 0;  // Reached the start of the string.
 898                 }
 899                 spanLength=0;  // Match strings from before a string match.
 900                 continue;
 901             }
 902         }
 903         // Finished trying to match all strings at pos.
 904
 905         if(spanLength!=0 || pos==length) {
 906             // The position is before an unlimited code point span (spanLength!=0),
 907             // not before a string match.
 908             // The only position where spanLength==0 before a span is pos==length.
 909             // Otherwise, an unlimited code point span is only tried again when no
 910             // strings match, and if such a non-initial span fails we stop.
 911             if(offsets.isEmpty()) {
 912                 return pos;  // No strings matched before a span.
 913             }
 914             // Match strings from before the next string match.
 915         } else {
 916             // The position is before a string match (or a single code point).
 917             if(offsets.isEmpty()) {
 918                 // No more strings matched before a previous string match.
 919                 // Try another code point span from before the last string match.
 920                 int32_t oldPos=pos;
 921                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
 922                 spanLength=oldPos-pos;
 923                 if( pos==0 ||           // Reached the start of the string, or
 924                     spanLength==0       // neither strings nor span progressed.
 925                 ) {
 926                     return pos;
 927                 }
 928                 continue;  // spanLength>0: Match strings from before a span.
 929             } else {
 930                 // Try to match only one code point from before a string match if some
 931                 // string matched beyond it, so that we try all possible positions
 932                 // and don't overshoot.
 933                 spanLength=spanOneBack(spanSet, s, pos);
 934                 if(spanLength>0) {
 935                     if(spanLength==pos) {
 936                         return 0;  // Reached the start of the string.
 937                     }
 938                     // Match strings before this code point.
 939                     // There cannot be any decrements below it because UnicodeSet strings
 940                     // contain multiple code points.
 941                     pos-=spanLength;
 942                     offsets.shift(spanLength);
 943                     spanLength=0;
 944                     continue;  // Match strings from before a single code point.
 945                 }
 946                 // Match strings from before the next string match.
 947             }
 948         }
 949         pos-=offsets.popMinimum();
 950         spanLength=0;  // Match strings from before a string match.
 951     }
 952 }
 953
 954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
 955     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 956         return spanNotUTF8(s, length);
 957     }
 958     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
 959     if(spanLength==length) {
 960         return length;
 961     }
 962
 963     // Consider strings; they may overlap with the span.
 964     OffsetList offsets;
 965     if(spanCondition==USET_SPAN_CONTAINED) {
 966         // Use offset list to try all possibilities.
 967         offsets.setMaxLength(maxLength8);
 968     }
 969     int32_t pos=spanLength, rest=length-pos;
 970     int32_t i, stringsLength=strings.size();
 971     uint8_t *spanUTF8Lengths=spanLengths;
 972     if(all) {
 973         spanUTF8Lengths+=2*stringsLength;
 974     }
 975     for(;;) {
 976         const uint8_t *s8=utf8;
 977         int32_t length8;
 978         if(spanCondition==USET_SPAN_CONTAINED) {
 979             for(i=0; i<stringsLength; ++i) {
 980                 length8=utf8Lengths[i];
 981                 if(length8==0) {
 982                     continue;  // String not representable in UTF-8.
 983                 }
 984                 int32_t overlap=spanUTF8Lengths[i];
 985                 if(overlap==ALL_CP_CONTAINED) {
 986                     s8+=length8;
 987                     continue;  // Irrelevant string.
 988                 }
 989
 990                 // Try to match this string at pos-overlap..pos.
 991                 if(overlap>=LONG_SPAN) {
 992                     overlap=length8;
 993                     // While contained: No point matching fully inside the code point span.
 994                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
 995                 }
 996                 if(overlap>spanLength) {
 997                     overlap=spanLength;
 998                 }
 999                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1000                 for(;;) {
1001                     if(inc>rest) {
1002                         break;
1003                     }
1004                     // Try to match if the increment is not listed already.
1005                     // Match at code point boundaries. (The UTF-8 strings were converted
1006                     // from UTF-16 and are guaranteed to be well-formed.)
1007                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1008                         !offsets.containsOffset(inc) &&
1009                         matches8(s+pos-overlap, s8, length8)
1010
1011                     ) {
1012                         if(inc==rest) {
1013                             return length;  // Reached the end of the string.
1014                         }
1015                         offsets.addOffset(inc);
1016                     }
1017                     if(overlap==0) {
1018                         break;
1019                     }
1020                     --overlap;
1021                     ++inc;
1022                 }
1023                 s8+=length8;
1024             }
1025         } else /* USET_SPAN_SIMPLE */ {
1026             int32_t maxInc=0, maxOverlap=0;
1027             for(i=0; i<stringsLength; ++i) {
1028                 length8=utf8Lengths[i];
1029                 if(length8==0) {
1030                     continue;  // String not representable in UTF-8.
1031                 }
1032                 int32_t overlap=spanUTF8Lengths[i];
1033                 // For longest match, we do need to try to match even an all-contained string
1034                 // to find the match from the earliest start.
1035
1036                 // Try to match this string at pos-overlap..pos.
1037                 if(overlap>=LONG_SPAN) {
1038                     overlap=length8;
1039                     // Longest match: Need to match fully inside the code point span
1040                     // to find the match from the earliest start.
1041                 }
1042                 if(overlap>spanLength) {
1043                     overlap=spanLength;
1044                 }
1045                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1046                 for(;;) {
1047                     if(inc>rest || overlap<maxOverlap) {
1048                         break;
1049                     }
1050                     // Try to match if the string is longer or starts earlier.
1051                     // Match at code point boundaries. (The UTF-8 strings were converted
1052                     // from UTF-16 and are guaranteed to be well-formed.)
1053                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1054                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1055                         matches8(s+pos-overlap, s8, length8)
1056
1057                     ) {
1058                         maxInc=inc;  // Longest match from earliest start.
1059                         maxOverlap=overlap;
1060                         break;
1061                     }
1062                     --overlap;
1063                     ++inc;
1064                 }
1065                 s8+=length8;
1066             }
1067
1068             if(maxInc!=0 || maxOverlap!=0) {
1069                 // Longest-match algorithm, and there was a string match.
1070                 // Simply continue after it.
1071                 pos+=maxInc;
1072                 rest-=maxInc;
1073                 if(rest==0) {
1074                     return length;  // Reached the end of the string.
1075                 }
1076                 spanLength=0;  // Match strings from after a string match.
1077                 continue;
1078             }
1079         }
1080         // Finished trying to match all strings at pos.
1081
1082         if(spanLength!=0 || pos==0) {
1083             // The position is after an unlimited code point span (spanLength!=0),
1084             // not after a string match.
1085             // The only position where spanLength==0 after a span is pos==0.
1086             // Otherwise, an unlimited code point span is only tried again when no
1087             // strings match, and if such a non-initial span fails we stop.
1088             if(offsets.isEmpty()) {
1089                 return pos;  // No strings matched after a span.
1090             }
1091             // Match strings from after the next string match.
1092         } else {
1093             // The position is after a string match (or a single code point).
1094             if(offsets.isEmpty()) {
1095                 // No more strings matched after a previous string match.
1096                 // Try another code point span from after the last string match.
1097                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1098                 if( spanLength==rest || // Reached the end of the string, or
1099                     spanLength==0       // neither strings nor span progressed.
1100                 ) {
1101                     return pos+spanLength;
1102                 }
1103                 pos+=spanLength;
1104                 rest-=spanLength;
1105                 continue;  // spanLength>0: Match strings from after a span.
1106             } else {
1107                 // Try to match only one code point from after a string match if some
1108                 // string matched beyond it, so that we try all possible positions
1109                 // and don't overshoot.
1110                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1111                 if(spanLength>0) {
1112                     if(spanLength==rest) {
1113                         return length;  // Reached the end of the string.
1114                     }
1115                     // Match strings after this code point.
1116                     // There cannot be any increments below it because UnicodeSet strings
1117                     // contain multiple code points.
1118                     pos+=spanLength;
1119                     rest-=spanLength;
1120                     offsets.shift(spanLength);
1121                     spanLength=0;
1122                     continue;  // Match strings from after a single code point.
1123                 }
1124                 // Match strings from after the next string match.
1125             }
1126         }
1127         int32_t minOffset=offsets.popMinimum();
1128         pos+=minOffset;
1129         rest-=minOffset;
1130         spanLength=0;  // Match strings from after a string match.
1131     }
1132 }
1133
1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1135     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1136         return spanNotBackUTF8(s, length);
1137     }
1138     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1139     if(pos==0) {
1140         return 0;
1141     }
1142     int32_t spanLength=length-pos;
1143
1144     // Consider strings; they may overlap with the span.
1145     OffsetList offsets;
1146     if(spanCondition==USET_SPAN_CONTAINED) {
1147         // Use offset list to try all possibilities.
1148         offsets.setMaxLength(maxLength8);
1149     }
1150     int32_t i, stringsLength=strings.size();
1151     uint8_t *spanBackUTF8Lengths=spanLengths;
1152     if(all) {
1153         spanBackUTF8Lengths+=3*stringsLength;
1154     }
1155     for(;;) {
1156         const uint8_t *s8=utf8;
1157         int32_t length8;
1158         if(spanCondition==USET_SPAN_CONTAINED) {
1159             for(i=0; i<stringsLength; ++i) {
1160                 length8=utf8Lengths[i];
1161                 if(length8==0) {
1162                     continue;  // String not representable in UTF-8.
1163                 }
1164                 int32_t overlap=spanBackUTF8Lengths[i];
1165                 if(overlap==ALL_CP_CONTAINED) {
1166                     s8+=length8;
1167                     continue;  // Irrelevant string.
1168                 }
1169
1170                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1171                 if(overlap>=LONG_SPAN) {
1172                     overlap=length8;
1173                     // While contained: No point matching fully inside the code point span.
1174                     int32_t len1=0;
1175                     U8_FWD_1(s8, len1, overlap);
1176                     overlap-=len1;  // Length of the string minus the first code point.
1177                 }
1178                 if(overlap>spanLength) {
1179                     overlap=spanLength;
1180                 }
1181                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1182                 for(;;) {
1183                     if(dec>pos) {
1184                         break;
1185                     }
1186                     // Try to match if the decrement is not listed already.
1187                     // Match at code point boundaries. (The UTF-8 strings were converted
1188                     // from UTF-16 and are guaranteed to be well-formed.)
1189                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1190                         !offsets.containsOffset(dec) &&
1191                         matches8(s+pos-dec, s8, length8)
1192                     ) {
1193                         if(dec==pos) {
1194                             return 0;  // Reached the start of the string.
1195                         }
1196                         offsets.addOffset(dec);
1197                     }
1198                     if(overlap==0) {
1199                         break;
1200                     }
1201                     --overlap;
1202                     ++dec;
1203                 }
1204                 s8+=length8;
1205             }
1206         } else /* USET_SPAN_SIMPLE */ {
1207             int32_t maxDec=0, maxOverlap=0;
1208             for(i=0; i<stringsLength; ++i) {
1209                 length8=utf8Lengths[i];
1210                 if(length8==0) {
1211                     continue;  // String not representable in UTF-8.
1212                 }
1213                 int32_t overlap=spanBackUTF8Lengths[i];
1214                 // For longest match, we do need to try to match even an all-contained string
1215                 // to find the match from the latest end.
1216
1217                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1218                 if(overlap>=LONG_SPAN) {
1219                     overlap=length8;
1220                     // Longest match: Need to match fully inside the code point span
1221                     // to find the match from the latest end.
1222                 }
1223                 if(overlap>spanLength) {
1224                     overlap=spanLength;
1225                 }
1226                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1227                 for(;;) {
1228                     if(dec>pos || overlap<maxOverlap) {
1229                         break;
1230                     }
1231                     // Try to match if the string is longer or ends later.
1232                     // Match at code point boundaries. (The UTF-8 strings were converted
1233                     // from UTF-16 and are guaranteed to be well-formed.)
1234                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1235                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1236                         matches8(s+pos-dec, s8, length8)
1237                     ) {
1238                         maxDec=dec;  // Longest match from latest end.
1239                         maxOverlap=overlap;
1240                         break;
1241                     }
1242                     --overlap;
1243                     ++dec;
1244                 }
1245                 s8+=length8;
1246             }
1247
1248             if(maxDec!=0 || maxOverlap!=0) {
1249                 // Longest-match algorithm, and there was a string match.
1250                 // Simply continue before it.
1251                 pos-=maxDec;
1252                 if(pos==0) {
1253                     return 0;  // Reached the start of the string.
1254                 }
1255                 spanLength=0;  // Match strings from before a string match.
1256                 continue;
1257             }
1258         }
1259         // Finished trying to match all strings at pos.
1260
1261         if(spanLength!=0 || pos==length) {
1262             // The position is before an unlimited code point span (spanLength!=0),
1263             // not before a string match.
1264             // The only position where spanLength==0 before a span is pos==length.
1265             // Otherwise, an unlimited code point span is only tried again when no
1266             // strings match, and if such a non-initial span fails we stop.
1267             if(offsets.isEmpty()) {
1268                 return pos;  // No strings matched before a span.
1269             }
1270             // Match strings from before the next string match.
1271         } else {
1272             // The position is before a string match (or a single code point).
1273             if(offsets.isEmpty()) {
1274                 // No more strings matched before a previous string match.
1275                 // Try another code point span from before the last string match.
1276                 int32_t oldPos=pos;
1277                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1278                 spanLength=oldPos-pos;
1279                 if( pos==0 ||           // Reached the start of the string, or
1280                     spanLength==0       // neither strings nor span progressed.
1281                 ) {
1282                     return pos;
1283                 }
1284                 continue;  // spanLength>0: Match strings from before a span.
1285             } else {
1286                 // Try to match only one code point from before a string match if some
1287                 // string matched beyond it, so that we try all possible positions
1288                 // and don't overshoot.
1289                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1290                 if(spanLength>0) {
1291                     if(spanLength==pos) {
1292                         return 0;  // Reached the start of the string.
1293                     }
1294                     // Match strings before this code point.
1295                     // There cannot be any decrements below it because UnicodeSet strings
1296                     // contain multiple code points.
1297                     pos-=spanLength;
1298                     offsets.shift(spanLength);
1299                     spanLength=0;
1300                     continue;  // Match strings from before a single code point.
1301                 }
1302                 // Match strings from before the next string match.
1303             }
1304         }
1305         pos-=offsets.popMinimum();
1306         spanLength=0;  // Match strings from before a string match.
1307     }
1308 }
1309
1310 /*
1311  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1312  *
1313  * Theoretical algorithm:
1314  * - Iterate through the string, and at each code point boundary:
1315  *   + If the code point there is in the set, then return with the current position.
1316  *   + If a set string matches at the current position, then return with the current position.
1317  *
1318  * Optimized implementation:
1319  *
1320  * (Same assumption as for span() above.)
1321  *
1322  * Create and cache a spanNotSet which contains all of the single code points
1323  * of the original set but none of its strings.
1324  * For each set string add its initial code point to the spanNotSet.
1325  * (Also add its final code point for spanNotBack().)
1326  *
1327  * - Loop:
1328  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329  *   + If the current code point is in the original set, then
1330  *     return the current position.
1331  *   + If any set string matches at the current position, then
1332  *     return the current position.
1333  *   + If there is no match at the current position, neither for the code point there
1334  *     nor for any set string, then skip this code point and continue the loop.
1335  *     This happens for set-string-initial code points that were added to spanNotSet
1336  *     when there is not actually a match for such a set string.
1337  */
1338
1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1340     int32_t pos=0, rest=length;
1341     int32_t i, stringsLength=strings.size();
1342     do {
1343         // Span until we find a code point from the set,
1344         // or a code point that starts or ends some string.
1345         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1346         if(i==rest) {
1347             return length;  // Reached the end of the string.
1348         }
1349         pos+=i;
1350         rest-=i;
1351
1352         // Check whether the current code point is in the original set,
1353         // without the string starts and ends.
1354         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1355         if(cpLength>0) {
1356             return pos;  // There is a set element at pos.
1357         }
1358
1359         // Try to match the strings at pos.
1360         for(i=0; i<stringsLength; ++i) {
1361             if(spanLengths[i]==ALL_CP_CONTAINED) {
1362                 continue;  // Irrelevant string.
1363             }
1364             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1365             const UChar *s16=string.getBuffer();
1366             int32_t length16=string.length();
1367             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1368                 return pos;  // There is a set element at pos.
1369             }
1370         }
1371
1372         // The span(while not contained) ended on a string start/end which is
1373         // not in the original set. Skip this code point and continue.
1374         // cpLength<0
1375         pos-=cpLength;
1376         rest+=cpLength;
1377     } while(rest!=0);
1378     return length;  // Reached the end of the string.
1379 }
1380
1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1382     int32_t pos=length;
1383     int32_t i, stringsLength=strings.size();
1384     do {
1385         // Span until we find a code point from the set,
1386         // or a code point that starts or ends some string.
1387         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1388         if(pos==0) {
1389             return 0;  // Reached the start of the string.
1390         }
1391
1392         // Check whether the current code point is in the original set,
1393         // without the string starts and ends.
1394         int32_t cpLength=spanOneBack(spanSet, s, pos);
1395         if(cpLength>0) {
1396             return pos;  // There is a set element at pos.
1397         }
1398
1399         // Try to match the strings at pos.
1400         for(i=0; i<stringsLength; ++i) {
1401             // Use spanLengths rather than a spanBackLengths pointer because
1402             // it is easier and we only need to know whether the string is irrelevant
1403             // which is the same in either array.
1404             if(spanLengths[i]==ALL_CP_CONTAINED) {
1405                 continue;  // Irrelevant string.
1406             }
1407             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1408             const UChar *s16=string.getBuffer();
1409             int32_t length16=string.length();
1410             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1411                 return pos;  // There is a set element at pos.
1412             }
1413         }
1414
1415         // The span(while not contained) ended on a string start/end which is
1416         // not in the original set. Skip this code point and continue.
1417         // cpLength<0
1418         pos+=cpLength;
1419     } while(pos!=0);
1420     return 0;  // Reached the start of the string.
1421 }
1422
1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1424     int32_t pos=0, rest=length;
1425     int32_t i, stringsLength=strings.size();
1426     uint8_t *spanUTF8Lengths=spanLengths;
1427     if(all) {
1428         spanUTF8Lengths+=2*stringsLength;
1429     }
1430     do {
1431         // Span until we find a code point from the set,
1432         // or a code point that starts or ends some string.
1433         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1434         if(i==rest) {
1435             return length;  // Reached the end of the string.
1436         }
1437         pos+=i;
1438         rest-=i;
1439
1440         // Check whether the current code point is in the original set,
1441         // without the string starts and ends.
1442         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1443         if(cpLength>0) {
1444             return pos;  // There is a set element at pos.
1445         }
1446
1447         // Try to match the strings at pos.
1448         const uint8_t *s8=utf8;
1449         int32_t length8;
1450         for(i=0; i<stringsLength; ++i) {
1451             length8=utf8Lengths[i];
1452             // ALL_CP_CONTAINED: Irrelevant string.
1453             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1454                 return pos;  // There is a set element at pos.
1455             }
1456             s8+=length8;
1457         }
1458
1459         // The span(while not contained) ended on a string start/end which is
1460         // not in the original set. Skip this code point and continue.
1461         // cpLength<0
1462         pos-=cpLength;
1463         rest+=cpLength;
1464     } while(rest!=0);
1465     return length;  // Reached the end of the string.
1466 }
1467
1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1469     int32_t pos=length;
1470     int32_t i, stringsLength=strings.size();
1471     uint8_t *spanBackUTF8Lengths=spanLengths;
1472     if(all) {
1473         spanBackUTF8Lengths+=3*stringsLength;
1474     }
1475     do {
1476         // Span until we find a code point from the set,
1477         // or a code point that starts or ends some string.
1478         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1479         if(pos==0) {
1480             return 0;  // Reached the start of the string.
1481         }
1482
1483         // Check whether the current code point is in the original set,
1484         // without the string starts and ends.
1485         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1486         if(cpLength>0) {
1487             return pos;  // There is a set element at pos.
1488         }
1489
1490         // Try to match the strings at pos.
1491         const uint8_t *s8=utf8;
1492         int32_t length8;
1493         for(i=0; i<stringsLength; ++i) {
1494             length8=utf8Lengths[i];
1495             // ALL_CP_CONTAINED: Irrelevant string.
1496             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1497                 return pos;  // There is a set element at pos.
1498             }
1499             s8+=length8;
1500         }
1501
1502         // The span(while not contained) ended on a string start/end which is
1503         // not in the original set. Skip this code point and continue.
1504         // cpLength<0
1505         pos+=cpLength;
1506     } while(pos!=0);
1507     return 0;  // Reached the start of the string.
1508 }
1509
1510 U_NAMESPACE_END