icuSources/i18n/utf8collationiterator.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2012-2014, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * utf8collationiterator.cpp
   9 *
  10 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
  11 * created by: Markus W. Scherer
  12 */
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_COLLATION
  17
  18 #include "unicode/utf8.h"
  19 #include "charstr.h"
  20 #include "cmemory.h"
  21 #include "collation.h"
  22 #include "collationdata.h"
  23 #include "collationfcd.h"
  24 #include "collationiterator.h"
  25 #include "normalizer2impl.h"
  26 #include "uassert.h"
  27 #include "utf8collationiterator.h"
  28
  29 U_NAMESPACE_BEGIN
  30
  31 UTF8CollationIterator::~UTF8CollationIterator() {}
  32
  33 void
  34 UTF8CollationIterator::resetToOffset(int32_t newOffset) {
  35     reset();
  36     pos = newOffset;
  37 }
  38
  39 int32_t
  40 UTF8CollationIterator::getOffset() const {
  41     return pos;
  42 }
  43
  44 uint32_t
  45 UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
  46     if(pos == length) {
  47         c = U_SENTINEL;
  48         return Collation::FALLBACK_CE32;
  49     }
  50     // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
  51     c = u8[pos++];
  52     if(c < 0xc0) {
  53         // ASCII 00..7F; trail bytes 80..BF map to error values.
  54         return trie->data32[c];
  55     }
  56     uint8_t t1, t2;
  57     if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
  58         // U+0080..U+07FF; 00..7F map to error values.
  59         uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
  60         c = ((c & 0x1f) << 6) | t1;
  61         ++pos;
  62         return ce32;
  63     } else if(c <= 0xef &&
  64               ((pos + 1) < length || length < 0) &&
  65               (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
  66               (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
  67     ) {
  68         // U+0800..U+FFFF; caller maps surrogates to error values.
  69         c = (UChar)((c << 12) | (t1 << 6) | t2);
  70         pos += 2;
  71         return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
  72     } else {
  73         // Function call for supplementary code points and error cases.
  74         // Illegal byte sequences yield U+FFFD.
  75         c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
  76         return data->getCE32(c);
  77     }
  78 }
  79
  80 UBool
  81 UTF8CollationIterator::foundNULTerminator() {
  82     if(length < 0) {
  83         length = --pos;
  84         return TRUE;
  85     } else {
  86         return FALSE;
  87     }
  88 }
  89
  90 UBool
  91 UTF8CollationIterator::forbidSurrogateCodePoints() const {
  92     return TRUE;
  93 }
  94
  95 UChar32
  96 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
  97     if(pos == length) {
  98         return U_SENTINEL;
  99     }
 100     if(u8[pos] == 0 && length < 0) {
 101         length = pos;
 102         return U_SENTINEL;
 103     }
 104     UChar32 c;
 105     U8_NEXT_OR_FFFD(u8, pos, length, c);
 106     return c;
 107 }
 108
 109 UChar32
 110 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
 111     if(pos == 0) {
 112         return U_SENTINEL;
 113     }
 114     UChar32 c;
 115     U8_PREV_OR_FFFD(u8, 0, pos, c);
 116     return c;
 117 }
 118
 119 void
 120 UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
 121     U8_FWD_N(u8, pos, length, num);
 122 }
 123
 124 void
 125 UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
 126     U8_BACK_N(u8, 0, pos, num);
 127 }
 128
 129 // FCDUTF8CollationIterator ------------------------------------------------ ***
 130
 131 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
 132
 133 void
 134 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
 135     reset();
 136     start = pos = newOffset;
 137     state = CHECK_FWD;
 138 }
 139
 140 int32_t
 141 FCDUTF8CollationIterator::getOffset() const {
 142     if(state != IN_NORMALIZED) {
 143         return pos;
 144     } else if(pos == 0) {
 145         return start;
 146     } else {
 147         return limit;
 148     }
 149 }
 150
 151 uint32_t
 152 FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
 153     for(;;) {
 154         if(state == CHECK_FWD) {
 155             // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
 156             if(pos == length) {
 157                 c = U_SENTINEL;
 158                 return Collation::FALLBACK_CE32;
 159             }
 160             c = u8[pos++];
 161             if(c < 0xc0) {
 162                 // ASCII 00..7F; trail bytes 80..BF map to error values.
 163                 return trie->data32[c];
 164             }
 165             uint8_t t1, t2;
 166             if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
 167                 // U+0080..U+07FF; 00..7F map to error values.
 168                 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
 169                 c = ((c & 0x1f) << 6) | t1;
 170                 ++pos;
 171                 if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
 172                     pos -= 2;
 173                 } else {
 174                     return ce32;
 175                 }
 176             } else if(c <= 0xef &&
 177                       ((pos + 1) < length || length < 0) &&
 178                       (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
 179                       (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
 180             ) {
 181                 // U+0800..U+FFFF; caller maps surrogates to error values.
 182                 c = (UChar)((c << 12) | (t1 << 6) | t2);
 183                 pos += 2;
 184                 if(CollationFCD::hasTccc(c) &&
 185                         (CollationFCD::maybeTibetanCompositeVowel(c) ||
 186                             (pos != length && nextHasLccc()))) {
 187                     pos -= 3;
 188                 } else {
 189                     break;  // return CE32(BMP)
 190                 }
 191             } else {
 192                 // Function call for supplementary code points and error cases.
 193                 // Illegal byte sequences yield U+FFFD.
 194                 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
 195                 if(c == 0xfffd) {
 196                     return Collation::FFFD_CE32;
 197                 } else {
 198                     U_ASSERT(c > 0xffff);
 199                     if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
 200                         pos -= 4;
 201                     } else {
 202                         return data->getCE32FromSupplementary(c);
 203                     }
 204                 }
 205             }
 206             if(!nextSegment(errorCode)) {
 207                 c = U_SENTINEL;
 208                 return Collation::FALLBACK_CE32;
 209             }
 210             continue;
 211         } else if(state == IN_FCD_SEGMENT && pos != limit) {
 212             return UTF8CollationIterator::handleNextCE32(c, errorCode);
 213         } else if(state == IN_NORMALIZED && pos != normalized.length()) {
 214             c = normalized[pos++];
 215             break;
 216         } else {
 217             switchToForward();
 218         }
 219     }
 220     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
 221 }
 222
 223 UBool
 224 FCDUTF8CollationIterator::nextHasLccc() const {
 225     U_ASSERT(state == CHECK_FWD && pos != length);
 226     // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
 227     // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
 228     UChar32 c = u8[pos];
 229     if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
 230     int32_t i = pos;
 231     U8_NEXT_OR_FFFD(u8, i, length, c);
 232     if(c > 0xffff) { c = U16_LEAD(c); }
 233     return CollationFCD::hasLccc(c);
 234 }
 235
 236 UBool
 237 FCDUTF8CollationIterator::previousHasTccc() const {
 238     U_ASSERT(state == CHECK_BWD && pos != 0);
 239     UChar32 c = u8[pos - 1];
 240     if(c < 0x80) { return FALSE; }
 241     int32_t i = pos;
 242     U8_PREV_OR_FFFD(u8, 0, i, c);
 243     if(c > 0xffff) { c = U16_LEAD(c); }
 244     return CollationFCD::hasTccc(c);
 245 }
 246
 247 UChar
 248 FCDUTF8CollationIterator::handleGetTrailSurrogate() {
 249     if(state != IN_NORMALIZED) { return 0; }
 250     U_ASSERT(pos < normalized.length());
 251     UChar trail;
 252     if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
 253     return trail;
 254 }
 255
 256 UBool
 257 FCDUTF8CollationIterator::foundNULTerminator() {
 258     if(state == CHECK_FWD && length < 0) {
 259         length = --pos;
 260         return TRUE;
 261     } else {
 262         return FALSE;
 263     }
 264 }
 265
 266 UChar32
 267 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
 268     UChar32 c;
 269     for(;;) {
 270         if(state == CHECK_FWD) {
 271             if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
 272                 return U_SENTINEL;
 273             }
 274             if(c < 0x80) {
 275                 ++pos;
 276                 return c;
 277             }
 278             U8_NEXT_OR_FFFD(u8, pos, length, c);
 279             if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
 280                     (CollationFCD::maybeTibetanCompositeVowel(c) ||
 281                         (pos != length && nextHasLccc()))) {
 282                 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
 283                 // and we can use U8_LENGTH() rather than a previous-position variable.
 284                 pos -= U8_LENGTH(c);
 285                 if(!nextSegment(errorCode)) {
 286                     return U_SENTINEL;
 287                 }
 288                 continue;
 289             }
 290             return c;
 291         } else if(state == IN_FCD_SEGMENT && pos != limit) {
 292             U8_NEXT_OR_FFFD(u8, pos, length, c);
 293             return c;
 294         } else if(state == IN_NORMALIZED && pos != normalized.length()) {
 295             c = normalized.char32At(pos);
 296             pos += U16_LENGTH(c);
 297             return c;
 298         } else {
 299             switchToForward();
 300         }
 301     }
 302 }
 303
 304 UChar32
 305 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
 306     UChar32 c;
 307     for(;;) {
 308         if(state == CHECK_BWD) {
 309             if(pos == 0) {
 310                 return U_SENTINEL;
 311             }
 312             if((c = u8[pos - 1]) < 0x80) {
 313                 --pos;
 314                 return c;
 315             }
 316             U8_PREV_OR_FFFD(u8, 0, pos, c);
 317             if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
 318                     (CollationFCD::maybeTibetanCompositeVowel(c) ||
 319                         (pos != 0 && previousHasTccc()))) {
 320                 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
 321                 // and we can use U8_LENGTH() rather than a previous-position variable.
 322                 pos += U8_LENGTH(c);
 323                 if(!previousSegment(errorCode)) {
 324                     return U_SENTINEL;
 325                 }
 326                 continue;
 327             }
 328             return c;
 329         } else if(state == IN_FCD_SEGMENT && pos != start) {
 330             U8_PREV_OR_FFFD(u8, 0, pos, c);
 331             return c;
 332         } else if(state >= IN_NORMALIZED && pos != 0) {
 333             c = normalized.char32At(pos - 1);
 334             pos -= U16_LENGTH(c);
 335             return c;
 336         } else {
 337             switchToBackward();
 338         }
 339     }
 340 }
 341
 342 void
 343 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
 344     // Specify the class to avoid a virtual-function indirection.
 345     // In Java, we would declare this class final.
 346     while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
 347         --num;
 348     }
 349 }
 350
 351 void
 352 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
 353     // Specify the class to avoid a virtual-function indirection.
 354     // In Java, we would declare this class final.
 355     while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
 356         --num;
 357     }
 358 }
 359
 360 void
 361 FCDUTF8CollationIterator::switchToForward() {
 362     U_ASSERT(state == CHECK_BWD ||
 363              (state == IN_FCD_SEGMENT && pos == limit) ||
 364              (state == IN_NORMALIZED && pos == normalized.length()));
 365     if(state == CHECK_BWD) {
 366         // Turn around from backward checking.
 367         start = pos;
 368         if(pos == limit) {
 369             state = CHECK_FWD;  // Check forward.
 370         } else {  // pos < limit
 371             state = IN_FCD_SEGMENT;  // Stay in FCD segment.
 372         }
 373     } else {
 374         // Reached the end of the FCD segment.
 375         if(state == IN_FCD_SEGMENT) {
 376             // The input text segment is FCD, extend it forward.
 377         } else {
 378             // The input text segment needed to be normalized.
 379             // Switch to checking forward from it.
 380             start = pos = limit;
 381         }
 382         state = CHECK_FWD;
 383     }
 384 }
 385
 386 UBool
 387 FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
 388     if(U_FAILURE(errorCode)) { return FALSE; }
 389     U_ASSERT(state == CHECK_FWD && pos != length);
 390     // The input text [start..pos[ passes the FCD check.
 391     int32_t segmentStart = pos;
 392     // Collect the characters being checked, in case they need to be normalized.
 393     UnicodeString s;
 394     uint8_t prevCC = 0;
 395     for(;;) {
 396         // Fetch the next character and its fcd16 value.
 397         int32_t cpStart = pos;
 398         UChar32 c;
 399         U8_NEXT_OR_FFFD(u8, pos, length, c);
 400         uint16_t fcd16 = nfcImpl.getFCD16(c);
 401         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
 402         if(leadCC == 0 && cpStart != segmentStart) {
 403             // FCD boundary before this character.
 404             pos = cpStart;
 405             break;
 406         }
 407         s.append(c);
 408         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
 409             // Fails FCD check. Find the next FCD boundary and normalize.
 410             while(pos != length) {
 411                 cpStart = pos;
 412                 U8_NEXT_OR_FFFD(u8, pos, length, c);
 413                 if(nfcImpl.getFCD16(c) <= 0xff) {
 414                     pos = cpStart;
 415                     break;
 416                 }
 417                 s.append(c);
 418             }
 419             if(!normalize(s, errorCode)) { return FALSE; }
 420             start = segmentStart;
 421             limit = pos;
 422             state = IN_NORMALIZED;
 423             pos = 0;
 424             return TRUE;
 425         }
 426         prevCC = (uint8_t)fcd16;
 427         if(pos == length || prevCC == 0) {
 428             // FCD boundary after the last character.
 429             break;
 430         }
 431     }
 432     limit = pos;
 433     pos = segmentStart;
 434     U_ASSERT(pos != limit);
 435     state = IN_FCD_SEGMENT;
 436     return TRUE;
 437 }
 438
 439 void
 440 FCDUTF8CollationIterator::switchToBackward() {
 441     U_ASSERT(state == CHECK_FWD ||
 442              (state == IN_FCD_SEGMENT && pos == start) ||
 443              (state >= IN_NORMALIZED && pos == 0));
 444     if(state == CHECK_FWD) {
 445         // Turn around from forward checking.
 446         limit = pos;
 447         if(pos == start) {
 448             state = CHECK_BWD;  // Check backward.
 449         } else {  // pos > start
 450             state = IN_FCD_SEGMENT;  // Stay in FCD segment.
 451         }
 452     } else {
 453         // Reached the start of the FCD segment.
 454         if(state == IN_FCD_SEGMENT) {
 455             // The input text segment is FCD, extend it backward.
 456         } else {
 457             // The input text segment needed to be normalized.
 458             // Switch to checking backward from it.
 459             limit = pos = start;
 460         }
 461         state = CHECK_BWD;
 462     }
 463 }
 464
 465 UBool
 466 FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
 467     if(U_FAILURE(errorCode)) { return FALSE; }
 468     U_ASSERT(state == CHECK_BWD && pos != 0);
 469     // The input text [pos..limit[ passes the FCD check.
 470     int32_t segmentLimit = pos;
 471     // Collect the characters being checked, in case they need to be normalized.
 472     UnicodeString s;
 473     uint8_t nextCC = 0;
 474     for(;;) {
 475         // Fetch the previous character and its fcd16 value.
 476         int32_t cpLimit = pos;
 477         UChar32 c;
 478         U8_PREV_OR_FFFD(u8, 0, pos, c);
 479         uint16_t fcd16 = nfcImpl.getFCD16(c);
 480         uint8_t trailCC = (uint8_t)fcd16;
 481         if(trailCC == 0 && cpLimit != segmentLimit) {
 482             // FCD boundary after this character.
 483             pos = cpLimit;
 484             break;
 485         }
 486         s.append(c);
 487         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
 488                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
 489             // Fails FCD check. Find the previous FCD boundary and normalize.
 490             while(fcd16 > 0xff && pos != 0) {
 491                 cpLimit = pos;
 492                 U8_PREV_OR_FFFD(u8, 0, pos, c);
 493                 fcd16 = nfcImpl.getFCD16(c);
 494                 if(fcd16 == 0) {
 495                     pos = cpLimit;
 496                     break;
 497                 }
 498                 s.append(c);
 499             }
 500             s.reverse();
 501             if(!normalize(s, errorCode)) { return FALSE; }
 502             limit = segmentLimit;
 503             start = pos;
 504             state = IN_NORMALIZED;
 505             pos = normalized.length();
 506             return TRUE;
 507         }
 508         nextCC = (uint8_t)(fcd16 >> 8);
 509         if(pos == 0 || nextCC == 0) {
 510             // FCD boundary before the following character.
 511             break;
 512         }
 513     }
 514     start = pos;
 515     pos = segmentLimit;
 516     U_ASSERT(pos != start);
 517     state = IN_FCD_SEGMENT;
 518     return TRUE;
 519 }
 520
 521 UBool
 522 FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
 523     // NFD without argument checking.
 524     U_ASSERT(U_SUCCESS(errorCode));
 525     nfcImpl.decompose(s, normalized, errorCode);
 526     return U_SUCCESS(errorCode);
 527 }
 528
 529 U_NAMESPACE_END
 530
 531 #endif  // !UCONFIG_NO_COLLATION