1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * utf8collationiterator.cpp
10 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/utf8.h"
21 #include "collation.h"
22 #include "collationdata.h"
23 #include "collationfcd.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
27 #include "utf8collationiterator.h"
31 UTF8CollationIterator::~UTF8CollationIterator() {}
34 UTF8CollationIterator::resetToOffset(int32_t newOffset
) {
40 UTF8CollationIterator::getOffset() const {
45 UTF8CollationIterator::handleNextCE32(UChar32
&c
, UErrorCode
& /*errorCode*/) {
48 return Collation::FALLBACK_CE32
;
50 // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
54 return trie
->data32
[c
];
57 if(0xe0 <= c
&& c
< 0xf0 &&
58 ((pos
+ 1) < length
|| length
< 0) &&
59 U8_IS_VALID_LEAD3_AND_T1(c
, t1
= u8
[pos
]) &&
60 (t2
= (u8
[pos
+ 1] - 0x80)) <= 0x3f) {
61 // U+0800..U+FFFF except surrogates
62 c
= (((c
& 0xf) << 12) | ((t1
& 0x3f) << 6) | t2
);
64 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie
, c
);
65 } else if(c
< 0xe0 && c
>= 0xc2 && pos
!= length
&& (t1
= (u8
[pos
] - 0x80)) <= 0x3f) {
67 uint32_t ce32
= trie
->data32
[trie
->index
[(UTRIE2_UTF8_2B_INDEX_2_OFFSET
- 0xc0) + c
] + t1
];
68 c
= ((c
& 0x1f) << 6) | t1
;
72 // Function call for supplementary code points and error cases.
73 // Illegal byte sequences yield U+FFFD.
74 c
= utf8_nextCharSafeBody(u8
, &pos
, length
, c
, -3);
75 return data
->getCE32(c
);
80 UTF8CollationIterator::foundNULTerminator() {
90 UTF8CollationIterator::forbidSurrogateCodePoints() const {
95 UTF8CollationIterator::nextCodePoint(UErrorCode
& /*errorCode*/) {
99 if(u8
[pos
] == 0 && length
< 0) {
104 U8_NEXT_OR_FFFD(u8
, pos
, length
, c
);
109 UTF8CollationIterator::previousCodePoint(UErrorCode
& /*errorCode*/) {
114 U8_PREV_OR_FFFD(u8
, 0, pos
, c
);
119 UTF8CollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
120 U8_FWD_N(u8
, pos
, length
, num
);
124 UTF8CollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode
& /*errorCode*/) {
125 U8_BACK_N(u8
, 0, pos
, num
);
128 // FCDUTF8CollationIterator ------------------------------------------------ ***
130 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
133 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset
) {
135 start
= pos
= newOffset
;
140 FCDUTF8CollationIterator::getOffset() const {
141 if(state
!= IN_NORMALIZED
) {
143 } else if(pos
== 0) {
151 FCDUTF8CollationIterator::handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
) {
153 if(state
== CHECK_FWD
) {
154 // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
157 return Collation::FALLBACK_CE32
;
160 if(U8_IS_SINGLE(c
)) {
162 return trie
->data32
[c
];
165 if(0xe0 <= c
&& c
< 0xf0 &&
166 ((pos
+ 1) < length
|| length
< 0) &&
167 U8_IS_VALID_LEAD3_AND_T1(c
, t1
= u8
[pos
]) &&
168 (t2
= (u8
[pos
+ 1] - 0x80)) <= 0x3f) {
169 // U+0800..U+FFFF except surrogates
170 c
= (((c
& 0xf) << 12) | ((t1
& 0x3f) << 6) | t2
);
172 if(CollationFCD::hasTccc(c
) &&
173 (CollationFCD::maybeTibetanCompositeVowel(c
) ||
174 (pos
!= length
&& nextHasLccc()))) {
177 break; // return CE32(BMP)
179 } else if(c
< 0xe0 && c
>= 0xc2 && pos
!= length
&& (t1
= (u8
[pos
] - 0x80)) <= 0x3f) {
181 uint32_t ce32
= trie
->data32
[trie
->index
[(UTRIE2_UTF8_2B_INDEX_2_OFFSET
- 0xc0) + c
] + t1
];
182 c
= ((c
& 0x1f) << 6) | t1
;
184 if(CollationFCD::hasTccc(c
) && pos
!= length
&& nextHasLccc()) {
190 // Function call for supplementary code points and error cases.
191 // Illegal byte sequences yield U+FFFD.
192 c
= utf8_nextCharSafeBody(u8
, &pos
, length
, c
, -3);
194 return Collation::FFFD_CE32
;
196 U_ASSERT(c
> 0xffff);
197 if(CollationFCD::hasTccc(U16_LEAD(c
)) && pos
!= length
&& nextHasLccc()) {
200 return data
->getCE32FromSupplementary(c
);
204 if(!nextSegment(errorCode
)) {
206 return Collation::FALLBACK_CE32
;
209 } else if(state
== IN_FCD_SEGMENT
&& pos
!= limit
) {
210 return UTF8CollationIterator::handleNextCE32(c
, errorCode
);
211 } else if(state
== IN_NORMALIZED
&& pos
!= normalized
.length()) {
212 c
= normalized
[pos
++];
218 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie
, c
);
222 FCDUTF8CollationIterator::nextHasLccc() const {
223 U_ASSERT(state
== CHECK_FWD
&& pos
!= length
);
224 // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
225 // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
227 if(c
< 0xcc || (0xe4 <= c
&& c
<= 0xed && c
!= 0xea)) { return FALSE
; }
229 U8_NEXT_OR_FFFD(u8
, i
, length
, c
);
230 if(c
> 0xffff) { c
= U16_LEAD(c
); }
231 return CollationFCD::hasLccc(c
);
235 FCDUTF8CollationIterator::previousHasTccc() const {
236 U_ASSERT(state
== CHECK_BWD
&& pos
!= 0);
237 UChar32 c
= u8
[pos
- 1];
238 if(U8_IS_SINGLE(c
)) { return FALSE
; }
240 U8_PREV_OR_FFFD(u8
, 0, i
, c
);
241 if(c
> 0xffff) { c
= U16_LEAD(c
); }
242 return CollationFCD::hasTccc(c
);
246 FCDUTF8CollationIterator::handleGetTrailSurrogate() {
247 if(state
!= IN_NORMALIZED
) { return 0; }
248 U_ASSERT(pos
< normalized
.length());
250 if(U16_IS_TRAIL(trail
= normalized
[pos
])) { ++pos
; }
255 FCDUTF8CollationIterator::foundNULTerminator() {
256 if(state
== CHECK_FWD
&& length
< 0) {
265 FCDUTF8CollationIterator::nextCodePoint(UErrorCode
&errorCode
) {
268 if(state
== CHECK_FWD
) {
269 if(pos
== length
|| ((c
= u8
[pos
]) == 0 && length
< 0)) {
272 if(U8_IS_SINGLE(c
)) {
276 U8_NEXT_OR_FFFD(u8
, pos
, length
, c
);
277 if(CollationFCD::hasTccc(c
<= 0xffff ? c
: U16_LEAD(c
)) &&
278 (CollationFCD::maybeTibetanCompositeVowel(c
) ||
279 (pos
!= length
&& nextHasLccc()))) {
280 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
281 // and we can use U8_LENGTH() rather than a previous-position variable.
283 if(!nextSegment(errorCode
)) {
289 } else if(state
== IN_FCD_SEGMENT
&& pos
!= limit
) {
290 U8_NEXT_OR_FFFD(u8
, pos
, length
, c
);
292 } else if(state
== IN_NORMALIZED
&& pos
!= normalized
.length()) {
293 c
= normalized
.char32At(pos
);
294 pos
+= U16_LENGTH(c
);
303 FCDUTF8CollationIterator::previousCodePoint(UErrorCode
&errorCode
) {
306 if(state
== CHECK_BWD
) {
310 if(U8_IS_SINGLE(c
= u8
[pos
- 1])) {
314 U8_PREV_OR_FFFD(u8
, 0, pos
, c
);
315 if(CollationFCD::hasLccc(c
<= 0xffff ? c
: U16_LEAD(c
)) &&
316 (CollationFCD::maybeTibetanCompositeVowel(c
) ||
317 (pos
!= 0 && previousHasTccc()))) {
318 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
319 // and we can use U8_LENGTH() rather than a previous-position variable.
321 if(!previousSegment(errorCode
)) {
327 } else if(state
== IN_FCD_SEGMENT
&& pos
!= start
) {
328 U8_PREV_OR_FFFD(u8
, 0, pos
, c
);
330 } else if(state
>= IN_NORMALIZED
&& pos
!= 0) {
331 c
= normalized
.char32At(pos
- 1);
332 pos
-= U16_LENGTH(c
);
341 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
) {
342 // Specify the class to avoid a virtual-function indirection.
343 // In Java, we would declare this class final.
344 while(num
> 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode
) >= 0) {
350 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
) {
351 // Specify the class to avoid a virtual-function indirection.
352 // In Java, we would declare this class final.
353 while(num
> 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode
) >= 0) {
359 FCDUTF8CollationIterator::switchToForward() {
360 U_ASSERT(state
== CHECK_BWD
||
361 (state
== IN_FCD_SEGMENT
&& pos
== limit
) ||
362 (state
== IN_NORMALIZED
&& pos
== normalized
.length()));
363 if(state
== CHECK_BWD
) {
364 // Turn around from backward checking.
367 state
= CHECK_FWD
; // Check forward.
368 } else { // pos < limit
369 state
= IN_FCD_SEGMENT
; // Stay in FCD segment.
372 // Reached the end of the FCD segment.
373 if(state
== IN_FCD_SEGMENT
) {
374 // The input text segment is FCD, extend it forward.
376 // The input text segment needed to be normalized.
377 // Switch to checking forward from it.
385 FCDUTF8CollationIterator::nextSegment(UErrorCode
&errorCode
) {
386 if(U_FAILURE(errorCode
)) { return FALSE
; }
387 U_ASSERT(state
== CHECK_FWD
&& pos
!= length
);
388 // The input text [start..pos[ passes the FCD check.
389 int32_t segmentStart
= pos
;
390 // Collect the characters being checked, in case they need to be normalized.
394 // Fetch the next character and its fcd16 value.
395 int32_t cpStart
= pos
;
397 U8_NEXT_OR_FFFD(u8
, pos
, length
, c
);
398 uint16_t fcd16
= nfcImpl
.getFCD16(c
);
399 uint8_t leadCC
= (uint8_t)(fcd16
>> 8);
400 if(leadCC
== 0 && cpStart
!= segmentStart
) {
401 // FCD boundary before this character.
406 if(leadCC
!= 0 && (prevCC
> leadCC
|| CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16
))) {
407 // Fails FCD check. Find the next FCD boundary and normalize.
408 while(pos
!= length
) {
410 U8_NEXT_OR_FFFD(u8
, pos
, length
, c
);
411 if(nfcImpl
.getFCD16(c
) <= 0xff) {
417 if(!normalize(s
, errorCode
)) { return FALSE
; }
418 start
= segmentStart
;
420 state
= IN_NORMALIZED
;
424 prevCC
= (uint8_t)fcd16
;
425 if(pos
== length
|| prevCC
== 0) {
426 // FCD boundary after the last character.
432 U_ASSERT(pos
!= limit
);
433 state
= IN_FCD_SEGMENT
;
438 FCDUTF8CollationIterator::switchToBackward() {
439 U_ASSERT(state
== CHECK_FWD
||
440 (state
== IN_FCD_SEGMENT
&& pos
== start
) ||
441 (state
>= IN_NORMALIZED
&& pos
== 0));
442 if(state
== CHECK_FWD
) {
443 // Turn around from forward checking.
446 state
= CHECK_BWD
; // Check backward.
447 } else { // pos > start
448 state
= IN_FCD_SEGMENT
; // Stay in FCD segment.
451 // Reached the start of the FCD segment.
452 if(state
== IN_FCD_SEGMENT
) {
453 // The input text segment is FCD, extend it backward.
455 // The input text segment needed to be normalized.
456 // Switch to checking backward from it.
464 FCDUTF8CollationIterator::previousSegment(UErrorCode
&errorCode
) {
465 if(U_FAILURE(errorCode
)) { return FALSE
; }
466 U_ASSERT(state
== CHECK_BWD
&& pos
!= 0);
467 // The input text [pos..limit[ passes the FCD check.
468 int32_t segmentLimit
= pos
;
469 // Collect the characters being checked, in case they need to be normalized.
473 // Fetch the previous character and its fcd16 value.
474 int32_t cpLimit
= pos
;
476 U8_PREV_OR_FFFD(u8
, 0, pos
, c
);
477 uint16_t fcd16
= nfcImpl
.getFCD16(c
);
478 uint8_t trailCC
= (uint8_t)fcd16
;
479 if(trailCC
== 0 && cpLimit
!= segmentLimit
) {
480 // FCD boundary after this character.
485 if(trailCC
!= 0 && ((nextCC
!= 0 && trailCC
> nextCC
) ||
486 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16
))) {
487 // Fails FCD check. Find the previous FCD boundary and normalize.
488 while(fcd16
> 0xff && pos
!= 0) {
490 U8_PREV_OR_FFFD(u8
, 0, pos
, c
);
491 fcd16
= nfcImpl
.getFCD16(c
);
499 if(!normalize(s
, errorCode
)) { return FALSE
; }
500 limit
= segmentLimit
;
502 state
= IN_NORMALIZED
;
503 pos
= normalized
.length();
506 nextCC
= (uint8_t)(fcd16
>> 8);
507 if(pos
== 0 || nextCC
== 0) {
508 // FCD boundary before the following character.
514 U_ASSERT(pos
!= start
);
515 state
= IN_FCD_SEGMENT
;
520 FCDUTF8CollationIterator::normalize(const UnicodeString
&s
, UErrorCode
&errorCode
) {
521 // NFD without argument checking.
522 U_ASSERT(U_SUCCESS(errorCode
));
523 nfcImpl
.decompose(s
, normalized
, errorCode
);
524 return U_SUCCESS(errorCode
);
529 #endif // !UCONFIG_NO_COLLATION