git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/utf8collationiterator.cpp

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
57a6839d A	3	/*
	4	*******************************************************************************
	5	* Copyright (C) 2012-2014, International Business Machines
	6	* Corporation and others. All Rights Reserved.
	7	*******************************************************************************
	8	* utf8collationiterator.cpp
	9	*
	10	* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
	11	* created by: Markus W. Scherer
	12	*/
	13
	14	#include "unicode/utypes.h"
	15
	16	#if !UCONFIG_NO_COLLATION
	17
	18	#include "unicode/utf8.h"
	19	#include "charstr.h"
	20	#include "cmemory.h"
	21	#include "collation.h"
	22	#include "collationdata.h"
	23	#include "collationfcd.h"
	24	#include "collationiterator.h"
	25	#include "normalizer2impl.h"
	26	#include "uassert.h"
	27	#include "utf8collationiterator.h"
	28
	29	U_NAMESPACE_BEGIN
	30
	31	UTF8CollationIterator::~UTF8CollationIterator() {}
	32
	33	void
	34	UTF8CollationIterator::resetToOffset(int32_t newOffset) {
	35	reset();
	36	pos = newOffset;
	37	}
	38
	39	int32_t
	40	UTF8CollationIterator::getOffset() const {
	41	return pos;
	42	}
	43
	44	uint32_t
	45	UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /errorCode/) {
	46	if(pos == length) {
	47	c = U_SENTINEL;
	48	return Collation::FALLBACK_CE32;
	49	}
	50	// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
	51	c = u8[pos++];
0f5d89e8 A	52	if(U8_IS_SINGLE(c)) {
0f5d89e8 A	53	// ASCII 00..7F
57a6839d A	54	return trie->data32[c];
	55	}
	56	uint8_t t1, t2;
0f5d89e8 A	57	if(0xe0 <= c && c < 0xf0 &&
	58	((pos + 1) < length \|\| length < 0) &&
	59	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
	60	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
	61	// U+0800..U+FFFF except surrogates
	62	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
	63	pos += 2;
	64	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
	65	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
	66	// U+0080..U+07FF
57a6839d A	67	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
	68	c = ((c & 0x1f) << 6) \| t1;
	69	++pos;
	70	return ce32;
57a6839d A	71	} else {
	72	// Function call for supplementary code points and error cases.
	73	// Illegal byte sequences yield U+FFFD.
	74	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
	75	return data->getCE32(c);
	76	}
	77	}
	78
	79	UBool
	80	UTF8CollationIterator::foundNULTerminator() {
	81	if(length < 0) {
	82	length = --pos;
	83	return TRUE;
	84	} else {
	85	return FALSE;
	86	}
	87	}
	88
	89	UBool
	90	UTF8CollationIterator::forbidSurrogateCodePoints() const {
	91	return TRUE;
	92	}
	93
	94	UChar32
	95	UTF8CollationIterator::nextCodePoint(UErrorCode & /errorCode/) {
	96	if(pos == length) {
	97	return U_SENTINEL;
	98	}
	99	if(u8[pos] == 0 && length < 0) {
	100	length = pos;
	101	return U_SENTINEL;
	102	}
	103	UChar32 c;
	104	U8_NEXT_OR_FFFD(u8, pos, length, c);
	105	return c;
	106	}
	107
	108	UChar32
	109	UTF8CollationIterator::previousCodePoint(UErrorCode & /errorCode/) {
	110	if(pos == 0) {
	111	return U_SENTINEL;
	112	}
	113	UChar32 c;
	114	U8_PREV_OR_FFFD(u8, 0, pos, c);
	115	return c;
	116	}
	117
	118	void
	119	UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
	120	U8_FWD_N(u8, pos, length, num);
	121	}
	122
	123	void
	124	UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
	125	U8_BACK_N(u8, 0, pos, num);
	126	}
	127
	128	// FCDUTF8CollationIterator ------------------------------------------------ ***
	129
	130	FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
	131
	132	void
	133	FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
	134	reset();
135	start = pos = newOffset;
136	state = CHECK_FWD;
137	}
138
139	int32_t
140	FCDUTF8CollationIterator::getOffset() const {
141	if(state != IN_NORMALIZED) {
142	return pos;
143	} else if(pos == 0) {
144	return start;
145	} else {
146	return limit;
147	}
148	}
149
150	uint32_t
151	FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
152	for(;;) {
153	if(state == CHECK_FWD) {
154	// Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
155	if(pos == length) {
156	c = U_SENTINEL;
157	return Collation::FALLBACK_CE32;
158	}
159	c = u8[pos++];
0f5d89e8 A	160	if(U8_IS_SINGLE(c)) {
0f5d89e8 A	161	// ASCII 00..7F
57a6839d A	162	return trie->data32[c];
	163	}
	164	uint8_t t1, t2;
0f5d89e8 A	165	if(0xe0 <= c && c < 0xf0 &&
	166	((pos + 1) < length \|\| length < 0) &&
	167	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
	168	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
	169	// U+0800..U+FFFF except surrogates
	170	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
57a6839d A	171	pos += 2;
	172	if(CollationFCD::hasTccc(c) &&
	173	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
	174	(pos != length && nextHasLccc()))) {
	175	pos -= 3;
	176	} else {
	177	break; // return CE32(BMP)
	178	}
0f5d89e8 A	179	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
	180	// U+0080..U+07FF
	181	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
	182	c = ((c & 0x1f) << 6) \| t1;
	183	++pos;
	184	if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
	185	pos -= 2;
	186	} else {
	187	return ce32;
	188	}
57a6839d A	189	} else {
	190	// Function call for supplementary code points and error cases.
	191	// Illegal byte sequences yield U+FFFD.
	192	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
	193	if(c == 0xfffd) {
	194	return Collation::FFFD_CE32;
	195	} else {
	196	U_ASSERT(c > 0xffff);
	197	if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
	198	pos -= 4;
	199	} else {
	200	return data->getCE32FromSupplementary(c);
	201	}
	202	}
	203	}
	204	if(!nextSegment(errorCode)) {
	205	c = U_SENTINEL;
	206	return Collation::FALLBACK_CE32;
	207	}
	208	continue;
	209	} else if(state == IN_FCD_SEGMENT && pos != limit) {
	210	return UTF8CollationIterator::handleNextCE32(c, errorCode);
	211	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
	212	c = normalized[pos++];
	213	break;
	214	} else {
	215	switchToForward();
	216	}
	217	}
	218	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
	219	}
	220
	221	UBool
	222	FCDUTF8CollationIterator::nextHasLccc() const {
	223	U_ASSERT(state == CHECK_FWD && pos != length);
	224	// The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
	225	// CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
	226	UChar32 c = u8[pos];
	227	if(c < 0xcc \|\| (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
	228	int32_t i = pos;
	229	U8_NEXT_OR_FFFD(u8, i, length, c);
	230	if(c > 0xffff) { c = U16_LEAD(c); }
	231	return CollationFCD::hasLccc(c);
	232	}
	233
	234	UBool
	235	FCDUTF8CollationIterator::previousHasTccc() const {
	236	U_ASSERT(state == CHECK_BWD && pos != 0);
	237	UChar32 c = u8[pos - 1];
0f5d89e8	238	if(U8_IS_SINGLE(c)) { return FALSE; }
57a6839d A	239	int32_t i = pos;
	240	U8_PREV_OR_FFFD(u8, 0, i, c);
	241	if(c > 0xffff) { c = U16_LEAD(c); }
	242	return CollationFCD::hasTccc(c);
	243	}
	244
	245	UChar
	246	FCDUTF8CollationIterator::handleGetTrailSurrogate() {
	247	if(state != IN_NORMALIZED) { return 0; }
	248	U_ASSERT(pos < normalized.length());
	249	UChar trail;
	250	if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
	251	return trail;
	252	}
	253
	254	UBool
	255	FCDUTF8CollationIterator::foundNULTerminator() {
	256	if(state == CHECK_FWD && length < 0) {
	257	length = --pos;
	258	return TRUE;
	259	} else {
	260	return FALSE;
	261	}
	262	}
	263
	264	UChar32
	265	FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
	266	UChar32 c;
	267	for(;;) {
	268	if(state == CHECK_FWD) {
	269	if(pos == length \|\| ((c = u8[pos]) == 0 && length < 0)) {
	270	return U_SENTINEL;
	271	}
0f5d89e8	272	if(U8_IS_SINGLE(c)) {
57a6839d A	273	++pos;
	274	return c;
	275	}
	276	U8_NEXT_OR_FFFD(u8, pos, length, c);
	277	if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
	278	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
	279	(pos != length && nextHasLccc()))) {
	280	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
	281	// and we can use U8_LENGTH() rather than a previous-position variable.
	282	pos -= U8_LENGTH(c);
	283	if(!nextSegment(errorCode)) {
	284	return U_SENTINEL;
	285	}
	286	continue;
	287	}
	288	return c;
	289	} else if(state == IN_FCD_SEGMENT && pos != limit) {
	290	U8_NEXT_OR_FFFD(u8, pos, length, c);
	291	return c;
	292	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
	293	c = normalized.char32At(pos);
	294	pos += U16_LENGTH(c);
	295	return c;
	296	} else {
	297	switchToForward();
	298	}
	299	}
	300	}
	301
	302	UChar32
	303	FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
	304	UChar32 c;
	305	for(;;) {
	306	if(state == CHECK_BWD) {
	307	if(pos == 0) {
	308	return U_SENTINEL;
	309	}
0f5d89e8	310	if(U8_IS_SINGLE(c = u8[pos - 1])) {
57a6839d A	311	--pos;
	312	return c;
	313	}
	314	U8_PREV_OR_FFFD(u8, 0, pos, c);
	315	if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
	316	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
	317	(pos != 0 && previousHasTccc()))) {
	318	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
	319	// and we can use U8_LENGTH() rather than a previous-position variable.
	320	pos += U8_LENGTH(c);
	321	if(!previousSegment(errorCode)) {
	322	return U_SENTINEL;
	323	}
	324	continue;
	325	}
	326	return c;
	327	} else if(state == IN_FCD_SEGMENT && pos != start) {
	328	U8_PREV_OR_FFFD(u8, 0, pos, c);
	329	return c;
	330	} else if(state >= IN_NORMALIZED && pos != 0) {
	331	c = normalized.char32At(pos - 1);
	332	pos -= U16_LENGTH(c);
	333	return c;
	334	} else {
	335	switchToBackward();
	336	}
	337	}
	338	}
	339
	340	void
	341	FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
	342	// Specify the class to avoid a virtual-function indirection.
	343	// In Java, we would declare this class final.
	344	while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
	345	--num;
	346	}
	347	}
	348
	349	void
	350	FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
	351	// Specify the class to avoid a virtual-function indirection.
	352	// In Java, we would declare this class final.
	353	while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
	354	--num;
	355	}
	356	}
	357
	358	void
	359	FCDUTF8CollationIterator::switchToForward() {
	360	U_ASSERT(state == CHECK_BWD \|\|
	361	(state == IN_FCD_SEGMENT && pos == limit) \|\|
	362	(state == IN_NORMALIZED && pos == normalized.length()));
	363	if(state == CHECK_BWD) {
	364	// Turn around from backward checking.
	365	start = pos;
	366	if(pos == limit) {
	367	state = CHECK_FWD; // Check forward.
	368	} else { // pos < limit
	369	state = IN_FCD_SEGMENT; // Stay in FCD segment.
	370	}
	371	} else {
	372	// Reached the end of the FCD segment.
	373	if(state == IN_FCD_SEGMENT) {
	374	// The input text segment is FCD, extend it forward.
375	} else {
376	// The input text segment needed to be normalized.
377	// Switch to checking forward from it.
378	start = pos = limit;
379	}
380	state = CHECK_FWD;
381	}
382	}
383
384	UBool
385	FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
386	if(U_FAILURE(errorCode)) { return FALSE; }
387	U_ASSERT(state == CHECK_FWD && pos != length);
388	// The input text [start..pos[ passes the FCD check.
389	int32_t segmentStart = pos;
390	// Collect the characters being checked, in case they need to be normalized.
391	UnicodeString s;
392	uint8_t prevCC = 0;
393	for(;;) {
394	// Fetch the next character and its fcd16 value.
395	int32_t cpStart = pos;
396	UChar32 c;
397	U8_NEXT_OR_FFFD(u8, pos, length, c);
398	uint16_t fcd16 = nfcImpl.getFCD16(c);
399	uint8_t leadCC = (uint8_t)(fcd16 >> 8);
400	if(leadCC == 0 && cpStart != segmentStart) {
401	// FCD boundary before this character.
402	pos = cpStart;
403	break;
404	}
405	s.append(c);
406	if(leadCC != 0 && (prevCC > leadCC \|\| CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
407	// Fails FCD check. Find the next FCD boundary and normalize.
408	while(pos != length) {
409	cpStart = pos;
410	U8_NEXT_OR_FFFD(u8, pos, length, c);
411	if(nfcImpl.getFCD16(c) <= 0xff) {
412	pos = cpStart;
413	break;
414	}
415	s.append(c);
416	}
417	if(!normalize(s, errorCode)) { return FALSE; }
418	start = segmentStart;
419	limit = pos;
420	state = IN_NORMALIZED;
421	pos = 0;
422	return TRUE;
423	}
424	prevCC = (uint8_t)fcd16;
425	if(pos == length \|\| prevCC == 0) {
426	// FCD boundary after the last character.
427	break;
428	}
429	}
430	limit = pos;
431	pos = segmentStart;
432	U_ASSERT(pos != limit);
433	state = IN_FCD_SEGMENT;
434	return TRUE;
435	}
436
437	void
438	FCDUTF8CollationIterator::switchToBackward() {
439	U_ASSERT(state == CHECK_FWD \|\|
440	(state == IN_FCD_SEGMENT && pos == start) \|\|
441	(state >= IN_NORMALIZED && pos == 0));
442	if(state == CHECK_FWD) {
443	// Turn around from forward checking.
444	limit = pos;
445	if(pos == start) {
446	state = CHECK_BWD; // Check backward.
447	} else { // pos > start
448	state = IN_FCD_SEGMENT; // Stay in FCD segment.
449	}
450	} else {
451	// Reached the start of the FCD segment.
452	if(state == IN_FCD_SEGMENT) {
453	// The input text segment is FCD, extend it backward.
454	} else {
455	// The input text segment needed to be normalized.
456	// Switch to checking backward from it.
457	limit = pos = start;
458	}
459	state = CHECK_BWD;
460	}
461	}
462
463	UBool
464	FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
465	if(U_FAILURE(errorCode)) { return FALSE; }
466	U_ASSERT(state == CHECK_BWD && pos != 0);
467	// The input text [pos..limit[ passes the FCD check.
468	int32_t segmentLimit = pos;
469	// Collect the characters being checked, in case they need to be normalized.
470	UnicodeString s;
471	uint8_t nextCC = 0;
472	for(;;) {
473	// Fetch the previous character and its fcd16 value.
474	int32_t cpLimit = pos;
475	UChar32 c;
476	U8_PREV_OR_FFFD(u8, 0, pos, c);
477	uint16_t fcd16 = nfcImpl.getFCD16(c);
478	uint8_t trailCC = (uint8_t)fcd16;
479	if(trailCC == 0 && cpLimit != segmentLimit) {
480	// FCD boundary after this character.
481	pos = cpLimit;
482	break;
483	}
484	s.append(c);
485	if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) \|\|
486	CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
487	// Fails FCD check. Find the previous FCD boundary and normalize.
488	while(fcd16 > 0xff && pos != 0) {
489	cpLimit = pos;
490	U8_PREV_OR_FFFD(u8, 0, pos, c);
491	fcd16 = nfcImpl.getFCD16(c);
492	if(fcd16 == 0) {
493	pos = cpLimit;
494	break;
495	}
496	s.append(c);
497	}
498	s.reverse();
499	if(!normalize(s, errorCode)) { return FALSE; }
500	limit = segmentLimit;
501	start = pos;
502	state = IN_NORMALIZED;
503	pos = normalized.length();
504	return TRUE;
505	}
506	nextCC = (uint8_t)(fcd16 >> 8);
507	if(pos == 0 \|\| nextCC == 0) {
508	// FCD boundary before the following character.
509	break;
510	}
511	}
512	start = pos;
513	pos = segmentLimit;
514	U_ASSERT(pos != start);
515	state = IN_FCD_SEGMENT;
516	return TRUE;
517	}
518
519	UBool
520	FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
521	// NFD without argument checking.
522	U_ASSERT(U_SUCCESS(errorCode));
523	nfcImpl.decompose(s, normalized, errorCode);
524	return U_SUCCESS(errorCode);
525	}
526
527	U_NAMESPACE_END
528
529	#endif // !UCONFIG_NO_COLLATION