1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * utf8collationiterator.h
10 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11 * created by: Markus W. Scherer
14 #ifndef __UTF8COLLATIONITERATOR_H__
15 #define __UTF8COLLATIONITERATOR_H__
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_COLLATION
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
30 * UTF-8 collation element and character iterator.
31 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32 * Unnormalized text is handled by a subclass.
34 class U_I18N_API UTF8CollationIterator
: public CollationIterator
{
36 UTF8CollationIterator(const CollationData
*d
, UBool numeric
,
37 const uint8_t *s
, int32_t p
, int32_t len
)
38 : CollationIterator(d
, numeric
),
39 u8(s
), pos(p
), length(len
) {}
41 virtual ~UTF8CollationIterator();
43 virtual void resetToOffset(int32_t newOffset
);
45 virtual int32_t getOffset() const;
47 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
49 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
53 * For byte sequences that are illegal in UTF-8, an error value may be returned
54 * together with a bogus code point. The caller will ignore that code point.
56 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
59 * Valid lead surrogates are returned from inside a normalized text segment,
60 * where handleGetTrailSurrogate() will return the matching trail surrogate.
62 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
64 virtual UBool
foundNULTerminator();
66 virtual UBool
forbidSurrogateCodePoints() const;
68 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
70 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
74 int32_t length
; // <0 for NUL-terminated strings
78 * Incrementally checks the input text for FCD and normalizes where necessary.
80 class U_I18N_API FCDUTF8CollationIterator
: public UTF8CollationIterator
{
82 FCDUTF8CollationIterator(const CollationData
*data
, UBool numeric
,
83 const uint8_t *s
, int32_t p
, int32_t len
)
84 : UTF8CollationIterator(data
, numeric
, s
, p
, len
),
85 state(CHECK_FWD
), start(p
),
86 nfcImpl(data
->nfcImpl
) {}
88 virtual ~FCDUTF8CollationIterator();
90 virtual void resetToOffset(int32_t newOffset
);
92 virtual int32_t getOffset() const;
94 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
96 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
99 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
101 virtual UChar
handleGetTrailSurrogate();
103 virtual UBool
foundNULTerminator();
105 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
107 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
110 UBool
nextHasLccc() const;
111 UBool
previousHasTccc() const;
114 * Switches to forward checking if possible.
116 void switchToForward();
119 * Extends the FCD text segment forward or normalizes around pos.
120 * @return TRUE if success
122 UBool
nextSegment(UErrorCode
&errorCode
);
125 * Switches to backward checking.
127 void switchToBackward();
130 * Extends the FCD text segment backward or normalizes around pos.
131 * @return TRUE if success
133 UBool
previousSegment(UErrorCode
&errorCode
);
135 UBool
normalize(const UnicodeString
&s
, UErrorCode
&errorCode
);
139 * The input text [start..pos[ passes the FCD check.
140 * Moving forward checks incrementally.
141 * limit is undefined.
145 * The input text [pos..limit[ passes the FCD check.
146 * Moving backward checks incrementally.
147 * start is undefined.
151 * The input text [start..limit[ passes the FCD check.
152 * pos tracks the current text index.
156 * The input text [start..limit[ failed the FCD check and was normalized.
157 * pos tracks the current index in the normalized string.
167 const Normalizer2Impl
&nfcImpl
;
168 UnicodeString normalized
;
173 #endif // !UCONFIG_NO_COLLATION
174 #endif // __UTF8COLLATIONITERATOR_H__