2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * utf8collationiterator.h
8 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
9 * created by: Markus W. Scherer
12 #ifndef __UTF8COLLATIONITERATOR_H__
13 #define __UTF8COLLATIONITERATOR_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "normalizer2impl.h"
27 * UTF-8 collation element and character iterator.
28 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
29 * Unnormalized text is handled by a subclass.
31 class U_I18N_API UTF8CollationIterator
: public CollationIterator
{
33 UTF8CollationIterator(const CollationData
*d
, UBool numeric
,
34 const uint8_t *s
, int32_t p
, int32_t len
)
35 : CollationIterator(d
, numeric
),
36 u8(s
), pos(p
), length(len
) {}
38 virtual ~UTF8CollationIterator();
40 virtual void resetToOffset(int32_t newOffset
);
42 virtual int32_t getOffset() const;
44 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
46 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
50 * For byte sequences that are illegal in UTF-8, an error value may be returned
51 * together with a bogus code point. The caller will ignore that code point.
53 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
54 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
56 * Valid lead surrogates are returned from inside a normalized text segment,
57 * where handleGetTrailSurrogate() will return the matching trail surrogate.
59 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
61 virtual UBool
foundNULTerminator();
63 virtual UBool
forbidSurrogateCodePoints() const;
65 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
67 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
71 int32_t length
; // <0 for NUL-terminated strings
75 * Incrementally checks the input text for FCD and normalizes where necessary.
77 class U_I18N_API FCDUTF8CollationIterator
: public UTF8CollationIterator
{
79 FCDUTF8CollationIterator(const CollationData
*data
, UBool numeric
,
80 const uint8_t *s
, int32_t p
, int32_t len
)
81 : UTF8CollationIterator(data
, numeric
, s
, p
, len
),
82 state(CHECK_FWD
), start(p
),
83 nfcImpl(data
->nfcImpl
) {}
85 virtual ~FCDUTF8CollationIterator();
87 virtual void resetToOffset(int32_t newOffset
);
89 virtual int32_t getOffset() const;
91 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
93 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
96 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
98 virtual UChar
handleGetTrailSurrogate();
100 virtual UBool
foundNULTerminator();
102 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
104 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
107 UBool
nextHasLccc() const;
108 UBool
previousHasTccc() const;
111 * Switches to forward checking if possible.
113 void switchToForward();
116 * Extends the FCD text segment forward or normalizes around pos.
117 * @return TRUE if success
119 UBool
nextSegment(UErrorCode
&errorCode
);
122 * Switches to backward checking.
124 void switchToBackward();
127 * Extends the FCD text segment backward or normalizes around pos.
128 * @return TRUE if success
130 UBool
previousSegment(UErrorCode
&errorCode
);
132 UBool
normalize(const UnicodeString
&s
, UErrorCode
&errorCode
);
136 * The input text [start..pos[ passes the FCD check.
137 * Moving forward checks incrementally.
138 * limit is undefined.
142 * The input text [pos..limit[ passes the FCD check.
143 * Moving backward checks incrementally.
144 * start is undefined.
148 * The input text [start..limit[ passes the FCD check.
149 * pos tracks the current text index.
153 * The input text [start..limit[ failed the FCD check and was normalized.
154 * pos tracks the current index in the normalized string.
164 const Normalizer2Impl
&nfcImpl
;
165 UnicodeString normalized
;
170 #endif // !UCONFIG_NO_COLLATION
171 #endif // __UTF8COLLATIONITERATOR_H__