2 *******************************************************************************
3 * Copyright (C) 2012-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * utf8collationiterator.h
8 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
9 * created by: Markus W. Scherer
12 #ifndef __UTF8COLLATIONITERATOR_H__
13 #define __UTF8COLLATIONITERATOR_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationiterator.h"
23 #include "normalizer2impl.h"
28 * UTF-8 collation element and character iterator.
29 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
30 * Unnormalized text is handled by a subclass.
32 class U_I18N_API UTF8CollationIterator
: public CollationIterator
{
34 UTF8CollationIterator(const CollationData
*d
, UBool numeric
,
35 const uint8_t *s
, int32_t p
, int32_t len
)
36 : CollationIterator(d
, numeric
),
37 u8(s
), pos(p
), length(len
) {}
39 virtual ~UTF8CollationIterator();
41 virtual void resetToOffset(int32_t newOffset
);
43 virtual int32_t getOffset() const;
45 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
47 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
51 * For byte sequences that are illegal in UTF-8, an error value may be returned
52 * together with a bogus code point. The caller will ignore that code point.
54 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
55 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
57 * Valid lead surrogates are returned from inside a normalized text segment,
58 * where handleGetTrailSurrogate() will return the matching trail surrogate.
60 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
62 virtual UBool
foundNULTerminator();
64 virtual UBool
forbidSurrogateCodePoints() const;
66 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
68 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
72 int32_t length
; // <0 for NUL-terminated strings
76 * Incrementally checks the input text for FCD and normalizes where necessary.
78 class U_I18N_API FCDUTF8CollationIterator
: public UTF8CollationIterator
{
80 FCDUTF8CollationIterator(const CollationData
*data
, UBool numeric
,
81 const uint8_t *s
, int32_t p
, int32_t len
)
82 : UTF8CollationIterator(data
, numeric
, s
, p
, len
),
83 state(CHECK_FWD
), start(p
),
84 nfcImpl(data
->nfcImpl
) {}
86 virtual ~FCDUTF8CollationIterator();
88 virtual void resetToOffset(int32_t newOffset
);
90 virtual int32_t getOffset() const;
92 virtual UChar32
nextCodePoint(UErrorCode
&errorCode
);
94 virtual UChar32
previousCodePoint(UErrorCode
&errorCode
);
97 virtual uint32_t handleNextCE32(UChar32
&c
, UErrorCode
&errorCode
);
99 virtual UChar
handleGetTrailSurrogate();
101 virtual UBool
foundNULTerminator();
103 virtual void forwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
105 virtual void backwardNumCodePoints(int32_t num
, UErrorCode
&errorCode
);
108 UBool
nextHasLccc() const;
109 UBool
previousHasTccc() const;
112 * Switches to forward checking if possible.
114 void switchToForward();
117 * Extends the FCD text segment forward or normalizes around pos.
118 * @return TRUE if success
120 UBool
nextSegment(UErrorCode
&errorCode
);
123 * Switches to backward checking.
125 void switchToBackward();
128 * Extends the FCD text segment backward or normalizes around pos.
129 * @return TRUE if success
131 UBool
previousSegment(UErrorCode
&errorCode
);
133 UBool
normalize(const UnicodeString
&s
, UErrorCode
&errorCode
);
137 * The input text [start..pos[ passes the FCD check.
138 * Moving forward checks incrementally.
139 * limit is undefined.
143 * The input text [pos..limit[ passes the FCD check.
144 * Moving backward checks incrementally.
145 * start is undefined.
149 * The input text [start..limit[ passes the FCD check.
150 * pos tracks the current text index.
154 * The input text [start..limit[ failed the FCD check and was normalized.
155 * pos tracks the current index in the normalized string.
165 const Normalizer2Impl
&nfcImpl
;
166 UnicodeString normalized
;
171 #endif // !UCONFIG_NO_COLLATION
172 #endif // __UTF8COLLATIONITERATOR_H__