]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/utf8collationiterator.h
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / utf8collationiterator.h
CommitLineData
57a6839d
A
1/*
2*******************************************************************************
2ca993e8 3* Copyright (C) 2012-2016, International Business Machines
57a6839d
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* utf8collationiterator.h
7*
8* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
9* created by: Markus W. Scherer
10*/
11
12#ifndef __UTF8COLLATIONITERATOR_H__
13#define __UTF8COLLATIONITERATOR_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "cmemory.h"
20#include "collation.h"
21#include "collationdata.h"
2ca993e8 22#include "collationiterator.h"
57a6839d
A
23#include "normalizer2impl.h"
24
25U_NAMESPACE_BEGIN
26
27/**
28 * UTF-8 collation element and character iterator.
29 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
30 * Unnormalized text is handled by a subclass.
31 */
32class U_I18N_API UTF8CollationIterator : public CollationIterator {
33public:
34 UTF8CollationIterator(const CollationData *d, UBool numeric,
35 const uint8_t *s, int32_t p, int32_t len)
36 : CollationIterator(d, numeric),
37 u8(s), pos(p), length(len) {}
38
39 virtual ~UTF8CollationIterator();
40
41 virtual void resetToOffset(int32_t newOffset);
42
43 virtual int32_t getOffset() const;
44
45 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
46
47 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
48
49protected:
50 /**
51 * For byte sequences that are illegal in UTF-8, an error value may be returned
52 * together with a bogus code point. The caller will ignore that code point.
53 *
54 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
55 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
56 *
57 * Valid lead surrogates are returned from inside a normalized text segment,
58 * where handleGetTrailSurrogate() will return the matching trail surrogate.
59 */
60 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
61
62 virtual UBool foundNULTerminator();
63
64 virtual UBool forbidSurrogateCodePoints() const;
65
66 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
67
68 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69
70 const uint8_t *u8;
71 int32_t pos;
72 int32_t length; // <0 for NUL-terminated strings
73};
74
75/**
76 * Incrementally checks the input text for FCD and normalizes where necessary.
77 */
78class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
79public:
80 FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
81 const uint8_t *s, int32_t p, int32_t len)
82 : UTF8CollationIterator(data, numeric, s, p, len),
83 state(CHECK_FWD), start(p),
84 nfcImpl(data->nfcImpl) {}
85
86 virtual ~FCDUTF8CollationIterator();
87
88 virtual void resetToOffset(int32_t newOffset);
89
90 virtual int32_t getOffset() const;
91
92 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
93
94 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
95
96protected:
97 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
98
99 virtual UChar handleGetTrailSurrogate();
100
101 virtual UBool foundNULTerminator();
102
103 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
104
105 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106
107private:
108 UBool nextHasLccc() const;
109 UBool previousHasTccc() const;
110
111 /**
112 * Switches to forward checking if possible.
113 */
114 void switchToForward();
115
116 /**
117 * Extends the FCD text segment forward or normalizes around pos.
118 * @return TRUE if success
119 */
120 UBool nextSegment(UErrorCode &errorCode);
121
122 /**
123 * Switches to backward checking.
124 */
125 void switchToBackward();
126
127 /**
128 * Extends the FCD text segment backward or normalizes around pos.
129 * @return TRUE if success
130 */
131 UBool previousSegment(UErrorCode &errorCode);
132
133 UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
134
135 enum State {
136 /**
137 * The input text [start..pos[ passes the FCD check.
138 * Moving forward checks incrementally.
139 * limit is undefined.
140 */
141 CHECK_FWD,
142 /**
143 * The input text [pos..limit[ passes the FCD check.
144 * Moving backward checks incrementally.
145 * start is undefined.
146 */
147 CHECK_BWD,
148 /**
149 * The input text [start..limit[ passes the FCD check.
150 * pos tracks the current text index.
151 */
152 IN_FCD_SEGMENT,
153 /**
154 * The input text [start..limit[ failed the FCD check and was normalized.
155 * pos tracks the current index in the normalized string.
156 */
157 IN_NORMALIZED
158 };
159
160 State state;
161
162 int32_t start;
163 int32_t limit;
164
165 const Normalizer2Impl &nfcImpl;
166 UnicodeString normalized;
167};
168
169U_NAMESPACE_END
170
171#endif // !UCONFIG_NO_COLLATION
172#endif // __UTF8COLLATIONITERATOR_H__