]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/utf8collationiterator.h
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / i18n / utf8collationiterator.h
CommitLineData
57a6839d
A
1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* utf8collationiterator.h
7*
8* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
9* created by: Markus W. Scherer
10*/
11
12#ifndef __UTF8COLLATIONITERATOR_H__
13#define __UTF8COLLATIONITERATOR_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "cmemory.h"
20#include "collation.h"
21#include "collationdata.h"
22#include "normalizer2impl.h"
23
24U_NAMESPACE_BEGIN
25
26/**
27 * UTF-8 collation element and character iterator.
28 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
29 * Unnormalized text is handled by a subclass.
30 */
31class U_I18N_API UTF8CollationIterator : public CollationIterator {
32public:
33 UTF8CollationIterator(const CollationData *d, UBool numeric,
34 const uint8_t *s, int32_t p, int32_t len)
35 : CollationIterator(d, numeric),
36 u8(s), pos(p), length(len) {}
37
38 virtual ~UTF8CollationIterator();
39
40 virtual void resetToOffset(int32_t newOffset);
41
42 virtual int32_t getOffset() const;
43
44 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
45
46 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
47
48protected:
49 /**
50 * For byte sequences that are illegal in UTF-8, an error value may be returned
51 * together with a bogus code point. The caller will ignore that code point.
52 *
53 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
54 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
55 *
56 * Valid lead surrogates are returned from inside a normalized text segment,
57 * where handleGetTrailSurrogate() will return the matching trail surrogate.
58 */
59 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
60
61 virtual UBool foundNULTerminator();
62
63 virtual UBool forbidSurrogateCodePoints() const;
64
65 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
66
67 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
68
69 const uint8_t *u8;
70 int32_t pos;
71 int32_t length; // <0 for NUL-terminated strings
72};
73
74/**
75 * Incrementally checks the input text for FCD and normalizes where necessary.
76 */
77class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
78public:
79 FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
80 const uint8_t *s, int32_t p, int32_t len)
81 : UTF8CollationIterator(data, numeric, s, p, len),
82 state(CHECK_FWD), start(p),
83 nfcImpl(data->nfcImpl) {}
84
85 virtual ~FCDUTF8CollationIterator();
86
87 virtual void resetToOffset(int32_t newOffset);
88
89 virtual int32_t getOffset() const;
90
91 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
92
93 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
94
95protected:
96 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
97
98 virtual UChar handleGetTrailSurrogate();
99
100 virtual UBool foundNULTerminator();
101
102 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
103
104 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
105
106private:
107 UBool nextHasLccc() const;
108 UBool previousHasTccc() const;
109
110 /**
111 * Switches to forward checking if possible.
112 */
113 void switchToForward();
114
115 /**
116 * Extends the FCD text segment forward or normalizes around pos.
117 * @return TRUE if success
118 */
119 UBool nextSegment(UErrorCode &errorCode);
120
121 /**
122 * Switches to backward checking.
123 */
124 void switchToBackward();
125
126 /**
127 * Extends the FCD text segment backward or normalizes around pos.
128 * @return TRUE if success
129 */
130 UBool previousSegment(UErrorCode &errorCode);
131
132 UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
133
134 enum State {
135 /**
136 * The input text [start..pos[ passes the FCD check.
137 * Moving forward checks incrementally.
138 * limit is undefined.
139 */
140 CHECK_FWD,
141 /**
142 * The input text [pos..limit[ passes the FCD check.
143 * Moving backward checks incrementally.
144 * start is undefined.
145 */
146 CHECK_BWD,
147 /**
148 * The input text [start..limit[ passes the FCD check.
149 * pos tracks the current text index.
150 */
151 IN_FCD_SEGMENT,
152 /**
153 * The input text [start..limit[ failed the FCD check and was normalized.
154 * pos tracks the current index in the normalized string.
155 */
156 IN_NORMALIZED
157 };
158
159 State state;
160
161 int32_t start;
162 int32_t limit;
163
164 const Normalizer2Impl &nfcImpl;
165 UnicodeString normalized;
166};
167
168U_NAMESPACE_END
169
170#endif // !UCONFIG_NO_COLLATION
171#endif // __UTF8COLLATIONITERATOR_H__