]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/strmatch.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / strmatch.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/23/01 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "strmatch.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19 #include "unicode/utf16.h"
20
21 U_NAMESPACE_BEGIN
22
23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
24
25 StringMatcher::StringMatcher(const UnicodeString& theString,
26 int32_t start,
27 int32_t limit,
28 int32_t segmentNum,
29 const TransliterationRuleData& theData) :
30 data(&theData),
31 segmentNumber(segmentNum),
32 matchStart(-1),
33 matchLimit(-1)
34 {
35 theString.extractBetween(start, limit, pattern);
36 }
37
38 StringMatcher::StringMatcher(const StringMatcher& o) :
39 UnicodeFunctor(o),
40 UnicodeMatcher(o),
41 UnicodeReplacer(o),
42 pattern(o.pattern),
43 data(o.data),
44 segmentNumber(o.segmentNumber),
45 matchStart(o.matchStart),
46 matchLimit(o.matchLimit)
47 {
48 }
49
50 /**
51 * Destructor
52 */
53 StringMatcher::~StringMatcher() {
54 }
55
56 /**
57 * Implement UnicodeFunctor
58 */
59 UnicodeFunctor* StringMatcher::clone() const {
60 return new StringMatcher(*this);
61 }
62
63 /**
64 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
65 * and return the pointer.
66 */
67 UnicodeMatcher* StringMatcher::toMatcher() const {
68 return (UnicodeMatcher*) this;
69 }
70
71 /**
72 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
73 * and return the pointer.
74 */
75 UnicodeReplacer* StringMatcher::toReplacer() const {
76 return (UnicodeReplacer*) this;
77 }
78
79 /**
80 * Implement UnicodeMatcher
81 */
82 UMatchDegree StringMatcher::matches(const Replaceable& text,
83 int32_t& offset,
84 int32_t limit,
85 UBool incremental) {
86 int32_t i;
87 int32_t cursor = offset;
88 if (limit < cursor) {
89 // Match in the reverse direction
90 for (i=pattern.length()-1; i>=0; --i) {
91 UChar keyChar = pattern.charAt(i);
92 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
93 if (subm == 0) {
94 if (cursor > limit &&
95 keyChar == text.charAt(cursor)) {
96 --cursor;
97 } else {
98 return U_MISMATCH;
99 }
100 } else {
101 UMatchDegree m =
102 subm->matches(text, cursor, limit, incremental);
103 if (m != U_MATCH) {
104 return m;
105 }
106 }
107 }
108 // Record the match position, but adjust for a normal
109 // forward start, limit, and only if a prior match does not
110 // exist -- we want the rightmost match.
111 if (matchStart < 0) {
112 matchStart = cursor+1;
113 matchLimit = offset+1;
114 }
115 } else {
116 for (i=0; i<pattern.length(); ++i) {
117 if (incremental && cursor == limit) {
118 // We've reached the context limit without a mismatch and
119 // without completing our match.
120 return U_PARTIAL_MATCH;
121 }
122 UChar keyChar = pattern.charAt(i);
123 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
124 if (subm == 0) {
125 // Don't need the cursor < limit check if
126 // incremental is TRUE (because it's done above); do need
127 // it otherwise.
128 if (cursor < limit &&
129 keyChar == text.charAt(cursor)) {
130 ++cursor;
131 } else {
132 return U_MISMATCH;
133 }
134 } else {
135 UMatchDegree m =
136 subm->matches(text, cursor, limit, incremental);
137 if (m != U_MATCH) {
138 return m;
139 }
140 }
141 }
142 // Record the match position
143 matchStart = offset;
144 matchLimit = cursor;
145 }
146
147 offset = cursor;
148 return U_MATCH;
149 }
150
151 /**
152 * Implement UnicodeMatcher
153 */
154 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
155 UBool escapeUnprintable) const
156 {
157 result.truncate(0);
158 UnicodeString str, quoteBuf;
159 if (segmentNumber > 0) {
160 result.append((UChar)40); /*(*/
161 }
162 for (int32_t i=0; i<pattern.length(); ++i) {
163 UChar keyChar = pattern.charAt(i);
164 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
165 if (m == 0) {
166 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
167 } else {
168 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
169 TRUE, escapeUnprintable, quoteBuf);
170 }
171 }
172 if (segmentNumber > 0) {
173 result.append((UChar)41); /*)*/
174 }
175 // Flush quoteBuf out to result
176 ICU_Utility::appendToRule(result, -1,
177 TRUE, escapeUnprintable, quoteBuf);
178 return result;
179 }
180
181 /**
182 * Implement UnicodeMatcher
183 */
184 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
185 if (pattern.length() == 0) {
186 return TRUE;
187 }
188 UChar32 c = pattern.char32At(0);
189 const UnicodeMatcher *m = data->lookupMatcher(c);
190 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
191 }
192
193 /**
194 * Implement UnicodeMatcher
195 */
196 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
197 UChar32 ch;
198 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
199 ch = pattern.char32At(i);
200 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
201 if (matcher == NULL) {
202 toUnionTo.add(ch);
203 } else {
204 matcher->addMatchSetTo(toUnionTo);
205 }
206 }
207 }
208
209 /**
210 * UnicodeReplacer API
211 */
212 int32_t StringMatcher::replace(Replaceable& text,
213 int32_t start,
214 int32_t limit,
215 int32_t& /*cursor*/) {
216
217 int32_t outLen = 0;
218
219 // Copy segment with out-of-band data
220 int32_t dest = limit;
221 // If there was no match, that means that a quantifier
222 // matched zero-length. E.g., x (a)* y matched "xy".
223 if (matchStart >= 0) {
224 if (matchStart != matchLimit) {
225 text.copy(matchStart, matchLimit, dest);
226 outLen = matchLimit - matchStart;
227 }
228 }
229
230 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
231
232 return outLen;
233 }
234
235 /**
236 * UnicodeReplacer API
237 */
238 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
239 UBool /*escapeUnprintable*/) const {
240 // assert(segmentNumber > 0);
241 rule.truncate(0);
242 rule.append((UChar)0x0024 /*$*/);
243 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
244 return rule;
245 }
246
247 /**
248 * Remove any match info. This must be called before performing a
249 * set of matches with this segment.
250 */
251 void StringMatcher::resetMatch() {
252 matchStart = matchLimit = -1;
253 }
254
255 /**
256 * Union the set of all characters that may output by this object
257 * into the given set.
258 * @param toUnionTo the set into which to union the output characters
259 */
260 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
261 // The output of this replacer varies; it is the source text between
262 // matchStart and matchLimit. Since this varies depending on the
263 // input text, we can't compute it here. We can either do nothing
264 // or we can add ALL characters to the set. It's probably more useful
265 // to do nothing.
266 }
267
268 /**
269 * Implement UnicodeFunctor
270 */
271 void StringMatcher::setData(const TransliterationRuleData* d) {
272 data = d;
273 int32_t i = 0;
274 while (i<pattern.length()) {
275 UChar32 c = pattern.char32At(i);
276 UnicodeFunctor* f = data->lookup(c);
277 if (f != NULL) {
278 f->setData(data);
279 }
280 i += U16_LENGTH(c);
281 }
282 }
283
284 U_NAMESPACE_END
285
286 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
287
288 //eof